Tables and Basic Queries

행: observations (또는 items, records)
열: variables (또는 attributes)

import sqlite3 as db
conn = db.connect('example.db')  # database 연결
c = conn.cursor()  # 명령실행 위한 cursor()를 c로 임의 저장
c.execute("DROP TABLE IF EXISTS Students")  # 이미 존재하는 'Student' 테이블있다면 삭제
c.execute("CREATE TABLE Students (id INTEGER, name TEXT)") # 2개 열 가진 테이블생성
<sqlite3.Cursor at 0x2894e350880>

INSERT INTO

item 추가

c.execute("INSERT INTO Students VALUES (02, 'Andy')")
c.execute("INSERT INTO Students VALUES (09, 'Bill')")
c.execute("INSERT INTO Students VALUES (11, 'Chris')")
c.execute("INSERT INTO Students VALUES (13, 'Dan')")
<sqlite3.Cursor at 0x2894e350880>

excutemany()

item 추가

# 튜플 리스트로 추가하기
add_students = [(14, 'Ethan'),
                 (22, 'Frank'),
                 (31, 'Gavin'),
                 (37, 'Henry'),
                 (42, 'James')]

c.executemany('INSERT INTO Students VALUES (?, ?)', add_students)
conn.commit()  # 변경내용을 database에 반영하겠다고 확정하는 명령어

SELECT

조회

c.execute("SELECT * FROM Students")
results = c.fetchall()
print("\nThe entries of Students:\n", results, "Results:", len(results))
The entries of Students:
 [(2, 'Andy'), (9, 'Bill'), (11, 'Chris'), (13, 'Dan'), (14, 'Ethan'), (22, 'Frank'), (31, 'Gavin'), (37, 'Henry'), (42, 'James')] Results: 9

Join

INNER JOIN(A, B) A와 B 교집합
OUTER JOIN(A, B) A와 B 합집합 (missing value는 Null/SQL, NaN/Pandas로 채움)
LEFT JOIN(A, B) A 기준
RIGHT JOIN(A, B) B 기준

Where

조건

# Grades테이블 생성
c.execute('DROP TABLE IF EXISTS Grades')
c.execute('CREATE TABLE Grades (id INTEGER, course TEXT, grade REAL)')

add_items = [(9, "Arts", 4.0),
           (9, "Ethics", 3.0),
           (9, "History", 1.0),
           (13, "Arts", 4.0),
           (13, "Ethics", 4.0),
           (13, "History", 4.0),
           (31, "Arts", 2.0),
           (31, "History", 3.0)
           ]

c.executemany('INSERT INTO Grades VALUES (?, ?, ?)', add_items)
conn.commit()
# 예제: Grades테이블에서 id 대신 Students테이블의 name을 불러오시오
query = '''
        SELECT Students.name, Grades.course, Grades.grade
        FROM Students, Grades
        WHERE Students.id = Grades.id
'''

for match in c.execute(query):
    print(match)
('Bill', 'Arts', 4.0)
('Bill', 'Ethics', 3.0)
('Bill', 'History', 1.0)
('Dan', 'Arts', 4.0)
('Dan', 'Ethics', 4.0)
('Dan', 'History', 4.0)
('Gavin', 'Arts', 2.0)
('Gavin', 'History', 3.0)
# 예제: History과목의 (name, grade)으로 이루어진 리스트 생성하시오
query = '''
        SELECT Students.name, Grades.grade
        FROM Students, Grades
        WHERE Students.id = Grades.id and Grades.course = 'History'
'''
c.execute(query)
output = c.fetchall()
output
[('Bill', 1.0), ('Dan', 4.0), ('Gavin', 3.0)]
# 예제: Students테이블을 기준으로 LEFT JOIN하여 name, grade 선택하시오
query = '''
        SELECT Students.name, Grades.grade
        FROM Students LEFT JOIN Grades ON
        Students.id = Grades.id
'''
c.execute(query)
matches = c.fetchall()
for index, match in enumerate(matches):
    print(index, "=>", match)
0 => ('Andy', None)
1 => ('Bill', 1.0)
2 => ('Bill', 3.0)
3 => ('Bill', 4.0)
4 => ('Chris', None)
5 => ('Dan', 4.0)
6 => ('Dan', 4.0)
7 => ('Dan', 4.0)
8 => ('Ethan', None)
9 => ('Frank', None)
10 => ('Gavin', 2.0)
11 => ('Gavin', 3.0)
12 => ('Henry', None)
13 => ('James', None)

Aggregations (그룹별 계산)

AVG MIN MAX SUM COUNT

# id별로 평균 grade 계산
query = '''
        SELECT id, AVG(grade)
        FROM Grades
        GROUP BY id
'''
for match in c.execute(query):
    print(match)
(9, 2.6666666666666665)
(13, 4.0)
(31, 2.5)

Clean up

cursor, connection 닫기

c.close()
conn.close()

Data setup

참고 데이터: NYC 311 calls (datasets of complaints filed by residents of New York City via 311 calls)

dataset = db.connect("C:/Users/NYC-311-2M.db", isolation_level=None)  # DB파일 생성 및 연결

LIMIT

부분 조회

query = '''
        SELECT * FROM data
        LIMIT 4               
'''                             # 4개 행만 불러오기
import pandas as pd
df = pd.read_sql_query(query, dataset)
df.head()
index CreatedDate ClosedDate Agency ComplaintType Descriptor City
0 1 2015-09-15 02:14:04.000000 None NYPD Illegal Parking Blocked Hydrant None
1 2 2015-09-15 02:12:49.000000 None NYPD Noise - Street/Sidewalk Loud Talking NEW YORK
2 3 2015-09-15 02:11:19.000000 None NYPD Noise - Street/Sidewalk Loud Talking NEW YORK
3 4 2015-09-15 02:09:46.000000 None NYPD Noise - Commercial Loud Talking BRONX

DISTINCT

unique value 찾기 (고유값)

query = 'SELECT DISTINCT Agency FROM data'
df = pd.read_sql_query(query, dataset)

print("Found {} unique Agencies. The first few are:".format(len(df)))
df.head()
Found 50 unique Agencies. The first few are:
Agency
0 NYPD
1 DHS
2 DOT
3 DOHMH
4 CHALL

GROUP BY

그룹핑

# ComplaintType별로 묶기
query = '''
    SELECT ComplaintType, City, Agency
    FROM data
    GROUP BY ComplaintType
'''
df = pd.read_sql_query(query, dataset)
print(df.shape)
df.head()
(200, 3)
ComplaintType City Agency
0 AGENCY BRONX HPD
1 APPLIANCE BRONX HPD
2 Adopt-A-Basket Springfield Gardens DSNY
3 Agency Issues None DOT
4 Air Quality NEW YORK DEP

그룹별 집합/계산

# ComplaintType별로 카운트한 7개 결과 출력
query = '''
    SELECT ComplaintType, COUNT(*)
    FROM data
    GROUP BY ComplaintType
    LIMIT 7
'''

df = pd.read_sql_query(query, dataset)
df
ComplaintType COUNT(*)
0 AGENCY 2
1 APPLIANCE 11263
2 Adopt-A-Basket 50
3 Agency Issues 7428
4 Air Quality 8151
5 Animal Abuse 10614
6 Animal Facility - No Permit 78

UPPER(), LOWER()

대문자/소문자/혼합형 변경

query = '''
    SELECT UPPER(Agency), LOWER(ComplaintType), LOWER(Descriptor)
    FROM data
    GROUP BY LOWER(ComplaintType)
    LIMIT 10
'''

df = pd.read_sql_query(query, dataset)
df.head()
UPPER(Agency) LOWER(ComplaintType) LOWER(Descriptor)
0 DSNY adopt-a-basket 10a adopt-a-basket
1 HPD agency housing quality standards
2 DOT agency issues bike share
3 DEP air quality air: odor/fumes, vehicle idling (ad3)
4 NYPD animal abuse other (complaint details)

HAVING

그룹화된 필드에 조건을 줄 때
WHERE 기본적인 조건절로서 그룹핑 전 모든 필드에 조건 줄 수 있음

# 혼합형과 대소문자 중복
query1 = '''
      SELECT ComplaintType, COUNT(*)
      FROM (SELECT DISTINCT ComplaintType FROM data)
      GROUP BY LOWER(ComplaintType)
      HAVING COUNT(*) >= 2
'''
df1 = pd.read_sql_query(query1, dataset)
df1
ComplaintType COUNT(*)
0 Elevator 2
1 PLUMBING 2

IN

특정한 레코드 값 가져올때

query = '''
      SELECT DISTINCT ComplaintType
      FROM data
      WHERE LOWER(ComplaintType) IN ("plumbing", "elevator")
'''
df = pd.read_sql_query(query, dataset)
df.head()
ComplaintType
0 PLUMBING
1 Elevator
2 Plumbing
3 ELEVATOR

AS

테이블 열 이름변경

query = '''
    SELECT Agency AS AGC, COUNT(*) AS NumComp
    FROM data
    GROUP BY Agency
'''
df = pd.read_sql_query(query, dataset)
df.head()
AGC NumComp
0 3-1-1 1289
1 ACS 3
2 AJC 6
3 CAU 1
4 CCRB 1

ORDER BY

정렬 (내림차순 -, DESC 사용)

query = '''
    SELECT Agency AS AGC, COUNT(*) AS NumComp
    FROM data
    GROUP BY UPPER(Agency)
    ORDER BY -NumComp
'''
# 또는 ORDER BY NumComplaints DESC 

df = pd.read_sql_query(query, dataset)
df.head()
AGC NumComp
0 HPD 640096
1 NYPD 340694
2 DOT 322969
3 DEP 181121
4 DSNY 152004

LIKE %word%

word가 포함된 value값을 모두 조회

query = '''
    SELECT LOWER(ComplaintType) AS type, COUNT(*) AS freq
    FROM data
    WHERE LOWER(ComplaintType) LIKE '%noise%'
    GROUP BY type
    ORDER BY -freq'''

df_noisy = pd.read_sql_query(query, dataset)
print("'noise' 포함된 {}개 쿼리".format(len(df_noisy)))
df_noisy
'noise' 포함된 8개 쿼리
type freq
0 noise 54165
1 noise - street/sidewalk 48436
2 noise - commercial 42422
3 noise - vehicle 18370
4 noise - park 4020
5 noise - helicopter 1715
6 noise - house of worship 1143
7 collection truck noise 184

<> 특정 레코드값 제외

COLLATE NOCASE

대소문자 구분을 무시

# name = 'None'을 제외하고 City이름은 대소문자 구별하지 않음
query = '''
    SELECT UPPER(City) AS name, COUNT(*) AS freq
    FROM data
    WHERE name <> 'None'
    GROUP BY City COLLATE NOCASE
    ORDER BY -freq
    LIMIT 10
'''
df2 = pd.read_sql_query(query, dataset)
df2
name freq
0 BROOKLYN 579363
1 NEW YORK 385655
2 BRONX 342533
3 STATEN ISLAND 92509
4 JAMAICA 46683
5 FLUSHING 35504
6 ASTORIA 31873
7 RIDGEWOOD 21618
8 WOODSIDE 15932
9 CORONA 15740
top5_cities = list(df2.head(5)['name'])
top5_cities   # complaint 숫자로 본 top5 도시
['BROOKLYN', 'NEW YORK', 'BRONX', 'STATEN ISLAND', 'JAMAICA']

문자들의 list를 하나의 문자로

def strs_to_args(str_list):
    assert type (str_list) is list
    assert all ([type (s) is str for s in str_list])
    ###
    c = str(list(i for i in str_list))
    c= c.replace('[','')
    c= c.replace(']','')
    c= c.replace("'",'"')
    return c

# 테스트
a=['a', 'b', 'c', 'd']
strs_to_args(a)
'"a", "b", "c", "d"'
#예제: top5 도시들의 complaint type별 count를 구하시오
query = '''SELECT LOWER(ComplaintType) AS complaint_type, 
                UPPER(CITY) as city_name, COUNT(*) as complaint_count
            FROM data
            WHERE city_name IN ({})
            GROUP BY city_name, complaint_type            
'''.format(strs_to_args(top5_cities))

df_complaints_by_city = pd.read_sql_query(query, dataset)
df_complaints_by_city
###
display(df_complaints_by_city.head(10))
complaint_type city_name complaint_count
0 adopt-a-basket BRONX 7
1 agency BRONX 1
2 air quality BRONX 666
3 animal abuse BRONX 1973
4 animal facility - no permit BRONX 10
5 animal in a park BRONX 189
6 appliance BRONX 3758
7 asbestos BRONX 264
8 beach/pool/sauna complaint BRONX 13
9 best/site safety BRONX 22
# 예제: 전체도시에서 top10 complaints를 정렬
query = '''Select LOWER(ComplaintType) as type, Count(ComplaintType) as freq
           from data GROUP BY type 
           ORDER BY -freq
            '''
df_complaint_freq = pd.read_sql_query(query, dataset)
top_complaints = df_complaint_freq[:10]
top_complaints
type freq
0 heat/hot water 241430
1 street condition 124347
2 street light condition 98577
3 blocked driveway 95080
4 illegal parking 83961
5 unsanitary condition 81394
6 paint/plaster 69929
7 water system 69209
8 plumbing 60105
9 noise 54165
# 도시별 top10 complaints별 count
df_plot = top_complaints.merge(df_complaints_by_city,
                               left_on=['type'],
                               right_on=['complaint_type'],
                               how='left')
df_plot.dropna(inplace=True)
df_plot.head(3)
type freq complaint_type city_name complaint_count
0 heat/hot water 241430 heat/hot water BRONX 79690
1 heat/hot water 241430 heat/hot water BROOKLYN 72410
2 heat/hot water 241430 heat/hot water JAMAICA 3376
# 예제: 도시별 전체 불만 중 해당 불만이 차지하는 비중을 'frac'으로 나타내라
df_plot['frac']=df_plot['complaint_count']/df_plot['freq']  # 열 추가
df_plot_fraction = df_plot.copy()  # DF 복사
df_plot_fraction = df_plot.drop('complaint_count', axis=1)  # 'count'열 삭제
###
df_plot_fraction.head(2)
type freq complaint_type city_name frac
0 heat/hot water 241430 heat/hot water BRONX 0.330075
1 heat/hot water 241430 heat/hot water BROOKLYN 0.299921

날짜, 시간

query = '''
        SELECT LOWER(ComplaintType), CreatedDate, UPPER(City)
        FROM data
        WHERE CreatedDate >= "2015-09-15 00:00:00.0" 
        and CreatedDate < "2015-09-16 00:00:00.0"
        ORDER BY CreatedDate 
'''    # 2015-09-15 ~ 2015-09-16 날짜 사이 발생한 items 모두
df = pd.read_sql_query (query, dataset)
df.head(2)
LOWER(ComplaintType) CreatedDate UPPER(City)
0 illegal parking 2015-09-15 00:01:23.000000 None
1 blocked driveway 2015-09-15 00:02:29.000000 REGO PARK

strftime()

날짜/시간을 string으로 변환

query = '''
    SELECT CreatedDate, STRFTIME('%H', CreatedDate) AS Hour, LOWER(ComplaintType)
    FROM data
    LIMIT 5
'''    # 시간을 Hour로 저장
df = pd.read_sql_query (query, dataset)
df.head(2)
CreatedDate Hour LOWER(ComplaintType)
0 2015-09-15 02:14:04.000000 02 illegal parking
1 2015-09-15 02:12:49.000000 02 noise - street/sidewalk
# 예제: 시간대별 count 구하기
query = '''
    SELECT STRFTIME('%H', CreatedDate) AS hour, count(*) AS count
    FROM data GROUP BY hour'''
df_complaints_by_hour = pd.read_sql_query(query, dataset)
###
df_complaints_by_hour.head(2)
hour count
0 00 564703
1 01 23489
# 예제: noise들어간 compalint type중에 00:00시 날짜를 제외하고 시간대별 정렬
query='''SELECT STRFTIME('%H', CreatedDate) AS hour, count(*) AS count
         FROM data 
         WHERE LOWER(ComplaintType) LIKE '%noise%' AND
         STRFTIME('%H:%M:%f', CreatedDate) <> '00:00:00.000'
         GROUP BY hour
'''
df_noisy_by_hour = pd.read_sql_query(query, dataset)
df_noisy_by_hour.head(3)
hour count
0 00 15349
1 01 11284
2 02 7170
# 예제: top3 complaint를 시간대별로, count 역순 정렬
top3 = strs_to_args(top_complaints[:3]['type'].tolist())

query='''
    SELECT LOWER(ComplaintType) as complaint_type, strftime('%H', CreatedDate) as hour, count(*) as count
    FROM data
    WHERE (complaint_type IN ({}) and strftime('%H:%M:%f', CreatedDate)<> '00:00:00.000')
    GROUP BY complaint_type, hour
    ORDER BY hour, -count
'''.format(top3)

df_top3_complaints_by_hour = pd.read_sql_query(query, dataset)
df_top3_complaints_by_hour.head(10)
complaint_type hour count
0 street condition 00 1298
1 street light condition 00 891
2 heat/hot water 00 139
3 street condition 01 1093
4 street light condition 01 623
5 heat/hot water 01 89
6 street condition 02 661
7 street light condition 02 385
8 heat/hot water 02 68
9 street condition 03 349
# 시간대별 complaint 수를 line chart로 그리시오
df_pivot = pd.pivot_table(df_top3_complaints_by_hour, values='count', columns='complaint_type', index='hour')

import matplotlib.pyplot as plt
df_pivot.plot(figsize=(10, 8))
plt.xlabel('Hour of Day')
plt.ylabel('Number of Complaints')
plt.show()

Tags: ,

Categories:

Updated: