# 경로 확인 코드
import os
os.getcwd()
# csv 파일로 저장하기
import pandas as pd
sample_df = pd.DataFrame({"col1" : [1, 2], "col2" : [3,4]})
sample_df # 파일명 test.csv, index=False
csv_file_path = './output/test.csv' # 경로 설정
sample_df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')
# csv 파일 불러오기
df = pd.read_csv(filepath_or_buffer = csv_file_path)
df
df_boston = pd.read_csv(filepath_or_buffer = './data/boston.csv') # ./data/boston.csv = 경로
df_boston
▣ 날짜 데이터 핸들링 코드
date_string = "2023-03-02"
# type(date_string) = str
datetime_obj = pd.to_datetime(date_string)
# print(datetime_obj) = 2023-03-02 00:00:00
# print(type(datetime_obj)) = <class 'pandas._libs.tslibs.timestamps.Timestamp'>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
date_list = ["2023-03-02"]
print(type(date_list))
datetime_obj = pd.to_datetime(date_list)
print(datetime_obj)
print(type(datetime_obj))
-결과값-
<class 'list'>
DatetimeIndex(['2023-03-02'], dtype='datetime64[ns]', freq=None)
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sales = pd.read_csv('data/raw_sales.csv')
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 5 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datesold 29580 non-null object
1 postcode 29580 non-null int64
2 price 29580 non-null int64
3 propertyType 29580 non-null object
4 bedrooms 29580 non-null int64
dtypes: int64(3), object(2)
memory usage: 1.1+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 날짜 데이터 변환
sales.head(1)
# 연도, 월, 일
sales['year'] = sales['datesold'].dt.year
sales['month'] = sales['datesold'].dt.month
sales['day'] = sales['datesold'].dt.day
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datesold 29580 non-null datetime64[ns]
1 postcode 29580 non-null int64
2 price 29580 non-null int64
3 propertyType 29580 non-null object
4 bedrooms 29580 non-null int64
5 year 29580 non-null int32
6 month 29580 non-null int32
7 day 29580 non-null int32
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
sales['datesold'] = pd.to_datetime(sales['datesold'])
sales.info()
Selection deleted
sales['datesold'] = pd.to_datetime(sales['datesold'])
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datesold 29580 non-null datetime64[ns]
1 postcode 29580 non-null int64
2 price 29580 non-null int64
3 propertyType 29580 non-null object
4 bedrooms 29580 non-null int64
5 year 29580 non-null int32
6 month 29580 non-null int32
7 day 29580 non-null int32
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 연도, 월, 일
sales['year'] = sales['datesold'].dt.year
sales['month'] = sales['datesold'].dt.month
sales['day'] = sales['datesold'].dt.day
sales # 데이터 프레임 출력
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# shift 기능
temp_df = sales.loc[:, ['datesold', 'price']].copy()
temp_df.head(3)
temp_df['shifted_v1'] = temp_df['price'].shift(1, fill_value=0).astype(int)
temp_df['diff'] = temp_df['price'] - temp_df['shifted_v1']
temp_df.head(5)
temp_df = sales.loc[:, ['datesold', 'price']].copy()
temp_df['shifted_v3'] = temp_df['price'].shift(-1, fill_value=0).astype(int)
temp_df['shifted_v4'] = temp_df['price'].shift(-2, fill_value=0).astype(int)
temp_df.tail(5)
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 datesold 29580 non-null datetime64[ns]
1 postcode 29580 non-null int64
2 price 29580 non-null int64
3 propertyType 29580 non-null object
4 bedrooms 29580 non-null int64
5 year 29580 non-null int32
6 month 29580 non-null int32
7 day 29580 non-null int32
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB
sales['한글날짜'] = sales['datesold'].dt.strftime('%Y년 %m월 %d일')
sales.head(3)
▣ 결측치 이상치 데이터 핸들링
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels
import scipy
import seaborn as sns
print(pd.__version__)
print(np.__version__)
print(matplotlib.__version__)
print(statsmodels.__version__)
print(scipy.__version__)
print(sns.__version__)
-결과값-
2.0.3
1.25.2
3.7.2
0.14.0
1.11.1
0.12.2
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
DATA_PATH = './'
covidtotals = pd.read_csv(DATA_PATH + "data/covidtotalswithmissings.csv")
covidtotals.head(5)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 결측치 및 데이터 타입 확인
covidtotals.info()
-결과값-
covidtotals.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 12 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 iso_code 210 non-null object
1 lastdate 210 non-null object
2 location 210 non-null object
3 total_cases 210 non-null int64
4 total_deaths 210 non-null int64
5 total_cases_pm 209 non-null float64
6 total_deaths_pm 209 non-null float64
7 population 210 non-null float64
8 pop_density 198 non-null float64
9 median_age 186 non-null float64
10 gdp_per_capita 182 non-null float64
11 hosp_beds 164 non-null float64
dtypes: float64(7), int64(2), object(3)
memory usage: 19.8+ KB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format
covidtotals = covidtotals.set_index('iso_code')
covidtotals.head()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### 결측치 처리
- 인구통계 컬럼에서 결측치 데이터 확인
- 컬럼 방향으로 결측치의 갯수를 파악함
- 5가지 인구 통계 변수 중에서 3개가 누락된 곳, 4개가 누락된 곳 확인
# totvars = 코비드 바이러스 확진자 관련 변수
totvars = ['location','total_cases','total_deaths','total_cases_pm','total_deaths_pm']
# demovars = 인구수 관련 변수
demovars = ['population','pop_density','median_age','gdp_per_capita', 'hosp_beds']
covidtotals[demovars].isnull().sum(axis=0)
-결과값-
covidtotals[demovars].isnull().sum(axis=0)
population 0
pop_density 12
median_age 24
gdp_per_capita 28
hosp_beds 46
dtype: int64
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
covidtotals.iloc[2:4, :]
demovars_misscnt = covidtotals[demovars].isnull().sum(axis=1)
demovars_misscnt.value_counts()
-결과값-
0 156
1 24
2 12
3 10
4 8
Name: count, dtype: int64
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
### 미션1.
- 결측치가 3개 이상인 데이터를 추출하세요!
- 추출해야할 컬럼은 ['location'] + demovars
covidtotals.loc[demovars_misscnt >= 3, ['location'] + demovars].T
# demovars_misscnt = 결측치 갯수
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Index Alignment
인덱스의 값끼리 연산이 이뤄지는 것을 말함. Pandas에서 제공함
s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])
print(s1, "\n", s2)
print(s1 + s2)
-결과값 -
a 1
b 2
c 3
d 4
dtype: int64
d 4
c 3
b 2
a 1
dtype: int64
a 2
b 4
c 6
d 8
dtype: int64
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 코로나 관련 변수만 추출
- 결측치가 있는지 확인
- 결측치가 존재하는 국가 하나 존재
- totvars 사용
totvars_misscnt = covidtotals[totvars].isnull().sum(axis=1)
totvars_misscnt.value_counts()
ms_country = covidtotals.loc[totvars_misscnt > 1, ['location'] + totvars]
ms_country.T
covidtotalsonly = covidtotals.loc[:, totvars]
covidtotalsonly['total_cases_q'] = pd.qcut(covidtotalsonly['total_cases'],
labels = ['매우낮음', '낮음', '중간', '높음', '매우높음'],
q = 5, precision=0)
covidtotalsonly['total_deaths_q'] = pd.qcut(covidtotalsonly['total_deaths'],
labels = ['매우낮음', '낮음', '중간', '높음', '매우높음'],
q = 5, precision=0)
covidtotalsonly
pd.crosstab(covidtotalsonly['total_cases_q'], covidtotalsonly['total_deaths_q'])
covidtotals.loc[(covidtotalsonly.total_cases_q=="매우높음") & (covidtotalsonly.total_deaths_q=="중간")].T
covidtotals.loc[(covidtotalsonly.total_cases_q=="매우낮음") & (covidtotalsonly.total_deaths_q=="중간")].T
covidtotals.loc[(covidtotalsonly.total_cases_q=="낮음") & (covidtotalsonly.total_deaths_q=="높음")].T
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -