pandas 기본개념 정리

카테고리 없음

by seonjaechoi 2023. 8. 9. 16:44

# 경로 확인 코드
import os 
os.getcwd()

# csv 파일로 저장하기
import pandas as pd

sample_df = pd.DataFrame({"col1" : [1, 2], "col2" : [3,4]})
sample_df # 파일명 test.csv, index=False
csv_file_path = './output/test.csv' # 경로 설정
sample_df.to_csv(csv_file_path, index=False, encoding='utf-8-sig')

# csv 파일 불러오기
df = pd.read_csv(filepath_or_buffer = csv_file_path)
df

df_boston = pd.read_csv(filepath_or_buffer = './data/boston.csv') # ./data/boston.csv = 경로
df_boston

▣ 날짜 데이터 핸들링 코드

date_string = "2023-03-02"
# type(date_string) = str

datetime_obj = pd.to_datetime(date_string)
# print(datetime_obj) = 2023-03-02 00:00:00
# print(type(datetime_obj)) = <class 'pandas._libs.tslibs.timestamps.Timestamp'>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
date_list = ["2023-03-02"]
print(type(date_list))

datetime_obj = pd.to_datetime(date_list)
print(datetime_obj)
print(type(datetime_obj))

-결과값-
<class 'list'>
DatetimeIndex(['2023-03-02'], dtype='datetime64[ns]', freq=None)
<class 'pandas.core.indexes.datetimes.DatetimeIndex'>
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
sales = pd.read_csv('data/raw_sales.csv')
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   datesold      29580 non-null  object
 1   postcode      29580 non-null  int64 
 2   price         29580 non-null  int64 
 3   propertyType  29580 non-null  object
 4   bedrooms      29580 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 1.1+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
# 날짜 데이터 변환
sales.head(1)

# 연도, 월, 일
sales['year'] = sales['datesold'].dt.year
sales['month'] = sales['datesold'].dt.month
sales['day'] = sales['datesold'].dt.day
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datesold      29580 non-null  datetime64[ns]
 1   postcode      29580 non-null  int64         
 2   price         29580 non-null  int64         
 3   propertyType  29580 non-null  object        
 4   bedrooms      29580 non-null  int64         
 5   year          29580 non-null  int32         
 6   month         29580 non-null  int32         
 7   day           29580 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
sales['datesold'] = pd.to_datetime(sales['datesold'])
sales.info()
Selection deleted
sales['datesold'] = pd.to_datetime(sales['datesold'])
sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datesold      29580 non-null  datetime64[ns]
 1   postcode      29580 non-null  int64         
 2   price         29580 non-null  int64         
 3   propertyType  29580 non-null  object        
 4   bedrooms      29580 non-null  int64         
 5   year          29580 non-null  int32         
 6   month         29580 non-null  int32         
 7   day           29580 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# 연도, 월, 일
sales['year'] = sales['datesold'].dt.year
sales['month'] = sales['datesold'].dt.month
sales['day'] = sales['datesold'].dt.day
sales # 데이터 프레임 출력
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# shift 기능
temp_df = sales.loc[:, ['datesold', 'price']].copy()
temp_df.head(3)

temp_df['shifted_v1'] = temp_df['price'].shift(1, fill_value=0).astype(int)
temp_df['diff'] = temp_df['price'] - temp_df['shifted_v1']
temp_df.head(5)

temp_df = sales.loc[:, ['datesold', 'price']].copy()
temp_df['shifted_v3'] = temp_df['price'].shift(-1, fill_value=0).astype(int)
temp_df['shifted_v4'] = temp_df['price'].shift(-2, fill_value=0).astype(int)
temp_df.tail(5)

sales.info()
-결과값-
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29580 entries, 0 to 29579
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   datesold      29580 non-null  datetime64[ns]
 1   postcode      29580 non-null  int64         
 2   price         29580 non-null  int64         
 3   propertyType  29580 non-null  object        
 4   bedrooms      29580 non-null  int64         
 5   year          29580 non-null  int32         
 6   month         29580 non-null  int32         
 7   day           29580 non-null  int32         
dtypes: datetime64[ns](1), int32(3), int64(3), object(1)
memory usage: 1.5+ MB

sales['한글날짜'] = sales['datesold'].dt.strftime('%Y년 %m월 %d일')
sales.head(3)

▣ 결측치 이상치 데이터 핸들링

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import statsmodels
import scipy
import seaborn as sns

print(pd.__version__)
print(np.__version__)
print(matplotlib.__version__)
print(statsmodels.__version__)
print(scipy.__version__)
print(sns.__version__)
-결과값-
2.0.3
1.25.2
3.7.2
0.14.0
1.11.1
0.12.2

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

DATA_PATH = './'
covidtotals = pd.read_csv(DATA_PATH + "data/covidtotalswithmissings.csv")
covidtotals.head(5)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

# 결측치 및 데이터 타입 확인
covidtotals.info()
-결과값-

covidtotals.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 210 entries, 0 to 209
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   iso_code         210 non-null    object 
 1   lastdate         210 non-null    object 
 2   location         210 non-null    object 
 3   total_cases      210 non-null    int64  
 4   total_deaths     210 non-null    int64  
 5   total_cases_pm   209 non-null    float64
 6   total_deaths_pm  209 non-null    float64
 7   population       210 non-null    float64
 8   pop_density      198 non-null    float64
 9   median_age       186 non-null    float64
 10  gdp_per_capita   182 non-null    float64
 11  hosp_beds        164 non-null    float64
dtypes: float64(7), int64(2), object(3)
memory usage: 19.8+ KB

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

pd.set_option('display.width', 80)
pd.set_option('display.max_columns', 20)
pd.set_option('display.max_rows', 20)
pd.options.display.float_format = '{:,.2f}'.format

covidtotals = covidtotals.set_index('iso_code')
covidtotals.head()

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
### 결측치 처리
 - 인구통계 컬럼에서 결측치 데이터 확인
 - 컬럼 방향으로 결측치의 갯수를 파악함
 - 5가지 인구 통계 변수 중에서 3개가 누락된 곳, 4개가 누락된 곳 확인
 
# totvars = 코비드 바이러스 확진자 관련 변수
totvars = ['location','total_cases','total_deaths','total_cases_pm','total_deaths_pm']
# demovars = 인구수 관련 변수
demovars = ['population','pop_density','median_age','gdp_per_capita', 'hosp_beds']

covidtotals[demovars].isnull().sum(axis=0)
-결과값-

covidtotals[demovars].isnull().sum(axis=0)
population         0
pop_density       12
median_age        24
gdp_per_capita    28
hosp_beds         46
dtype: int64

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

covidtotals.iloc[2:4, :]

demovars_misscnt = covidtotals[demovars].isnull().sum(axis=1)
demovars_misscnt.value_counts()
-결과값-
0    156
1     24
2     12
3     10
4      8
Name: count, dtype: int64

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 
### 미션1.
 - 결측치가 3개 이상인 데이터를 추출하세요!
 - 추출해야할 컬럼은 ['location'] + demovars

covidtotals.loc[demovars_misscnt >= 3, ['location'] + demovars].T
# demovars_misscnt = 결측치 갯수

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

Index Alignment
인덱스의 값끼리 연산이 이뤄지는 것을 말함. Pandas에서 제공함

s1 = pd.Series([1, 2, 3, 4], index=['a', 'b', 'c', 'd'])
s2 = pd.Series([4, 3, 2, 1], index=['d', 'c', 'b', 'a'])

print(s1, "\n", s2)
print(s1 + s2)
-결과값 -
a    1
b    2
c    3
d    4
dtype: int64 
 d    4
c    3
b    2
a    1
dtype: int64
a    2
b    4
c    6
d    8
dtype: int64

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

## 코로나 관련 변수만 추출
 - 결측치가 있는지 확인
 - 결측치가 존재하는 국가 하나 존재
 - totvars 사용
 
totvars_misscnt = covidtotals[totvars].isnull().sum(axis=1)
totvars_misscnt.value_counts()
ms_country = covidtotals.loc[totvars_misscnt > 1, ['location'] + totvars]
ms_country.T
 
covidtotalsonly = covidtotals.loc[:, totvars]
covidtotalsonly['total_cases_q'] = pd.qcut(covidtotalsonly['total_cases'],
                                          labels = ['매우낮음', '낮음', '중간', '높음', '매우높음'],
                                          q = 5, precision=0)

covidtotalsonly['total_deaths_q'] = pd.qcut(covidtotalsonly['total_deaths'],
                                          labels = ['매우낮음', '낮음', '중간', '높음', '매우높음'],
                                          q = 5, precision=0)
covidtotalsonly

pd.crosstab(covidtotalsonly['total_cases_q'], covidtotalsonly['total_deaths_q'])

covidtotals.loc[(covidtotalsonly.total_cases_q=="매우높음") & (covidtotalsonly.total_deaths_q=="중간")].T
covidtotals.loc[(covidtotalsonly.total_cases_q=="매우낮음") & (covidtotalsonly.total_deaths_q=="중간")].T
covidtotals.loc[(covidtotalsonly.total_cases_q=="낮음") & (covidtotalsonly.total_deaths_q=="높음")].T
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -