본 내용은 파이썬을 활용한 크롤러 개발과 스크레이핑 입문 교재를 기반으로 수강한 내용을 리뷰합니다.
교재 구매 링크 : https://www.yes24.com/Product/Goods/76488159
2023-08-08 수업 내용 리뷰
## 크롬드라이버 설정 방법
- 크롬 드라이버 다운로드 사이트 : https://chromedriver.chromium.org/downloads
## 주피터 환경에서 크롬 드라이버를 통해 웹페이지 접속 및 크롤링
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## Selenium 가져오기 및 버전 체크
import selenium
import os
print(selenium.__version__)
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
def list_files(startpath):
for root, dirs, files in os.walk(startpath):
level = root.replace(startpath, '').count(os.sep)
indent = ' ' * 4 * (level)
print('{}{}/'.format(indent, os.path.basename(root)))
subindent = ' ' * 4 * (level + 1)
for f in files:
print('{}{}'.format(subindent, f))
list_files("driver")
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
driver/
windows_110/
chromedriver.exe
LICENSE.chromedriver
● 크롬드라이버 경로를 지정 후 아래 코드를 실행한다.
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
CHROME_DRIVER_PATH = './driver/windows_110/chromedriver.exe'
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
driver.get('https://www.naver.com/')
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
driver.quit()
● 위 방식은 Chrome 버전의 변동에 따라 Chrome Driver를 다시 다운로드 받아야함
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## webdriver-manager 라이브러리를 활용한 드라이버 최신화
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get('https://www.naver.com/')
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 구글 이미지 다운로드
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
CHROME_DRIVER_PATH = './driver/windows_110/chromedriver.exe'
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)
URL='https://www.google.co.kr/imghp'
driver.get(url=URL)
elem = driver.find_element(By.CSS_SELECTOR, "body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form > div:nth-child(1) > div.A8SBwf > div.RNNXgb > div > div.a4bIc > textarea.gLFyf")
elem.send_keys("보라카이")
elem.send_keys(Keys.RETURN)
elem = driver.find_element(By.TAG_NAME, "body")
for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.1)
try:
driver.find_element(By.CSS_SELECTOR, "#islmp > div > div > div > div.gBPM8 > div.qvfT1 > div.YstHxe > input").click()
for i in range(60):
elem.send_keys(Keys.PAGE_DOWN)
time.sleep(0.1)
except:
pass
links=[]
images = driver.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a.wXeWr.islib.nfEiy > div.bRMDJf.islir > img")
for image in images:
if image.get_attribute('src') is not None:
links.append(image.get_attribute('src'))
print(' 찾은 이미지 개수:',len(links))
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
찾은 이미지 개수: 47
driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 이미지 파일 다운로드
import urllib.request
for k, i in enumerate(links):
url = i
urllib.request.urlretrieve(url, ".\\사진다운로드\\"+str(k)+".jpg")
print('다운로드 완료하였습니다.')
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
다운로드 완료하였습니다.
driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 검색어 추출
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
URL='https://signal.bz/news'
driver.get(url=URL)
results = driver.find_elements(By.CSS_SELECTOR, "#app > div > main > div > section > div > section > section:nth-child(2) > div:nth-child(2) > div > div > div > a > span.rank-text")
results
keyword_list = []
for keyword in results:
keyword_list.append(keyword.text)
keyword_list
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
['1_태풍 카눈 북상에', '2_경상수지 두 달', '3_잼버리', '4_류현진 강습 타구', '5_사고 위장 부사관', '6_잼버리 대원들', '7_서울대공원서 시베리아호랑이 파악', '8_中 윤동주 폐쇄', '9_입추', '10_신성록']
driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 뽐뿌 판매목록 및 링크 크롤링
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time
driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
URL= 'https://www.ppomppu.co.kr/zboard/zboard.php?id=ppomppu'
driver.get(url=URL)
titles = driver.find_elements(By.CSS_SELECTOR, '#revolution_main_table > tbody > tr > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font')
urls = driver.find_elements(By.CSS_SELECTOR, '#revolution_main_table > tbody > tr > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a')
title_lists = []
url_lists = []
for i in range(len(titles)):
print(titles[i].text)
title_lists.append(titles[i].text)
print(urls[i].get_attribute('href'))
url_lists.append(urls[i].get_attribute('href'))
#revolution_main_table
#revolution_main_table > tbody > tr:nth-child(9) > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font
#revolution_main_table > tbody > tr:nth-child(11) > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font
▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
[티-] 바-린 퓨- 젤리 50ml 5개 외 (5,520원/무료)
https://www.ppomppu.co.kr/zboard/view.php?id=ppomppu&page=1&divpage=80&n-
.
.
.
펩-제-슈- 무-벨 -임 300ml 20펫 (12,720 / 무배)
https://www.ppomppu.co.kr/zboard/view.php?id=social&n-
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -