Crawling 다양한 환경에서 실습 해보기

Crawling

by seonjaechoi 2023. 8. 8. 16:46

본 내용은 파이썬을 활용한 크롤러 개발과 스크레이핑 입문 교재를 기반으로 수강한 내용을 리뷰합니다.

교재 구매 링크 : https://www.yes24.com/Product/Goods/76488159

파이썬을 활용한 크롤러 개발과 스크레이핑 입문 - 예스24

웹 데이터 수집과 분석을 자동으로 처리해 보자인공지능, 머신러닝 기술의 발달과 더불어 최근 데이터 분석의 수요가 많아지고 있다. 데이터를 유의미한 자료로 활용하기 위해서는 다양한 데이

www.yes24.com

2023-08-08 수업 내용 리뷰

## 크롬드라이버 설정 방법
- 크롬 드라이버 다운로드 사이트 : https://chromedriver.chromium.org/downloads

## 주피터 환경에서 크롬 드라이버를 통해 웹페이지 접속 및 크롤링
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

## Selenium 가져오기 및 버전 체크

import selenium
import os

print(selenium.__version__)

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 


def list_files(startpath):
    for root, dirs, files in os.walk(startpath):
        level = root.replace(startpath, '').count(os.sep)
        indent = ' ' * 4 * (level)
        print('{}{}/'.format(indent, os.path.basename(root)))
        subindent = ' ' * 4 * (level + 1)
        for f in files:
            print('{}{}'.format(subindent, f))
            
list_files("driver")

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ 

driver/
    windows_110/
        chromedriver.exe
        LICENSE.chromedriver
        
● 크롬드라이버 경로를 지정 후 아래 코드를 실행한다.

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

from selenium import webdriver
from selenium.webdriver.chrome.service import Service

CHROME_DRIVER_PATH = './driver/windows_110/chromedriver.exe'
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

driver.get('https://www.naver.com/')

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ 

driver.quit()

● 위 방식은 Chrome 버전의 변동에 따라 Chrome Driver를 다시 다운로드 받아야함

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

## webdriver-manager 라이브러리를 활용한 드라이버 최신화

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
driver.get('https://www.naver.com/')

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ 

driver.quit()

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 구글 이미지 다운로드

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

CHROME_DRIVER_PATH = './driver/windows_110/chromedriver.exe'
service = Service(executable_path=CHROME_DRIVER_PATH)
options = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=service, options=options)

URL='https://www.google.co.kr/imghp'
driver.get(url=URL)

elem = driver.find_element(By.CSS_SELECTOR, "body > div.L3eUgb > div.o3j99.ikrT4e.om7nvf > form > div:nth-child(1) > div.A8SBwf > div.RNNXgb > div > div.a4bIc > textarea.gLFyf")
elem.send_keys("보라카이")
elem.send_keys(Keys.RETURN)

elem = driver.find_element(By.TAG_NAME, "body")
for i in range(60):
    elem.send_keys(Keys.PAGE_DOWN)
    time.sleep(0.1)

try:
    driver.find_element(By.CSS_SELECTOR, "#islmp > div > div > div > div.gBPM8 > div.qvfT1 > div.YstHxe > input").click()
    for i in range(60):
        elem.send_keys(Keys.PAGE_DOWN)
        time.sleep(0.1)

except:
    pass

links=[] 
images = driver.find_elements(By.CSS_SELECTOR, "#islrg > div.islrc > div > a.wXeWr.islib.nfEiy > div.bRMDJf.islir > img")

for image in images:
    if image.get_attribute('src') is not None:
        links.append(image.get_attribute('src'))
        
print(' 찾은 이미지 개수:',len(links))

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼

찾은 이미지 개수: 47

driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 이미지 파일 다운로드

import urllib.request

for k, i in enumerate(links):
    url = i
    urllib.request.urlretrieve(url, ".\\사진다운로드\\"+str(k)+".jpg")

print('다운로드 완료하였습니다.')

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼

다운로드 완료하였습니다.

driver.quit()
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 검색어 추출

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
URL='https://signal.bz/news'
driver.get(url=URL)
results = driver.find_elements(By.CSS_SELECTOR, "#app > div > main > div > section > div > section > section:nth-child(2) > div:nth-child(2) > div > div > div > a > span.rank-text")
results

keyword_list = []
for keyword in results:
    keyword_list.append(keyword.text)
keyword_list

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼
['1_태풍 카눈 북상에', '2_경상수지 두 달', '3_잼버리', '4_류현진 강습 타구', '5_사고 위장 부사관', '6_잼버리 대원들', '7_서울대공원서 시베리아호랑이 파악', '8_中 윤동주 폐쇄', '9_입추', '10_신성록']

driver.quit()

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
## 뽐뿌 판매목록 및 링크 크롤링

from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
import time

driver = webdriver.Chrome(service=ChromeService(ChromeDriverManager().install()))
URL= 'https://www.ppomppu.co.kr/zboard/zboard.php?id=ppomppu'
driver.get(url=URL)

titles = driver.find_elements(By.CSS_SELECTOR, '#revolution_main_table > tbody > tr > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font')
urls = driver.find_elements(By.CSS_SELECTOR, '#revolution_main_table > tbody > tr > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a')

title_lists = []
url_lists = []

for i in range(len(titles)):
    print(titles[i].text)
    title_lists.append(titles[i].text)
    print(urls[i].get_attribute('href'))
    url_lists.append(urls[i].get_attribute('href'))

#revolution_main_table
#revolution_main_table > tbody > tr:nth-child(9) > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font
#revolution_main_table > tbody > tr:nth-child(11) > td:nth-child(3) > table > tbody > tr > td:nth-child(2) > div > a > font

▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼ ▼

[티-] 바-린 퓨- 젤리 50ml 5개 외 (5,520원/무료)
https://www.ppomppu.co.kr/zboard/view.php?id=ppomppu&page=1&divpage=80&n-
.
.
.
펩-제-슈- 무-벨 -임 300ml 20펫 (12,720 / 무배)
https://www.ppomppu.co.kr/zboard/view.php?id=social&n-

- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Chois

고정 헤더 영역

메뉴 레이어

메뉴 리스트

검색 레이어

검색 영역

상세 컨텐츠

본문 제목

본문

추가 정보

인기글

최신글

티스토리툴바