[Python_bigdata] 파이썬을 활용한 웹 크롤링

#Day05_04_crawling

#!/usr/bin/env python

# coding: utf-8

#크롤링으로 원하는 데이터 수집

#python에서 크롤링을 위해서 필요한 외부 패키지

import requests as req

from bs4 import BeautifulSoup

#1. 대상사이트

target_url =' https://www.apache.org/'

#2. 대상 사이트에 통해 페이지 요청하기

response = req.get(target_url) #get요청에 따른 response 200 이면 성공

#3. 응답된 결과를 담은 response 변수에서 페이지의 내용보기

BeautifulSoup(response.text) #response.text는 문자 자체라고 볼 수 있다. BeautifulSoup 를 통해 html 과 같이 가져온다.

soup = BeautifulSoup(response.text,'html.parser') #html.parser를 이용하여 html 형태로 출력가능

#4. html 결과값에서 긁어올 부분(가져올 부분) select 하기

categories = soup.select('#by_category>ul>li>a') #,> 를 통해 하위의 코드를 가져온다.

Stay Hungry Stay Foolish