[ABC 220906 - 16일차] 오후 실습 과제

갓생사는 공순이가 되고싶은 콩순이 2022. 9. 11. 02:11

<과제>

5가지 공감 랭킹 뉴스 데이터 크롤링

사용자에게 보고 싶은 공감 랭킹 뉴스를 입력받아 [love, congrats, expect, surprise, sad]

워드 클라우드 시각화 진행

링크 :
https://entertain.naver.com/ranking/sympathy/cheer
https://entertain.naver.com/ranking/sympathy/congrats
https://entertain.naver.com/ranking/sympathy/expect
https://entertain.naver.com/ranking/sympathy/surprise
https://entertain.naver.com/ranking/sympathy/sad

한글 깨짐 방지

import matplotlib as mpl
import matplotlib.pyplot as plt
 
%config InlineBackend.figure_format = 'retina'
 
!apt -qq -y install fonts-nanum
 
import matplotlib.font_manager as fm
fontpath = '/usr/share/fonts/truetype/nanum/NanumBarunGothic.ttf'
font = fm.FontProperties(fname=fontpath, size=9)
plt.rc('font', family='NanumBarunGothic') 
mpl.font_manager._rebuild()

라이브러리 설치

!pip install selenium
!apt-get update
!apt install chromium-chromedriver
!cp /usr/lib/chromium-browser/chromedriver /usr/bin

뉴스 데이터 크롤링 및 워드 클라우드 시각화 진행

- 일단, 교수님께서 주신 조건 말고도 cheer 항목이 더 있다는 것을 알았음. 그래서 5가지 항목이 아닌, 6가지 항목 [love, cheer, congrats, expect, surprise, sad]로 늘렸음.

- 또한, 사용자에게 잘못된 입력을 받았을 때 잘못 입력했다는 경고 메세지를 띄우고, 다시 입력받는 창이 뜨게 while문을 사용해서 무한루프 돌림.

from selenium import webdriver
from bs4 import BeautifulSoup

import re
import time
from pytz import timezone
import datetime

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# 1) 데이터 프레임 생성
data = pd.DataFrame(columns=['순위', '공감종류', '기사제목', '기사링크', '기사내용', '공감수', '수집일자'])

# 2) 크롬을 켜서 해당 주소로 접속
options = webdriver.ChromeOptions()
options.add_argument('--headless') # Head-less 설정
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=options)
# https://entertain.naver.com/ranking/sympathy
# https://entertain.naver.com/ranking/sympathy/cheer
# https://entertain.naver.com/ranking/sympathy/congrats
# https://entertain.naver.com/ranking/sympathy/expect
# https://entertain.naver.com/ranking/sympathy/surprise
# https://entertain.naver.com/ranking/sympathy/sad
while True : 
  what_sym = input('보고싶은 공감 종류를 입력하세요. {love, cheer, congrats, expect, surprise, sad} :')
  what_sym = str(what_sym)

  if what_sym == 'love' :
    url = 'https://entertain.naver.com/ranking/sympathy'
    break

  elif what_sym != 'surprise' and what_sym != 'cheer' and what_sym != 'congrats' and what_sym != 'expect' and what_sym != 'surprise' and what_sym != 'sad' :
    print("잘못 입력하셨습니다. {love, cheer, congrats, expect, surprise, sad} 중에 하나를 선택해주세요")
    continue

  else :
    url = 'https://entertain.naver.com/ranking/sympathy/' + what_sym
    break

driver.get(url)
driver.implicitly_wait(3)
time.sleep(1.5)

#3) 스크롤 해보자
driver.execute_script('window.scrollTo(0, 800)')
time.sleep(3)

# 4) 수집하자
html_source = driver.page_source

# 5) 파싱
soup = BeautifulSoup(html_source, 'html.parser', from_encoding = 'UTF-8')

li = soup.find_all('li', {'class' ,'_inc_news_lst3_rank_reply'})

for index_l in range(0, len(li)) :
  try :
    # 순위
    rank = li[index_l].find('em', {'class', 'blind'}).text.replace("\n", ' ').replace('\t', ' ').strip()

    # 기사제목
    title = li[index_l].find('a', {'class', 'tit'}).text.replace("\n", ' ').replace('\t', ' ').strip()

    # 기사내용
    summary = li[index_l].find('p', {'class', 'summary'}).text.replace("\n", ' ').replace('\t', ' ').strip()

    # 기사링크
    link = li[index_l].find('a').attrs['href']

    # 공감수
    count = li[index_l].find('a', {'class', 'likeitnews_item_likeit'}).text.replace("\n", ' ').replace('\t', ' ').strip()
    count = count.split('수')[1]

    # 데이터 저장 dataframe 저장(append)
    data = data.append({'순위' : rank,
                        '공감종류' : what_sym,
                        '기사제목' : title, 
                        '기사링크' : 'https://entertain.naver.com' + link,
                        '기사내용' : summary,
                        '공감수' : count,
                        '수집일자' : datetime.datetime.now(timezone('Asia/Seoul')).strftime('%Y-%m-%d %H:%M:%S')},
                        ignore_index = True)
    
  except:
    pass
  
  print('Complets of ' + rank + ' : ' + title)

data_text = data['기사제목'].replace('[^\w]', ' ', regex=True, inplace=True)

data_text = " ".join(li for li in data.기사제목.astype(str))

plt.subplots(figsize=(25, 15))
wordcloud = WordCloud(background_color='white', width=1000, height=700, font_path=fontpath).generate(data_text)
plt.axis('off')
plt.imshow(wordcloud, interpolation='bilinear')
plt.show()

LIST