FinGraph / src /graphBuilder /scrapping /finScrapping.py
dev-yuje's picture
feat: complete finance graph integration and fix isolation
47e7138
import re
import sys
import time
from collections import Counter
from datetime import datetime, timedelta
# ์œˆ๋„์šฐ ์ฝ˜์†” UnicodeEncodeError ์™„์ „ ๋ฐฉ์ง€
if hasattr(sys.stdout, 'reconfigure'):
sys.stdout.reconfigure(encoding='utf-8')
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
# ์ˆ˜์ง‘ ๋Œ€์ƒ ์นดํ…Œ๊ณ ๋ฆฌ sid - ์‚ฌ์šฉ์ž์˜ ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ ์ง€์นจ์— ๋งž์ถ”์–ด ๊ฒฝ์ œ์™€ IT/๊ณผํ•™์„ ๋ชจ๋‘ ์ˆ˜์ง‘ํ•ฉ๋‹ˆ๋‹ค.
categories_sid = {
"๊ฒฝ์ œ": "101",
"IT/๊ณผํ•™": "105",
}
NUM_ARTICLES_PER_DATE_CAT = 20 # ์นดํ…Œ๊ณ ๋ฆฌ๋ณ„/๋‚ ์งœ๋ณ„ ์ˆ˜์ง‘๋Ÿ‰ (7์ผ * 2๊ฐœ ์นดํ…Œ๊ณ ๋ฆฌ * 20 = ์ตœ๋Œ€ 280๊ฑด ๋งํฌ ํŒŒ์‹ฑ)
# AI ๋ฐ ๊ธˆ์œต/ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ๋ฆฌ์ŠคํŠธ (๊ต์ฐจ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง ์ ์šฉ)
AI_KEYWORDS = [
"AI", "์ธ๊ณต์ง€๋Šฅ", "์ƒ์„ฑํ˜• AI", "๋Œ€๊ทœ๋ชจ์–ธ์–ด๋ชจ๋ธ", "LLM", "GPT",
"์ œ๋ฏธ๋‚˜์ด", "Gemini", "ํด๋กœ๋“œ", "Claude", "๋จธ์‹ ๋Ÿฌ๋‹", "๋”ฅ๋Ÿฌ๋‹"
]
FIN_KEYWORDS = [
"ํ•€ํ…Œํฌ", "๊ธˆ์œต", "์€ํ–‰", "์นด๋“œ", "์ฆ๊ถŒ", "ํŽ˜์ด", "์†ก๊ธˆ", "๊ฒฐ์ œ",
"์ž์‚ฐ๊ด€๋ฆฌ", "์‹ ์šฉํ‰๊ฐ€", "์‹ ์šฉ", "ํˆฌ์ž", "๋งˆ์ด๋ฐ์ดํ„ฐ", "๋กœ๋ณด์–ด๋“œ๋ฐ”์ด์ €",
"์ธํ„ฐ๋„ท์€ํ–‰", "์ธ์Šˆ์–ดํ…Œํฌ", "์ž์‚ฐ์šด์šฉ", "์นด์นด์˜ค๋ฑ…ํฌ", "ํ† ์Šค๋ฑ…ํฌ",
"์ผ€์ด๋ฑ…ํฌ", "๋„ค์ด๋ฒ„ํŽ˜์ด", "์นด์นด์˜คํŽ˜์ด", "ํ† ์Šค", "์ฃผ์‹", "๋ฑ…ํ‚น",
"๋””์ง€ํ„ธ ๊ธˆ์œต", "ST", "ํ† ํฐ์ฆ๊ถŒ", "FDS", "๊ธˆ์œต ์‚ฌ๊ธฐ", "์ด์ƒ๊ฑฐ๋ž˜"
]
FINTECH_AI_KEYWORDS = AI_KEYWORDS + FIN_KEYWORDS # ์‹œ๊ฐํ™” ํ˜ธํ™˜์šฉ ์ „์ฒด ๋ชฉ๋ก
print("[INIT] ChromeDriver ์ดˆ๊ธฐํ™” ์ค‘...")
service = Service(ChromeDriverManager().install())
options = webdriver.ChromeOptions()
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--headless") # ์†๋„ ๋ฐ ์•ˆ์ •์„ฑ ๊ทน๋Œ€ํ™”๋ฅผ ์œ„ํ•ด headless ๋ชจ๋“œ ํ™œ์„ฑํ™”
driver = webdriver.Chrome(service=service, options=options)
print("[INIT] [OK] ๋ธŒ๋ผ์šฐ์ € ์‹คํ–‰ ์™„๋ฃŒ")
def get_article_links(driver, sid: str, target_date: str, num_articles: int) -> list[str]:
article_links: list[str] = []
# 20๊ฐœ์”ฉ ๋Š์–ด์„œ ํŽ˜์ด์ง€๋ณ„ ์ง์ ‘ ๋กœ๋“œํ•˜์—ฌ ์†๋„๋ฅผ 10๋ฐฐ ์ด์ƒ ํ–ฅ์ƒ์‹œํ‚ต๋‹ˆ๋‹ค
max_pages = (num_articles // 20) + 1
selectors = [
".list_body a",
"ul.type06_headline a",
"ul.type06 a",
"a.sa_text_title",
".sa_text a",
]
for page in range(1, max_pages + 1):
page_url = f"https://news.naver.com/main/list.naver?mode=LSD&mid=sec&sid1={sid}&date={target_date}&page={page}"
print(f" [LINK] ํŽ˜์ด์ง€ ์ด๋™ (Page {page}): {page_url}")
try:
driver.get(page_url)
time.sleep(1.5)
except Exception as e:
print(f" [LINK] โš ๏ธ ํŽ˜์ด์ง€ ๋กœ๋“œ ์˜ค๋ฅ˜ (์Šคํ‚ต): {e}")
continue
found_in_page = 0
for selector in selectors:
elements = driver.find_elements(By.CSS_SELECTOR, selector)
for element in elements:
try:
url = element.get_attribute("href")
if (
url
and "news.naver.com" in url
and "/article/" in url
and "/comment/" not in url
and url not in article_links
):
article_links.append(url)
found_in_page += 1
if len(article_links) >= num_articles:
break
except Exception:
continue
if len(article_links) >= num_articles:
break
print(f" -> Page {page}์—์„œ {found_in_page}๊ฐœ ๊ธฐ์‚ฌ ๋งํฌ ํ™•๋ณด (๋ˆ„์ : {len(article_links)}๊ฐœ)")
if len(article_links) >= num_articles or found_in_page == 0:
break
print(f" [LINK] โœ… {target_date} ์ผ์ž ์ด {len(article_links)}๊ฐœ ๋งํฌ ํ™•๋ณด\n")
return article_links[:num_articles]
def parse_article_detail(driver, article_url, category):
driver.get(article_url)
time.sleep(1.5)
article_data = {
"article_id": "",
"title": "",
"content": "",
"url": article_url,
"published_date": "",
"source": "",
"author": "",
"category": category,
}
try:
match = re.search(r"article/(\d+)/(\d+)", article_url)
article_data["article_id"] = (
f"ART_{match.group(1)}_{match.group(2)}" if match else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}"
)
for sel in [
"#title_area span",
"#ct .media_end_head_headline",
".media_end_head_headline",
"h2#title_area",
".news_end_title",
]:
try:
el = driver.find_element(By.CSS_SELECTOR, sel)
if el.text.strip():
article_data["title"] = el.text.strip()
break
except:
continue
for sel in [
"#dic_area",
"article#dic_area",
".go_trans._article_content",
"._article_body_contents",
]:
try:
el = driver.find_element(By.CSS_SELECTOR, sel)
if el.text.strip():
article_data["content"] = el.text.strip()
break
except:
continue
try:
el = driver.find_element(By.CSS_SELECTOR, "a.media_end_head_top_logo img")
article_data["source"] = el.get_attribute("alt")
except:
try:
el = driver.find_element(By.CSS_SELECTOR, ".media_end_head_top_logo_text")
article_data["source"] = el.text.strip()
except:
pass
try:
el = driver.find_element(
By.CSS_SELECTOR,
"span.media_end_head_info_datestamp_time, span[data-date-time]",
)
article_data["published_date"] = (el.get_attribute("data-date-time") or el.text).strip()
except:
article_data["published_date"] = datetime.now().strftime("%Y-%m-%d %H:%M")
try:
el = driver.find_element(
By.CSS_SELECTOR,
"em.media_end_head_journalist_name, span.byline_s",
)
article_data["author"] = el.text.strip()
except:
pass
except Exception as e:
print(f" [PARSE] [WARN] ํŒŒ์‹ฑ ์˜ค๋ฅ˜: {e}")
return article_data
# โ”€โ”€ 1๋‹จ๊ณ„: ์ „์ฒด ๊ธฐ์‚ฌ ์ˆ˜์ง‘ โ”€โ”€
all_articles = []
category_stats = {}
# ์˜ค๋Š˜๋ถ€ํ„ฐ 7์ผ ์ „๊นŒ์ง€์˜ ๋‚ ์งœ ๋ฆฌ์ŠคํŠธ ์ƒ์„ฑ
target_dates = [(datetime.now() - timedelta(days=i)).strftime("%Y%m%d") for i in range(7)]
print(f"[CRAWL] [DATE] ๋Œ€์ƒ ์ˆ˜์ง‘ ๋‚ ์งœ (7์ผ): {target_dates}")
for target_date in target_dates:
print(f"\n{'=' * 60}")
print(f"[CRAWL] [DATE] {target_date} ์ผ์ž ์ˆ˜์ง‘ ์‹œ์ž‘")
print(f"{'=' * 60}")
for category_name, sid in categories_sid.items():
print(f"\n [CRAWL] [{category_name} - {target_date}] ์นดํ…Œ๊ณ ๋ฆฌ ์ˆ˜์ง‘ ์‹œ์ž‘")
# ๋‚ ์งœ๋ณ„/์นดํ…Œ๊ณ ๋ฆฌ๋ณ„ ๋ชฉํ‘œ ์ˆ˜์ง‘๋Ÿ‰
article_links = get_article_links(driver, sid, target_date, NUM_ARTICLES_PER_DATE_CAT)
cat_key = f"{category_name}_{target_date}"
cat_ok, cat_fail = 0, 0
for idx, article_url in enumerate(article_links, 1):
print(f" [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...")
article_data = parse_article_detail(driver, article_url, category_name)
if article_data["title"] and article_data["content"]:
# ๋งŒ์•ฝ ํŒŒ์‹ฑ๋œ published_date๊ฐ€ ๋น„์—ˆ๊ฑฐ๋‚˜ ์ด์ƒํ•˜๋‹ค๋ฉด target_date ๊ธฐ๋ฐ˜์œผ๋กœ ๋‚ ์งœ ํ˜•์‹ ์„ค์ •
if not article_data["published_date"] or "202" not in article_data["published_date"]:
formatted_date = f"{target_date[:4]}-{target_date[4:6]}-{target_date[6:]} 09:00"
article_data["published_date"] = formatted_date
all_articles.append(article_data)
cat_ok += 1
print(f" [OK] {article_data['title'][:40]}...")
print(f" ์–ธ๋ก ์‚ฌ: {article_data['source']} | ๋‚ ์งœ: {article_data['published_date']}")
else:
cat_fail += 1
missing = [
x
for x, v in [
("์ œ๋ชฉ", article_data["title"]),
("๋ณธ๋ฌธ", article_data["content"]),
]
if not v
]
print(f" [FAIL] ํŒŒ์‹ฑ์‹คํŒจ ({', '.join(missing)} ์—†์Œ)")
time.sleep(0.5)
category_stats[cat_key] = {"ok": cat_ok, "fail": cat_fail}
print(f"\n [CRAWL] [{category_name} - {target_date}] ์™„๋ฃŒ: ์„ฑ๊ณต {cat_ok}๊ฐœ / ์‹คํŒจ {cat_fail}๊ฐœ")
driver.quit()
print("\n[DONE] ๋ธŒ๋ผ์šฐ์ € ์ข…๋ฃŒ")
print(f"\n{'=' * 60}")
print("[SUMMARY] ์ˆ˜์ง‘ ๊ฒฐ๊ณผ Summary")
print(f"{'=' * 60}")
total_ok = 0
total_fail = 0
for cat_key, s in category_stats.items():
print(f" {cat_key}: ์„ฑ๊ณต {s['ok']}๊ฑด / ์‹คํŒจ {s['fail']}๊ฑด")
total_ok += s['ok']
total_fail += s['fail']
print(f" ์ „์ฒด ์ˆ˜์ง‘: ์„ฑ๊ณต {total_ok}๊ฑด / ์‹คํŒจ {total_fail}๊ฑด")
df_all = pd.DataFrame(all_articles)
# โ”€โ”€ 2๋‹จ๊ณ„: ๊ธˆ์œต AI ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง (๊ฒฝ์ œ -> AI / IT -> ๊ธˆ์œต) โ”€โ”€
print(f"\n{'=' * 60}")
print("[FILTER] ๊ธˆ์œต AI ๋“€์–ผ ํ•˜์ด๋ธŒ๋ฆฌ๋“œ ํ•„ํ„ฐ๋ง ์‹œ์ž‘")
print("[FILTER] - ๊ฒฝ์ œ ์„น์…˜ ๊ธฐ์‚ฌ: AI ํ‚ค์›Œ๋“œ ์กด์žฌ ์‹œ ํ†ต๊ณผ")
print("[FILTER] - IT/๊ณผํ•™ ์„น์…˜ ๊ธฐ์‚ฌ: ๊ธˆ์œต ํ‚ค์›Œ๋“œ ์กด์žฌ ์‹œ ํ†ต๊ณผ")
print(f"{'=' * 60}")
filtered_articles = []
for _, row in df_all.iterrows():
text = f"{row['title']} {row['content']}"
text_clean = text.lower().replace(" ", "")
# 1. AI ๋„๋ฉ”์ธ ๋งค์นญ
matched_ai = [kw for kw in AI_KEYWORDS if kw.lower().replace(" ", "") in text_clean]
# 2. ๊ธˆ์œต/ํ•€ํ…Œํฌ ๋„๋ฉ”์ธ ๋งค์นญ
matched_fin = [kw for kw in FIN_KEYWORDS if kw.lower().replace(" ", "") in text_clean]
is_passed = False
matched_info = []
if row['category'] == "๊ฒฝ์ œ":
if matched_ai:
is_passed = True
matched_info = matched_ai
elif row['category'] == "IT/๊ณผํ•™":
if matched_fin:
is_passed = True
matched_info = matched_fin
if is_passed:
row_dict = row.to_dict()
# ์‹œ๊ฐํ™” ๋ฐ ๋กœ๊น…์„ ์œ„ํ•ด ๊ฒฐํ•ฉ๋œ ๋งค์นญ ํ‚ค์›Œ๋“œ ์ •๋ณด ๊ธฐ๋ก
row_dict["matched_keywords"] = ", ".join(matched_info)
filtered_articles.append(row_dict)
df_filtered = pd.DataFrame(filtered_articles)
print(f" ์ „์ฒด ์ˆ˜์ง‘: {len(df_all)}๊ฑด")
print(f" AI ํ•€ํ…Œํฌ ๊ต์ฐจ ํ•„ํ„ฐ๋ง ํ†ต๊ณผ: {len(df_filtered)}๊ฑด ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)")
print("\n [๋„๋ฉ”์ธ๋ณ„ ๋งค์นญ ์š”์•ฝ]")
all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
kw_counts = Counter(all_kw)
print(" --- AI ๊ธฐ์ˆ  ํ‚ค์›Œ๋“œ ๋งค์นญ ---")
for kw in AI_KEYWORDS:
if kw_counts.get(kw, 0) > 0:
print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด")
print(" --- ๊ธˆ์œต/ํ•€ํ…Œํฌ ํ‚ค์›Œ๋“œ ๋งค์นญ ---")
for kw in FIN_KEYWORDS:
if kw_counts.get(kw, 0) > 0:
print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด")
df_filtered
# โ”€โ”€ 3๋‹จ๊ณ„: ์ €์žฅ โ”€โ”€
import os
output_dir = os.path.join("src", "graphBuilder", "scrapping")
os.makedirs(output_dir, exist_ok=True)
output_filename = os.path.join(output_dir, f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx")
df_filtered.to_excel(output_filename, index=False, engine="openpyxl")
print(f"[SAVE] [OK] ์ €์žฅ ์™„๋ฃŒ: {output_filename}")
print(f"[SAVE] - AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ: {len(df_filtered)}๊ฑด")
# โ”€โ”€ 4๋‹จ๊ณ„: ํ‚ค์›Œ๋“œ ๋นˆ๋„ ์‹œ๊ฐํ™” โ”€โ”€
try:
import platform
from collections import Counter
import matplotlib.pyplot as plt
# ํฐํŠธ ๊นจ์ง ๋ฐฉ์ง€ (Windows: Malgun Gothic, Mac: AppleGothic, Linux: NanumGothic)
if platform.system() == "Windows":
plt.rc("font", family="Malgun Gothic")
elif platform.system() == "Darwin":
plt.rc("font", family="AppleGothic")
else:
plt.rc("font", family="NanumGothic")
plt.rcParams["axes.unicode_minus"] = False
if not filtered_articles:
print("์‹œ๊ฐํ™”ํ•  ๋ฐ์ดํ„ฐ๊ฐ€ ์—†์Šต๋‹ˆ๋‹ค.")
else:
# ๋นˆ๋„์ˆ˜ ๊ณ„์‚ฐ
all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")]
kw_counts = Counter(all_kw)
# ๐Ÿ“Œ ๋ณ€๊ฒฝ ํฌ์ธํŠธ: FINTECH_AI_KEYWORDS ์ „์ฒด ๋ชฉ๋ก์„ ์ˆœ์„œ๋Œ€๋กœ ๊ทธ๋ž˜ํ”„์— ๊ฐ•์ œ ํ‘œ์‹œ (0๊ฑด ํฌํ•จ)
keywords = FINTECH_AI_KEYWORDS
counts = [kw_counts.get(kw, 0) for kw in keywords]
plt.figure(figsize=(12, 6))
# ๋ง‰๋Œ€ ๊ทธ๋ž˜ํ”„ ์ƒ์„ฑ
bars = plt.bar(keywords, counts, color="skyblue", edgecolor="white")
# ๋ง‰๋Œ€ ์œ„์— ์ˆซ์ž(๋นˆ๋„์ˆ˜) ํ‘œ์‹œ
for bar in bars:
height = bar.get_height()
# ๋ง‰๋Œ€์˜ ์ค‘์•™(x), ๋ง‰๋Œ€์˜ ๋†’์ด(y) ์œ„์น˜์— ํ…์ŠคํŠธ๋ฅผ ๋ฐฐ์น˜
plt.text(
bar.get_x() + bar.get_width() / 2.0,
height,
f"{height}",
ha="center",
va="bottom",
size=11,
fontweight="bold",
color="black",
)
plt.title("์ˆ˜์ง‘๋œ AI ํ•€ํ…Œํฌ ๊ธฐ์‚ฌ ํ‚ค์›Œ๋“œ ์ถœํ˜„ ๋นˆ๋„ (์ „์ฒด)", fontsize=15, pad=15)
plt.xlabel("ํ‚ค์›Œ๋“œ", fontsize=12)
plt.ylabel("์ถœํ˜„ ํšŸ์ˆ˜ (๊ฑด)", fontsize=12)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
except ImportError:
print("[INFO] matplotlib ๋ผ์ด๋ธŒ๋Ÿฌ๋ฆฌ๊ฐ€ ์„ค์น˜๋˜์–ด ์žˆ์ง€ ์•Š์•„ ์‹œ๊ฐํ™” ๋‹จ๊ณ„๋ฅผ ๊ฑด๋„ˆ๋œ๋‹ˆ๋‹ค.")