| import re |
| import sys |
| import time |
| from collections import Counter |
| from datetime import datetime, timedelta |
|
|
| |
| if hasattr(sys.stdout, 'reconfigure'): |
| sys.stdout.reconfigure(encoding='utf-8') |
|
|
| import pandas as pd |
| from selenium import webdriver |
| from selenium.webdriver.chrome.service import Service |
| from selenium.webdriver.common.by import By |
| from webdriver_manager.chrome import ChromeDriverManager |
|
|
| |
| categories_sid = { |
| "๊ฒฝ์ ": "101", |
| "IT/๊ณผํ": "105", |
| } |
| NUM_ARTICLES_PER_DATE_CAT = 20 |
|
|
| |
| AI_KEYWORDS = [ |
| "AI", "์ธ๊ณต์ง๋ฅ", "์์ฑํ AI", "๋๊ท๋ชจ์ธ์ด๋ชจ๋ธ", "LLM", "GPT", |
| "์ ๋ฏธ๋์ด", "Gemini", "ํด๋ก๋", "Claude", "๋จธ์ ๋ฌ๋", "๋ฅ๋ฌ๋" |
| ] |
|
|
| FIN_KEYWORDS = [ |
| "ํํ
ํฌ", "๊ธ์ต", "์ํ", "์นด๋", "์ฆ๊ถ", "ํ์ด", "์ก๊ธ", "๊ฒฐ์ ", |
| "์์ฐ๊ด๋ฆฌ", "์ ์ฉํ๊ฐ", "์ ์ฉ", "ํฌ์", "๋ง์ด๋ฐ์ดํฐ", "๋ก๋ณด์ด๋๋ฐ์ด์ ", |
| "์ธํฐ๋ท์ํ", "์ธ์์ดํ
ํฌ", "์์ฐ์ด์ฉ", "์นด์นด์ค๋ฑ
ํฌ", "ํ ์ค๋ฑ
ํฌ", |
| "์ผ์ด๋ฑ
ํฌ", "๋ค์ด๋ฒํ์ด", "์นด์นด์คํ์ด", "ํ ์ค", "์ฃผ์", "๋ฑ
ํน", |
| "๋์งํธ ๊ธ์ต", "ST", "ํ ํฐ์ฆ๊ถ", "FDS", "๊ธ์ต ์ฌ๊ธฐ", "์ด์๊ฑฐ๋" |
| ] |
|
|
| FINTECH_AI_KEYWORDS = AI_KEYWORDS + FIN_KEYWORDS |
|
|
| print("[INIT] ChromeDriver ์ด๊ธฐํ ์ค...") |
| service = Service(ChromeDriverManager().install()) |
| options = webdriver.ChromeOptions() |
| options.add_argument("--no-sandbox") |
| options.add_argument("--disable-dev-shm-usage") |
| options.add_argument("--headless") |
| driver = webdriver.Chrome(service=service, options=options) |
| print("[INIT] [OK] ๋ธ๋ผ์ฐ์ ์คํ ์๋ฃ") |
|
|
|
|
| def get_article_links(driver, sid: str, target_date: str, num_articles: int) -> list[str]: |
| article_links: list[str] = [] |
| |
| max_pages = (num_articles // 20) + 1 |
|
|
| selectors = [ |
| ".list_body a", |
| "ul.type06_headline a", |
| "ul.type06 a", |
| "a.sa_text_title", |
| ".sa_text a", |
| ] |
|
|
| for page in range(1, max_pages + 1): |
| page_url = f"https://news.naver.com/main/list.naver?mode=LSD&mid=sec&sid1={sid}&date={target_date}&page={page}" |
| print(f" [LINK] ํ์ด์ง ์ด๋ (Page {page}): {page_url}") |
| try: |
| driver.get(page_url) |
| time.sleep(1.5) |
| except Exception as e: |
| print(f" [LINK] โ ๏ธ ํ์ด์ง ๋ก๋ ์ค๋ฅ (์คํต): {e}") |
| continue |
|
|
| found_in_page = 0 |
| for selector in selectors: |
| elements = driver.find_elements(By.CSS_SELECTOR, selector) |
| for element in elements: |
| try: |
| url = element.get_attribute("href") |
| if ( |
| url |
| and "news.naver.com" in url |
| and "/article/" in url |
| and "/comment/" not in url |
| and url not in article_links |
| ): |
| article_links.append(url) |
| found_in_page += 1 |
| if len(article_links) >= num_articles: |
| break |
| except Exception: |
| continue |
| if len(article_links) >= num_articles: |
| break |
|
|
| print(f" -> Page {page}์์ {found_in_page}๊ฐ ๊ธฐ์ฌ ๋งํฌ ํ๋ณด (๋์ : {len(article_links)}๊ฐ)") |
| if len(article_links) >= num_articles or found_in_page == 0: |
| break |
|
|
| print(f" [LINK] โ
{target_date} ์ผ์ ์ด {len(article_links)}๊ฐ ๋งํฌ ํ๋ณด\n") |
| return article_links[:num_articles] |
|
|
|
|
| def parse_article_detail(driver, article_url, category): |
| driver.get(article_url) |
| time.sleep(1.5) |
| article_data = { |
| "article_id": "", |
| "title": "", |
| "content": "", |
| "url": article_url, |
| "published_date": "", |
| "source": "", |
| "author": "", |
| "category": category, |
| } |
| try: |
| match = re.search(r"article/(\d+)/(\d+)", article_url) |
| article_data["article_id"] = ( |
| f"ART_{match.group(1)}_{match.group(2)}" if match else f"ART_{datetime.now().strftime('%Y%m%d%H%M%S')}" |
| ) |
| for sel in [ |
| "#title_area span", |
| "#ct .media_end_head_headline", |
| ".media_end_head_headline", |
| "h2#title_area", |
| ".news_end_title", |
| ]: |
| try: |
| el = driver.find_element(By.CSS_SELECTOR, sel) |
| if el.text.strip(): |
| article_data["title"] = el.text.strip() |
| break |
| except: |
| continue |
| for sel in [ |
| "#dic_area", |
| "article#dic_area", |
| ".go_trans._article_content", |
| "._article_body_contents", |
| ]: |
| try: |
| el = driver.find_element(By.CSS_SELECTOR, sel) |
| if el.text.strip(): |
| article_data["content"] = el.text.strip() |
| break |
| except: |
| continue |
| try: |
| el = driver.find_element(By.CSS_SELECTOR, "a.media_end_head_top_logo img") |
| article_data["source"] = el.get_attribute("alt") |
| except: |
| try: |
| el = driver.find_element(By.CSS_SELECTOR, ".media_end_head_top_logo_text") |
| article_data["source"] = el.text.strip() |
| except: |
| pass |
| try: |
| el = driver.find_element( |
| By.CSS_SELECTOR, |
| "span.media_end_head_info_datestamp_time, span[data-date-time]", |
| ) |
| article_data["published_date"] = (el.get_attribute("data-date-time") or el.text).strip() |
| except: |
| article_data["published_date"] = datetime.now().strftime("%Y-%m-%d %H:%M") |
| try: |
| el = driver.find_element( |
| By.CSS_SELECTOR, |
| "em.media_end_head_journalist_name, span.byline_s", |
| ) |
| article_data["author"] = el.text.strip() |
| except: |
| pass |
| except Exception as e: |
| print(f" [PARSE] [WARN] ํ์ฑ ์ค๋ฅ: {e}") |
| return article_data |
|
|
|
|
| |
| all_articles = [] |
| category_stats = {} |
|
|
| |
| target_dates = [(datetime.now() - timedelta(days=i)).strftime("%Y%m%d") for i in range(7)] |
|
|
| print(f"[CRAWL] [DATE] ๋์ ์์ง ๋ ์ง (7์ผ): {target_dates}") |
|
|
| for target_date in target_dates: |
| print(f"\n{'=' * 60}") |
| print(f"[CRAWL] [DATE] {target_date} ์ผ์ ์์ง ์์") |
| print(f"{'=' * 60}") |
|
|
| for category_name, sid in categories_sid.items(): |
| print(f"\n [CRAWL] [{category_name} - {target_date}] ์นดํ
๊ณ ๋ฆฌ ์์ง ์์") |
| |
| |
| article_links = get_article_links(driver, sid, target_date, NUM_ARTICLES_PER_DATE_CAT) |
|
|
| cat_key = f"{category_name}_{target_date}" |
| cat_ok, cat_fail = 0, 0 |
| |
| for idx, article_url in enumerate(article_links, 1): |
| print(f" [PARSE] ({idx}/{len(article_links)}) {article_url[:70]}...") |
| article_data = parse_article_detail(driver, article_url, category_name) |
|
|
| if article_data["title"] and article_data["content"]: |
| |
| if not article_data["published_date"] or "202" not in article_data["published_date"]: |
| formatted_date = f"{target_date[:4]}-{target_date[4:6]}-{target_date[6:]} 09:00" |
| article_data["published_date"] = formatted_date |
| |
| all_articles.append(article_data) |
| cat_ok += 1 |
| print(f" [OK] {article_data['title'][:40]}...") |
| print(f" ์ธ๋ก ์ฌ: {article_data['source']} | ๋ ์ง: {article_data['published_date']}") |
| else: |
| cat_fail += 1 |
| missing = [ |
| x |
| for x, v in [ |
| ("์ ๋ชฉ", article_data["title"]), |
| ("๋ณธ๋ฌธ", article_data["content"]), |
| ] |
| if not v |
| ] |
| print(f" [FAIL] ํ์ฑ์คํจ ({', '.join(missing)} ์์)") |
| time.sleep(0.5) |
|
|
| category_stats[cat_key] = {"ok": cat_ok, "fail": cat_fail} |
| print(f"\n [CRAWL] [{category_name} - {target_date}] ์๋ฃ: ์ฑ๊ณต {cat_ok}๊ฐ / ์คํจ {cat_fail}๊ฐ") |
|
|
| driver.quit() |
| print("\n[DONE] ๋ธ๋ผ์ฐ์ ์ข
๋ฃ") |
| print(f"\n{'=' * 60}") |
| print("[SUMMARY] ์์ง ๊ฒฐ๊ณผ Summary") |
| print(f"{'=' * 60}") |
| total_ok = 0 |
| total_fail = 0 |
| for cat_key, s in category_stats.items(): |
| print(f" {cat_key}: ์ฑ๊ณต {s['ok']}๊ฑด / ์คํจ {s['fail']}๊ฑด") |
| total_ok += s['ok'] |
| total_fail += s['fail'] |
| print(f" ์ ์ฒด ์์ง: ์ฑ๊ณต {total_ok}๊ฑด / ์คํจ {total_fail}๊ฑด") |
|
|
| df_all = pd.DataFrame(all_articles) |
|
|
|
|
| |
| print(f"\n{'=' * 60}") |
| print("[FILTER] ๊ธ์ต AI ๋์ผ ํ์ด๋ธ๋ฆฌ๋ ํํฐ๋ง ์์") |
| print("[FILTER] - ๊ฒฝ์ ์น์
๊ธฐ์ฌ: AI ํค์๋ ์กด์ฌ ์ ํต๊ณผ") |
| print("[FILTER] - IT/๊ณผํ ์น์
๊ธฐ์ฌ: ๊ธ์ต ํค์๋ ์กด์ฌ ์ ํต๊ณผ") |
| print(f"{'=' * 60}") |
|
|
| filtered_articles = [] |
| for _, row in df_all.iterrows(): |
| text = f"{row['title']} {row['content']}" |
| text_clean = text.lower().replace(" ", "") |
| |
| |
| matched_ai = [kw for kw in AI_KEYWORDS if kw.lower().replace(" ", "") in text_clean] |
| |
| matched_fin = [kw for kw in FIN_KEYWORDS if kw.lower().replace(" ", "") in text_clean] |
| |
| is_passed = False |
| matched_info = [] |
| |
| if row['category'] == "๊ฒฝ์ ": |
| if matched_ai: |
| is_passed = True |
| matched_info = matched_ai |
| elif row['category'] == "IT/๊ณผํ": |
| if matched_fin: |
| is_passed = True |
| matched_info = matched_fin |
| |
| if is_passed: |
| row_dict = row.to_dict() |
| |
| row_dict["matched_keywords"] = ", ".join(matched_info) |
| filtered_articles.append(row_dict) |
|
|
| df_filtered = pd.DataFrame(filtered_articles) |
|
|
| print(f" ์ ์ฒด ์์ง: {len(df_all)}๊ฑด") |
| print(f" AI ํํ
ํฌ ๊ต์ฐจ ํํฐ๋ง ํต๊ณผ: {len(df_filtered)}๊ฑด ({len(df_filtered) / max(len(df_all), 1) * 100:.1f}%)") |
| print("\n [๋๋ฉ์ธ๋ณ ๋งค์นญ ์์ฝ]") |
| all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")] |
| kw_counts = Counter(all_kw) |
| print(" --- AI ๊ธฐ์ ํค์๋ ๋งค์นญ ---") |
| for kw in AI_KEYWORDS: |
| if kw_counts.get(kw, 0) > 0: |
| print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด") |
| print(" --- ๊ธ์ต/ํํ
ํฌ ํค์๋ ๋งค์นญ ---") |
| for kw in FIN_KEYWORDS: |
| if kw_counts.get(kw, 0) > 0: |
| print(f" {kw}: {kw_counts.get(kw, 0)}๊ฑด") |
|
|
| df_filtered |
|
|
| |
| import os |
|
|
| output_dir = os.path.join("src", "graphBuilder", "scrapping") |
| os.makedirs(output_dir, exist_ok=True) |
| output_filename = os.path.join(output_dir, f"Articles_{datetime.now().strftime('%Y%m%d_%H%M%S')}.xlsx") |
| df_filtered.to_excel(output_filename, index=False, engine="openpyxl") |
| print(f"[SAVE] [OK] ์ ์ฅ ์๋ฃ: {output_filename}") |
| print(f"[SAVE] - AI ํํ
ํฌ ๊ธฐ์ฌ: {len(df_filtered)}๊ฑด") |
|
|
|
|
| |
| try: |
| import platform |
| from collections import Counter |
|
|
| import matplotlib.pyplot as plt |
|
|
| |
| if platform.system() == "Windows": |
| plt.rc("font", family="Malgun Gothic") |
| elif platform.system() == "Darwin": |
| plt.rc("font", family="AppleGothic") |
| else: |
| plt.rc("font", family="NanumGothic") |
| plt.rcParams["axes.unicode_minus"] = False |
|
|
| if not filtered_articles: |
| print("์๊ฐํํ ๋ฐ์ดํฐ๊ฐ ์์ต๋๋ค.") |
| else: |
| |
| all_kw = [kw for row in filtered_articles for kw in row["matched_keywords"].split(", ")] |
| kw_counts = Counter(all_kw) |
|
|
| |
| keywords = FINTECH_AI_KEYWORDS |
| counts = [kw_counts.get(kw, 0) for kw in keywords] |
|
|
| plt.figure(figsize=(12, 6)) |
|
|
| |
| bars = plt.bar(keywords, counts, color="skyblue", edgecolor="white") |
|
|
| |
| for bar in bars: |
| height = bar.get_height() |
| |
| plt.text( |
| bar.get_x() + bar.get_width() / 2.0, |
| height, |
| f"{height}", |
| ha="center", |
| va="bottom", |
| size=11, |
| fontweight="bold", |
| color="black", |
| ) |
|
|
| plt.title("์์ง๋ AI ํํ
ํฌ ๊ธฐ์ฌ ํค์๋ ์ถํ ๋น๋ (์ ์ฒด)", fontsize=15, pad=15) |
| plt.xlabel("ํค์๋", fontsize=12) |
| plt.ylabel("์ถํ ํ์ (๊ฑด)", fontsize=12) |
| plt.grid(axis="y", linestyle="--", alpha=0.7) |
| plt.xticks(rotation=45) |
| plt.tight_layout() |
| plt.show() |
| except ImportError: |
| print("[INFO] matplotlib ๋ผ์ด๋ธ๋ฌ๋ฆฌ๊ฐ ์ค์น๋์ด ์์ง ์์ ์๊ฐํ ๋จ๊ณ๋ฅผ ๊ฑด๋๋๋๋ค.") |
|
|
|
|