Spaces:
Runtime error
Runtime error
| """ | |
| ๋ด์ค ๊ธฐ์ฌ ์คํฌ๋ํ ๋๊ตฌ (์ต์ ํ ๋ฒ์ ) | |
| ์ฌ์ฉ๋ฒ: | |
| article = extract_article(url) | |
| ๋ฐํ ํ์ (JSON): | |
| { | |
| 'title': '๊ธฐ์ฌ ์ ๋ชฉ', | |
| 'text': '๊ธฐ์ฌ ๋ณธ๋ฌธ ํ ์คํธ', | |
| 'image_url': '๋ํ ์ด๋ฏธ์ง URL' | |
| } | |
| ์์กด์ฑ: pip3 install trafilatura newspaper3k playwright beautifulsoup4 requests fake-useragent extruct | |
| Playwright ์ด๊ธฐ ์ค์น: playwright install chromium | |
| ์ฑ๋ฅ ์ต์ ์์: | |
| 1. Trafilatura (๊ฐ์ฅ ๋น ๋ฅด๊ณ ์ ํ, ์ ์ ์ฝํ ์ธ ) | |
| 2. Newspaper3k (๋น ๋ฅด๊ณ ํ๊ตญ์ด ์ง์ ์ฐ์) | |
| 3. Playwright + Trafilatura (JavaScript ๋ ๋๋ง ํ์์) | |
| 4. Playwright + Newspaper3k (๋์ฒด ๋ฐฉ๋ฒ) | |
| """ | |
| import json | |
| import time | |
| from typing import Optional, Dict | |
| from urllib.parse import urljoin | |
| import requests | |
| import trafilatura | |
| from bs4 import BeautifulSoup | |
| from newspaper import Article | |
| try: | |
| from fake_useragent import UserAgent | |
| ua = UserAgent() | |
| USER_AGENT = ua.random | |
| except ImportError: | |
| USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' | |
| try: | |
| from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout | |
| PLAYWRIGHT_AVAILABLE = True | |
| except ImportError: | |
| PLAYWRIGHT_AVAILABLE = False | |
| print("โ ๏ธ Playwright ๋ฏธ์ค์น - JavaScript ๋ ๋๋ง ๊ธฐ๋ฅ ๋นํ์ฑํ") | |
| try: | |
| import extruct | |
| EXTRUCT_AVAILABLE = True | |
| except ImportError: | |
| EXTRUCT_AVAILABLE = False | |
| # HTTP ํค๋ ์ค์ | |
| HEADERS = { | |
| 'User-Agent': USER_AGENT, | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', | |
| 'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3', | |
| 'Accept-Encoding': 'gzip, deflate', | |
| 'DNT': '1', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1', | |
| } | |
| def fetch_with_headers(url: str) -> str: | |
| """HTTP ํค๋๋ฅผ ํฌํจํ URL ์์ฒญ""" | |
| response = requests.get(url, headers=HEADERS, timeout=30) | |
| response.raise_for_status() | |
| return response.text | |
| def extract_images_from_html(html: str, base_url: str = "") -> Optional[str]: | |
| """HTML์์ ์ด๋ฏธ์ง ์ถ์ถ (์ฌ๋ฌ ๋ฐฉ๋ฒ ์๋)""" | |
| soup = BeautifulSoup(html, 'html.parser') | |
| # 1. og:image ๋ฉํํ๊ทธ | |
| og_image = soup.find('meta', property='og:image') | |
| if og_image and og_image.get('content'): | |
| return og_image.get('content') | |
| # 2. twitter:image | |
| tw_image = soup.find('meta', attrs={'name': 'twitter:image'}) | |
| if tw_image and tw_image.get('content'): | |
| return tw_image.get('content') | |
| # 3. extruct๋ก JSON-LD ํ์ฑ | |
| if EXTRUCT_AVAILABLE: | |
| try: | |
| metadata = extruct.extract(html, base_url=base_url) | |
| # Schema.org ImageObject ์ฐพ๊ธฐ | |
| for item in metadata.get('json-ld', []): | |
| if isinstance(item, dict): | |
| if item.get('image'): | |
| img = item['image'] | |
| if isinstance(img, str): | |
| return img | |
| elif isinstance(img, dict) and img.get('url'): | |
| return img['url'] | |
| elif isinstance(img, list) and len(img) > 0: | |
| return img[0] if isinstance(img[0], str) else img[0].get('url') | |
| except: | |
| pass | |
| # 4. article ๋ด๋ถ์ ์ฒซ ๋ฒ์งธ ์ด๋ฏธ์ง | |
| article_imgs = soup.select('article img[src], .article img[src], #article img[src]') | |
| if article_imgs: | |
| src = article_imgs[0].get('src') | |
| return urljoin(base_url, src) if src else None | |
| # 5. ์ผ๋ฐ img ํ๊ทธ | |
| imgs = soup.find_all('img', src=True) | |
| for img in imgs: | |
| src = img.get('src') | |
| # ๋ก๊ณ , ์์ด์ฝ ์ ์ธ | |
| if src and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'profile', 'ad', 'banner']): | |
| # ์ต์ ํฌ๊ธฐ ํ์ธ (width/height ์์ฑ) | |
| width = img.get('width', '0') | |
| height = img.get('height', '0') | |
| try: | |
| if int(width) >= 200 or int(height) >= 200: | |
| return urljoin(base_url, src) | |
| except: | |
| return urljoin(base_url, src) | |
| return None | |
| def extract_trafilatura(url: str) -> Optional[Dict[str, str]]: | |
| """Trafilatura ๊ธฐ์ฌ ์ถ์ถ""" | |
| try: | |
| html = fetch_with_headers(url) | |
| result = trafilatura.extract(html, output_format='json', url=url, | |
| include_images=True, include_links=True) | |
| if result: | |
| data = json.loads(result) | |
| image_url = data.get('image') or extract_images_from_html(html, url) | |
| return { | |
| 'title': data.get('title'), | |
| 'text': data.get('text'), | |
| 'image_url': image_url | |
| } | |
| except Exception as e: | |
| print(f"Trafilatura ์คํจ: {e}") | |
| return None | |
| def extract_newspaper(url: str) -> Optional[Dict[str, str]]: | |
| """Newspaper3k ๊ธฐ์ฌ ์ถ์ถ""" | |
| try: | |
| html = fetch_with_headers(url) | |
| article = Article(url) | |
| article.config.browser_user_agent = HEADERS['User-Agent'] | |
| article.set_html(html) | |
| article.parse() | |
| image_url = article.top_image or extract_images_from_html(html, url) | |
| return { | |
| 'title': article.title, | |
| 'text': article.text, | |
| 'image_url': image_url | |
| } | |
| except Exception as e: | |
| print(f"Newspaper3k ์คํจ: {e}") | |
| return None | |
| def get_rendered_html_playwright(url: str, wait: int = 2) -> Optional[str]: | |
| """Playwright๋ก ๋ ๋๋ง๋ HTML ๊ฐ์ ธ์ค๊ธฐ""" | |
| try: | |
| with sync_playwright() as p: | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent=HEADERS['User-Agent'], | |
| viewport={'width': 1920, 'height': 1080} | |
| ) | |
| page = context.new_page() | |
| page.goto(url, wait_until='domcontentloaded', timeout=30000) | |
| time.sleep(wait) | |
| html = page.content() | |
| browser.close() | |
| return html | |
| except Exception as e: | |
| print(f"Playwright ์ค๋ฅ: {e}") | |
| return None | |
| def extract_playwright_trafilatura(url: str) -> Optional[Dict[str, str]]: | |
| """Playwright + Trafilatura ์กฐํฉ""" | |
| try: | |
| html = get_rendered_html_playwright(url) | |
| if html: | |
| result = trafilatura.extract(html, output_format='json', url=url, | |
| include_images=True, include_links=True) | |
| if result: | |
| data = json.loads(result) | |
| image_url = data.get('image') or extract_images_from_html(html, url) | |
| return { | |
| 'title': data.get('title'), | |
| 'text': data.get('text'), | |
| 'image_url': image_url | |
| } | |
| except Exception as e: | |
| print(f"Playwright+Trafilatura ์คํจ: {e}") | |
| return None | |
| def extract_playwright_newspaper(url: str) -> Optional[Dict[str, str]]: | |
| """Playwright + Newspaper3k ์กฐํฉ""" | |
| try: | |
| html = get_rendered_html_playwright(url) | |
| if html: | |
| article = Article(url='') | |
| article.set_html(html) | |
| article.parse() | |
| image_url = article.top_image or extract_images_from_html(html, url) | |
| return { | |
| 'title': article.title, | |
| 'text': article.text, | |
| 'image_url': image_url | |
| } | |
| except Exception as e: | |
| print(f"Playwright+Newspaper3k ์คํจ: {e}") | |
| return None | |
| def extract_article(url: str) -> Optional[Dict[str, str]]: | |
| """๊ธฐ์ฌ ์ถ์ถ - ์ต์ ์์๋ก ์๋""" | |
| print(f"๐ ์ถ์ถ ์์: {url}") | |
| result = {'title': None, 'text': None, 'image_url': None} | |
| # ์ต์ ์์: ๋น ๋ฅด๊ณ ์ ํํ ๊ฒ๋ถํฐ ์๋ | |
| # 1. Trafilatura - ๊ฐ์ฅ ๋น ๋ฅด๊ณ ์ ํ (์ ์ ์ฝํ ์ธ ) | |
| # 2. Newspaper3k - ๋น ๋ฅด๊ณ ํ๊ตญ์ด ์ง์ ์ฐ์ | |
| # 3. Playwright + Trafilatura - JavaScript ๋ ๋๋ง์ด ํ์ํ ๊ฒฝ์ฐ | |
| # 4. Playwright + Newspaper3k - ๋์ฒด ๋ฐฉ๋ฒ | |
| extractors = [ | |
| ("Trafilatura", extract_trafilatura), | |
| ("Newspaper3k", extract_newspaper), | |
| ] | |
| # Playwright ์ถ๊ฐ (JavaScript ๋ ๋๋ง ํ์์) | |
| if PLAYWRIGHT_AVAILABLE: | |
| extractors.extend([ | |
| ("Playwright+Trafilatura", extract_playwright_trafilatura), | |
| ("Playwright+Newspaper3k", extract_playwright_newspaper), | |
| ]) | |
| for i, (name, extractor) in enumerate(extractors, 1): | |
| print(f" {i}๏ธโฃ {name} ์๋...") | |
| try: | |
| data = extractor(url) | |
| if data: | |
| # ๊ฒฐ๊ณผ ์ ๋ฐ์ดํธ | |
| updated = [] | |
| for key in result: | |
| if not result[key] and data.get(key): | |
| result[key] = data[key] | |
| updated.append(key) | |
| if updated: | |
| print(f" โ ์ถ์ถ ์ฑ๊ณต: {', '.join(updated)}") | |
| # ์ ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง ๋ชจ๋ ์์ผ๋ฉด ์ฑ๊ณต | |
| if result['title'] and result['text'] and result['image_url']: | |
| print(f" โ {name} ์๋ฃ! (์ ๋ชฉ O, ๋ณธ๋ฌธ O, ์ด๋ฏธ์ง O)") | |
| return result | |
| # ์ํ ์ถ๋ ฅ | |
| status = f"์ ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง: {'O' if result['image_url'] else 'X'}" | |
| if result['title'] and result['text']: | |
| print(f" โ ๏ธ ์ด๋ฏธ์ง ์์ - ๋ค์ ๋จ๊ณ ๊ณ์ ({status})") | |
| else: | |
| print(f" โ ๏ธ ๋ถ๋ถ ์ฑ๊ณต ({status})") | |
| else: | |
| print(f" โ {name} ์คํจ") | |
| except requests.HTTPError as e: | |
| if e.response.status_code in (403, 429): | |
| print(f" โ {name} ์ฐจ๋จ๋จ (HTTP {e.response.status_code})") | |
| raise | |
| print(f" โ {name} ์ค๋ฅ: {e}") | |
| except Exception as e: | |
| print(f" โ {name} ์ค๋ฅ: {e}") | |
| if result['title'] or result['text']: | |
| print(f" โ ์ต์ข ๊ฒฐ๊ณผ - ์ ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง: {'O' if result['image_url'] else 'X'}") | |
| return result | |
| print(" โ ๋ชจ๋ ๋ฐฉ๋ฒ ์คํจ") | |
| return None | |
| if __name__ == "__main__": | |
| test_urls = [ | |
| "https://www.chosun.com/national/education/2025/07/19/4OMZBICJSNDGXA567IKPRBUFKA/", | |
| "https://news.nate.com/view/20250521n37437", | |
| "https://www.hani.co.kr/arti/society/society_general/1204840.html" | |
| ] | |
| print(f"์ฌ์ฉ ๊ฐ๋ฅํ ๋๊ตฌ:") | |
| print(f" - Playwright: {'โ ' if PLAYWRIGHT_AVAILABLE else 'โ'}") | |
| print(f" - Extruct: {'โ ' if EXTRUCT_AVAILABLE else 'โ'}") | |
| print(f" - Fake UserAgent: {'โ ' if 'ua' in dir() else 'โ'}\n") | |
| for url in test_urls: | |
| print(f"\n{'='*60}") | |
| try: | |
| article = extract_article(url) | |
| if article: | |
| print(f"\n๐ ์ ๋ชฉ: {article.get('title', 'N/A')[:100]}...") | |
| print(f"๐ ๋ณธ๋ฌธ: {len(article.get('text', ''))}์") | |
| print(f"๐ผ๏ธ ์ด๋ฏธ์ง: {article.get('image_url', 'N/A')[:80]}..." if article.get('image_url') else "๐ผ๏ธ ์ด๋ฏธ์ง: ์์") | |
| else: | |
| print("์ถ์ถ ์คํจ") | |
| except Exception as e: | |
| print(f"์ ์ฒด ์คํจ: {e}") | |
| print("="*60) | |