press_ethics / news_text_scraper.py
jonghhhh's picture
Upload folder using huggingface_hub
3c03221 verified
"""
๋‰ด์Šค ๊ธฐ์‚ฌ ์Šคํฌ๋ž˜ํ•‘ ๋„๊ตฌ (์ตœ์ ํ™” ๋ฒ„์ „)
์‚ฌ์šฉ๋ฒ•:
article = extract_article(url)
๋ฐ˜ํ™˜ ํ˜•์‹ (JSON):
{
'title': '๊ธฐ์‚ฌ ์ œ๋ชฉ',
'text': '๊ธฐ์‚ฌ ๋ณธ๋ฌธ ํ…์ŠคํŠธ',
'image_url': '๋Œ€ํ‘œ ์ด๋ฏธ์ง€ URL'
}
์˜์กด์„ฑ: pip3 install trafilatura newspaper3k playwright beautifulsoup4 requests fake-useragent extruct
Playwright ์ดˆ๊ธฐ ์„ค์น˜: playwright install chromium
์„ฑ๋Šฅ ์ตœ์  ์ˆœ์„œ:
1. Trafilatura (๊ฐ€์žฅ ๋น ๋ฅด๊ณ  ์ •ํ™•, ์ •์  ์ฝ˜ํ…์ธ )
2. Newspaper3k (๋น ๋ฅด๊ณ  ํ•œ๊ตญ์–ด ์ง€์› ์šฐ์ˆ˜)
3. Playwright + Trafilatura (JavaScript ๋ Œ๋”๋ง ํ•„์š”์‹œ)
4. Playwright + Newspaper3k (๋Œ€์ฒด ๋ฐฉ๋ฒ•)
"""
import json
import time
from typing import Optional, Dict
from urllib.parse import urljoin
import requests
import trafilatura
from bs4 import BeautifulSoup
from newspaper import Article
try:
from fake_useragent import UserAgent
ua = UserAgent()
USER_AGENT = ua.random
except ImportError:
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
try:
from playwright.sync_api import sync_playwright, TimeoutError as PlaywrightTimeout
PLAYWRIGHT_AVAILABLE = True
except ImportError:
PLAYWRIGHT_AVAILABLE = False
print("โš ๏ธ Playwright ๋ฏธ์„ค์น˜ - JavaScript ๋ Œ๋”๋ง ๊ธฐ๋Šฅ ๋น„ํ™œ์„ฑํ™”")
try:
import extruct
EXTRUCT_AVAILABLE = True
except ImportError:
EXTRUCT_AVAILABLE = False
# HTTP ํ—ค๋” ์„ค์ •
HEADERS = {
'User-Agent': USER_AGENT,
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'ko-KR,ko;q=0.8,en-US;q=0.5,en;q=0.3',
'Accept-Encoding': 'gzip, deflate',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
def fetch_with_headers(url: str) -> str:
"""HTTP ํ—ค๋”๋ฅผ ํฌํ•จํ•œ URL ์š”์ฒญ"""
response = requests.get(url, headers=HEADERS, timeout=30)
response.raise_for_status()
return response.text
def extract_images_from_html(html: str, base_url: str = "") -> Optional[str]:
"""HTML์—์„œ ์ด๋ฏธ์ง€ ์ถ”์ถœ (์—ฌ๋Ÿฌ ๋ฐฉ๋ฒ• ์‹œ๋„)"""
soup = BeautifulSoup(html, 'html.parser')
# 1. og:image ๋ฉ”ํƒ€ํƒœ๊ทธ
og_image = soup.find('meta', property='og:image')
if og_image and og_image.get('content'):
return og_image.get('content')
# 2. twitter:image
tw_image = soup.find('meta', attrs={'name': 'twitter:image'})
if tw_image and tw_image.get('content'):
return tw_image.get('content')
# 3. extruct๋กœ JSON-LD ํŒŒ์‹ฑ
if EXTRUCT_AVAILABLE:
try:
metadata = extruct.extract(html, base_url=base_url)
# Schema.org ImageObject ์ฐพ๊ธฐ
for item in metadata.get('json-ld', []):
if isinstance(item, dict):
if item.get('image'):
img = item['image']
if isinstance(img, str):
return img
elif isinstance(img, dict) and img.get('url'):
return img['url']
elif isinstance(img, list) and len(img) > 0:
return img[0] if isinstance(img[0], str) else img[0].get('url')
except:
pass
# 4. article ๋‚ด๋ถ€์˜ ์ฒซ ๋ฒˆ์งธ ์ด๋ฏธ์ง€
article_imgs = soup.select('article img[src], .article img[src], #article img[src]')
if article_imgs:
src = article_imgs[0].get('src')
return urljoin(base_url, src) if src else None
# 5. ์ผ๋ฐ˜ img ํƒœ๊ทธ
imgs = soup.find_all('img', src=True)
for img in imgs:
src = img.get('src')
# ๋กœ๊ณ , ์•„์ด์ฝ˜ ์ œ์™ธ
if src and not any(x in src.lower() for x in ['logo', 'icon', 'avatar', 'profile', 'ad', 'banner']):
# ์ตœ์†Œ ํฌ๊ธฐ ํ™•์ธ (width/height ์†์„ฑ)
width = img.get('width', '0')
height = img.get('height', '0')
try:
if int(width) >= 200 or int(height) >= 200:
return urljoin(base_url, src)
except:
return urljoin(base_url, src)
return None
def extract_trafilatura(url: str) -> Optional[Dict[str, str]]:
"""Trafilatura ๊ธฐ์‚ฌ ์ถ”์ถœ"""
try:
html = fetch_with_headers(url)
result = trafilatura.extract(html, output_format='json', url=url,
include_images=True, include_links=True)
if result:
data = json.loads(result)
image_url = data.get('image') or extract_images_from_html(html, url)
return {
'title': data.get('title'),
'text': data.get('text'),
'image_url': image_url
}
except Exception as e:
print(f"Trafilatura ์‹คํŒจ: {e}")
return None
def extract_newspaper(url: str) -> Optional[Dict[str, str]]:
"""Newspaper3k ๊ธฐ์‚ฌ ์ถ”์ถœ"""
try:
html = fetch_with_headers(url)
article = Article(url)
article.config.browser_user_agent = HEADERS['User-Agent']
article.set_html(html)
article.parse()
image_url = article.top_image or extract_images_from_html(html, url)
return {
'title': article.title,
'text': article.text,
'image_url': image_url
}
except Exception as e:
print(f"Newspaper3k ์‹คํŒจ: {e}")
return None
def get_rendered_html_playwright(url: str, wait: int = 2) -> Optional[str]:
"""Playwright๋กœ ๋ Œ๋”๋ง๋œ HTML ๊ฐ€์ ธ์˜ค๊ธฐ"""
try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent=HEADERS['User-Agent'],
viewport={'width': 1920, 'height': 1080}
)
page = context.new_page()
page.goto(url, wait_until='domcontentloaded', timeout=30000)
time.sleep(wait)
html = page.content()
browser.close()
return html
except Exception as e:
print(f"Playwright ์˜ค๋ฅ˜: {e}")
return None
def extract_playwright_trafilatura(url: str) -> Optional[Dict[str, str]]:
"""Playwright + Trafilatura ์กฐํ•ฉ"""
try:
html = get_rendered_html_playwright(url)
if html:
result = trafilatura.extract(html, output_format='json', url=url,
include_images=True, include_links=True)
if result:
data = json.loads(result)
image_url = data.get('image') or extract_images_from_html(html, url)
return {
'title': data.get('title'),
'text': data.get('text'),
'image_url': image_url
}
except Exception as e:
print(f"Playwright+Trafilatura ์‹คํŒจ: {e}")
return None
def extract_playwright_newspaper(url: str) -> Optional[Dict[str, str]]:
"""Playwright + Newspaper3k ์กฐํ•ฉ"""
try:
html = get_rendered_html_playwright(url)
if html:
article = Article(url='')
article.set_html(html)
article.parse()
image_url = article.top_image or extract_images_from_html(html, url)
return {
'title': article.title,
'text': article.text,
'image_url': image_url
}
except Exception as e:
print(f"Playwright+Newspaper3k ์‹คํŒจ: {e}")
return None
def extract_article(url: str) -> Optional[Dict[str, str]]:
"""๊ธฐ์‚ฌ ์ถ”์ถœ - ์ตœ์  ์ˆœ์„œ๋กœ ์‹œ๋„"""
print(f"๐Ÿ” ์ถ”์ถœ ์‹œ์ž‘: {url}")
result = {'title': None, 'text': None, 'image_url': None}
# ์ตœ์  ์ˆœ์„œ: ๋น ๋ฅด๊ณ  ์ •ํ™•ํ•œ ๊ฒƒ๋ถ€ํ„ฐ ์‹œ๋„
# 1. Trafilatura - ๊ฐ€์žฅ ๋น ๋ฅด๊ณ  ์ •ํ™• (์ •์  ์ฝ˜ํ…์ธ )
# 2. Newspaper3k - ๋น ๋ฅด๊ณ  ํ•œ๊ตญ์–ด ์ง€์› ์šฐ์ˆ˜
# 3. Playwright + Trafilatura - JavaScript ๋ Œ๋”๋ง์ด ํ•„์š”ํ•œ ๊ฒฝ์šฐ
# 4. Playwright + Newspaper3k - ๋Œ€์ฒด ๋ฐฉ๋ฒ•
extractors = [
("Trafilatura", extract_trafilatura),
("Newspaper3k", extract_newspaper),
]
# Playwright ์ถ”๊ฐ€ (JavaScript ๋ Œ๋”๋ง ํ•„์š”์‹œ)
if PLAYWRIGHT_AVAILABLE:
extractors.extend([
("Playwright+Trafilatura", extract_playwright_trafilatura),
("Playwright+Newspaper3k", extract_playwright_newspaper),
])
for i, (name, extractor) in enumerate(extractors, 1):
print(f" {i}๏ธโƒฃ {name} ์‹œ๋„...")
try:
data = extractor(url)
if data:
# ๊ฒฐ๊ณผ ์—…๋ฐ์ดํŠธ
updated = []
for key in result:
if not result[key] and data.get(key):
result[key] = data[key]
updated.append(key)
if updated:
print(f" โ†’ ์ถ”์ถœ ์„ฑ๊ณต: {', '.join(updated)}")
# ์ œ๋ชฉ, ๋ณธ๋ฌธ, ์ด๋ฏธ์ง€ ๋ชจ๋‘ ์žˆ์œผ๋ฉด ์„ฑ๊ณต
if result['title'] and result['text'] and result['image_url']:
print(f" โœ… {name} ์™„๋ฃŒ! (์ œ๋ชฉ O, ๋ณธ๋ฌธ O, ์ด๋ฏธ์ง€ O)")
return result
# ์ƒํƒœ ์ถœ๋ ฅ
status = f"์ œ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง€: {'O' if result['image_url'] else 'X'}"
if result['title'] and result['text']:
print(f" โš ๏ธ ์ด๋ฏธ์ง€ ์—†์Œ - ๋‹ค์Œ ๋‹จ๊ณ„ ๊ณ„์† ({status})")
else:
print(f" โš ๏ธ ๋ถ€๋ถ„ ์„ฑ๊ณต ({status})")
else:
print(f" โŒ {name} ์‹คํŒจ")
except requests.HTTPError as e:
if e.response.status_code in (403, 429):
print(f" โŒ {name} ์ฐจ๋‹จ๋จ (HTTP {e.response.status_code})")
raise
print(f" โŒ {name} ์˜ค๋ฅ˜: {e}")
except Exception as e:
print(f" โŒ {name} ์˜ค๋ฅ˜: {e}")
if result['title'] or result['text']:
print(f" โœ… ์ตœ์ข… ๊ฒฐ๊ณผ - ์ œ๋ชฉ: {'O' if result['title'] else 'X'}, ๋ณธ๋ฌธ: {'O' if result['text'] else 'X'}, ์ด๋ฏธ์ง€: {'O' if result['image_url'] else 'X'}")
return result
print(" โŒ ๋ชจ๋“  ๋ฐฉ๋ฒ• ์‹คํŒจ")
return None
if __name__ == "__main__":
test_urls = [
"https://www.chosun.com/national/education/2025/07/19/4OMZBICJSNDGXA567IKPRBUFKA/",
"https://news.nate.com/view/20250521n37437",
"https://www.hani.co.kr/arti/society/society_general/1204840.html"
]
print(f"์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ๋„๊ตฌ:")
print(f" - Playwright: {'โœ…' if PLAYWRIGHT_AVAILABLE else 'โŒ'}")
print(f" - Extruct: {'โœ…' if EXTRUCT_AVAILABLE else 'โŒ'}")
print(f" - Fake UserAgent: {'โœ…' if 'ua' in dir() else 'โŒ'}\n")
for url in test_urls:
print(f"\n{'='*60}")
try:
article = extract_article(url)
if article:
print(f"\n๐Ÿ“„ ์ œ๋ชฉ: {article.get('title', 'N/A')[:100]}...")
print(f"๐Ÿ“ ๋ณธ๋ฌธ: {len(article.get('text', ''))}์ž")
print(f"๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€: {article.get('image_url', 'N/A')[:80]}..." if article.get('image_url') else "๐Ÿ–ผ๏ธ ์ด๋ฏธ์ง€: ์—†์Œ")
else:
print("์ถ”์ถœ ์‹คํŒจ")
except Exception as e:
print(f"์ „์ฒด ์‹คํŒจ: {e}")
print("="*60)