Spaces:

NzTama
/

Sentiment

Runtime error

App Files Files Community

Sentiment / services /news.py

NzTama

Initial clean deploy: Sentiment Analysis

fa8ff66 2 months ago

raw

history blame contribute delete

16.5 kB

	"""
	news.py – News scraper dispatcher.
	Exports: scrape_news(portal, pages, keyword) -> list[dict]

	portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
	"""
	from __future__ import annotations

	import random
	import re
	import time
	from urllib.parse import quote, quote_plus, urlparse, urlunparse

	import requests
	from bs4 import BeautifulSoup


	# ── Shared HTTP session helpers ────────────────────────────────────────────────

	_HEADERS = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
	),
	"Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,/;q=0.8",
	}

	def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
	for attempt in range(retries):
	try:
	r = sess.get(url, timeout=20, allow_redirects=True)
	r.raise_for_status()
	return r
	except Exception as e:
	if attempt < retries - 1:
	time.sleep(delay)
	return None

	def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
	container = None
	for cls in container_classes:
	container = soup.find("div", class_=cls)
	if container:
	break
	scope = container if container else soup
	texts = []
	for p in scope.find_all("p"):
	t = p.get_text(" ", strip=True)
	if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
	texts.append(t)
	return texts


	# ── Detik.com ──────────────────────────────────────────────────────────────────

	def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
	import datetime
	sess = requests.Session()
	sess.headers.update(_HEADERS)
	results = []

	for page in range(1, max_pages + 1):
	r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
	if not r: break
	soup = BeautifulSoup(r.text, "html.parser")
	news_list = soup.find_all('div', class_='media')
	if not news_list: break

	for news in news_list:
	try:
	title_tag = news.find('h3', class_='media__title')
	if not title_tag: continue
	link_tag = title_tag.find('a', class_='media__link')
	if not link_tag or not link_tag.has_attr('href'): continue
	link = link_tag['href']
	title = link_tag.text.strip()

	news_date = None
	date_tag = news.find('div', class_='media__date')
	if date_tag:
	span_tag = date_tag.find('span')
	if span_tag and span_tag.has_attr('d-time'):
	timestamp = span_tag['d-time']
	news_date = datetime.datetime.fromtimestamp(int(timestamp))

	news_resp = _get(sess, link)
	if not news_resp: continue
	news_soup = BeautifulSoup(news_resp.text, 'html.parser')

	content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
	content = ""
	if content_div:
	parts = []
	for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
	text = tag.get_text(strip=True)
	if text:
	prefix = tag.name.upper() if tag.name.startswith('h') else ''
	parts.append(f"{prefix}: {text}" if prefix else text)
	content = '\n'.join(parts)

	nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
	tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []

	results.append({
	'judul': title,
	'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
	'tag': ', '.join(tags),
	'isi_berita': content,
	'link': link
	})
	except Exception: pass
	time.sleep(2)
	return results


	# ── Radar ──────────────────────────────────────────────────────────────────────

	def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
	BASE_HOST = "https://radarcirebon.disway.id"
	sess = requests.Session()
	sess.headers.update(_HEADERS)
	results = []

	def _abs(href):
	if not href: return None
	href = href.strip()
	return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")

	for page in range(1, max_pages + 1):
	q = quote_plus(keyword)
	offset = (page - 1) * 30
	url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="

	r = _get(sess, url)
	if not r: break
	soup = BeautifulSoup(r.text, "html.parser")

	news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
	for item in news_list:
	try:
	a = item.find('a', href=True)
	if not a: continue
	link = _abs(a.get('href'))
	title = a.get_text(strip=True)

	detail_r = _get(sess, link)
	if not detail_r: continue
	detail_soup = BeautifulSoup(detail_r.text, "html.parser")

	h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
	title_detail = h1.get_text(strip=True) if h1 else title

	date_text = ""
	date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
	if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)

	content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
	content = ""
	if content_container:
	content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])

	tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
	tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]

	results.append({
	"judul": title_detail,
	"tanggal": date_text,
	"tag": ", ".join(tags) if tags else "-",
	"isi_berita": content,
	"link": link
	})
	except Exception: pass
	time.sleep(2)
	return results

	# ── Antara ─────────────────────────────────────────────────────────────────────

	def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
	BASE_HOST = "https://www.antaranews.com"
	sess = requests.Session()
	sess.headers.update(_HEADERS)
	results = []

	def _norm(href):
	if not href: return None
	href = href.strip()
	if href.startswith("/"): href = BASE_HOST + href
	elif not href.startswith("http"): return None
	return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")

	for page in range(1, max_pages + 1):
	q = quote_plus(keyword)
	url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
	r = _get(sess, url)
	if not r: break
	soup = BeautifulSoup(r.text, "html.parser")

	anchors = soup.select('a[href*="/berita/"]')
	links = {_norm(a.get('href')) for a in anchors if a.get('href')}

	for link in links:
	if not link: continue
	detail_r = _get(sess, link)
	if not detail_r: continue
	detail_soup = BeautifulSoup(detail_r.text, "html.parser")

	h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
	title_detail = h1.get_text(strip=True) if h1 else ""

	date_detail = ""
	cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
	if cal_icon and cal_icon.find_parent('li'):
	date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)

	content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])

	tags = []
	for a in detail_soup.select('a[href*="/tag/"]'):
	tag_text = a.get('title') or a.get_text(strip=True)
	if tag_text: tags.append(tag_text)

	results.append({
	"judul": title_detail,
	"tanggal": date_detail,
	"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
	"isi_berita": "\n".join(content_parts),
	"link": link
	})
	return results

	# ── CNN ────────────────────────────────────────────────────────────────────────

	def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from ._driver import _create_driver

	BASE_HOST = "https://www.cnnindonesia.com"
	results = []

	driver = _create_driver(mobile=False)
	for page in range(1, max_pages + 1):
	q = quote(keyword)
	url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
	driver.get(url)

	if page == 1:
	try:
	WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
	except: pass

	try:
	WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
	except: continue

	soup = BeautifulSoup(driver.page_source, "html.parser")
	links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}

	sess = requests.Session()
	sess.headers.update(_HEADERS)
	for link in links:
	html = _get(sess, link)
	if not html: continue
	ds = BeautifulSoup(html.text, "html.parser")

	title_el = ds.select_one('h1')
	title = title_el.get_text(strip=True) if title_el else "-"

	date_el = ds.select_one('div.text-cnn_grey.text-sm')
	date_text = date_el.get_text(strip=True) if date_el else "-"

	tags_list = []
	tk_header = ds.find('div', class_='title-box', text=re.compile(r'\sTOPIK TERKAIT\s'))
	if tk_header and tk_header.find_next_sibling('div'):
	tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]

	content_container = ds.select_one("div.detail-text")
	content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"

	results.append({
	"judul": title,
	"tanggal": date_text,
	"tag": ", ".join(tags_list) if tags_list else "-",
	"isi_berita": content,
	"link": link
	})
	driver.quit()
	return results

	# ── RadarCirebonID ─────────────────────────────────────────────────────────────

	def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
	BASE_HOST = "https://radarcirebon.id"
	sess = requests.Session()
	sess.headers.update(_HEADERS)
	results = []

	for page in range(1, max_pages + 1):
	q = quote(keyword).replace('%20', '+')
	url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
	r = _get(sess, url)
	if not r: break

	soup = BeautifulSoup(r.text, "html.parser")
	links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}

	for link in links:
	detail_r = _get(sess, link)
	if not detail_r: continue
	ds = BeautifulSoup(detail_r.text, "html.parser")

	title_el = ds.select_one('h1.entry-title')
	date_el = ds.select_one('time.entry-date')

	c_parts = []
	cc = ds.select_one('div.entry-content')
	if cc:
	for p in cc.select('p'):
	if not p.find_parent(class_='read-also'):
	t = p.get_text(" ", strip=True)
	if t: c_parts.append(t)

	tc = ds.select_one('div.wp-block-tag-cloud')
	tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []

	results.append({
	"judul": title_el.get_text(strip=True) if title_el else "-",
	"tanggal": date_el.get_text(strip=True) if date_el else "-",
	"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
	"isi_berita": "\n".join(c_parts) if c_parts else "-",
	"link": link
	})

	return results


	# ── Public API ─────────────────────────────────────────────────────────────────

	_PORTAL_MAP = {
	"detik": _scrape_detik,
	"detik.com": _scrape_detik,
	"radar": _scrape_radar,
	"radardisway": _scrape_radar,
	"radarcirebon.disway.id": _scrape_radar,
	"antara": _scrape_antara,
	"antaranews": _scrape_antara,
	"antaranews.com": _scrape_antara,
	"cnn": _scrape_cnn,
	"cnnindonesia": _scrape_cnn,
	"cnnindonesia.com": _scrape_cnn,
	"radarcirebon": _scrape_radarcirebon,
	"radarcirebon.id": _scrape_radarcirebon,
	}


	def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
	if not portal: return []
	portal_key = portal.strip().lower().rstrip("/")
	scraper = _PORTAL_MAP.get(portal_key)

	if scraper is None:
	for key, fn in _PORTAL_MAP.items():
	if key in portal_key or portal_key in key:
	scraper = fn
	break

	if scraper is None:
	try:
	domain = urlparse(portal).netloc or portal_key
	for key, fn in _PORTAL_MAP.items():
	if key in domain:
	scraper = fn
	break
	except Exception: pass

	if scraper is None:
	print(f"[News] Portal '{portal}' tidak dikenali.")
	return []

	print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
	try:
	return scraper(keyword, max_pages=pages)
	except Exception as e:
	print(f"[News] Error saat scraping: {e}")
	return []