Sentiment / services /news.py
NzTama's picture
Initial clean deploy: Sentiment Analysis
fa8ff66
"""
news.py – News scraper dispatcher.
Exports: scrape_news(portal, pages, keyword) -> list[dict]
portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
"""
from __future__ import annotations
import random
import re
import time
from urllib.parse import quote, quote_plus, urlparse, urlunparse
import requests
from bs4 import BeautifulSoup
# ── Shared HTTP session helpers ────────────────────────────────────────────────
_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
),
"Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}
def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
for attempt in range(retries):
try:
r = sess.get(url, timeout=20, allow_redirects=True)
r.raise_for_status()
return r
except Exception as e:
if attempt < retries - 1:
time.sleep(delay)
return None
def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
container = None
for cls in container_classes:
container = soup.find("div", class_=cls)
if container:
break
scope = container if container else soup
texts = []
for p in scope.find_all("p"):
t = p.get_text(" ", strip=True)
if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
texts.append(t)
return texts
# ── Detik.com ──────────────────────────────────────────────────────────────────
def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
import datetime
sess = requests.Session()
sess.headers.update(_HEADERS)
results = []
for page in range(1, max_pages + 1):
r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
if not r: break
soup = BeautifulSoup(r.text, "html.parser")
news_list = soup.find_all('div', class_='media')
if not news_list: break
for news in news_list:
try:
title_tag = news.find('h3', class_='media__title')
if not title_tag: continue
link_tag = title_tag.find('a', class_='media__link')
if not link_tag or not link_tag.has_attr('href'): continue
link = link_tag['href']
title = link_tag.text.strip()
news_date = None
date_tag = news.find('div', class_='media__date')
if date_tag:
span_tag = date_tag.find('span')
if span_tag and span_tag.has_attr('d-time'):
timestamp = span_tag['d-time']
news_date = datetime.datetime.fromtimestamp(int(timestamp))
news_resp = _get(sess, link)
if not news_resp: continue
news_soup = BeautifulSoup(news_resp.text, 'html.parser')
content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
content = ""
if content_div:
parts = []
for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
text = tag.get_text(strip=True)
if text:
prefix = tag.name.upper() if tag.name.startswith('h') else ''
parts.append(f"{prefix}: {text}" if prefix else text)
content = '\n'.join(parts)
nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []
results.append({
'judul': title,
'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
'tag': ', '.join(tags),
'isi_berita': content,
'link': link
})
except Exception: pass
time.sleep(2)
return results
# ── Radar ──────────────────────────────────────────────────────────────────────
def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
BASE_HOST = "https://radarcirebon.disway.id"
sess = requests.Session()
sess.headers.update(_HEADERS)
results = []
def _abs(href):
if not href: return None
href = href.strip()
return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")
for page in range(1, max_pages + 1):
q = quote_plus(keyword)
offset = (page - 1) * 30
url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="
r = _get(sess, url)
if not r: break
soup = BeautifulSoup(r.text, "html.parser")
news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
for item in news_list:
try:
a = item.find('a', href=True)
if not a: continue
link = _abs(a.get('href'))
title = a.get_text(strip=True)
detail_r = _get(sess, link)
if not detail_r: continue
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
title_detail = h1.get_text(strip=True) if h1 else title
date_text = ""
date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)
content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
content = ""
if content_container:
content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])
tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]
results.append({
"judul": title_detail,
"tanggal": date_text,
"tag": ", ".join(tags) if tags else "-",
"isi_berita": content,
"link": link
})
except Exception: pass
time.sleep(2)
return results
# ── Antara ─────────────────────────────────────────────────────────────────────
def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
BASE_HOST = "https://www.antaranews.com"
sess = requests.Session()
sess.headers.update(_HEADERS)
results = []
def _norm(href):
if not href: return None
href = href.strip()
if href.startswith("/"): href = BASE_HOST + href
elif not href.startswith("http"): return None
return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")
for page in range(1, max_pages + 1):
q = quote_plus(keyword)
url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
r = _get(sess, url)
if not r: break
soup = BeautifulSoup(r.text, "html.parser")
anchors = soup.select('a[href*="/berita/"]')
links = {_norm(a.get('href')) for a in anchors if a.get('href')}
for link in links:
if not link: continue
detail_r = _get(sess, link)
if not detail_r: continue
detail_soup = BeautifulSoup(detail_r.text, "html.parser")
h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
title_detail = h1.get_text(strip=True) if h1 else ""
date_detail = ""
cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
if cal_icon and cal_icon.find_parent('li'):
date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)
content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])
tags = []
for a in detail_soup.select('a[href*="/tag/"]'):
tag_text = a.get('title') or a.get_text(strip=True)
if tag_text: tags.append(tag_text)
results.append({
"judul": title_detail,
"tanggal": date_detail,
"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
"isi_berita": "\n".join(content_parts),
"link": link
})
return results
# ── CNN ────────────────────────────────────────────────────────────────────────
def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from ._driver import _create_driver
BASE_HOST = "https://www.cnnindonesia.com"
results = []
driver = _create_driver(mobile=False)
for page in range(1, max_pages + 1):
q = quote(keyword)
url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
driver.get(url)
if page == 1:
try:
WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
except: pass
try:
WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
except: continue
soup = BeautifulSoup(driver.page_source, "html.parser")
links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}
sess = requests.Session()
sess.headers.update(_HEADERS)
for link in links:
html = _get(sess, link)
if not html: continue
ds = BeautifulSoup(html.text, "html.parser")
title_el = ds.select_one('h1')
title = title_el.get_text(strip=True) if title_el else "-"
date_el = ds.select_one('div.text-cnn_grey.text-sm')
date_text = date_el.get_text(strip=True) if date_el else "-"
tags_list = []
tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
if tk_header and tk_header.find_next_sibling('div'):
tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]
content_container = ds.select_one("div.detail-text")
content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"
results.append({
"judul": title,
"tanggal": date_text,
"tag": ", ".join(tags_list) if tags_list else "-",
"isi_berita": content,
"link": link
})
driver.quit()
return results
# ── RadarCirebonID ─────────────────────────────────────────────────────────────
def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
BASE_HOST = "https://radarcirebon.id"
sess = requests.Session()
sess.headers.update(_HEADERS)
results = []
for page in range(1, max_pages + 1):
q = quote(keyword).replace('%20', '+')
url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
r = _get(sess, url)
if not r: break
soup = BeautifulSoup(r.text, "html.parser")
links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}
for link in links:
detail_r = _get(sess, link)
if not detail_r: continue
ds = BeautifulSoup(detail_r.text, "html.parser")
title_el = ds.select_one('h1.entry-title')
date_el = ds.select_one('time.entry-date')
c_parts = []
cc = ds.select_one('div.entry-content')
if cc:
for p in cc.select('p'):
if not p.find_parent(class_='read-also'):
t = p.get_text(" ", strip=True)
if t: c_parts.append(t)
tc = ds.select_one('div.wp-block-tag-cloud')
tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []
results.append({
"judul": title_el.get_text(strip=True) if title_el else "-",
"tanggal": date_el.get_text(strip=True) if date_el else "-",
"tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
"isi_berita": "\n".join(c_parts) if c_parts else "-",
"link": link
})
return results
# ── Public API ─────────────────────────────────────────────────────────────────
_PORTAL_MAP = {
"detik": _scrape_detik,
"detik.com": _scrape_detik,
"radar": _scrape_radar,
"radardisway": _scrape_radar,
"radarcirebon.disway.id": _scrape_radar,
"antara": _scrape_antara,
"antaranews": _scrape_antara,
"antaranews.com": _scrape_antara,
"cnn": _scrape_cnn,
"cnnindonesia": _scrape_cnn,
"cnnindonesia.com": _scrape_cnn,
"radarcirebon": _scrape_radarcirebon,
"radarcirebon.id": _scrape_radarcirebon,
}
def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
if not portal: return []
portal_key = portal.strip().lower().rstrip("/")
scraper = _PORTAL_MAP.get(portal_key)
if scraper is None:
for key, fn in _PORTAL_MAP.items():
if key in portal_key or portal_key in key:
scraper = fn
break
if scraper is None:
try:
domain = urlparse(portal).netloc or portal_key
for key, fn in _PORTAL_MAP.items():
if key in domain:
scraper = fn
break
except Exception: pass
if scraper is None:
print(f"[News] Portal '{portal}' tidak dikenali.")
return []
print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
try:
return scraper(keyword, max_pages=pages)
except Exception as e:
print(f"[News] Error saat scraping: {e}")
return []