""" news.py – News scraper dispatcher. Exports: scrape_news(portal, pages, keyword) -> list[dict] portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon' """ from __future__ import annotations import random import re import time from urllib.parse import quote, quote_plus, urlparse, urlunparse import requests from bs4 import BeautifulSoup # ── Shared HTTP session helpers ──────────────────────────────────────────────── _HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" ), "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", } def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0): for attempt in range(retries): try: r = sess.get(url, timeout=20, allow_redirects=True) r.raise_for_status() return r except Exception as e: if attempt < retries - 1: time.sleep(delay) return None def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list: container = None for cls in container_classes: container = soup.find("div", class_=cls) if container: break scope = container if container else soup texts = [] for p in scope.find_all("p"): t = p.get_text(" ", strip=True) if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")): texts.append(t) return texts # ── Detik.com ────────────────────────────────────────────────────────────────── def _scrape_detik(keyword: str, max_pages: int = 1) -> list: import datetime sess = requests.Session() sess.headers.update(_HEADERS) results = [] for page in range(1, max_pages + 1): r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2") if not r: break soup = BeautifulSoup(r.text, "html.parser") news_list = soup.find_all('div', class_='media') if not news_list: break for news in news_list: try: title_tag = news.find('h3', class_='media__title') if not title_tag: continue link_tag = title_tag.find('a', class_='media__link') if not link_tag or not link_tag.has_attr('href'): continue link = link_tag['href'] title = link_tag.text.strip() news_date = None date_tag = news.find('div', class_='media__date') if date_tag: span_tag = date_tag.find('span') if span_tag and span_tag.has_attr('d-time'): timestamp = span_tag['d-time'] news_date = datetime.datetime.fromtimestamp(int(timestamp)) news_resp = _get(sess, link) if not news_resp: continue news_soup = BeautifulSoup(news_resp.text, 'html.parser') content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text') content = "" if content_div: parts = [] for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): text = tag.get_text(strip=True) if text: prefix = tag.name.upper() if tag.name.startswith('h') else '' parts.append(f"{prefix}: {text}" if prefix else text) content = '\n'.join(parts) nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav') tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else [] results.append({ 'judul': title, 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '', 'tag': ', '.join(tags), 'isi_berita': content, 'link': link }) except Exception: pass time.sleep(2) return results # ── Radar ────────────────────────────────────────────────────────────────────── def _scrape_radar(keyword: str, max_pages: int = 1) -> list: BASE_HOST = "https://radarcirebon.disway.id" sess = requests.Session() sess.headers.update(_HEADERS) results = [] def _abs(href): if not href: return None href = href.strip() return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/") for page in range(1, max_pages + 1): q = quote_plus(keyword) offset = (page - 1) * 30 url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num=" r = _get(sess, url) if not r: break soup = BeautifulSoup(r.text, "html.parser") news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media') for item in news_list: try: a = item.find('a', href=True) if not a: continue link = _abs(a.get('href')) title = a.get_text(strip=True) detail_r = _get(sess, link) if not detail_r: continue detail_soup = BeautifulSoup(detail_r.text, "html.parser") h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1') title_detail = h1.get_text(strip=True) if h1 else title date_text = "" date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date') if date_detail_tag: date_text = date_detail_tag.get_text(strip=True) content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content') content = "" if content_container: content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)]) tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href) tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')] results.append({ "judul": title_detail, "tanggal": date_text, "tag": ", ".join(tags) if tags else "-", "isi_berita": content, "link": link }) except Exception: pass time.sleep(2) return results # ── Antara ───────────────────────────────────────────────────────────────────── def _scrape_antara(keyword: str, max_pages: int = 1) -> list: BASE_HOST = "https://www.antaranews.com" sess = requests.Session() sess.headers.update(_HEADERS) results = [] def _norm(href): if not href: return None href = href.strip() if href.startswith("/"): href = BASE_HOST + href elif not href.startswith("http"): return None return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/") for page in range(1, max_pages + 1): q = quote_plus(keyword) url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "") r = _get(sess, url) if not r: break soup = BeautifulSoup(r.text, "html.parser") anchors = soup.select('a[href*="/berita/"]') links = {_norm(a.get('href')) for a in anchors if a.get('href')} for link in links: if not link: continue detail_r = _get(sess, link) if not detail_r: continue detail_soup = BeautifulSoup(detail_r.text, "html.parser") h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1') title_detail = h1.get_text(strip=True) if h1 else "" date_detail = "" cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar') if cal_icon and cal_icon.find_parent('li'): date_detail = cal_icon.find_parent('li').get_text(" ", strip=True) content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"]) tags = [] for a in detail_soup.select('a[href*="/tag/"]'): tag_text = a.get('title') or a.get_text(strip=True) if tag_text: tags.append(tag_text) results.append({ "judul": title_detail, "tanggal": date_detail, "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-", "isi_berita": "\n".join(content_parts), "link": link }) return results # ── CNN ──────────────────────────────────────────────────────────────────────── def _scrape_cnn(keyword: str, max_pages: int = 1) -> list: from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from ._driver import _create_driver BASE_HOST = "https://www.cnnindonesia.com" results = [] driver = _create_driver(mobile=False) for page in range(1, max_pages + 1): q = quote(keyword) url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "") driver.get(url) if page == 1: try: WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click() except: pass try: WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a"))) except: continue soup = BeautifulSoup(driver.page_source, "html.parser") links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)} sess = requests.Session() sess.headers.update(_HEADERS) for link in links: html = _get(sess, link) if not html: continue ds = BeautifulSoup(html.text, "html.parser") title_el = ds.select_one('h1') title = title_el.get_text(strip=True) if title_el else "-" date_el = ds.select_one('div.text-cnn_grey.text-sm') date_text = date_el.get_text(strip=True) if date_el else "-" tags_list = [] tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*')) if tk_header and tk_header.find_next_sibling('div'): tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')] content_container = ds.select_one("div.detail-text") content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-" results.append({ "judul": title, "tanggal": date_text, "tag": ", ".join(tags_list) if tags_list else "-", "isi_berita": content, "link": link }) driver.quit() return results # ── RadarCirebonID ───────────────────────────────────────────────────────────── def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list: BASE_HOST = "https://radarcirebon.id" sess = requests.Session() sess.headers.update(_HEADERS) results = [] for page in range(1, max_pages + 1): q = quote(keyword).replace('%20', '+') url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "") r = _get(sess, url) if not r: break soup = BeautifulSoup(r.text, "html.parser") links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])} for link in links: detail_r = _get(sess, link) if not detail_r: continue ds = BeautifulSoup(detail_r.text, "html.parser") title_el = ds.select_one('h1.entry-title') date_el = ds.select_one('time.entry-date') c_parts = [] cc = ds.select_one('div.entry-content') if cc: for p in cc.select('p'): if not p.find_parent(class_='read-also'): t = p.get_text(" ", strip=True) if t: c_parts.append(t) tc = ds.select_one('div.wp-block-tag-cloud') tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else [] results.append({ "judul": title_el.get_text(strip=True) if title_el else "-", "tanggal": date_el.get_text(strip=True) if date_el else "-", "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-", "isi_berita": "\n".join(c_parts) if c_parts else "-", "link": link }) return results # ── Public API ───────────────────────────────────────────────────────────────── _PORTAL_MAP = { "detik": _scrape_detik, "detik.com": _scrape_detik, "radar": _scrape_radar, "radardisway": _scrape_radar, "radarcirebon.disway.id": _scrape_radar, "antara": _scrape_antara, "antaranews": _scrape_antara, "antaranews.com": _scrape_antara, "cnn": _scrape_cnn, "cnnindonesia": _scrape_cnn, "cnnindonesia.com": _scrape_cnn, "radarcirebon": _scrape_radarcirebon, "radarcirebon.id": _scrape_radarcirebon, } def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list: if not portal: return [] portal_key = portal.strip().lower().rstrip("/") scraper = _PORTAL_MAP.get(portal_key) if scraper is None: for key, fn in _PORTAL_MAP.items(): if key in portal_key or portal_key in key: scraper = fn break if scraper is None: try: domain = urlparse(portal).netloc or portal_key for key, fn in _PORTAL_MAP.items(): if key in domain: scraper = fn break except Exception: pass if scraper is None: print(f"[News] Portal '{portal}' tidak dikenali.") return [] print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')") try: return scraper(keyword, max_pages=pages) except Exception as e: print(f"[News] Error saat scraping: {e}") return []