| """ |
| news.py β News scraper dispatcher. |
| Exports: scrape_news(portal, pages, keyword) -> list[dict] |
| |
| portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon' |
| """ |
| from __future__ import annotations |
|
|
| import random |
| import re |
| import time |
| from urllib.parse import quote, quote_plus, urlparse, urlunparse |
|
|
| import requests |
| from bs4 import BeautifulSoup |
|
|
|
|
| |
|
|
| _HEADERS = { |
| "User-Agent": ( |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " |
| "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36" |
| ), |
| "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7", |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", |
| } |
|
|
| def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0): |
| for attempt in range(retries): |
| try: |
| r = sess.get(url, timeout=20, allow_redirects=True) |
| r.raise_for_status() |
| return r |
| except Exception as e: |
| if attempt < retries - 1: |
| time.sleep(delay) |
| return None |
|
|
| def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list: |
| container = None |
| for cls in container_classes: |
| container = soup.find("div", class_=cls) |
| if container: |
| break |
| scope = container if container else soup |
| texts = [] |
| for p in scope.find_all("p"): |
| t = p.get_text(" ", strip=True) |
| if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")): |
| texts.append(t) |
| return texts |
|
|
|
|
| |
|
|
| def _scrape_detik(keyword: str, max_pages: int = 1) -> list: |
| import datetime |
| sess = requests.Session() |
| sess.headers.update(_HEADERS) |
| results = [] |
|
|
| for page in range(1, max_pages + 1): |
| r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2") |
| if not r: break |
| soup = BeautifulSoup(r.text, "html.parser") |
| news_list = soup.find_all('div', class_='media') |
| if not news_list: break |
|
|
| for news in news_list: |
| try: |
| title_tag = news.find('h3', class_='media__title') |
| if not title_tag: continue |
| link_tag = title_tag.find('a', class_='media__link') |
| if not link_tag or not link_tag.has_attr('href'): continue |
| link = link_tag['href'] |
| title = link_tag.text.strip() |
|
|
| news_date = None |
| date_tag = news.find('div', class_='media__date') |
| if date_tag: |
| span_tag = date_tag.find('span') |
| if span_tag and span_tag.has_attr('d-time'): |
| timestamp = span_tag['d-time'] |
| news_date = datetime.datetime.fromtimestamp(int(timestamp)) |
|
|
| news_resp = _get(sess, link) |
| if not news_resp: continue |
| news_soup = BeautifulSoup(news_resp.text, 'html.parser') |
| |
| content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text') |
| content = "" |
| if content_div: |
| parts = [] |
| for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']): |
| text = tag.get_text(strip=True) |
| if text: |
| prefix = tag.name.upper() if tag.name.startswith('h') else '' |
| parts.append(f"{prefix}: {text}" if prefix else text) |
| content = '\n'.join(parts) |
|
|
| nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav') |
| tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else [] |
|
|
| results.append({ |
| 'judul': title, |
| 'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '', |
| 'tag': ', '.join(tags), |
| 'isi_berita': content, |
| 'link': link |
| }) |
| except Exception: pass |
| time.sleep(2) |
| return results |
|
|
|
|
| |
|
|
| def _scrape_radar(keyword: str, max_pages: int = 1) -> list: |
| BASE_HOST = "https://radarcirebon.disway.id" |
| sess = requests.Session() |
| sess.headers.update(_HEADERS) |
| results = [] |
|
|
| def _abs(href): |
| if not href: return None |
| href = href.strip() |
| return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/") |
|
|
| for page in range(1, max_pages + 1): |
| q = quote_plus(keyword) |
| offset = (page - 1) * 30 |
| url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num=" |
| |
| r = _get(sess, url) |
| if not r: break |
| soup = BeautifulSoup(r.text, "html.parser") |
|
|
| news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media') |
| for item in news_list: |
| try: |
| a = item.find('a', href=True) |
| if not a: continue |
| link = _abs(a.get('href')) |
| title = a.get_text(strip=True) |
| |
| detail_r = _get(sess, link) |
| if not detail_r: continue |
| detail_soup = BeautifulSoup(detail_r.text, "html.parser") |
| |
| h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1') |
| title_detail = h1.get_text(strip=True) if h1 else title |
| |
| date_text = "" |
| date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date') |
| if date_detail_tag: date_text = date_detail_tag.get_text(strip=True) |
| |
| content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content') |
| content = "" |
| if content_container: |
| content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)]) |
| |
| tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href) |
| tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')] |
| |
| results.append({ |
| "judul": title_detail, |
| "tanggal": date_text, |
| "tag": ", ".join(tags) if tags else "-", |
| "isi_berita": content, |
| "link": link |
| }) |
| except Exception: pass |
| time.sleep(2) |
| return results |
|
|
| |
|
|
| def _scrape_antara(keyword: str, max_pages: int = 1) -> list: |
| BASE_HOST = "https://www.antaranews.com" |
| sess = requests.Session() |
| sess.headers.update(_HEADERS) |
| results = [] |
| |
| def _norm(href): |
| if not href: return None |
| href = href.strip() |
| if href.startswith("/"): href = BASE_HOST + href |
| elif not href.startswith("http"): return None |
| return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/") |
|
|
| for page in range(1, max_pages + 1): |
| q = quote_plus(keyword) |
| url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "") |
| r = _get(sess, url) |
| if not r: break |
| soup = BeautifulSoup(r.text, "html.parser") |
| |
| anchors = soup.select('a[href*="/berita/"]') |
| links = {_norm(a.get('href')) for a in anchors if a.get('href')} |
| |
| for link in links: |
| if not link: continue |
| detail_r = _get(sess, link) |
| if not detail_r: continue |
| detail_soup = BeautifulSoup(detail_r.text, "html.parser") |
| |
| h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1') |
| title_detail = h1.get_text(strip=True) if h1 else "" |
| |
| date_detail = "" |
| cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar') |
| if cal_icon and cal_icon.find_parent('li'): |
| date_detail = cal_icon.find_parent('li').get_text(" ", strip=True) |
| |
| content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"]) |
| |
| tags = [] |
| for a in detail_soup.select('a[href*="/tag/"]'): |
| tag_text = a.get('title') or a.get_text(strip=True) |
| if tag_text: tags.append(tag_text) |
| |
| results.append({ |
| "judul": title_detail, |
| "tanggal": date_detail, |
| "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-", |
| "isi_berita": "\n".join(content_parts), |
| "link": link |
| }) |
| return results |
|
|
| |
|
|
| def _scrape_cnn(keyword: str, max_pages: int = 1) -> list: |
| from selenium.webdriver.common.by import By |
| from selenium.webdriver.support.ui import WebDriverWait |
| from selenium.webdriver.support import expected_conditions as EC |
| from ._driver import _create_driver |
|
|
| BASE_HOST = "https://www.cnnindonesia.com" |
| results = [] |
|
|
| driver = _create_driver(mobile=False) |
| for page in range(1, max_pages + 1): |
| q = quote(keyword) |
| url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "") |
| driver.get(url) |
|
|
| if page == 1: |
| try: |
| WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click() |
| except: pass |
|
|
| try: |
| WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a"))) |
| except: continue |
|
|
| soup = BeautifulSoup(driver.page_source, "html.parser") |
| links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)} |
|
|
| sess = requests.Session() |
| sess.headers.update(_HEADERS) |
| for link in links: |
| html = _get(sess, link) |
| if not html: continue |
| ds = BeautifulSoup(html.text, "html.parser") |
| |
| title_el = ds.select_one('h1') |
| title = title_el.get_text(strip=True) if title_el else "-" |
| |
| date_el = ds.select_one('div.text-cnn_grey.text-sm') |
| date_text = date_el.get_text(strip=True) if date_el else "-" |
| |
| tags_list = [] |
| tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*')) |
| if tk_header and tk_header.find_next_sibling('div'): |
| tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')] |
| |
| content_container = ds.select_one("div.detail-text") |
| content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-" |
| |
| results.append({ |
| "judul": title, |
| "tanggal": date_text, |
| "tag": ", ".join(tags_list) if tags_list else "-", |
| "isi_berita": content, |
| "link": link |
| }) |
| driver.quit() |
| return results |
|
|
| |
|
|
| def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list: |
| BASE_HOST = "https://radarcirebon.id" |
| sess = requests.Session() |
| sess.headers.update(_HEADERS) |
| results = [] |
|
|
| for page in range(1, max_pages + 1): |
| q = quote(keyword).replace('%20', '+') |
| url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "") |
| r = _get(sess, url) |
| if not r: break |
| |
| soup = BeautifulSoup(r.text, "html.parser") |
| links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])} |
| |
| for link in links: |
| detail_r = _get(sess, link) |
| if not detail_r: continue |
| ds = BeautifulSoup(detail_r.text, "html.parser") |
| |
| title_el = ds.select_one('h1.entry-title') |
| date_el = ds.select_one('time.entry-date') |
| |
| c_parts = [] |
| cc = ds.select_one('div.entry-content') |
| if cc: |
| for p in cc.select('p'): |
| if not p.find_parent(class_='read-also'): |
| t = p.get_text(" ", strip=True) |
| if t: c_parts.append(t) |
| |
| tc = ds.select_one('div.wp-block-tag-cloud') |
| tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else [] |
| |
| results.append({ |
| "judul": title_el.get_text(strip=True) if title_el else "-", |
| "tanggal": date_el.get_text(strip=True) if date_el else "-", |
| "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-", |
| "isi_berita": "\n".join(c_parts) if c_parts else "-", |
| "link": link |
| }) |
| |
| return results |
|
|
|
|
| |
|
|
| _PORTAL_MAP = { |
| "detik": _scrape_detik, |
| "detik.com": _scrape_detik, |
| "radar": _scrape_radar, |
| "radardisway": _scrape_radar, |
| "radarcirebon.disway.id": _scrape_radar, |
| "antara": _scrape_antara, |
| "antaranews": _scrape_antara, |
| "antaranews.com": _scrape_antara, |
| "cnn": _scrape_cnn, |
| "cnnindonesia": _scrape_cnn, |
| "cnnindonesia.com": _scrape_cnn, |
| "radarcirebon": _scrape_radarcirebon, |
| "radarcirebon.id": _scrape_radarcirebon, |
| } |
|
|
|
|
| def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list: |
| if not portal: return [] |
| portal_key = portal.strip().lower().rstrip("/") |
| scraper = _PORTAL_MAP.get(portal_key) |
|
|
| if scraper is None: |
| for key, fn in _PORTAL_MAP.items(): |
| if key in portal_key or portal_key in key: |
| scraper = fn |
| break |
|
|
| if scraper is None: |
| try: |
| domain = urlparse(portal).netloc or portal_key |
| for key, fn in _PORTAL_MAP.items(): |
| if key in domain: |
| scraper = fn |
| break |
| except Exception: pass |
|
|
| if scraper is None: |
| print(f"[News] Portal '{portal}' tidak dikenali.") |
| return [] |
|
|
| print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')") |
| try: |
| return scraper(keyword, max_pages=pages) |
| except Exception as e: |
| print(f"[News] Error saat scraping: {e}") |
| return [] |