"""
news.py  –  News scraper dispatcher.
Exports: scrape_news(portal, pages, keyword) -> list[dict]

portal: 'detik', 'radar', 'antara', 'cnn', 'radarcirebon'
"""
from __future__ import annotations

import random
import re
import time
from urllib.parse import quote, quote_plus, urlparse, urlunparse

import requests
from bs4 import BeautifulSoup


# ── Shared HTTP session helpers ────────────────────────────────────────────────

_HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
    ),
    "Accept-Language": "id-ID,id;q=0.9,en-US;q=0.8,en;q=0.7",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
}

def _get(sess: requests.Session, url: str, retries: int = 3, delay: float = 3.0):
    for attempt in range(retries):
        try:
            r = sess.get(url, timeout=20, allow_redirects=True)
            r.raise_for_status()
            return r
        except Exception as e:
            if attempt < retries - 1:
                time.sleep(delay)
    return None

def _extract_paragraphs(soup, container_classes: list, min_len: int = 30) -> list:
    container = None
    for cls in container_classes:
        container = soup.find("div", class_=cls)
        if container:
            break
    scope = container if container else soup
    texts = []
    for p in scope.find_all("p"):
        t = p.get_text(" ", strip=True)
        if t and len(t) >= min_len and not t.lower().startswith(("baca juga", "lihat juga", "advertisement")):
            texts.append(t)
    return texts


# ── Detik.com ──────────────────────────────────────────────────────────────────

def _scrape_detik(keyword: str, max_pages: int = 1) -> list:
    import datetime
    sess = requests.Session()
    sess.headers.update(_HEADERS)
    results = []

    for page in range(1, max_pages + 1):
        r = _get(sess, f"https://www.detik.com/search/searchall?query={keyword}&sortby=time&page={page}&siteid=2")
        if not r: break
        soup = BeautifulSoup(r.text, "html.parser")
        news_list = soup.find_all('div', class_='media')
        if not news_list: break

        for news in news_list:
            try:
                title_tag = news.find('h3', class_='media__title')
                if not title_tag: continue
                link_tag = title_tag.find('a', class_='media__link')
                if not link_tag or not link_tag.has_attr('href'): continue
                link = link_tag['href']
                title = link_tag.text.strip()

                news_date = None
                date_tag = news.find('div', class_='media__date')
                if date_tag:
                    span_tag = date_tag.find('span')
                    if span_tag and span_tag.has_attr('d-time'):
                        timestamp = span_tag['d-time']
                        news_date = datetime.datetime.fromtimestamp(int(timestamp))

                news_resp = _get(sess, link)
                if not news_resp: continue
                news_soup = BeautifulSoup(news_resp.text, 'html.parser')
                
                content_div = news_soup.find('div', class_='detail__body-text') or news_soup.find('div', class_='detail_text')
                content = ""
                if content_div:
                    parts = []
                    for tag in content_div.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
                        text = tag.get_text(strip=True)
                        if text:
                            prefix = tag.name.upper() if tag.name.startswith('h') else ''
                            parts.append(f"{prefix}: {text}" if prefix else text)
                    content = '\n'.join(parts)

                nav_div = news_soup.find('div', class_='detail_tag') or news_soup.find('div', class_='tag__list') or news_soup.find('div', class_='nav')
                tags = [a.text.strip() for a in nav_div.find_all('a')] if nav_div else []

                results.append({
                    'judul': title,
                    'tanggal': news_date.strftime('%Y-%m-%d %H:%M') if news_date else '',
                    'tag': ', '.join(tags),
                    'isi_berita': content,
                    'link': link
                })
            except Exception: pass
        time.sleep(2)
    return results


# ── Radar ──────────────────────────────────────────────────────────────────────

def _scrape_radar(keyword: str, max_pages: int = 1) -> list:
    BASE_HOST = "https://radarcirebon.disway.id"
    sess = requests.Session()
    sess.headers.update(_HEADERS)
    results = []

    def _abs(href):
        if not href: return None
        href = href.strip()
        return href if href.startswith("http") else BASE_HOST + "/" + href.lstrip("/")

    for page in range(1, max_pages + 1):
        q = quote_plus(keyword)
        offset = (page - 1) * 30
        url = f"{BASE_HOST}/search/kata/{offset}/{offset}/?c={q}&num=" if page > 1 else f"{BASE_HOST}/search/kata/?c={q}&num="
        
        r = _get(sess, url)
        if not r: break
        soup = BeautifulSoup(r.text, "html.parser")

        news_list = soup.find_all(class_='media-heading') or soup.find_all('div', class_='media')
        for item in news_list:
            try:
                a = item.find('a', href=True)
                if not a: continue
                link = _abs(a.get('href'))
                title = a.get_text(strip=True)
                
                detail_r = _get(sess, link)
                if not detail_r: continue
                detail_soup = BeautifulSoup(detail_r.text, "html.parser")
                
                h1 = detail_soup.find('h1', class_='text-black') or detail_soup.find('h1')
                title_detail = h1.get_text(strip=True) if h1 else title
                
                date_text = ""
                date_detail_tag = detail_soup.find('span', class_='date') or detail_soup.find(class_='date')
                if date_detail_tag: date_text = date_detail_tag.get_text(strip=True)
                
                content_container = detail_soup.find('div', class_='entry-content') or detail_soup.find('div', class_='post-content')
                content = ""
                if content_container:
                    content = "\n".join([p.get_text(strip=True) for p in content_container.find_all('p') if 'Baca Juga:' not in p.get_text(strip=True)])
                
                tag_links = detail_soup.find_all('a', href=lambda href: href and '/listtag/' in href)
                tags = [a_tag.get('title', '').strip() for a_tag in tag_links if a_tag.get('title')]
                
                results.append({
                    "judul": title_detail,
                    "tanggal": date_text,
                    "tag": ", ".join(tags) if tags else "-",
                    "isi_berita": content,
                    "link": link
                })
            except Exception: pass
        time.sleep(2)
    return results

# ── Antara ─────────────────────────────────────────────────────────────────────

def _scrape_antara(keyword: str, max_pages: int = 1) -> list:
    BASE_HOST = "https://www.antaranews.com"
    sess = requests.Session()
    sess.headers.update(_HEADERS)
    results = []
    
    def _norm(href):
        if not href: return None
        href = href.strip()
        if href.startswith("/"): href = BASE_HOST + href
        elif not href.startswith("http"): return None
        return urlunparse(urlparse(href)._replace(query="", fragment="")).rstrip("/")

    for page in range(1, max_pages + 1):
        q = quote_plus(keyword)
        url = f"{BASE_HOST}/search?q={q}" + (f"&page={page}" if page > 1 else "")
        r = _get(sess, url)
        if not r: break
        soup = BeautifulSoup(r.text, "html.parser")
        
        anchors = soup.select('a[href*="/berita/"]')
        links = {_norm(a.get('href')) for a in anchors if a.get('href')}
        
        for link in links:
            if not link: continue
            detail_r = _get(sess, link)
            if not detail_r: continue
            detail_soup = BeautifulSoup(detail_r.text, "html.parser")
            
            h1 = detail_soup.select_one('div.wrap__article-detail-title h1') or detail_soup.find('h1')
            title_detail = h1.get_text(strip=True) if h1 else ""
            
            date_detail = ""
            cal_icon = detail_soup.select_one('i.fa-calendar') or detail_soup.select_one('i.fas.fa-calendar')
            if cal_icon and cal_icon.find_parent('li'):
                date_detail = cal_icon.find_parent('li').get_text(" ", strip=True)
            
            content_parts = _extract_paragraphs(detail_soup, ["wrap__article-detail-content", "detail__body-text"])
            
            tags = []
            for a in detail_soup.select('a[href*="/tag/"]'):
                tag_text = a.get('title') or a.get_text(strip=True)
                if tag_text: tags.append(tag_text)
            
            results.append({
                "judul": title_detail,
                "tanggal": date_detail,
                "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
                "isi_berita": "\n".join(content_parts),
                "link": link
            })
    return results

# ── CNN ────────────────────────────────────────────────────────────────────────

def _scrape_cnn(keyword: str, max_pages: int = 1) -> list:
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    from ._driver import _create_driver

    BASE_HOST = "https://www.cnnindonesia.com"
    results = []

    driver = _create_driver(mobile=False)
    for page in range(1, max_pages + 1):
        q = quote(keyword)
        url = f"{BASE_HOST}/search?query={q}&result_type=latest" + (f"&page={page}" if page > 1 else "")
        driver.get(url)

        if page == 1:
            try:
                WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.XPATH, "//button[text()='AGREE']"))).click()
            except: pass

        try:
            WebDriverWait(driver, 15).until(EC.presence_of_element_located((By.CSS_SELECTOR, "div.nhl-list article a")))
        except: continue

        soup = BeautifulSoup(driver.page_source, "html.parser")
        links = {a['href'] for a in soup.select('div.nhl-list article a[href]') if re.search(r'/\d{14}-\d{2,3}-\d{6,}', urlparse(a['href']).path)}

        sess = requests.Session()
        sess.headers.update(_HEADERS)
        for link in links:
            html = _get(sess, link)
            if not html: continue
            ds = BeautifulSoup(html.text, "html.parser")
            
            title_el = ds.select_one('h1')
            title = title_el.get_text(strip=True) if title_el else "-"
            
            date_el = ds.select_one('div.text-cnn_grey.text-sm')
            date_text = date_el.get_text(strip=True) if date_el else "-"
            
            tags_list = []
            tk_header = ds.find('div', class_='title-box', text=re.compile(r'\s*TOPIK TERKAIT\s*'))
            if tk_header and tk_header.find_next_sibling('div'):
                tags_list = [t.get_text(strip=True) for t in tk_header.find_next_sibling('div').select('a')]
            
            content_container = ds.select_one("div.detail-text")
            content = "\n".join([p.get_text(" ", strip=True) for p in content_container.find_all('p') if not p.get_text(" ", strip=True).lower().startswith("lihat juga")]) if content_container else "-"
            
            results.append({
                "judul": title,
                "tanggal": date_text,
                "tag": ", ".join(tags_list) if tags_list else "-",
                "isi_berita": content,
                "link": link
            })
    driver.quit()
    return results

# ── RadarCirebonID ─────────────────────────────────────────────────────────────

def _scrape_radarcirebon(keyword: str, max_pages: int = 1) -> list:
    BASE_HOST = "https://radarcirebon.id"
    sess = requests.Session()
    sess.headers.update(_HEADERS)
    results = []

    for page in range(1, max_pages + 1):
        q = quote(keyword).replace('%20', '+')
        url = f"{BASE_HOST}/search/{q}/" + (f"page/{page}/" if page > 1 else "")
        r = _get(sess, url)
        if not r: break
        
        soup = BeautifulSoup(r.text, "html.parser")
        links = {a['href'] for a in soup.select('article .wp-block-latest-posts__post-title a') if re.search(r'/\d{4}/\d{2}/\d{2}/', a['href'])}
        
        for link in links:
            detail_r = _get(sess, link)
            if not detail_r: continue
            ds = BeautifulSoup(detail_r.text, "html.parser")
            
            title_el = ds.select_one('h1.entry-title')
            date_el = ds.select_one('time.entry-date')
            
            c_parts = []
            cc = ds.select_one('div.entry-content')
            if cc:
                for p in cc.select('p'):
                    if not p.find_parent(class_='read-also'):
                        t = p.get_text(" ", strip=True)
                        if t: c_parts.append(t)
                        
            tc = ds.select_one('div.wp-block-tag-cloud')
            tags = [a.get_text(strip=True) for a in tc.select('a')] if tc else []
            
            results.append({
                "judul": title_el.get_text(strip=True) if title_el else "-",
                "tanggal": date_el.get_text(strip=True) if date_el else "-",
                "tag": ", ".join(list(dict.fromkeys(tags))) if tags else "-",
                "isi_berita": "\n".join(c_parts) if c_parts else "-",
                "link": link
            })
            
    return results


# ── Public API ─────────────────────────────────────────────────────────────────

_PORTAL_MAP = {
    "detik":                 _scrape_detik,
    "detik.com":             _scrape_detik,
    "radar":                 _scrape_radar,
    "radardisway":           _scrape_radar,
    "radarcirebon.disway.id": _scrape_radar,
    "antara":                _scrape_antara,
    "antaranews":            _scrape_antara,
    "antaranews.com":        _scrape_antara,
    "cnn":                   _scrape_cnn,
    "cnnindonesia":          _scrape_cnn,
    "cnnindonesia.com":      _scrape_cnn,
    "radarcirebon":          _scrape_radarcirebon,
    "radarcirebon.id":       _scrape_radarcirebon,
}


def scrape_news(portal: str, pages: int = 1, keyword: str = "kabupaten cirebon") -> list:
    if not portal: return []
    portal_key = portal.strip().lower().rstrip("/")
    scraper = _PORTAL_MAP.get(portal_key)

    if scraper is None:
        for key, fn in _PORTAL_MAP.items():
            if key in portal_key or portal_key in key:
                scraper = fn
                break

    if scraper is None:
        try:
            domain = urlparse(portal).netloc or portal_key
            for key, fn in _PORTAL_MAP.items():
                if key in domain:
                    scraper = fn
                    break
        except Exception: pass

    if scraper is None:
        print(f"[News] Portal '{portal}' tidak dikenali.")
        return []

    print(f"[News] Scraping '{portal}' ({pages} pages, keyword='{keyword}')")
    try:
        return scraper(keyword, max_pages=pages)
    except Exception as e:
        print(f"[News] Error saat scraping: {e}")
        return []