"""
rappler_scraper.py
------------------
Scrapes fact-check articles from Rappler's Facts First / Fact-Check sections.
(https://www.rappler.com/facts-first/ and https://www.rappler.com/newsbreak/fact-check/)

Respects robots.txt, caches results for 7 days, and never raises on failure.
"""
from __future__ import annotations

import json
import logging
import re
import time
from datetime import datetime, timezone
from pathlib import Path
from typing import Optional
from urllib.parse import urljoin
from urllib.robotparser import RobotFileParser

import requests
from bs4 import BeautifulSoup

from .base import DataSource, NormalizedSample, clean_text, detect_language

logger = logging.getLogger(__name__)

_UA = "PhilVerify-Research/1.0 (academic research; contact: research@philverify.ph)"
_HEADERS = {
    "User-Agent": _UA,
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.5",
}

# ---------------------------------------------------------------------------
# Verdict → label mapping
# ---------------------------------------------------------------------------
_VERDICT_MAP: dict[str, int] = {
    # Likely Fake  (label 2)
    "FALSE": 2,
    "FAKE": 2,
    "MISLEADING": 2,
    "DISINFORMATION": 2,
    "FABRICATED": 2,
    # Unverified  (label 1)
    "UNVERIFIED": 1,
    "NEEDS MORE CONTEXT": 1,
    "MISSING CONTEXT": 1,
    "NEEDS CONTEXT": 1,
    "PARTLY TRUE": 1,
    "PARTLY FALSE": 1,
    "HALF TRUE": 1,
    "MIXTURE": 1,
    "UNPROVEN": 1,
    # Credible  (label 0)
    "TRUE": 0,
    "ACCURATE": 0,
    "CORRECT": 0,
    "VERIFIED": 0,
}

_CACHE_TTL_DAYS = 7
_REQUEST_DELAY = 1.5  # seconds between requests


def _resolve_verdict(raw: str) -> Optional[int]:
    """Normalise a raw verdict string to a label int, or None if unrecognised."""
    normalised = raw.strip().upper()
    if normalised in _VERDICT_MAP:
        return _VERDICT_MAP[normalised]
    for key, label in _VERDICT_MAP.items():
        if key in normalised:
            return label
    return None


def _robots_allows(base_url: str, path: str) -> bool:
    """Return True when robots.txt permits PhilVerify to access *path*."""
    robots_url = urljoin(base_url, "/robots.txt")
    rp = RobotFileParser()
    rp.set_url(robots_url)
    try:
        rp.read()
    except Exception as exc:
        logger.warning("Could not read robots.txt (%s): %s — proceeding with caution", robots_url, exc)
        return True
    target = urljoin(base_url, path)
    allowed = rp.can_fetch(_UA, target)
    if not allowed:
        logger.warning("robots.txt disallows scraping %s", target)
    return allowed


def _get(url: str, timeout: int = 20) -> Optional[requests.Response]:
    """GET *url* with the project User-Agent; return None on any error."""
    try:
        resp = requests.get(url, headers=_HEADERS, timeout=timeout)
        resp.raise_for_status()
        return resp
    except requests.RequestException as exc:
        logger.warning("GET %s failed: %s", url, exc)
        return None


def _cache_fresh(cache_path: Path) -> bool:
    """True if *cache_path* exists and was written within the TTL window."""
    if not cache_path.exists():
        return False
    mtime = datetime.fromtimestamp(cache_path.stat().st_mtime, tz=timezone.utc)
    age_days = (datetime.now(tz=timezone.utc) - mtime).days
    return age_days < _CACHE_TTL_DAYS


class RapplerScraper(DataSource):
    """Scrape fact-check articles from Rappler and return NormalizedSample list.

    Tries both:
    - https://www.rappler.com/facts-first/
    - https://www.rappler.com/newsbreak/fact-check/

    Parameters
    ----------
    max_pages:
        Maximum number of listing pages to iterate per section. Defaults to 10.
    """

    BASE_URL = "https://www.rappler.com"

    # Ordered list of archive sections to attempt; first one that yields articles wins.
    ARCHIVE_PATHS = [
        "/facts-first/",
        "/newsbreak/fact-check/",
    ]

    def __init__(self, max_pages: int = 10) -> None:
        self.max_pages = max_pages
        self.cache_file: Path = (
            Path(__file__).parent.parent / "data" / "raw" / "rappler_cache.json"
        )
        self.cache_file.parent.mkdir(parents=True, exist_ok=True)

    # ------------------------------------------------------------------
    # DataSource interface
    # ------------------------------------------------------------------

    @property
    def source_name(self) -> str:
        return "rappler_factcheck"

    def fetch(self) -> list[NormalizedSample]:
        """Fetch and return normalised samples from Rappler.

        Loads from local cache when available and fresh; otherwise scrapes
        the live site and persists results to cache.
        """
        # 1. Try cache first
        if _cache_fresh(self.cache_file):
            logger.info("Loading Rappler data from cache: %s", self.cache_file)
            return self._load_cache()

        # 2. Respect robots.txt (check each section path)
        allowed_paths = [
            path for path in self.ARCHIVE_PATHS
            if _robots_allows(self.BASE_URL, path)
        ]
        if not allowed_paths:
            logger.error("robots.txt forbids all Rappler fact-check paths — returning []")
            return []

        logger.info("Scraping Rappler (paths: %s, max %d pages each)…", allowed_paths, self.max_pages)

        article_urls: list[str] = []

        # 3. Collect article URLs across all allowed archive sections
        for archive_path in allowed_paths:
            section_urls = self._collect_article_urls(archive_path)
            logger.info("Section %s: found %d article links", archive_path, len(section_urls))
            article_urls.extend(section_urls)
            time.sleep(_REQUEST_DELAY)

        # De-duplicate
        seen_set: set[str] = set()
        unique_urls: list[str] = []
        for u in article_urls:
            if u not in seen_set:
                seen_set.add(u)
                unique_urls.append(u)

        if not unique_urls:
            logger.warning("No article URLs collected from Rappler — returning []")
            return []

        # 4. Scrape individual articles
        samples: list[NormalizedSample] = []
        for idx, url in enumerate(unique_urls, start=1):
            logger.debug("[%d/%d] Scraping %s", idx, len(unique_urls), url)
            sample = self._scrape_article(url)
            if sample is not None:
                samples.append(sample)
            time.sleep(_REQUEST_DELAY)

        logger.info("Rappler: collected %d labelled samples", len(samples))

        # 5. Persist to cache
        if samples:
            self._save_cache(samples)

        return samples

    # ------------------------------------------------------------------
    # Private helpers
    # ------------------------------------------------------------------

    def _collect_article_urls(self, archive_path: str) -> list[str]:
        """Return all article URLs found across paginated listing pages for *archive_path*."""
        urls: list[str] = []
        for page_num in range(1, self.max_pages + 1):
            page_urls = self._get_article_urls_from_page(archive_path, page_num)
            if not page_urls:
                logger.info(
                    "No articles on page %d of %s — stopping pagination",
                    page_num,
                    archive_path,
                )
                break
            logger.info("  %s page %d: %d links", archive_path, page_num, len(page_urls))
            urls.extend(page_urls)
            time.sleep(_REQUEST_DELAY)
        return urls

    def _listing_page_candidates(self, archive_path: str, page_num: int) -> list[str]:
        """Return concrete URLs to try for a given archive path + page number."""
        base = f"{self.BASE_URL}{archive_path}"
        base = base.rstrip("/")
        candidates = [
            f"{base}/",                       # page 1 root
            f"{base}/page/{page_num}/",       # WordPress-style
            f"{base}?page={page_num}",        # query-param style
            f"{base}?paged={page_num}",
        ]
        if page_num == 1:
            # For page 1 try root first; duplicates are fine — we break early
            candidates.insert(0, f"{base}/")
        return candidates

    def _get_article_urls_from_page(self, archive_path: str, page_num: int) -> list[str]:
        """Fetch one listing page and return article URLs found on it."""
        for url in self._listing_page_candidates(archive_path, page_num):
            resp = _get(url)
            if resp is None:
                time.sleep(0.5)
                continue

            soup = BeautifulSoup(resp.text, "lxml")
            links = self._parse_article_links(soup)
            if links:
                return links
            # If the page loaded but had no links, try next candidate
            time.sleep(0.3)

        return []

    def _parse_article_links(self, soup: BeautifulSoup) -> list[str]:
        """Extract article hrefs from a listing-page soup object."""
        links: list[str] = []

        selectors = [
            "article h2 a",
            "article h3 a",
            ".entry-title a",
            "h2.entry-title a",
            ".story-card__title a",
            ".article-title a",
            ".post-title a",
            "h2 a[href*='fact-check']",
            "h3 a[href*='fact-check']",
            "h2 a[href*='facts-first']",
            "h3 a[href*='facts-first']",
            "h2 a",
        ]
        for selector in selectors:
            nodes = soup.select(selector)
            if not nodes:
                continue
            for node in nodes:
                href = node.get("href", "")
                if not href:
                    continue
                if href.startswith("http"):
                    full = href
                elif href.startswith("/"):
                    full = urljoin(self.BASE_URL, href)
                else:
                    continue
                # Only keep URLs that look like Rappler articles
                if "rappler.com" in full:
                    links.append(full)
            if links:
                break

        # De-duplicate preserving order
        seen: set[str] = set()
        unique: list[str] = []
        for link in links:
            if link not in seen:
                seen.add(link)
                unique.append(link)
        return unique

    def _scrape_article(self, url: str) -> Optional[NormalizedSample]:
        """Fetch a single Rappler article page and return a NormalizedSample or None."""
        resp = _get(url)
        if resp is None:
            return None

        soup = BeautifulSoup(resp.text, "lxml")

        # --- Verdict ---
        raw_verdict = self._extract_verdict(soup)
        if raw_verdict is None:
            logger.debug("No recognisable verdict in %s — skipping", url)
            return None

        label = _resolve_verdict(raw_verdict)
        if label is None:
            logger.debug("Unknown verdict %r at %s — skipping", raw_verdict, url)
            return None

        # --- Headline ---
        headline = ""
        h1 = soup.find("h1")
        if h1:
            headline = h1.get_text(separator=" ", strip=True)

        # --- Body / summary text ---
        body_text = self._extract_body_text(soup) or headline
        if not body_text:
            return None

        text = clean_text(body_text)
        if not text:
            return None

        lang = detect_language(text)

        return NormalizedSample(
            text=text,
            label=label,
            source=self.source_name,
            language=lang,
            original_label=raw_verdict,
            confidence=1.0,
        )

    def _extract_verdict(self, soup: BeautifulSoup) -> Optional[str]:
        """Try several heuristics to extract the verdict string from a Rappler article."""

        # 1. Dedicated verdict / rating blocks — Rappler uses coloured label boxes
        verdict_selectors = [
            ".verdict",
            ".rating",
            ".label",
            ".fact-check-label",
            ".fc-label",
            "[class*='verdict']",
            "[class*='rating']",
            "[class*='label-']",
            ".wp-block-group",
            ".rappler-verdict",
        ]
        for sel in verdict_selectors:
            for node in soup.select(sel):
                raw = node.get_text(separator=" ", strip=True)
                if _resolve_verdict(raw) is not None:
                    return raw.strip()

        # 2. Open Graph / Twitter card meta (Rappler often embeds verdict in og:description)
        for meta in soup.find_all("meta"):
            content = meta.get("content", "")
            if not content:
                continue
            upper = content.upper()
            for key in _VERDICT_MAP:
                # Look for the verdict keyword appearing near the start or as a standalone token
                pattern = r"\b" + re.escape(key) + r"\b"
                if re.search(pattern, upper):
                    return key

        # 3. Structured data / JSON-LD (some CMS setups put verdict in schema.org ClaimReview)
        for script in soup.find_all("script", type="application/ld+json"):
            try:
                data = json.loads(script.string or "{}")
                # ClaimReview schema
                if isinstance(data, dict):
                    items = data if not isinstance(data.get("@graph"), list) else {}
                    review = items if items.get("@type") == "ClaimReview" else {}
                    rating = review.get("reviewRating", {})
                    alt_name = rating.get("alternateName", "")
                    if alt_name and _resolve_verdict(alt_name) is not None:
                        return alt_name
            except (json.JSONDecodeError, AttributeError):
                pass

        # 4. Bold/strong within article body
        article_body = (
            soup.find("div", class_=lambda c: c and "article-body" in c)
            or soup.find("div", class_=lambda c: c and "entry-content" in c)
            or soup.find("div", class_=lambda c: c and "content" in c)
        )
        if article_body:
            for tag in article_body.find_all(["strong", "b", "em", "span"]):
                raw = tag.get_text(strip=True)
                if _resolve_verdict(raw) is not None:
                    return raw

        # 5. Headline heuristic (e.g. "FACT CHECK: … is FALSE")
        h1 = soup.find("h1")
        if h1:
            h1_text = h1.get_text(strip=True).upper()
            for key in _VERDICT_MAP:
                if re.search(r"\b" + re.escape(key) + r"\b", h1_text):
                    return key

        # 6. Page title tag
        title_tag = soup.find("title")
        if title_tag:
            title_text = title_tag.get_text(strip=True).upper()
            for key in _VERDICT_MAP:
                if re.search(r"\b" + re.escape(key) + r"\b", title_text):
                    return key

        return None

    def _extract_body_text(self, soup: BeautifulSoup) -> str:
        """Extract the best representative text (claim + summary) from the article."""
        # Priority 1: claim box or summary paragraph
        claim_selectors = [
            ".claim",
            ".claim-text",
            ".fact-check-claim",
            ".article-summary",
            ".entry-summary",
            "blockquote",
        ]
        for sel in claim_selectors:
            node = soup.select_one(sel)
            if node:
                text = node.get_text(separator=" ", strip=True)
                if len(text) > 20:
                    return text

        # Priority 2: first substantive paragraph in article body
        body = (
            soup.find("div", class_=lambda c: c and "article-body" in c)
            or soup.find("div", class_=lambda c: c and "entry-content" in c)
            or soup.find("div", class_=lambda c: c and "content" in c)
        )
        if body:
            for p in body.find_all("p"):
                text = p.get_text(separator=" ", strip=True)
                if len(text) > 40:
                    return text

        # Priority 3: OG description
        og_desc = soup.find("meta", property="og:description")
        if og_desc:
            content = og_desc.get("content", "")
            if len(content) > 20:
                return content

        # Priority 4: meta description
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc:
            return meta_desc.get("content", "")

        return ""

    # ------------------------------------------------------------------
    # Cache helpers
    # ------------------------------------------------------------------

    def _save_cache(self, samples: list[NormalizedSample]) -> None:
        payload = {
            "timestamp": datetime.now(tz=timezone.utc).isoformat(),
            "source": self.source_name,
            "samples": [
                {
                    "text": s.text,
                    "label": s.label,
                    "source": s.source,
                    "language": s.language,
                    "original_label": s.original_label,
                    "confidence": s.confidence,
                }
                for s in samples
            ],
        }
        try:
            self.cache_file.write_text(
                json.dumps(payload, ensure_ascii=False, indent=2),
                encoding="utf-8",
            )
            logger.info("Rappler cache saved: %s (%d samples)", self.cache_file, len(samples))
        except OSError as exc:
            logger.error("Failed to write cache file %s: %s", self.cache_file, exc)

    def _load_cache(self) -> list[NormalizedSample]:
        try:
            payload = json.loads(self.cache_file.read_text(encoding="utf-8"))
            samples = [
                NormalizedSample(
                    text=item["text"],
                    label=item["label"],
                    source=item["source"],
                    language=item["language"],
                    original_label=item["original_label"],
                    confidence=item.get("confidence", 1.0),
                )
                for item in payload.get("samples", [])
            ]
            logger.info("Loaded %d samples from Rappler cache", len(samples))
            return samples
        except (OSError, json.JSONDecodeError, KeyError) as exc:
            logger.error("Cache load failed (%s): %s — will re-scrape", self.cache_file, exc)
            return []


# ---------------------------------------------------------------------------
# Quick smoke-test
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(name)s: %(message)s")
    scraper = RapplerScraper(max_pages=2)
    results = scraper.fetch()
    print(f"\nTotal samples: {len(results)}")
    for sample in results[:5]:
        print(f"  [{sample.label}] ({sample.original_label}) {sample.text[:120]!r}")