"""
PhilVerify — Claim Extractor
Extracts the key falsifiable claim from noisy social media text.

Strategy: sentence scoring based on presence of named entities,
verbs, dates, and numbers — no heavy model required.

Filipino fake news headlines almost always embed the checkworthy
assertion in a sentence that contains a specific number/date + person/org
name + an attribution verb (sinabi, ayon, announced, confirmed, etc.).
Scoring these signals finds the right sentence faster and more reliably
than a summarization model that was trained on English news compression.
"""
import re
import logging
from dataclasses import dataclass

logger = logging.getLogger(__name__)

_SENTENCE_SPLIT = re.compile(r"(?<=[.!?])\s+")

# Numbers, percentages, or month names signal a specific, verifiable claim
_DATE_OR_NUM = re.compile(
    r"\b(\d[\d,.%]*"
    r"|(?:January|February|March|April|May|June|July|August|"
    r"September|October|November|December)"
    r"|(?:Enero|Pebrero|Marso|Abril|Mayo|Hunyo|Hulyo|Agosto|"
    r"Setyembre|Oktubre|Nobyembre|Disyembre))\b",
    re.IGNORECASE,
)

# Attribution / assertion verbs in English and Filipino
_VERB_PATTERN = re.compile(
    r"\b(is|are|was|were|has|have|had|will|would"
    r"|said|says|announced|confirmed|reported|claims|showed"
    r"|found|revealed|arrested|killed|died|signed|approved|ordered"
    r"|sinabi|ipinahayag|inanunsyo|kinumpirma|ayon|nagpahayag"
    r"|inihayag|iniutos|nagsabi|ipinag-utos)\b",
    re.IGNORECASE,
)


@dataclass
class ClaimResult:
    claim: str
    method: str   # "sentence_scoring" | "sentence_heuristic"


def _score_sentence(sent: str) -> float:
    """Score a sentence by how likely it is to contain a falsifiable claim."""
    score = 0.0
    if _DATE_OR_NUM.search(sent):
        score += 2.0
    score += min(3.0, len(_VERB_PATTERN.findall(sent)) * 1.0)
    if len(sent) > 30:
        score += 1.0
    return score


class ClaimExtractor:
    """
    Extracts the single most falsifiable claim from input text using
    sentence scoring. No heavy model required — spaCy already loaded
    for NER; this module uses only stdlib regex.

    The highest-scoring sentence (by date/number + verb density) is
    returned as the claim for downstream NewsAPI evidence retrieval.
    """

    def extract(self, text: str) -> ClaimResult:
        if not text or len(text.strip()) < 20:
            return ClaimResult(claim=text.strip(), method="passthrough")

        sentences = [s.strip() for s in _SENTENCE_SPLIT.split(text.strip())]
        candidates = [s for s in sentences if len(s) > 15]

        if not candidates:
            return ClaimResult(claim=text[:200].strip(), method="sentence_heuristic")

        scored = [(s, _score_sentence(s)) for s in candidates]
        best_sent, best_score = max(scored, key=lambda x: x[1])

        if best_score > 0:
            return ClaimResult(claim=best_sent, method="sentence_scoring")

        # All scores zero — fall back to first two sentences
        return ClaimResult(
            claim=" ".join(candidates[:2]),
            method="sentence_heuristic",
        )