from __future__ import annotations

import re
from dataclasses import dataclass
from typing import List, Optional

from loguru import logger

from config import settings
from models.model_loader import get_model_loader

FAKE_TOKENS = ("fake", "false", "unreliable", "misinformation")

CLICKBAIT_PATTERNS = [
    (r"\byou won'?t believe\b", "clickbait"),
    (r"\bbreaking\s*:", "clickbait"),
    (r"\bshocking\s*:", "clickbait"),
    (r"\bexclusive\s*:", "clickbait"),
    (r"\bjust\s+in\s*:", "clickbait"),
    (r"\burgent\s*:", "clickbait"),
    (r"\bwhat\s+happens\s+next\b", "clickbait"),
    (r"\bthis\s+will\s+change\b", "clickbait"),
    (r"\b(?:everyone|nobody)\s+(?:is|was)\s+talking\b", "clickbait"),
]

EMOTIONAL_WORDS = {
    "outrage", "shocking", "horrifying", "disgusting", "amazing", "incredible",
    "unbelievable", "devastating", "terrifying", "explosive", "bombshell",
    "jaw-dropping", "heartbreaking", "furious", "scandal", "crisis",
    "chaos", "destroyed", "slammed", "blasted", "exposed", "revealed",
}

SUPERLATIVES = {
    "best", "worst", "greatest", "biggest", "most", "least",
    "fastest", "deadliest", "largest", "smallest", "ultimate",
}

MANIPULATION_PATTERNS = [
    (r"\bsources?\s+(?:say|said|claim|report)\b", "unverified_claim", "medium",
     "Unverified source attribution without specific citation"),
    (r"\ballegedly\b", "unverified_claim", "low",
     "Hedging language suggests unverified information"),
    (r"\breports?\s+suggest\b", "unverified_claim", "medium",
     "Vague report attribution"),
    (r"\baccording\s+to\s+(?:some|many|several)\b", "unverified_claim", "medium",
     "Non-specific source attribution"),
    (r"\brunconfirmed\b", "unverified_claim", "medium",
     "Explicitly unconfirmed information"),
    (r"\boutrage\b", "emotional_manipulation", "medium",
     "Emotional trigger word designed to provoke reaction"),
    (r"\bshocking\s+truth\b", "emotional_manipulation", "high",
     "Sensationalist phrase designed to manipulate reader emotion"),
    (r"\bwake\s+up\b", "emotional_manipulation", "medium",
     "Call-to-action implying hidden knowledge"),
    (r"\bthey\s+don'?t\s+want\s+you\s+to\s+know\b", "emotional_manipulation", "high",
     "Conspiracy framing language"),
    (r"\bopen\s+your\s+eyes\b", "emotional_manipulation", "medium",
     "Implies audience ignorance"),
    (r"\bexperts?\s+(?:confirm|say|agree|warn)\b", "false_authority", "medium",
     "Unnamed expert citation without specific attribution"),
    (r"\bscientists?\s+(?:confirm|prove|say)\b", "false_authority", "medium",
     "Unnamed scientist citation"),
    (r"\bstudies?\s+(?:show|prove|confirm)\b", "false_authority", "low",
     "Vague study reference without citation"),
    (r"\beveryone\s+knows\b", "false_authority", "medium",
     "Appeal to common knowledge fallacy"),
    (r"\bit'?s\s+(?:a\s+)?(?:well-?known|proven)\s+fact\b", "false_authority", "medium",
     "Assertion of fact without evidence"),
]

_NER_PREFERRED = {"PERSON", "ORG", "GPE", "EVENT", "PRODUCT", "NORP"}
# Cardinal numbers (counts, amounts) included in news queries only when short and digit-only
_NER_NUMERIC = {"CARDINAL", "MONEY", "QUANTITY"}


@dataclass
class TextClassification:
    label: str
    confidence: float
    fake_prob: float
    all_scores: dict[str, float]


@dataclass
class SensationalismResult:
    score: int
    level: str
    exclamation_count: int
    caps_word_count: int
    clickbait_matches: int
    emotional_word_count: int
    superlative_count: int


@dataclass
class ManipulationIndicator:
    pattern_type: str
    matched_text: str
    start_pos: int
    end_pos: int
    severity: str
    description: str


def detect_language(text: str) -> str:
    if not text or len(text.strip()) < 10:
        return "en"
    try:
        from langdetect import detect  # type: ignore
        lang = detect(text.strip())
        logger.info(f"Language detected: {lang}")
        return lang
    except ImportError:
        logger.debug("langdetect not installed - defaulting to 'en'")
        return "en"
    except Exception as e:
        logger.debug(f"Language detection failed: {e} - defaulting to 'en'")
        return "en"


def _scores_to_classification(items, *, allow_label0_fallback: bool = True) -> TextClassification:
    """Convert pipeline output to TextClassification.

    Prefer semantic fake labels. The bundled jy46604790 model uses
    LABEL_0=fake/LABEL_1=real, but arbitrary replacement models may not.
    """
    scores = {i["label"]: float(i["score"]) for i in items}
    top_label, top_conf = max(scores.items(), key=lambda kv: kv[1])

    fake_prob = max(
        (p for lbl, p in scores.items() if any(t in lbl.lower() for t in FAKE_TOKENS)),
        default=None,
    )
    if fake_prob is None:
        if allow_label0_fallback and "LABEL_0" in scores and "LABEL_1" in scores:
            fake_prob = scores["LABEL_0"]
        else:
            logger.warning(f"Could not infer fake label from text model labels: {list(scores)}")
            top_label = "uncertain_label_mapping"
            top_conf = 0.0
            fake_prob = 0.5

    return TextClassification(top_label, top_conf, fake_prob, scores)


def classify_text(text: str, language: Optional[str] = None) -> TextClassification:
    text = (text or "").strip()
    if not text:
        return TextClassification("unknown", 0.0, 0.0, {})

    loader = get_model_loader()
    is_non_english = bool(language and language != "en")
    if is_non_english and not settings.TEXT_MULTILANG_MODEL_ID:
        logger.warning(f"No multilingual text model configured for language={language}; returning uncertain score")
        return TextClassification("unsupported_language", 0.0, 0.5, {})

    pipe = loader.load_multilang_text_model() if is_non_english else loader.load_text_model()

    out = pipe(text[:2000], truncation=True, top_k=None)
    items = out[0] if isinstance(out[0], list) else out
    clf = _scores_to_classification(items, allow_label0_fallback=not is_non_english)
    logger.info(
        f"Text classify [{language or 'en'}] -> {clf.label} @ {clf.confidence:.3f} "
        f"fake_p={clf.fake_prob:.3f}"
    )
    return clf


def score_sensationalism(text: str) -> SensationalismResult:
    if not text:
        return SensationalismResult(0, "Low", 0, 0, 0, 0, 0)

    words = text.split()
    total_words = max(len(words), 1)
    excl = text.count("!")
    caps = sum(1 for w in words if w.isupper() and len(w) > 2)
    clickbait = sum(1 for pat, _ in CLICKBAIT_PATTERNS if re.search(pat, text, re.IGNORECASE))
    emotional = sum(1 for w in words if w.lower().strip(".,!?;:") in EMOTIONAL_WORDS)
    superlative = sum(1 for w in words if w.lower().strip(".,!?;:") in SUPERLATIVES)

    raw = (
        min(excl * 8, 25)
        + min(caps / total_words * 200, 25)
        + min(clickbait * 12, 25)
        + min(emotional * 6, 15)
        + min(superlative * 5, 10)
    )
    score = int(min(100, max(0, raw)))
    level = "Low" if score < 30 else ("Medium" if score < 60 else "High")

    logger.info(f"Sensationalism -> {score} ({level}) excl={excl} caps={caps} cb={clickbait} emo={emotional}")
    return SensationalismResult(score, level, excl, caps, clickbait, emotional, superlative)


def detect_manipulation_indicators(text: str) -> List[ManipulationIndicator]:
    if not text:
        return []
    indicators: List[ManipulationIndicator] = []
    for pattern, ptype, severity, description in MANIPULATION_PATTERNS:
        for m in re.finditer(pattern, text, re.IGNORECASE):
            indicators.append(ManipulationIndicator(
                pattern_type=ptype,
                matched_text=m.group(),
                start_pos=m.start(),
                end_pos=m.end(),
                severity=severity,
                description=description,
            ))
    indicators.sort(key=lambda i: i.start_pos)
    logger.info(f"Manipulation indicators -> {len(indicators)} found")
    return indicators


def extract_entities(text: str, max_k: int = 6) -> List[str]:
    if not text or len(text.strip()) < 20:
        return _extract_keywords_freq(text, max_k)

    nlp = get_model_loader().load_spacy_nlp()
    if nlp is None:
        return _extract_keywords_freq(text, max_k)

    try:
        doc = nlp(text[:5000])
        preferred: List[str] = []
        other: List[str] = []
        seen: set[str] = set()

        numeric: List[str] = []
        

        for ent in doc.ents:
            norm = ent.text.strip()
            norm_lower = norm.lower()
            if not norm or norm_lower in seen or len(norm) < 2:
                continue
            seen.add(norm_lower)
            if ent.label_ in _NER_PREFERRED:
                preferred.append(norm)
            elif ent.label_ in _NER_NUMERIC and norm.replace(",", "").isdigit() and len(norm) <= 6:
                # Include small cardinal numbers (e.g. "38", "55") — they're key facts
                numeric.append(norm)
            else:
                other.append(norm)

        entities = preferred + numeric + other
        if len(entities) < max_k:
            freq_kws = _extract_keywords_freq(text, max_k * 2)
            for k in freq_kws:
                if k.lower() not in seen:
                    entities.append(k)
                    seen.add(k.lower())

        result = entities[:max_k]
        logger.info(f"NER extracted {len(result)} entities: {result}")
        return result
    except Exception as e:
        logger.warning(f"spaCy NER failed: {e} - falling back to frequency extraction")
        return _extract_keywords_freq(text, max_k)


def _extract_keywords_freq(text: str, max_k: int = 6) -> List[str]:
    stop = {
        "the", "a", "an", "is", "are", "was", "were", "be", "been", "being", "to", "of", "and", "or", "but",
        "in", "on", "at", "for", "with", "by", "from", "as", "that", "this", "it", "its", "has", "have", "had",
        "will", "would", "can", "could", "should", "may", "might", "do", "does", "did", "not", "no", "so",
        "than", "then", "there", "their", "they", "them", "we", "our", "you", "your", "he", "she", "his", "her",
        "during", "several", "also", "about", "which", "who", "whom", "what", "where", "when", "why", "how",
        "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "only", "own", "same", "very",
        "these", "those", "into", "through", "after", "before", "over", "under", "between", "out", "against", "during"
    }
    words = re.findall(r"[A-Za-z][A-Za-z\-']{2,}|\b\d{1,5}\b", text or "")
    freq: dict[str, int] = {}
    for w in words:
        wl = w.lower()
        if wl in stop:
            continue
        freq[wl] = freq.get(wl, 0) + 1
    return [w for w, _ in sorted(freq.items(), key=lambda kv: (-kv[1], kv[0]))[:max_k]]


extract_keywords = extract_entities