from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
from transformers import (
    pipeline,
    AutoTokenizer,
)
from sentence_transformers import SentenceTransformer
import torch
import re
import hashlib
import logging
import spacy
from langdetect import detect, LangDetectException

# ── Logging ────────────────────────────────────────────────────────────────
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ── App ────────────────────────────────────────────────────────────────────
app = FastAPI(
    title="CivicPulse NLP API",
    description="NLP microservice for Civic Pulse Engine — Municipality of Pulilan, Bulacan",
    version="1.7.0",
)

# ═══════════════════════════════════════════════════════════════════════════
# MODEL LOADING
# All models loaded once at startup — never inside endpoint functions.
# ═══════════════════════════════════════════════════════════════════════════

DEVICE = 0 if torch.cuda.is_available() else -1

# ── 1. Sentiment Model ─────────────────────────────────────────────────────
# CHANGED in v1.7.0: Switched from tabularisai/multilingual-sentiment-analysis
# to rayubaldo44/civicpulse-sentiment-v2 (Stage 1 fine-tuned model)
# Base: dost-asti/RoBERTa-tl-sentiment-analysis, fine-tuned on:
#   - scaredmeow/shopee-reviews-tl-stars (15K Tagalog reviews, star→3-class mapped)
#   - legacy-datasets/hate_speech_filipino (10K election tweets, negative detection)
# Result: Direct 3-class output (negative/neutral/positive) — no aggregation needed.
# Fixes: "Salamat sa bagong street lights" now correctly classified as positive
#        (was neutral at 0.99 with tabularisai). 6/6 civic smoke test passed.
logger.info("Loading sentiment model: rayubaldo44/civicpulse-sentiment-v2...")
sentiment_pipeline = pipeline(
    task="text-classification",
    model="rayubaldo44/civicpulse-sentiment-v2",
    tokenizer="dost-asti/RoBERTa-tl-sentiment-analysis",
    device=DEVICE,
    top_k=None,
)
logger.info("Sentiment model loaded.")

# ── 2. Claim Detection Tokenizer ───────────────────────────────────────────
CLAIM_DETECTION_MODE = "heuristic"  # switch to "model" after fine-tuning

logger.info("Loading claim detection tokenizer: jcblaise/roberta-tagalog-large...")
claim_tokenizer = AutoTokenizer.from_pretrained("jcblaise/roberta-tagalog-large")
logger.info("Claim tokenizer loaded. (Heuristic mode until fine-tuned.)")

# ── 3. Topic Classification Model ──────────────────────────────────────────
# CHANGED in v1.5.0: Switched from cross-encoder/nli-MiniLM2-L6-H768 (English-only, 82M params)
# to MoritzLaurer/bge-m3-zeroshot-v2.0 (multilingual 100+ langs, 568M params, newest v2.0 architecture)
# Reason: Previous model could not classify Taglish text — nearly all comments fell into "Other"
# bge-m3-zeroshot-v2.0 is the most accurate multilingual zero-shot model available (2024)
# NOTE v1.7.0: Kept as-is. Stage 1 fine-tuned topic model collapsed to single class due to
# insufficient civic topic diversity in public datasets. Zero-shot remains superior until
# Stage 2 fine-tuning with annotated civic data.
logger.info("Loading topic model: MoritzLaurer/bge-m3-zeroshot-v2.0...")
topic_pipeline_model = pipeline(
    task="zero-shot-classification",
    model="MoritzLaurer/bge-m3-zeroshot-v2.0",
    device=DEVICE,
)
logger.info("Topic classification model loaded.")

# ── 4. spaCy NER (for /preprocess PII masking) ─────────────────────────────
logger.info("Loading spaCy NER model: en_core_web_sm...")
nlp_spacy = spacy.load("en_core_web_sm")
logger.info("spaCy NER model loaded.")

# ── 5. Embedding Model (for RAG pipeline) ──────────────────────────────────
logger.info("Loading embedding model: meedan/paraphrase-filipino-mpnet-base-v2...")
embedding_model = SentenceTransformer("meedan/paraphrase-filipino-mpnet-base-v2")
logger.info("Embedding model loaded.")

# ═══════════════════════════════════════════════════════════════════════════
# CONSTANTS
# ═══════════════════════════════════════════════════════════════════════════

CONFIDENCE_THRESHOLD = 0.65
TOPIC_REVIEW_THRESHOLD = 0.50

# CHANGED in v1.5.0: Descriptive labels for better zero-shot matching on Taglish civic text.
CIVIC_TOPICS_DESCRIPTIVE = [
    "government infrastructure projects, roads, bridges, flood control, buildings, and construction",
    "healthcare, hospitals, medical services, clinics, and public health programs",
    "waste management, garbage collection, recycling, and environmental sanitation",
    "public safety, police, crime, drugs, peace and order, and law enforcement",
    "other government services, civic matters, elections, and general topics",
]

# Map descriptive labels back to clean database-friendly labels
TOPIC_LABEL_MAP = {
    "government infrastructure projects, roads, bridges, flood control, buildings, and construction": "Infrastructure and Public Works",
    "healthcare, hospitals, medical services, clinics, and public health programs": "Healthcare and Medical Services",
    "waste management, garbage collection, recycling, and environmental sanitation": "Waste Management and Sanitation",
    "public safety, police, crime, drugs, peace and order, and law enforcement": "Public Safety and Peace and Order",
    "other government services, civic matters, elections, and general topics": "Other",
}

# Reverse map for custom_labels fallback (keeps backward compatibility)
CIVIC_TOPICS = list(TOPIC_LABEL_MAP.values())

# Claim detection heuristic patterns — Filipino/Taglish signals of a verifiable claim.
# Intentionally broad — RAG + Claude Haiku filters false positives later.
CLAIM_PATTERNS = [
    r"\bsinabi\b", r"\bayon\b", r"\bdaw\b", r"\braw\b", r"\bdiba\b",
    r"\btotoo\b", r"\bkatotohanan\b", r"\bbalita\b", r"\bnews\b",
    r"\bconfirmed\b", r"\bofficial\b", r"\bpahayag\b", r"\bannounced\b",
    r"\bnagsabi\b", r"\bsinabi ng\b", r"\bayon sa\b", r"\bpumirma\b",
    r"\b\d+\s*(piso|million|billion|porsyento|%|metro|km|kilometro)\b",
    r"\b\d+\s*(beses|taon|buwan|araw|oras)\b",
    r"\b(mayor|gobernador|konseho|lgu|barangay|kapitan)\b.*\b(nag|mag|ipa|sini|apro)\w+",
    r"\baccording to\b", r"\breport(ed|s)?\b", r"\bstatement\b",
    r"\bproject\b.*\b(million|billion|piso)\b",
    r"\b(budget|funds|pondo)\b.*\b\d+\b",
]
COMPILED_PATTERNS = [re.compile(p, re.IGNORECASE) for p in CLAIM_PATTERNS]

# Taglish SMS abbreviation map — high-frequency only
TAGLISH_MAP = {
    r"\bkc\b": "kasi",
    r"\bksi\b": "kasi",
    r"\bdk\b": "di ko",
    r"\bnman\b": "naman",
    r"\bsna\b": "sana",
    r"\bkau\b": "kayo",
    r"\bkaw\b": "ikaw",
    r"\bnyo\b": "ninyo",
    r"\bbaket\b": "bakit",
    r"\bpede\b": "pwede",
    r"\bpls\b": "please",
    r"\bplz\b": "please",
    r"\bomg\b": "",
    r"\blol\b": "",
    r"\bhaha+\b": "",
    r"\bhehe+\b": "",
}

# ═══════════════════════════════════════════════════════════════════════════
# PREPROCESS HELPERS
# ═══════════════════════════════════════════════════════════════════════════

def hash_user_id(user_id: str) -> str:
    return hashlib.sha256(user_id.encode()).hexdigest()

def normalize_platform(text: str) -> str:
    """Remove URLs, anonymize @mentions, strip # symbol but keep the word."""
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "[USER]", text)
    text = re.sub(r"#(\w+)", r"\1", text)
    return re.sub(r"\s+", " ", text).strip()

def normalize_taglish(text: str) -> str:
    """Expand high-frequency Taglish SMS abbreviations."""
    for pattern, replacement in TAGLISH_MAP.items():
        text = re.sub(pattern, replacement, text, flags=re.IGNORECASE)
    return re.sub(r"\s+", " ", text).strip()

def mask_named_entities(text: str) -> str:
    """Mask PERSON entities only — keep ORG/GPE so civic context survives."""
    doc = nlp_spacy(text)
    masked = text
    for ent in reversed(doc.ents):  # reversed so char offsets stay valid
        if ent.label_ == "PERSON":
            masked = masked[:ent.start_char] + "[NAME]" + masked[ent.end_char:]
    return masked

def detect_language(text: str) -> str:
    """Return 'tl', 'en', or 'tl-en' (Taglish). Falls back to 'unknown'."""
    try:
        lang = detect(text)
        if lang == "tl":
            return "tl"
        if lang == "en":
            tagalog_markers = {"ang", "ng", "mga", "na", "sa", "si", "ko", "ka", "po", "ba", "ay"}
            words = set(text.lower().split())
            if len(words & tagalog_markers) >= 2:
                return "tl-en"
            return "en"
        return lang
    except LangDetectException:
        return "unknown"

def is_spam(text: str) -> bool:
    """True if text is too short or contains no real words."""
    if len(text.strip()) < 5:
        return True
    return len(re.findall(r"[a-zA-ZÀ-ÿ\u0100-\u024F]+", text)) == 0

# ═══════════════════════════════════════════════════════════════════════════
# SCHEMAS
# ═══════════════════════════════════════════════════════════════════════════

class PreprocessRequest(BaseModel):
    text: str
    user_id: str
    comment_id: str | None = None

class PreprocessResponse(BaseModel):
    comment_id: str | None
    hashed_user_id: str
    cleaned_text: str
    language: str
    is_spam: bool

class EmbedRequest(BaseModel):
    text: str

class EmbedResponse(BaseModel):
    embedding: list[float]
    dimensions: int

class SentimentRequest(BaseModel):
    text: str
    comment_id: str | None = None

class SentimentScore(BaseModel):
    label: str
    score: float

class SentimentResponse(BaseModel):
    comment_id: str | None
    sentiment: str
    confidence: float
    all_scores: list[SentimentScore]
    needs_human_review: bool
    model: str

class ClaimRequest(BaseModel):
    text: str
    comment_id: str | None = None

class ClaimResponse(BaseModel):
    comment_id: str | None
    has_claim: bool
    confidence: float
    detection_mode: str
    matched_patterns: list[str]
    token_count: int
    model: str

class TopicRequest(BaseModel):
    text: str
    comment_id: str | None = None
    custom_labels: list[str] | None = None

class TopicScore(BaseModel):
    label: str
    score: float

class TopicResponse(BaseModel):
    comment_id: str | None
    topic: str
    confidence: float
    all_scores: list[TopicScore]
    needs_human_review: bool
    model: str

# ═══════════════════════════════════════════════════════════════════════════
# ENDPOINTS
# ═══════════════════════════════════════════════════════════════════════════

@app.get("/")
def root():
    return {
        "service": "CivicPulse NLP API",
        "version": "1.7.0",
        "status": "running",
        "endpoints": ["/preprocess", "/embed", "/sentiment", "/claim-detection", "/topic-classification", "/health"],
    }

@app.get("/health")
def health():
    """Keep-alive endpoint. GitHub Actions pings this every 25 min."""
    return {
        "status": "ok",
        "models_loaded": ["sentiment", "claim-tokenizer", "topic-classification", "spacy-ner", "embedding"],
        "claim_detection_mode": CLAIM_DETECTION_MODE,
    }

# ── Pre-Processing ─────────────────────────────────────────────────────────
@app.post("/preprocess", response_model=PreprocessResponse)
def preprocess(request: PreprocessRequest):
    """
    Cleans and anonymizes a raw comment before NLP processing.
    Call this first — pass cleaned_text to the three NLP endpoints.

    Steps (in order):
        1. SHA-256 hash user_id  (PII strip)
        2. Remove URLs, anonymize @mentions, strip # symbol
        3. Expand Taglish SMS abbreviations
        4. spaCy NER masks PERSON entities → [NAME]
        5. Detect language: tl / en / tl-en
        6. Spam check (too short or no real words)
    """
    text = request.text.strip()
    if not text:
        raise HTTPException(status_code=422, detail="text field cannot be empty.")

    hashed_uid = hash_user_id(request.user_id)
    text = normalize_platform(text)
    text = normalize_taglish(text)
    text = mask_named_entities(text)
    language = detect_language(text)
    spam = is_spam(text)

    return PreprocessResponse(
        comment_id=request.comment_id,
        hashed_user_id=hashed_uid,
        cleaned_text=text,
        language=language,
        is_spam=spam,
    )

# ── Embed ──────────────────────────────────────────────────────────────────
@app.post("/embed", response_model=EmbedResponse)
def embed(request: EmbedRequest):
    """
    Generate a 768-dimension dense vector embedding for a text string.
    Used for:   (1) embedding lgu_documents into pgvector, and
                (2) embedding flagged claims for cosine similarity search.
    Model: meedan/paraphrase-filipino-mpnet-base-v2
    """
    text = request.text.strip()
    if not text:
        raise HTTPException(status_code=422, detail="text field cannot be empty.")
    try:
        vector = embedding_model.encode(text, normalize_embeddings=True).tolist()
        return EmbedResponse(embedding=vector, dimensions=len(vector))
    except Exception as e:
        logger.error(f"Embedding error: {e}")
        raise HTTPException(status_code=500, detail=f"Embedding error: {str(e)}")

# ── Sentiment ──────────────────────────────────────────────────────────────
@app.post("/sentiment", response_model=SentimentResponse)
def analyze_sentiment(request: SentimentRequest):
    """
    Classify a comment as positive, negative, or neutral.

    v1.7.0: Uses rayubaldo44/civicpulse-sentiment-v2 (Stage 1 fine-tuned).
    Base: dost-asti/RoBERTa-tl-sentiment-analysis fine-tuned on Shopee reviews
    (15K Tagalog) + hate speech Filipino (10K election tweets).
    Direct 3-class output — no aggregation needed.
    Confidence below 0.65 is routed to human review queue.
    """
    text = request.text.strip()
    if not text:
        raise HTTPException(status_code=422, detail="text field cannot be empty.")
    if len(text) > 1000:
        logger.warning(f"Long text ({len(text)} chars), tokenizer will truncate.")

    try:
        raw_results = sentiment_pipeline(text, truncation=True, max_length=512)
    except Exception as e:
        logger.error(f"Sentiment inference error: {e}")
        raise HTTPException(status_code=500, detail=f"Model inference error: {str(e)}")

    # v1.7.0: Model outputs 3 classes directly — negative, neutral, positive
    scores = raw_results[0]

    # Build all_scores list
    all_scores = [
        SentimentScore(label=s["label"], score=round(s["score"], 4))
        for s in scores
    ]

    # Top prediction
    top = max(scores, key=lambda s: s["score"])
    top_label = top["label"]
    top_confidence = round(top["score"], 4)
    needs_review = top_confidence < CONFIDENCE_THRESHOLD

    return SentimentResponse(
        comment_id=request.comment_id,
        sentiment="review" if needs_review else top_label,
        confidence=top_confidence,
        all_scores=all_scores,
        needs_human_review=needs_review,
        model="rayubaldo44/civicpulse-sentiment-v2",
    )

# ── Claim Detection ────────────────────────────────────────────────────────
@app.post("/claim-detection", response_model=ClaimResponse)
def detect_claim(request: ClaimRequest):
    """
    Detect whether a comment contains a verifiable factual claim.
    has_claim = True means the comment will be passed to the RAG pipeline.
    """
    text = request.text.strip()
    if not text:
        raise HTTPException(status_code=422, detail="text field cannot be empty.")

    tokens = claim_tokenizer(
        text, truncation=True, max_length=512, return_tensors=None,
    )
    token_count = len(tokens["input_ids"])

    if CLAIM_DETECTION_MODE == "model":
        raise HTTPException(
            status_code=501,
            detail="Model mode not yet available. Fine-tune jcblaise/roberta-tagalog-large first."
        )

    matched = []
    for pattern in COMPILED_PATTERNS:
        match = pattern.search(text)
        if match:
            matched.append(match.group(0).lower())

    return ClaimResponse(
        comment_id=request.comment_id,
        has_claim=len(matched) > 0,
        confidence=1.0 if matched else 0.0,
        detection_mode="heuristic",
        matched_patterns=list(set(matched)),
        token_count=token_count,
        model="jcblaise/roberta-tagalog-large (tokenizer only — pending fine-tune)",
    )

# ── Topic Classification ───────────────────────────────────────────────────
@app.post("/topic-classification", response_model=TopicResponse)
def classify_topic(request: TopicRequest):
    """
    Classify a comment into one of five civic topic areas using
    zero-shot NLI. No training data required — labels passed at runtime.
    Confidence below 0.50 is flagged for human review.

    v1.5.0: Switched to bge-m3-zeroshot-v2.0 (568M params, 100+ languages)
    for most accurate multilingual Taglish classification. Uses descriptive
    candidate labels mapped back to clean database categories.
    """
    text = request.text.strip()
    if not text:
        raise HTTPException(status_code=422, detail="text field cannot be empty.")

    # If custom labels are provided, use them directly (no mapping)
    if request.custom_labels:
        labels = request.custom_labels
        use_mapping = False
    else:
        labels = CIVIC_TOPICS_DESCRIPTIVE
        use_mapping = True

    if len(labels) < 2:
        raise HTTPException(status_code=422, detail="At least 2 candidate labels required.")

    try:
        result = topic_pipeline_model(
            text,
            candidate_labels=labels,
            truncation=True,
            max_length=512,
        )
    except Exception as e:
        logger.error(f"Topic classification error: {e}")
        raise HTTPException(status_code=500, detail=f"Model inference error: {str(e)}")

    # Map descriptive labels back to clean database labels
    if use_mapping:
        mapped_labels = [TOPIC_LABEL_MAP.get(l, l) for l in result["labels"]]
    else:
        mapped_labels = result["labels"]

    all_scores = [
        TopicScore(label=label, score=round(score, 4))
        for label, score in zip(mapped_labels, result["scores"])
    ]

    top_label = mapped_labels[0]
    top_score = round(result["scores"][0], 4)

    return TopicResponse(
        comment_id=request.comment_id,
        topic=top_label,
        confidence=top_score,
        all_scores=all_scores,
        needs_human_review=top_score < TOPIC_REVIEW_THRESHOLD,
        model="MoritzLaurer/bge-m3-zeroshot-v2.0",
    )