import re
from collections import Counter

import gradio as gr
import spacy
from textblob import TextBlob
from transformers import pipeline

# ---------------------------------------------------------
# LOAD MODELS
# ---------------------------------------------------------

# Accurate mode model (spaCy transformer)
nlp_trf = spacy.load("en_core_web_trf")

# Fast mode model (spaCy small)
nlp_sm = spacy.load("en_core_web_sm")

# DeBERTa NER (used only in Accurate mode)
deberta_ner = pipeline(
    "token-classification",
    model="geckos/deberta-base-fine-tuned-ner",
    aggregation_strategy="simple"
)

stopwords = nlp_trf.Defaults.stop_words


# ---------------------------------------------------------
# HELPERS
# ---------------------------------------------------------

def clean_text(text):
    return text.strip()


def get_word_freq(text):
    words = re.findall(r"\b\w+\b", text.lower())
    words = [w for w in words if w not in stopwords]
    counts = Counter(words).most_common(10)
    if not counts:
        return "No words found."
    return "\n".join(f"{w}: {c}" for w, c in counts)


def get_sentiment(text):
    sentiment = TextBlob(text).sentiment
    return (
        f"Polarity: {sentiment.polarity:.3f}\n"
        f"Subjectivity: {sentiment.subjectivity:.3f}"
    )


def run_spacy_entities(doc):
    ents = []
    for ent in doc.ents:
        ents.append({"text": ent.text, "label": ent.label_})
    return ents


def run_deberta_batched(text):
    """Split text into sentences and batch them for faster NER."""
    sentences = [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]
    results = []
    for sent in sentences:
        out = deberta_ner(sent)
        for r in out:
            results.append({"text": r["word"], "label": r["entity_group"]})
    return results


def categorize_entities(spacy_ents, deberta_ents):
    people = set()
    orgs = set()
    locations = set()
    misc = set()

    def norm(t): return t.strip()

    # spaCy mapping
    for ent in spacy_ents:
        text = norm(ent["text"])
        label = ent["label"]
        if label == "PERSON":
            people.add(text)
        elif label == "ORG":
            orgs.add(text)
        elif label in ("GPE", "LOC"):
            locations.add(text)
        else:
            misc.add(text)

    # DeBERTa mapping
    for ent in deberta_ents:
        text = norm(ent["text"])
        label = ent["label"]
        if label == "PER":
            people.add(text)
        elif label == "ORG":
            orgs.add(text)
        elif label == "LOC":
            locations.add(text)
        else:
            misc.add(text)

    def fmt(title, items):
        if not items:
            return f"{title}:\n  (none)"
        items = sorted(items, key=lambda x: x.lower())
        return f"{title}:\n  - " + "\n  - ".join(items)

    return "\n\n".join([
        fmt("People", people),
        fmt("Organizations", orgs),
        fmt("Countries/Locations", locations),
        fmt("Misc", misc),
    ])


# ---------------------------------------------------------
# MAIN ANALYSIS
# ---------------------------------------------------------

def analyze_text(text, mode):
    text = clean_text(text)
    if not text:
        return ("No words found.", "No sentiment detected.", "No entities detected.")

    # Word frequency
    word_freq_str = get_word_freq(text)

    # Sentiment
    sentiment_str = get_sentiment(text)

    # Fast mode → spaCy small only
    if mode == "Fast":
        doc = nlp_sm(text)
        spacy_ents = run_spacy_entities(doc)
        entities_str = categorize_entities(spacy_ents, [])
        return (word_freq_str, sentiment_str, entities_str)

    # Accurate mode → spaCy transformer + DeBERTa (batched)
    doc = nlp_trf(text)
    spacy_ents = run_spacy_entities(doc)
    deberta_ents = run_deberta_batched(text)
    entities_str = categorize_entities(spacy_ents, deberta_ents)

    return (word_freq_str, sentiment_str, entities_str)


# ---------------------------------------------------------
# UI
# ---------------------------------------------------------

with gr.Blocks(title="🗳️ Text & Speech Analyzer") as demo:

    gr.Markdown("## 🗳️ Text & Speech Analyzer (Fast + Accurate Modes)")
    gr.Markdown(
        "Analyze political speeches, news, or press releases.\n\n"
        "**Fast Mode** → spaCy small (1–2 seconds)\n\n"
        "**Accurate Mode** → spaCy transformer + DeBERTa (8–12 seconds)"
    )

    mode = gr.Radio(["Fast", "Accurate"], value="Accurate", label="Choose Mode")

    input_box = gr.Textbox(
        lines=12,
        label="Paste text here",
        placeholder="Enter a speech, article, or paragraph..."
    )

    with gr.Tabs():
        with gr.Tab("Word Frequency"):
            out_words = gr.Textbox(lines=10, label="Most Common Words")

        with gr.Tab("Sentiment"):
            out_sent = gr.Textbox(lines=3, label="Sentiment")

        with gr.Tab("Entities"):
            out_ents = gr.Textbox(lines=10, label="Entities (People / Orgs / Locations)")

    analyze_btn = gr.Button("Analyze")

    analyze_btn.click(
        analyze_text,
        inputs=[input_box, mode],
        outputs=[out_words, out_sent, out_ents]
    )

demo.launch()