import gradio as gr
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity


# =====================================================
# TOP-K HELPER
# =====================================================
def get_top_k(similarity_scores, texts, k=5):
    idx = similarity_scores.argsort()[-k:][::-1]
    return [(texts[i], float(similarity_scores[i])) for i in idx]


def format_results(results):
    return "\n\n".join(
        [f"{i+1}. {txt[:200]}..." for i, (txt, _) in enumerate(results)]
    )


# =====================================================
# MAIN PIPELINE
# =====================================================
def pipeline(text):

    processed = preprocess(text)

    labels = {0: "Negative", 1: "Positive"}

    # =========================
    # TF-IDF CLASSIFICATION
    # =========================
    tv = vectorizer.transform([processed])
    tfidf_pred = baseline_clf.predict(tv)[0]

    # =========================
    # BERT CLASSIFICATION
    # =========================
    emb = bert_encoder.encode([text])
    bert_pred = advanced_clf.predict(emb)[0]

    # =========================
    # DISTILBERT CLASSIFICATION
    # =========================
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=128
    )

    with torch.no_grad():
        outputs = transformer_model(**inputs)
        logits = outputs.logits
        pred = torch.argmax(logits, dim=1).item()
        probs = F.softmax(logits, dim=1)
        confidence = probs.max().item()

    distilbert_label = labels[pred]

    # =========================
    # TF-IDF TOP-K
    # =========================
    q_vec = vectorizer.transform([processed])
    tfidf_sim = cosine_similarity(q_vec, tfidf_matrix).flatten()
    tfidf_topk = get_top_k(tfidf_sim, documents)

    # =========================
    # BERT TOP-K
    # =========================
    q_emb = bert_encoder.encode([text])
    bert_sim = cosine_similarity(q_emb, doc_embeddings).flatten()
    bert_topk = get_top_k(bert_sim, documents)

    # =========================
    # DISTILBERT TOP-K
    # =========================
    inputs_emb = get_distilbert_embedding(text)
    distil_sim = cosine_similarity(inputs_emb, distilbert_doc_embeddings).flatten()
    distil_topk = get_top_k(distil_sim, documents)

    # =========================
    # OUTPUT
    # =========================
    classification_output = f"""
TF-IDF Prediction: {labels[tfidf_pred]}
BERT Prediction: {labels[bert_pred]}
DistilBERT Prediction: {distilbert_label} ({confidence*100:.2f}%)
"""

    retrieval_output = f"""
🔹 TF-IDF TOP-5
{format_results(tfidf_topk)}

----------------------------

🔹 BERT TOP-5
{format_results(bert_topk)}

----------------------------

🔹 DistilBERT TOP-5
{format_results(distil_topk)}
"""

    return classification_output, retrieval_output


# =====================================================
# GRADIO UI
# =====================================================
demo = gr.Interface(

    fn=pipeline,

    inputs=gr.Textbox(
        label="Enter Review / Query",
        lines=3,
        placeholder="late delivery problem..."
    ),

    outputs=[
        gr.Textbox(label="🔹 Sentiment Classification"),
        gr.Textbox(label="🔹 Top-5 Retrieval Results")
    ],

    title="NLP Project: Classification + Semantic Search",

    description="""
TF-IDF + BERT + DistilBERT comparison system.
Shows both sentiment classification and semantic retrieval.
""",

    examples=[
        ["late delivery problem"],
        ["refund not given"],
        ["bad customer service"],
        ["product arrived damaged"]
    ]
)

demo.launch()