NLP_PROJECT / app.py
Omnia-cy's picture
Create app.py
55a7827 verified
import gradio as gr
import torch
import torch.nn.functional as F
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
# =====================================================
# TOP-K HELPER
# =====================================================
def get_top_k(similarity_scores, texts, k=5):
idx = similarity_scores.argsort()[-k:][::-1]
return [(texts[i], float(similarity_scores[i])) for i in idx]
def format_results(results):
return "\n\n".join(
[f"{i+1}. {txt[:200]}..." for i, (txt, _) in enumerate(results)]
)
# =====================================================
# MAIN PIPELINE
# =====================================================
def pipeline(text):
processed = preprocess(text)
labels = {0: "Negative", 1: "Positive"}
# =========================
# TF-IDF CLASSIFICATION
# =========================
tv = vectorizer.transform([processed])
tfidf_pred = baseline_clf.predict(tv)[0]
# =========================
# BERT CLASSIFICATION
# =========================
emb = bert_encoder.encode([text])
bert_pred = advanced_clf.predict(emb)[0]
# =========================
# DISTILBERT CLASSIFICATION
# =========================
inputs = tokenizer(
text,
return_tensors="pt",
truncation=True,
padding=True,
max_length=128
)
with torch.no_grad():
outputs = transformer_model(**inputs)
logits = outputs.logits
pred = torch.argmax(logits, dim=1).item()
probs = F.softmax(logits, dim=1)
confidence = probs.max().item()
distilbert_label = labels[pred]
# =========================
# TF-IDF TOP-K
# =========================
q_vec = vectorizer.transform([processed])
tfidf_sim = cosine_similarity(q_vec, tfidf_matrix).flatten()
tfidf_topk = get_top_k(tfidf_sim, documents)
# =========================
# BERT TOP-K
# =========================
q_emb = bert_encoder.encode([text])
bert_sim = cosine_similarity(q_emb, doc_embeddings).flatten()
bert_topk = get_top_k(bert_sim, documents)
# =========================
# DISTILBERT TOP-K
# =========================
inputs_emb = get_distilbert_embedding(text)
distil_sim = cosine_similarity(inputs_emb, distilbert_doc_embeddings).flatten()
distil_topk = get_top_k(distil_sim, documents)
# =========================
# OUTPUT
# =========================
classification_output = f"""
TF-IDF Prediction: {labels[tfidf_pred]}
BERT Prediction: {labels[bert_pred]}
DistilBERT Prediction: {distilbert_label} ({confidence*100:.2f}%)
"""
retrieval_output = f"""
🔹 TF-IDF TOP-5
{format_results(tfidf_topk)}
----------------------------
🔹 BERT TOP-5
{format_results(bert_topk)}
----------------------------
🔹 DistilBERT TOP-5
{format_results(distil_topk)}
"""
return classification_output, retrieval_output
# =====================================================
# GRADIO UI
# =====================================================
demo = gr.Interface(
fn=pipeline,
inputs=gr.Textbox(
label="Enter Review / Query",
lines=3,
placeholder="late delivery problem..."
),
outputs=[
gr.Textbox(label="🔹 Sentiment Classification"),
gr.Textbox(label="🔹 Top-5 Retrieval Results")
],
title="NLP Project: Classification + Semantic Search",
description="""
TF-IDF + BERT + DistilBERT comparison system.
Shows both sentiment classification and semantic retrieval.
""",
examples=[
["late delivery problem"],
["refund not given"],
["bad customer service"],
["product arrived damaged"]
]
)
demo.launch()