import gradio as gr import torch import torch.nn.functional as F import numpy as np from sklearn.metrics.pairwise import cosine_similarity # ===================================================== # TOP-K HELPER # ===================================================== def get_top_k(similarity_scores, texts, k=5): idx = similarity_scores.argsort()[-k:][::-1] return [(texts[i], float(similarity_scores[i])) for i in idx] def format_results(results): return "\n\n".join( [f"{i+1}. {txt[:200]}..." for i, (txt, _) in enumerate(results)] ) # ===================================================== # MAIN PIPELINE # ===================================================== def pipeline(text): processed = preprocess(text) labels = {0: "Negative", 1: "Positive"} # ========================= # TF-IDF CLASSIFICATION # ========================= tv = vectorizer.transform([processed]) tfidf_pred = baseline_clf.predict(tv)[0] # ========================= # BERT CLASSIFICATION # ========================= emb = bert_encoder.encode([text]) bert_pred = advanced_clf.predict(emb)[0] # ========================= # DISTILBERT CLASSIFICATION # ========================= inputs = tokenizer( text, return_tensors="pt", truncation=True, padding=True, max_length=128 ) with torch.no_grad(): outputs = transformer_model(**inputs) logits = outputs.logits pred = torch.argmax(logits, dim=1).item() probs = F.softmax(logits, dim=1) confidence = probs.max().item() distilbert_label = labels[pred] # ========================= # TF-IDF TOP-K # ========================= q_vec = vectorizer.transform([processed]) tfidf_sim = cosine_similarity(q_vec, tfidf_matrix).flatten() tfidf_topk = get_top_k(tfidf_sim, documents) # ========================= # BERT TOP-K # ========================= q_emb = bert_encoder.encode([text]) bert_sim = cosine_similarity(q_emb, doc_embeddings).flatten() bert_topk = get_top_k(bert_sim, documents) # ========================= # DISTILBERT TOP-K # ========================= inputs_emb = get_distilbert_embedding(text) distil_sim = cosine_similarity(inputs_emb, distilbert_doc_embeddings).flatten() distil_topk = get_top_k(distil_sim, documents) # ========================= # OUTPUT # ========================= classification_output = f""" TF-IDF Prediction: {labels[tfidf_pred]} BERT Prediction: {labels[bert_pred]} DistilBERT Prediction: {distilbert_label} ({confidence*100:.2f}%) """ retrieval_output = f""" 🔹 TF-IDF TOP-5 {format_results(tfidf_topk)} ---------------------------- 🔹 BERT TOP-5 {format_results(bert_topk)} ---------------------------- 🔹 DistilBERT TOP-5 {format_results(distil_topk)} """ return classification_output, retrieval_output # ===================================================== # GRADIO UI # ===================================================== demo = gr.Interface( fn=pipeline, inputs=gr.Textbox( label="Enter Review / Query", lines=3, placeholder="late delivery problem..." ), outputs=[ gr.Textbox(label="🔹 Sentiment Classification"), gr.Textbox(label="🔹 Top-5 Retrieval Results") ], title="NLP Project: Classification + Semantic Search", description=""" TF-IDF + BERT + DistilBERT comparison system. Shows both sentiment classification and semantic retrieval. """, examples=[ ["late delivery problem"], ["refund not given"], ["bad customer service"], ["product arrived damaged"] ] ) demo.launch()