Spaces:

Omnia-cy
/

NLP_PROJECT

Configuration error

App Files Files Community

NLP_PROJECT / app.py

Omnia-cy

Create app.py

55a7827 verified 21 days ago

raw

history blame contribute delete

3.76 kB

	import gradio as gr
	import torch
	import torch.nn.functional as F
	import numpy as np
	from sklearn.metrics.pairwise import cosine_similarity


	# =====================================================
	# TOP-K HELPER
	# =====================================================
	def get_top_k(similarity_scores, texts, k=5):
	idx = similarity_scores.argsort()[-k:][::-1]
	return [(texts[i], float(similarity_scores[i])) for i in idx]


	def format_results(results):
	return "\n\n".join(
	[f"{i+1}. {txt[:200]}..." for i, (txt, _) in enumerate(results)]
	)


	# =====================================================
	# MAIN PIPELINE
	# =====================================================
	def pipeline(text):

	processed = preprocess(text)

	labels = {0: "Negative", 1: "Positive"}

	# =========================
	# TF-IDF CLASSIFICATION
	# =========================
	tv = vectorizer.transform([processed])
	tfidf_pred = baseline_clf.predict(tv)[0]

	# =========================
	# BERT CLASSIFICATION
	# =========================
	emb = bert_encoder.encode([text])
	bert_pred = advanced_clf.predict(emb)[0]

	# =========================
	# DISTILBERT CLASSIFICATION
	# =========================
	inputs = tokenizer(
	text,
	return_tensors="pt",
	truncation=True,
	padding=True,
	max_length=128
	)

	with torch.no_grad():
	outputs = transformer_model(**inputs)
	logits = outputs.logits
	pred = torch.argmax(logits, dim=1).item()
	probs = F.softmax(logits, dim=1)
	confidence = probs.max().item()

	distilbert_label = labels[pred]

	# =========================
	# TF-IDF TOP-K
	# =========================
	q_vec = vectorizer.transform([processed])
	tfidf_sim = cosine_similarity(q_vec, tfidf_matrix).flatten()
	tfidf_topk = get_top_k(tfidf_sim, documents)

	# =========================
	# BERT TOP-K
	# =========================
	q_emb = bert_encoder.encode([text])
	bert_sim = cosine_similarity(q_emb, doc_embeddings).flatten()
	bert_topk = get_top_k(bert_sim, documents)

	# =========================
	# DISTILBERT TOP-K
	# =========================
	inputs_emb = get_distilbert_embedding(text)
	distil_sim = cosine_similarity(inputs_emb, distilbert_doc_embeddings).flatten()
	distil_topk = get_top_k(distil_sim, documents)

	# =========================
	# OUTPUT
	# =========================
	classification_output = f"""
	TF-IDF Prediction: {labels[tfidf_pred]}
	BERT Prediction: {labels[bert_pred]}
	DistilBERT Prediction: {distilbert_label} ({confidence*100:.2f}%)
	"""

	retrieval_output = f"""
	🔹 TF-IDF TOP-5
	{format_results(tfidf_topk)}

	----------------------------

	🔹 BERT TOP-5
	{format_results(bert_topk)}

	----------------------------

	🔹 DistilBERT TOP-5
	{format_results(distil_topk)}
	"""

	return classification_output, retrieval_output


	# =====================================================
	# GRADIO UI
	# =====================================================
	demo = gr.Interface(

	fn=pipeline,

	inputs=gr.Textbox(
	label="Enter Review / Query",
	lines=3,
	placeholder="late delivery problem..."
	),

	outputs=[
	gr.Textbox(label="🔹 Sentiment Classification"),
	gr.Textbox(label="🔹 Top-5 Retrieval Results")
	],

	title="NLP Project: Classification + Semantic Search",

	description="""
	TF-IDF + BERT + DistilBERT comparison system.
	Shows both sentiment classification and semantic retrieval.
	""",

	examples=[
	["late delivery problem"],
	["refund not given"],
	["bad customer service"],
	["product arrived damaged"]
	]
	)

	demo.launch()