Spaces:

HarshitaSuri
/

llm-eval-tool

Sleeping

App Files Files Community

llm-eval-tool / app.py

HarshitaSuri

Update app.py

cb99e82 verified 7 months ago

raw

history blame contribute delete

10.3 kB

	# --- Imports ---
	import gradio as gr
	import pandas as pd
	import matplotlib.pyplot as plt
	from sentence_transformers import SentenceTransformer, util
	from detoxify import Detoxify
	import nltk
	from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
	from PIL import Image
	import io
	import en_core_web_sm

	# --- Setup ---
	nltk.download("punkt")
	embedder = SentenceTransformer("all-MiniLM-L6-v2")
	toxicity_model = Detoxify("original")
	nlp = en_core_web_sm.load()

	# --- Core Evaluation Function ---
	def evaluate(reference, responses_dict):
	results = {}
	smoothie = SmoothingFunction().method4
	for model_name, response in responses_dict.items():
	if not response.strip():
	continue

	ref_tokens = [nltk.word_tokenize(reference)]
	resp_tokens = nltk.word_tokenize(response)
	bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie)

	ref_emb = embedder.encode(reference, convert_to_tensor=True)
	resp_emb = embedder.encode(response, convert_to_tensor=True)
	cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item()

	try:
	tox = toxicity_model.predict(response)["toxicity"]
	except Exception:
	tox = 0.0

	final_score = (bleu + cosine_sim + (1 - tox)) / 3

	results[model_name] = {
	"BLEU": round(bleu, 3),
	"Semantic Similarity": round(cosine_sim, 3),
	"Toxicity": round(tox, 3),
	"Final Score": round(final_score, 3),
	}

	return results

	# --- Single Response Evaluation ---
	def evaluate_single(reference, response):
	smoothie = SmoothingFunction().method4
	ref_tokens = [nltk.word_tokenize(reference)]
	resp_tokens = nltk.word_tokenize(response)
	bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie)

	ref_emb = embedder.encode(reference, convert_to_tensor=True)
	resp_emb = embedder.encode(response, convert_to_tensor=True)
	cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item()

	try:
	tox = toxicity_model.predict(response)["toxicity"]
	except Exception:
	tox = 0.0

	return round(cosine_sim, 3), round(bleu, 3), round(tox, 3)

	# --- Hallucination Detection ---
	def detect_hallucination(reference, response):
	ref_doc = nlp(reference)
	resp_doc = nlp(response)

	ref_ents = set([(ent.text.lower(), ent.label_) for ent in ref_doc.ents])
	resp_ents = set([(ent.text.lower(), ent.label_) for ent in resp_doc.ents])

	hallucinated = resp_ents - ref_ents
	hallucination_score = len(hallucinated) / (len(resp_ents) + 1e-5)

	return round(hallucination_score, 3), list(hallucinated)

	def detect_bias(response):
	doc = nlp(response)
	gender_terms = {"he", "him", "his", "man", "male", "father", "boy"}
	female_terms = {"she", "her", "hers", "woman", "female", "mother", "girl"}
	race_terms = {"white", "black", "asian", "latino", "indian", "african", "european"}

	tokens = set([token.text.lower() for token in doc])
	gender_bias = {
	"male_terms": list(tokens & gender_terms),
	"female_terms": list(tokens & female_terms)
	}
	race_mentions = list(tokens & race_terms)

	entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"PERSON", "NORP"}]

	bias_score = len(gender_bias["male_terms"]) - len(gender_bias["female_terms"])

	return {
	"Gender Bias Score": bias_score,
	"Gender Terms": gender_bias,
	"Race Mentions": race_mentions,
	"Named Entities": entities
	}


	# --- Plot Results ---
	def plot_results(results):
	if not results:
	return None
	df = pd.DataFrame(results).T
	fig, ax = plt.subplots(figsize=(7, 4))
	df.plot(kind="bar", ax=ax)
	plt.title("Model Evaluation Metrics")
	plt.ylabel("Score")
	plt.xticks(rotation=45)
	plt.tight_layout()

	buf = io.BytesIO()
	plt.savefig(buf, format="png")
	buf.seek(0)
	plt.close(fig)
	return Image.open(buf)

	# --- Export CSV ---
	def export_csv(results):
	df = pd.DataFrame(results).T
	csv_file = "llm_evaluation_report.csv"
	df.to_csv(csv_file)
	return csv_file

	# --- Dataset Evaluation ---
	def evaluate_dataset(file):
	df = pd.read_csv(file.name) if file.name.endswith(".csv") else pd.read_json(file.name)
	df.columns = [c.strip() for c in df.columns]

	required_cols = ["reference", "model_name", "response"]
	for col in required_cols:
	if col not in df.columns:
	raise ValueError(f"Column '{col}' not found. Columns found: {df.columns.tolist()}")

	grouped = {}
	for _, row in df.iterrows():
	ref, model, resp = row["reference"], row["model_name"], row["response"]
	res = evaluate(ref, {model: resp})
	if model not in grouped:
	grouped[model] = {"BLEU": [], "Semantic Similarity": [], "Toxicity": [], "Final Score": []}
	grouped[model]["BLEU"].append(res[model]["BLEU"])
	grouped[model]["Semantic Similarity"].append(res[model]["Semantic Similarity"])
	grouped[model]["Toxicity"].append(res[model]["Toxicity"])
	grouped[model]["Final Score"].append(res[model]["Final Score"])

	avg_results = {}
	for model, metrics in grouped.items():
	avg_results[model] = {k: round(sum(v)/len(v), 3) for k, v in metrics.items()}

	return avg_results

	# --- Multimodel Wrapper ---
	def single_evaluation(ref, a, b, c):
	results = evaluate(ref, {"Model A": a, "Model B": b, "Model C": c})
	plot = plot_results(results)
	return results, plot

	# --- Gradio UI ---
	with gr.Blocks() as demo:
	gr.Markdown("# 🧪 LLM Evaluation Dashboard")

	state_results = gr.State({})

	# --- Single Response Analysis ---
	with gr.Tab("🔍 Single Response Analysis"):
	ref_input = gr.Textbox(label="Reference Answer")
	resp_input = gr.Textbox(label="Response")
	analyze_btn = gr.Button("Analyze Response")

	sim_out = gr.Number(label="Semantic Similarity")
	bleu_out = gr.Number(label="BLEU Score")
	tox_out = gr.Number(label="Toxicity Score")

	analyze_btn.click(evaluate_single, inputs=[ref_input, resp_input], outputs=[sim_out, bleu_out, tox_out])

	# --- Multimodel Evaluation ---
	with gr.Tab("📊 Multimodel Evaluation"):
	reference = gr.Textbox(label="Reference Answer")
	with gr.Row():
	resp1 = gr.Textbox(label="Model A Response")
	resp2 = gr.Textbox(label="Model B Response")
	resp3 = gr.Textbox(label="Model C Response")
	evaluate_btn = gr.Button("Evaluate")
	results_json = gr.JSON(label="Results")
	results_plot = gr.Image(label="Comparison Graph")

	def multi_eval_store(ref, a, b, c):
	res, plot = single_evaluation(ref, a, b, c)
	return res, plot, res

	evaluate_btn.click(multi_eval_store, inputs=[reference, resp1, resp2, resp3], outputs=[results_json, results_plot, state_results])



	# --- Hallucination Detection ---
	with gr.Tab("🧠 Hallucination Detection"):
	ref_hall = gr.Textbox(label="Reference Text")
	resp_hall = gr.Textbox(label="Model Response")
	detect_btn = gr.Button("Detect Hallucinations")

	hall_score = gr.Number(label="Hallucination Score")
	hall_entities = gr.JSON(label="Hallucinated Entities")

	detect_btn.click(detect_hallucination, inputs=[ref_hall, resp_hall], outputs=[hall_score, hall_entities])

	# --- Bias Detection Tab ---
	with gr.Tab("⚖️ Bias Detection"):
	bias_input = gr.Textbox(label="Model Response")
	bias_btn = gr.Button("Analyze Bias")

	bias_json = gr.JSON(label="Bias Analysis")

	bias_btn.click(detect_bias, inputs=[bias_input], outputs=[bias_json])


	# --- Dataset Evaluation ---
	with gr.Tab("📂 Dataset Evaluation"):
	dataset_file = gr.File(label="Upload CSV/JSON", file_types=[".csv", ".json"])
	dataset_btn = gr.Button("Evaluate Dataset")
	dataset_json = gr.JSON(label="Average Scores")

	def dataset_eval_store(file):
	res = evaluate_dataset(file)
	return res, res

	dataset_btn.click(dataset_eval_store, inputs=[dataset_file], outputs=[dataset_json, state_results])


	# --- Export Report ---
	with gr.Tab("📤 Export Report"):
	export_btn = gr.Button("Export as CSV")
	export_file = gr.File(label="Download CSV")

	export_btn.click(export_csv, inputs=[state_results], outputs=[export_file])


	# --- API Wrappers for Chrome Extension ---
	def api_evaluate_single(reference, response):
	sim, bleu, tox = evaluate_single(reference, response)
	return {
	"Semantic Similarity": sim,
	"BLEU": bleu,
	"Toxicity": tox
	}

	def api_detect_hallucination(reference, response):
	score, entities = detect_hallucination(reference, response)
	return {"Hallucination Score": score, "Hallucinated Entities": entities}

	def api_detect_bias(response):
	return detect_bias(response)

	# This exposes API endpoints without affecting the UI
	single_api = gr.Interface(
	fn=api_evaluate_single,
	inputs=["text", "text"],
	outputs="json",
	title="Single Response Evaluation API"
	)

	hallucination_api = gr.Interface(
	fn=api_detect_hallucination,
	inputs=["text", "text"],
	outputs="json",
	title="Hallucination Detection API"
	)

	bias_api = gr.Interface(
	fn=api_detect_bias,
	inputs="text",
	outputs="json",
	title="Bias Detection API"
	)


	import threading

	# --- Launch UI and APIs together ---
	ui_thread = threading.Thread(target=lambda: demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=True,
	inline=False
	))
	ui_thread.start()

	single_api_thread = threading.Thread(target=lambda: single_api.launch(
	server_name="0.0.0.0",
	server_port=7861,
	share=True,
	inline=False
	))
	single_api_thread.start()

	halluc_api_thread = threading.Thread(target=lambda: hallucination_api.launch(
	server_name="0.0.0.0",
	server_port=7862,
	share=True,
	inline=False
	))
	halluc_api_thread.start()

	bias_api_thread = threading.Thread(target=lambda: bias_api.launch(
	server_name="0.0.0.0",
	server_port=7863,
	share=True,
	inline=False
	))
	bias_api_thread.start()