llm-eval-tool / app.py
HarshitaSuri's picture
Update app.py
cb99e82 verified
# --- Imports ---
import gradio as gr
import pandas as pd
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from detoxify import Detoxify
import nltk
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from PIL import Image
import io
import en_core_web_sm
# --- Setup ---
nltk.download("punkt")
embedder = SentenceTransformer("all-MiniLM-L6-v2")
toxicity_model = Detoxify("original")
nlp = en_core_web_sm.load()
# --- Core Evaluation Function ---
def evaluate(reference, responses_dict):
results = {}
smoothie = SmoothingFunction().method4
for model_name, response in responses_dict.items():
if not response.strip():
continue
ref_tokens = [nltk.word_tokenize(reference)]
resp_tokens = nltk.word_tokenize(response)
bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie)
ref_emb = embedder.encode(reference, convert_to_tensor=True)
resp_emb = embedder.encode(response, convert_to_tensor=True)
cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item()
try:
tox = toxicity_model.predict(response)["toxicity"]
except Exception:
tox = 0.0
final_score = (bleu + cosine_sim + (1 - tox)) / 3
results[model_name] = {
"BLEU": round(bleu, 3),
"Semantic Similarity": round(cosine_sim, 3),
"Toxicity": round(tox, 3),
"Final Score": round(final_score, 3),
}
return results
# --- Single Response Evaluation ---
def evaluate_single(reference, response):
smoothie = SmoothingFunction().method4
ref_tokens = [nltk.word_tokenize(reference)]
resp_tokens = nltk.word_tokenize(response)
bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie)
ref_emb = embedder.encode(reference, convert_to_tensor=True)
resp_emb = embedder.encode(response, convert_to_tensor=True)
cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item()
try:
tox = toxicity_model.predict(response)["toxicity"]
except Exception:
tox = 0.0
return round(cosine_sim, 3), round(bleu, 3), round(tox, 3)
# --- Hallucination Detection ---
def detect_hallucination(reference, response):
ref_doc = nlp(reference)
resp_doc = nlp(response)
ref_ents = set([(ent.text.lower(), ent.label_) for ent in ref_doc.ents])
resp_ents = set([(ent.text.lower(), ent.label_) for ent in resp_doc.ents])
hallucinated = resp_ents - ref_ents
hallucination_score = len(hallucinated) / (len(resp_ents) + 1e-5)
return round(hallucination_score, 3), list(hallucinated)
def detect_bias(response):
doc = nlp(response)
gender_terms = {"he", "him", "his", "man", "male", "father", "boy"}
female_terms = {"she", "her", "hers", "woman", "female", "mother", "girl"}
race_terms = {"white", "black", "asian", "latino", "indian", "african", "european"}
tokens = set([token.text.lower() for token in doc])
gender_bias = {
"male_terms": list(tokens & gender_terms),
"female_terms": list(tokens & female_terms)
}
race_mentions = list(tokens & race_terms)
entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"PERSON", "NORP"}]
bias_score = len(gender_bias["male_terms"]) - len(gender_bias["female_terms"])
return {
"Gender Bias Score": bias_score,
"Gender Terms": gender_bias,
"Race Mentions": race_mentions,
"Named Entities": entities
}
# --- Plot Results ---
def plot_results(results):
if not results:
return None
df = pd.DataFrame(results).T
fig, ax = plt.subplots(figsize=(7, 4))
df.plot(kind="bar", ax=ax)
plt.title("Model Evaluation Metrics")
plt.ylabel("Score")
plt.xticks(rotation=45)
plt.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format="png")
buf.seek(0)
plt.close(fig)
return Image.open(buf)
# --- Export CSV ---
def export_csv(results):
df = pd.DataFrame(results).T
csv_file = "llm_evaluation_report.csv"
df.to_csv(csv_file)
return csv_file
# --- Dataset Evaluation ---
def evaluate_dataset(file):
df = pd.read_csv(file.name) if file.name.endswith(".csv") else pd.read_json(file.name)
df.columns = [c.strip() for c in df.columns]
required_cols = ["reference", "model_name", "response"]
for col in required_cols:
if col not in df.columns:
raise ValueError(f"Column '{col}' not found. Columns found: {df.columns.tolist()}")
grouped = {}
for _, row in df.iterrows():
ref, model, resp = row["reference"], row["model_name"], row["response"]
res = evaluate(ref, {model: resp})
if model not in grouped:
grouped[model] = {"BLEU": [], "Semantic Similarity": [], "Toxicity": [], "Final Score": []}
grouped[model]["BLEU"].append(res[model]["BLEU"])
grouped[model]["Semantic Similarity"].append(res[model]["Semantic Similarity"])
grouped[model]["Toxicity"].append(res[model]["Toxicity"])
grouped[model]["Final Score"].append(res[model]["Final Score"])
avg_results = {}
for model, metrics in grouped.items():
avg_results[model] = {k: round(sum(v)/len(v), 3) for k, v in metrics.items()}
return avg_results
# --- Multimodel Wrapper ---
def single_evaluation(ref, a, b, c):
results = evaluate(ref, {"Model A": a, "Model B": b, "Model C": c})
plot = plot_results(results)
return results, plot
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.Markdown("# πŸ§ͺ LLM Evaluation Dashboard")
state_results = gr.State({})
# --- Single Response Analysis ---
with gr.Tab("πŸ” Single Response Analysis"):
ref_input = gr.Textbox(label="Reference Answer")
resp_input = gr.Textbox(label="Response")
analyze_btn = gr.Button("Analyze Response")
sim_out = gr.Number(label="Semantic Similarity")
bleu_out = gr.Number(label="BLEU Score")
tox_out = gr.Number(label="Toxicity Score")
analyze_btn.click(evaluate_single, inputs=[ref_input, resp_input], outputs=[sim_out, bleu_out, tox_out])
# --- Multimodel Evaluation ---
with gr.Tab("πŸ“Š Multimodel Evaluation"):
reference = gr.Textbox(label="Reference Answer")
with gr.Row():
resp1 = gr.Textbox(label="Model A Response")
resp2 = gr.Textbox(label="Model B Response")
resp3 = gr.Textbox(label="Model C Response")
evaluate_btn = gr.Button("Evaluate")
results_json = gr.JSON(label="Results")
results_plot = gr.Image(label="Comparison Graph")
def multi_eval_store(ref, a, b, c):
res, plot = single_evaluation(ref, a, b, c)
return res, plot, res
evaluate_btn.click(multi_eval_store, inputs=[reference, resp1, resp2, resp3], outputs=[results_json, results_plot, state_results])
# --- Hallucination Detection ---
with gr.Tab("🧠 Hallucination Detection"):
ref_hall = gr.Textbox(label="Reference Text")
resp_hall = gr.Textbox(label="Model Response")
detect_btn = gr.Button("Detect Hallucinations")
hall_score = gr.Number(label="Hallucination Score")
hall_entities = gr.JSON(label="Hallucinated Entities")
detect_btn.click(detect_hallucination, inputs=[ref_hall, resp_hall], outputs=[hall_score, hall_entities])
# --- Bias Detection Tab ---
with gr.Tab("βš–οΈ Bias Detection"):
bias_input = gr.Textbox(label="Model Response")
bias_btn = gr.Button("Analyze Bias")
bias_json = gr.JSON(label="Bias Analysis")
bias_btn.click(detect_bias, inputs=[bias_input], outputs=[bias_json])
# --- Dataset Evaluation ---
with gr.Tab("πŸ“‚ Dataset Evaluation"):
dataset_file = gr.File(label="Upload CSV/JSON", file_types=[".csv", ".json"])
dataset_btn = gr.Button("Evaluate Dataset")
dataset_json = gr.JSON(label="Average Scores")
def dataset_eval_store(file):
res = evaluate_dataset(file)
return res, res
dataset_btn.click(dataset_eval_store, inputs=[dataset_file], outputs=[dataset_json, state_results])
# --- Export Report ---
with gr.Tab("πŸ“€ Export Report"):
export_btn = gr.Button("Export as CSV")
export_file = gr.File(label="Download CSV")
export_btn.click(export_csv, inputs=[state_results], outputs=[export_file])
# --- API Wrappers for Chrome Extension ---
def api_evaluate_single(reference, response):
sim, bleu, tox = evaluate_single(reference, response)
return {
"Semantic Similarity": sim,
"BLEU": bleu,
"Toxicity": tox
}
def api_detect_hallucination(reference, response):
score, entities = detect_hallucination(reference, response)
return {"Hallucination Score": score, "Hallucinated Entities": entities}
def api_detect_bias(response):
return detect_bias(response)
# This exposes API endpoints without affecting the UI
single_api = gr.Interface(
fn=api_evaluate_single,
inputs=["text", "text"],
outputs="json",
title="Single Response Evaluation API"
)
hallucination_api = gr.Interface(
fn=api_detect_hallucination,
inputs=["text", "text"],
outputs="json",
title="Hallucination Detection API"
)
bias_api = gr.Interface(
fn=api_detect_bias,
inputs="text",
outputs="json",
title="Bias Detection API"
)
import threading
# --- Launch UI and APIs together ---
ui_thread = threading.Thread(target=lambda: demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=True,
inline=False
))
ui_thread.start()
single_api_thread = threading.Thread(target=lambda: single_api.launch(
server_name="0.0.0.0",
server_port=7861,
share=True,
inline=False
))
single_api_thread.start()
halluc_api_thread = threading.Thread(target=lambda: hallucination_api.launch(
server_name="0.0.0.0",
server_port=7862,
share=True,
inline=False
))
halluc_api_thread.start()
bias_api_thread = threading.Thread(target=lambda: bias_api.launch(
server_name="0.0.0.0",
server_port=7863,
share=True,
inline=False
))
bias_api_thread.start()