Spaces:
Sleeping
Sleeping
| # --- Imports --- | |
| import gradio as gr | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from sentence_transformers import SentenceTransformer, util | |
| from detoxify import Detoxify | |
| import nltk | |
| from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction | |
| from PIL import Image | |
| import io | |
| import en_core_web_sm | |
| # --- Setup --- | |
| nltk.download("punkt") | |
| embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| toxicity_model = Detoxify("original") | |
| nlp = en_core_web_sm.load() | |
| # --- Core Evaluation Function --- | |
| def evaluate(reference, responses_dict): | |
| results = {} | |
| smoothie = SmoothingFunction().method4 | |
| for model_name, response in responses_dict.items(): | |
| if not response.strip(): | |
| continue | |
| ref_tokens = [nltk.word_tokenize(reference)] | |
| resp_tokens = nltk.word_tokenize(response) | |
| bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie) | |
| ref_emb = embedder.encode(reference, convert_to_tensor=True) | |
| resp_emb = embedder.encode(response, convert_to_tensor=True) | |
| cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item() | |
| try: | |
| tox = toxicity_model.predict(response)["toxicity"] | |
| except Exception: | |
| tox = 0.0 | |
| final_score = (bleu + cosine_sim + (1 - tox)) / 3 | |
| results[model_name] = { | |
| "BLEU": round(bleu, 3), | |
| "Semantic Similarity": round(cosine_sim, 3), | |
| "Toxicity": round(tox, 3), | |
| "Final Score": round(final_score, 3), | |
| } | |
| return results | |
| # --- Single Response Evaluation --- | |
| def evaluate_single(reference, response): | |
| smoothie = SmoothingFunction().method4 | |
| ref_tokens = [nltk.word_tokenize(reference)] | |
| resp_tokens = nltk.word_tokenize(response) | |
| bleu = sentence_bleu(ref_tokens, resp_tokens, smoothing_function=smoothie) | |
| ref_emb = embedder.encode(reference, convert_to_tensor=True) | |
| resp_emb = embedder.encode(response, convert_to_tensor=True) | |
| cosine_sim = util.pytorch_cos_sim(ref_emb, resp_emb).item() | |
| try: | |
| tox = toxicity_model.predict(response)["toxicity"] | |
| except Exception: | |
| tox = 0.0 | |
| return round(cosine_sim, 3), round(bleu, 3), round(tox, 3) | |
| # --- Hallucination Detection --- | |
| def detect_hallucination(reference, response): | |
| ref_doc = nlp(reference) | |
| resp_doc = nlp(response) | |
| ref_ents = set([(ent.text.lower(), ent.label_) for ent in ref_doc.ents]) | |
| resp_ents = set([(ent.text.lower(), ent.label_) for ent in resp_doc.ents]) | |
| hallucinated = resp_ents - ref_ents | |
| hallucination_score = len(hallucinated) / (len(resp_ents) + 1e-5) | |
| return round(hallucination_score, 3), list(hallucinated) | |
| def detect_bias(response): | |
| doc = nlp(response) | |
| gender_terms = {"he", "him", "his", "man", "male", "father", "boy"} | |
| female_terms = {"she", "her", "hers", "woman", "female", "mother", "girl"} | |
| race_terms = {"white", "black", "asian", "latino", "indian", "african", "european"} | |
| tokens = set([token.text.lower() for token in doc]) | |
| gender_bias = { | |
| "male_terms": list(tokens & gender_terms), | |
| "female_terms": list(tokens & female_terms) | |
| } | |
| race_mentions = list(tokens & race_terms) | |
| entities = [(ent.text, ent.label_) for ent in doc.ents if ent.label_ in {"PERSON", "NORP"}] | |
| bias_score = len(gender_bias["male_terms"]) - len(gender_bias["female_terms"]) | |
| return { | |
| "Gender Bias Score": bias_score, | |
| "Gender Terms": gender_bias, | |
| "Race Mentions": race_mentions, | |
| "Named Entities": entities | |
| } | |
| # --- Plot Results --- | |
| def plot_results(results): | |
| if not results: | |
| return None | |
| df = pd.DataFrame(results).T | |
| fig, ax = plt.subplots(figsize=(7, 4)) | |
| df.plot(kind="bar", ax=ax) | |
| plt.title("Model Evaluation Metrics") | |
| plt.ylabel("Score") | |
| plt.xticks(rotation=45) | |
| plt.tight_layout() | |
| buf = io.BytesIO() | |
| plt.savefig(buf, format="png") | |
| buf.seek(0) | |
| plt.close(fig) | |
| return Image.open(buf) | |
| # --- Export CSV --- | |
| def export_csv(results): | |
| df = pd.DataFrame(results).T | |
| csv_file = "llm_evaluation_report.csv" | |
| df.to_csv(csv_file) | |
| return csv_file | |
| # --- Dataset Evaluation --- | |
| def evaluate_dataset(file): | |
| df = pd.read_csv(file.name) if file.name.endswith(".csv") else pd.read_json(file.name) | |
| df.columns = [c.strip() for c in df.columns] | |
| required_cols = ["reference", "model_name", "response"] | |
| for col in required_cols: | |
| if col not in df.columns: | |
| raise ValueError(f"Column '{col}' not found. Columns found: {df.columns.tolist()}") | |
| grouped = {} | |
| for _, row in df.iterrows(): | |
| ref, model, resp = row["reference"], row["model_name"], row["response"] | |
| res = evaluate(ref, {model: resp}) | |
| if model not in grouped: | |
| grouped[model] = {"BLEU": [], "Semantic Similarity": [], "Toxicity": [], "Final Score": []} | |
| grouped[model]["BLEU"].append(res[model]["BLEU"]) | |
| grouped[model]["Semantic Similarity"].append(res[model]["Semantic Similarity"]) | |
| grouped[model]["Toxicity"].append(res[model]["Toxicity"]) | |
| grouped[model]["Final Score"].append(res[model]["Final Score"]) | |
| avg_results = {} | |
| for model, metrics in grouped.items(): | |
| avg_results[model] = {k: round(sum(v)/len(v), 3) for k, v in metrics.items()} | |
| return avg_results | |
| # --- Multimodel Wrapper --- | |
| def single_evaluation(ref, a, b, c): | |
| results = evaluate(ref, {"Model A": a, "Model B": b, "Model C": c}) | |
| plot = plot_results(results) | |
| return results, plot | |
| # --- Gradio UI --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π§ͺ LLM Evaluation Dashboard") | |
| state_results = gr.State({}) | |
| # --- Single Response Analysis --- | |
| with gr.Tab("π Single Response Analysis"): | |
| ref_input = gr.Textbox(label="Reference Answer") | |
| resp_input = gr.Textbox(label="Response") | |
| analyze_btn = gr.Button("Analyze Response") | |
| sim_out = gr.Number(label="Semantic Similarity") | |
| bleu_out = gr.Number(label="BLEU Score") | |
| tox_out = gr.Number(label="Toxicity Score") | |
| analyze_btn.click(evaluate_single, inputs=[ref_input, resp_input], outputs=[sim_out, bleu_out, tox_out]) | |
| # --- Multimodel Evaluation --- | |
| with gr.Tab("π Multimodel Evaluation"): | |
| reference = gr.Textbox(label="Reference Answer") | |
| with gr.Row(): | |
| resp1 = gr.Textbox(label="Model A Response") | |
| resp2 = gr.Textbox(label="Model B Response") | |
| resp3 = gr.Textbox(label="Model C Response") | |
| evaluate_btn = gr.Button("Evaluate") | |
| results_json = gr.JSON(label="Results") | |
| results_plot = gr.Image(label="Comparison Graph") | |
| def multi_eval_store(ref, a, b, c): | |
| res, plot = single_evaluation(ref, a, b, c) | |
| return res, plot, res | |
| evaluate_btn.click(multi_eval_store, inputs=[reference, resp1, resp2, resp3], outputs=[results_json, results_plot, state_results]) | |
| # --- Hallucination Detection --- | |
| with gr.Tab("π§ Hallucination Detection"): | |
| ref_hall = gr.Textbox(label="Reference Text") | |
| resp_hall = gr.Textbox(label="Model Response") | |
| detect_btn = gr.Button("Detect Hallucinations") | |
| hall_score = gr.Number(label="Hallucination Score") | |
| hall_entities = gr.JSON(label="Hallucinated Entities") | |
| detect_btn.click(detect_hallucination, inputs=[ref_hall, resp_hall], outputs=[hall_score, hall_entities]) | |
| # --- Bias Detection Tab --- | |
| with gr.Tab("βοΈ Bias Detection"): | |
| bias_input = gr.Textbox(label="Model Response") | |
| bias_btn = gr.Button("Analyze Bias") | |
| bias_json = gr.JSON(label="Bias Analysis") | |
| bias_btn.click(detect_bias, inputs=[bias_input], outputs=[bias_json]) | |
| # --- Dataset Evaluation --- | |
| with gr.Tab("π Dataset Evaluation"): | |
| dataset_file = gr.File(label="Upload CSV/JSON", file_types=[".csv", ".json"]) | |
| dataset_btn = gr.Button("Evaluate Dataset") | |
| dataset_json = gr.JSON(label="Average Scores") | |
| def dataset_eval_store(file): | |
| res = evaluate_dataset(file) | |
| return res, res | |
| dataset_btn.click(dataset_eval_store, inputs=[dataset_file], outputs=[dataset_json, state_results]) | |
| # --- Export Report --- | |
| with gr.Tab("π€ Export Report"): | |
| export_btn = gr.Button("Export as CSV") | |
| export_file = gr.File(label="Download CSV") | |
| export_btn.click(export_csv, inputs=[state_results], outputs=[export_file]) | |
| # --- API Wrappers for Chrome Extension --- | |
| def api_evaluate_single(reference, response): | |
| sim, bleu, tox = evaluate_single(reference, response) | |
| return { | |
| "Semantic Similarity": sim, | |
| "BLEU": bleu, | |
| "Toxicity": tox | |
| } | |
| def api_detect_hallucination(reference, response): | |
| score, entities = detect_hallucination(reference, response) | |
| return {"Hallucination Score": score, "Hallucinated Entities": entities} | |
| def api_detect_bias(response): | |
| return detect_bias(response) | |
| # This exposes API endpoints without affecting the UI | |
| single_api = gr.Interface( | |
| fn=api_evaluate_single, | |
| inputs=["text", "text"], | |
| outputs="json", | |
| title="Single Response Evaluation API" | |
| ) | |
| hallucination_api = gr.Interface( | |
| fn=api_detect_hallucination, | |
| inputs=["text", "text"], | |
| outputs="json", | |
| title="Hallucination Detection API" | |
| ) | |
| bias_api = gr.Interface( | |
| fn=api_detect_bias, | |
| inputs="text", | |
| outputs="json", | |
| title="Bias Detection API" | |
| ) | |
| import threading | |
| # --- Launch UI and APIs together --- | |
| ui_thread = threading.Thread(target=lambda: demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=True, | |
| inline=False | |
| )) | |
| ui_thread.start() | |
| single_api_thread = threading.Thread(target=lambda: single_api.launch( | |
| server_name="0.0.0.0", | |
| server_port=7861, | |
| share=True, | |
| inline=False | |
| )) | |
| single_api_thread.start() | |
| halluc_api_thread = threading.Thread(target=lambda: hallucination_api.launch( | |
| server_name="0.0.0.0", | |
| server_port=7862, | |
| share=True, | |
| inline=False | |
| )) | |
| halluc_api_thread.start() | |
| bias_api_thread = threading.Thread(target=lambda: bias_api.launch( | |
| server_name="0.0.0.0", | |
| server_port=7863, | |
| share=True, | |
| inline=False | |
| )) | |
| bias_api_thread.start() | |