Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

Update evaluator.py

by manayporwal07 - opened Sep 13, 2025

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+326

-157

Files changed (1) hide show

evaluator.py +326 -157

evaluator.py CHANGED Viewed

@@ -1,18 +1,278 @@
-"""
-Evaluation module: loads models, computes metrics, and creates visualizations.
-Lightweight, CPU-friendly, no Java required.
-"""
 import re
-import math
-import uuid
-from typing import List, Dict, Tuple
-import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
-import torch
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, util
@@ -109,158 +369,67 @@ def check_accuracy(reference: str, response: str) -> float:
 # --------------------------
-# SCORING PIPELINE
 # --------------------------
-def compute_row_scores(prompt, response, reference) -> Dict:
-    instr = check_instruction_following(prompt, response)
-    halluc = check_hallucination(reference, response)
-    assum = check_assumption(response)
-    coh = check_coherence(response)
-    acc = check_accuracy(reference, response)
-    # Final score: average
-    components = [instr, halluc, assum, coh, acc]
-    final = round(float(sum(components) / len(components)), 3)
-    return {
-        "InstructionFollowing": instr,
-        "Hallucination": halluc,
-        "AssumptionControl": assum,
-        "Coherence": coh,
-        "Accuracy": acc,
-        "FinalScore": final,
     }
-# --------------------------
-# VISUALIZATION HELPERS
-# --------------------------
-# def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
-#     """Radar chart for multiple agents."""
-#     N = len(labels)
-#     angles = [n / float(N) * 2 * math.pi for n in range(N)]
-#     angles += angles[:1]
-#     fig = plt.figure(figsize=(6.5, 6.5))
-#     ax = plt.subplot(111, polar=True)
-#     ax.set_xticks(angles[:-1])
-#     ax.set_xticklabels(labels, fontsize=9)
-#     ax.set_ylim(0, 100)
-#     ax.set_yticks([0, 25, 50, 75, 100])
-#     for r in rows:
-#         values = r["values"]
-#         values_closed = values + values[:1]
-#         ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
-#         ax.fill(angles, values_closed, alpha=fill_alpha)
-#     ax.set_title(title, y=1.08, fontsize=12)
-#     ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
-#     return fig
-# def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
-#     fig, ax = plt.subplots(figsize=(7, 5))
-#     sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
-#     ax.set_title(title)
-#     return fig
-# --------------------------
-# HIGH-LEVEL EVALUATION
-# --------------------------
-def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
-    """
-    df must contain: prompt, response, task, agent, reference
-    Returns: metrics_df, [(image_path, caption)], leaderboard_df
-    """
-    df = df.rename(columns={c: c.strip() for c in df.columns})
-    rows = []
-    for _, r in df.iterrows():
-        prompt = r.get("prompt", "")
-        response = r.get("response", "")
-        reference = r.get("reference", "")
-        agent = r.get("agent", "Unknown")
-        task = r.get("task", "Unknown")
-        scores = compute_row_scores(prompt, response, reference)
-        entry = {
-            "Task": str(task).strip(),
-            "Agent": str(agent),
-            "Prompt": prompt,
-            "Response": response,
-            "Reference": reference,
-        }
-        entry.update(scores)
-        rows.append(entry)
-    metrics_df = pd.DataFrame(rows)
-    # Visualization artifacts
-    images = []
-    metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
-    # Per-task radar and bar charts
-    for task, g in metrics_df.groupby("Task"):
-        series = []
-        for a in g["Agent"].unique():
-            subset = g[g["Agent"] == a]
-            vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
-            series.append({"name": a, "values": vals})
-        if series:
-            fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
-            fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
-            fig.savefig(fname, bbox_inches="tight")
-            plt.close(fig)
-            images.append((fname, f"{task} - radar"))
-            fig2, ax = plt.subplots(figsize=(8, 4))
-            avg = g.groupby("Agent")[metric_labels].mean()
-            avg.plot(kind="bar", ax=ax)
-            ax.set_title(f"{task} — Average Metrics by Agent")
-            ax.set_ylabel("Score (0-1)")
-            plt.xticks(rotation=45)
-            fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
-            fig2.savefig(fname2, bbox_inches="tight")
-            plt.close(fig2)
-            images.append((fname2, f"{task} - bar"))
-    # Global heatmap
-    metric_cols = metric_labels + ["FinalScore"]
-    figh = heatmap_plot(metrics_df, metric_cols)
-    fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
-    figh.savefig(fnameh, bbox_inches="tight")
-    plt.close(figh)
-    images.append((fnameh, "Metric Correlations Heatmap"))
     # Leaderboard
-    lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
-    lb = lb.sort_values(["FinalScore"], ascending=False)
-    return metrics_df, images, lb
-# --------------------------
-# DEMO USAGE
-# --------------------------
-if __name__ == "__main__":
-    # Sample dataset
-    data = [
-        {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
-        {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
-        {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
-    ]
-    df = pd.DataFrame(data)
-    metrics_df, images, leaderboard = evaluate_dataframe(df)
-    print("\n=== Metrics per response ===")
-    print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
-    print("\n=== Leaderboard (average per task & agent) ===")
-    print(leaderboard)
-    print("\nVisualization files saved in /tmp/:")
-    for path, caption in images:
-        print(f"{caption}: {path}")

+# """
+# Evaluation module: loads models, computes metrics, and creates visualizations.
+# Lightweight, CPU-friendly, no Java required.
+# """
+# import re
+# import math
+# import uuid
+# from typing import List, Dict, Tuple
+# import numpy as np
+# import pandas as pd
+# import matplotlib.pyplot as plt
+# import seaborn as sns
+# import torch
+# from transformers import AutoTokenizer, AutoModelForSequenceClassification
+# from sentence_transformers import SentenceTransformer, util
+# # --------------------------
+# # MODEL LOADING
+# # --------------------------
+# NLI_MODEL = "textattack/roberta-base-MNLI"
+# EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# # Load NLI model & tokenizer
+# nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
+# nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
+# nli_model.to("cpu")
+# nli_model.eval()
+# # Load embedding model
+# embed_model = SentenceTransformer(EMBED_MODEL)
+# # Label mapping from config
+# id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
+# # --------------------------
+# # METRIC FUNCTIONS
+# # --------------------------
+# def check_instruction_following(prompt: str, response: str) -> float:
+#     """Embedding-based similarity between prompt and response."""
+#     if not prompt or not response:
+#         return 0.0
+#     p_emb = embed_model.encode(prompt, convert_to_tensor=True)
+#     r_emb = embed_model.encode(response, convert_to_tensor=True)
+#     sim = float(util.cos_sim(p_emb, r_emb).item())
+#     return round(max(0.0, min(1.0, sim)), 3)
+# def check_hallucination(reference: str, response: str) -> float:
+#     """
+#     Single hallucination score:
+#     Entailment prob - Contradiction prob (normalized to [0,1]).
+#     Higher = less hallucination.
+#     """
+#     if not reference or not response:
+#         return 0.0
+#     with torch.no_grad():
+#         inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
+#         outputs = nli_model(**inputs)
+#         probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
+#     entail_prob, contra_prob = 0.0, 0.0
+#     for idx, p in enumerate(probs):
+#         label = id2label.get(idx, "")
+#         if "ENTAIL" in label:
+#             entail_prob = float(p)
+#         elif "CONTRA" in label:
+#             contra_prob = float(p)
+#     score = entail_prob - contra_prob
+#     score = (score + 1) / 2  # normalize [-1,1] → [0,1]
+#     return round(max(0.0, min(1.0, score)), 3)
+# def check_assumption(response: str) -> float:
+#     """Detect speculative/hedging terms."""
+#     if not response:
+#         return 0.0
+#     speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
+#     count = sum(1 for t in speculative_terms if t in response.lower())
+#     score = 1.0 - min(count / 5.0, 1.0)  # smoother decay
+#     return round(score, 3)
+# def check_coherence(response: str) -> float:
+#     """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
+#     if not response:
+#         return 0.0
+#     words = len(re.findall(r"\w+", response))
+#     sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
+#     if words < 5:
+#         return 0.3
+#     if words > 200:
+#         return 0.5
+#     base = min(1.0, (words / 50.0) + (sents / 5.0))
+#     return round(max(0.4, min(base, 0.95)), 3)
+# def check_accuracy(reference: str, response: str) -> float:
+#     """Semantic similarity between reference and response via embeddings (cosine)."""
+#     if not reference or not response:
+#         return 0.0
+#     ref_emb = embed_model.encode(reference, convert_to_tensor=True)
+#     resp_emb = embed_model.encode(response, convert_to_tensor=True)
+#     sim = float(util.cos_sim(ref_emb, resp_emb).item())
+#     return round(max(0.0, min(1.0, sim)), 3)
+# # --------------------------
+# # SCORING PIPELINE
+# # --------------------------
+# def compute_row_scores(prompt, response, reference) -> Dict:
+#     instr = check_instruction_following(prompt, response)
+#     halluc = check_hallucination(reference, response)
+#     assum = check_assumption(response)
+#     coh = check_coherence(response)
+#     acc = check_accuracy(reference, response)
+#     # Final score: average
+#     components = [instr, halluc, assum, coh, acc]
+#     final = round(float(sum(components) / len(components)), 3)
+#     return {
+#         "InstructionFollowing": instr,
+#         "Hallucination": halluc,
+#         "AssumptionControl": assum,
+#         "Coherence": coh,
+#         "Accuracy": acc,
+#         "FinalScore": final,
+#     }
+# # --------------------------
+# # VISUALIZATION HELPERS
+# # --------------------------
+# # def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
+# #     """Radar chart for multiple agents."""
+# #     N = len(labels)
+# #     angles = [n / float(N) * 2 * math.pi for n in range(N)]
+# #     angles += angles[:1]
+# #     fig = plt.figure(figsize=(6.5, 6.5))
+# #     ax = plt.subplot(111, polar=True)
+# #     ax.set_xticks(angles[:-1])
+# #     ax.set_xticklabels(labels, fontsize=9)
+# #     ax.set_ylim(0, 100)
+# #     ax.set_yticks([0, 25, 50, 75, 100])
+# #     for r in rows:
+# #         values = r["values"]
+# #         values_closed = values + values[:1]
+# #         ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
+# #         ax.fill(angles, values_closed, alpha=fill_alpha)
+# #     ax.set_title(title, y=1.08, fontsize=12)
+# #     ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
+# #     return fig
+# # def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
+# #     fig, ax = plt.subplots(figsize=(7, 5))
+# #     sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
+# #     ax.set_title(title)
+# #     return fig
+# # --------------------------
+# # HIGH-LEVEL EVALUATION
+# # --------------------------
+# def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
+#     """
+#     df must contain: prompt, response, task, agent, reference
+#     Returns: metrics_df, [(image_path, caption)], leaderboard_df
+#     """
+#     df = df.rename(columns={c: c.strip() for c in df.columns})
+#     rows = []
+#     for _, r in df.iterrows():
+#         prompt = r.get("prompt", "")
+#         response = r.get("response", "")
+#         reference = r.get("reference", "")
+#         agent = r.get("agent", "Unknown")
+#         task = r.get("task", "Unknown")
+#         scores = compute_row_scores(prompt, response, reference)
+#         entry = {
+#             "Task": str(task).strip(),
+#             "Agent": str(agent),
+#             "Prompt": prompt,
+#             "Response": response,
+#             "Reference": reference,
+#         }
+#         entry.update(scores)
+#         rows.append(entry)
+#     metrics_df = pd.DataFrame(rows)
+#     # Visualization artifacts
+#     images = []
+#     metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
+#     # Per-task radar and bar charts
+#     for task, g in metrics_df.groupby("Task"):
+#         series = []
+#         for a in g["Agent"].unique():
+#             subset = g[g["Agent"] == a]
+#             vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
+#             series.append({"name": a, "values": vals})
+#         if series:
+#             fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
+#             fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
+#             fig.savefig(fname, bbox_inches="tight")
+#             plt.close(fig)
+#             images.append((fname, f"{task} - radar"))
+#             fig2, ax = plt.subplots(figsize=(8, 4))
+#             avg = g.groupby("Agent")[metric_labels].mean()
+#             avg.plot(kind="bar", ax=ax)
+#             ax.set_title(f"{task} — Average Metrics by Agent")
+#             ax.set_ylabel("Score (0-1)")
+#             plt.xticks(rotation=45)
+#             fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
+#             fig2.savefig(fname2, bbox_inches="tight")
+#             plt.close(fig2)
+#             images.append((fname2, f"{task} - bar"))
+#     # Global heatmap
+#     metric_cols = metric_labels + ["FinalScore"]
+#     figh = heatmap_plot(metrics_df, metric_cols)
+#     fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
+#     figh.savefig(fnameh, bbox_inches="tight")
+#     plt.close(figh)
+#     images.append((fnameh, "Metric Correlations Heatmap"))
+#     # Leaderboard
+#     lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
+#     lb = lb.sort_values(["FinalScore"], ascending=False)
+#     return metrics_df, images, lb
+# # --------------------------
+# # DEMO USAGE
+# # --------------------------
+# if __name__ == "__main__":
+#     # Sample dataset
+#     data = [
+#         {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
+#         {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
+#         {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
+#     ]
+#     df = pd.DataFrame(data)
+#     metrics_df, images, leaderboard = evaluate_dataframe(df)
+#     print("\n=== Metrics per response ===")
+#     print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
+#     print("\n=== Leaderboard (average per task & agent) ===")
+#     print(leaderboard)
+#     print("\nVisualization files saved in /tmp/:")
+#     for path, caption in images:
+#         print(f"{caption}: {path}")
 import re
+import json
+import torch
 import pandas as pd
 import matplotlib.pyplot as plt
 import seaborn as sns
+import os
+import uuid
 from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, util
 # --------------------------
+# ROW & DF EVALUATION
 # --------------------------
+def evaluate_row(row):
+    prompt = row.get("prompt", "")
+    response = row.get("response", "")
+    reference = row.get("reference", "")
+    metrics = {
+        "task_id": row.get("task_id", ""),
+        "agent": row.get("agent", ""),
+        "instruction_following": check_instruction_following(prompt, response),
+        "hallucination": check_hallucination(reference, response),
+        "assumption": check_assumption(response),
+        "coherence": check_coherence(response),
+        "accuracy": check_accuracy(reference, response),
     }
+    # Weighted avg score (you can adjust weights)
+    metrics["final_score"] = round(
+        0.25 * metrics["instruction_following"]
+        + 0.25 * metrics["accuracy"]
+        + 0.2 * metrics["hallucination"]
+        + 0.15 * metrics["coherence"]
+        + 0.15 * metrics["assumption"],
+        3,
+    )
+    return metrics
+def evaluate_dataframe(df: pd.DataFrame):
+    metrics_df = df.apply(evaluate_row, axis=1, result_type="expand")
     # Leaderboard
+    leaderboard = (
+        metrics_df.groupby(["agent", "task_id"])["final_score"]
+        .mean()
+        .reset_index()
+    )
+    # Plots
+    images = []
+    out_dir = "/tmp/plots"
+    os.makedirs(out_dir, exist_ok=True)
+    # Histogram of scores
+    plt.figure(figsize=(6, 4))
+    sns.histplot(metrics_df["final_score"], bins=10, kde=False)
+    plt.title("Distribution of Final Scores")
+    hist_path = os.path.join(out_dir, f"hist_{uuid.uuid4().hex}.png")
+    plt.savefig(hist_path)
+    plt.close()
+    images.append((hist_path, "Final Score Distribution"))
+    # Per-agent average
+    plt.figure(figsize=(6, 4))
+    agent_scores = metrics_df.groupby("agent")["final_score"].mean().reset_index()
+    sns.barplot(data=agent_scores, x="agent", y="final_score")
+    plt.title("Average Final Score per Agent")
+    bar_path = os.path.join(out_dir, f"bar_{uuid.uuid4().hex}.png")
+    plt.savefig(bar_path)
+    plt.close()
+    images.append((bar_path, "Average Score per Agent"))
+    return metrics_df, images, leaderboard