Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

Update evaluator.py

by manayporwal07 - opened Sep 13, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+130

-135

Files changed (1) hide show

evaluator.py +130 -135

evaluator.py CHANGED Viewed

@@ -1,13 +1,11 @@
-# evaluator.py
 """
-Evaluation module: loads models (lightweight), computes metrics, and creates visualizations.
-No Java required.
 """
 import re
 import math
 import uuid
-import os
 from typing import List, Dict, Tuple
 import numpy as np
@@ -19,13 +17,12 @@ from transformers import AutoTokenizer, AutoModelForSequenceClassification
 from sentence_transformers import SentenceTransformer, util
 # --------------------------
-# MODEL LOADING (CPU-friendly)
 # --------------------------
-# Use small/medium models appropriate for Spaces.
 NLI_MODEL = "textattack/roberta-base-MNLI"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
-# Load NLI model & tokenizer (on CPU)
 nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
 nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
 nli_model.to("cpu")
@@ -34,66 +31,72 @@ nli_model.eval()
 # Load embedding model
 embed_model = SentenceTransformer(EMBED_MODEL)
-# get label mapping from model config (e.g., {0: 'CONTRADICTION', 1:'NEUTRAL', 2:'ENTAILMENT'})
 id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
 # --------------------------
 # METRIC FUNCTIONS
 # --------------------------
 def check_instruction_following(prompt: str, response: str) -> float:
-    """Keyword-overlap heuristic (normalized)."""
-    prompt = (prompt or "").lower()
-    response = (response or "").lower()
-    keywords = re.findall(r"\b\w+\b", prompt)
-    if len(keywords) == 0:
         return 0.0
-    matches = sum(1 for k in set(keywords) if k in response)
-    return round(matches / len(set(keywords)), 3)
-def check_hallucination(reference: str, response: str) -> Tuple[float, float]:
     """
-    Use NLI to get entailment and contradiction probabilities.
-    Returns (entail_prob, contra_prob) in [0,1].
-    If no reference provided, returns (0.0, 0.0).
     """
     if not reference or not response:
-        return 0.0, 0.0
     with torch.no_grad():
         inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
         outputs = nli_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
-    # Map probabilities to labels using id2label
-    entail_prob = 0.0
-    contra_prob = 0.0
     for idx, p in enumerate(probs):
-        label = id2label.get(idx, "").upper()
         if "ENTAIL" in label:
             entail_prob = float(p)
-        if "CONTRA" in label or "CONTRADICTION" in label:
             contra_prob = float(p)
-    return round(entail_prob, 3), round(contra_prob, 3)
 def check_assumption(response: str) -> float:
-    """Penalize speculative language (hedges)."""
     if not response:
         return 0.0
     speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
     count = sum(1 for t in speculative_terms if t in response.lower())
-    score = 1.0 - min(count / 3.0, 1.0)
     return round(score, 3)
 def check_coherence(response: str) -> float:
-    """Placeholder coherence metric — using a bounded random or simple heuristic.
-       Replace with grammar/perplexity later. Returns in [0,1]."""
     if not response:
         return 0.0
-    # simple heuristic: longer responses that have many sentences get slightly higher
     sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
-    words = max(1, len(re.findall(r"\w+", response)))
     base = min(1.0, (words / 50.0) + (sents / 5.0))
-    # clamp to [0.5, 0.98] to avoid extreme
-    val = max(0.5, min(base * 0.9, 0.98))
-    return round(val, 3)
 def check_accuracy(reference: str, response: str) -> float:
     """Semantic similarity between reference and response via embeddings (cosine)."""
@@ -102,146 +105,118 @@ def check_accuracy(reference: str, response: str) -> float:
     ref_emb = embed_model.encode(reference, convert_to_tensor=True)
     resp_emb = embed_model.encode(response, convert_to_tensor=True)
     sim = float(util.cos_sim(ref_emb, resp_emb).item())
-    # cosine similarity in [-1,1] but for sentences usually [0,1]
-    sim = max(0.0, min(1.0, sim))
-    return round(sim, 3)
 # --------------------------
-# AGGREGATION & SCORING
 # --------------------------
 def compute_row_scores(prompt, response, reference) -> Dict:
     instr = check_instruction_following(prompt, response)
-    entail, contra = check_hallucination(reference, response)
     assum = check_assumption(response)
     coh = check_coherence(response)
     acc = check_accuracy(reference, response)
-    # Combine hallucination metrics into single positive metric: entail good, contra bad
-    hyst = entail * (1 - contra)
-    hyst = round(max(0.0, min(1.0, hyst)), 3)
-    # final_score: simple average of six components (all in [0,1])
-    components = [instr, hyst, assum, coh, acc]
     final = round(float(sum(components) / len(components)), 3)
     return {
         "InstructionFollowing": instr,
-        "Hallucination_Entail": entail,
-        "Hallucination_Contra": contra,
-        "Hallucination_Metric": hyst,
         "AssumptionControl": assum,
         "Coherence": coh,
         "Accuracy": acc,
-        "FinalScore": final
     }
 # --------------------------
 # VISUALIZATION HELPERS
 # --------------------------
-def spider_net_multi(labels: List[str], rows: List[Dict], title: str = "Spider (Radar) Chart", fill_alpha: float = 0.12):
-    """
-    Create and return Matplotlib figure for radar chart.
-    rows: list of {"name": str, "values": [v1,...,vN]} values assumed on 0-100 scale for visibility.
-    """
-    N = len(labels)
-    angles = [n / float(N) * 2 * math.pi for n in range(N)]
-    angles += angles[:1]
-    fig = plt.figure(figsize=(6.5, 6.5))
-    ax = plt.subplot(111, polar=True)
-    ax.set_xticks(angles[:-1])
-    ax.set_xticklabels(labels, fontsize=9)
-    # radial limits: 0 to 100
-    ax.set_ylim(0, 100)
-    ax.set_yticks([0, 25, 50, 75, 100])
-    for r in rows:
-        values = r["values"]
-        values_closed = values + values[:1]
-        ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
-        ax.fill(angles, values_closed, alpha=fill_alpha)
-    ax.set_title(title, y=1.08, fontsize=12)
-    ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
-    return fig
-def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
-    fig, ax = plt.subplots(figsize=(7, 5))
-    sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
-    ax.set_title(title)
-    return fig
-def bar_plot_avg(df: pd.DataFrame, metric_cols: List[str], title: str = "Average Metric Scores per Agent"):
-    agg = df.groupby("Agent")[metric_cols].mean().reset_index()
-    fig, ax = plt.subplots(figsize=(10, 5))
-    agg.set_index("Agent")[metric_cols].plot(kind="bar", ax=ax)
-    ax.set_title(title)
-    ax.set_ylabel("Score (0 - 1)")
-    plt.xticks(rotation=45)
-    plt.tight_layout()
-    return fig
 # --------------------------
-# HIGH-LEVEL EVALUATION (batch)
 # --------------------------
-def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,str]]]:
     """
-    df must contain columns: prompt, response, task, agent, reference (reference optional)
-    Returns: metrics_df, list of (image_path, caption) for visualizations
     """
-    # Normalize columns
     df = df.rename(columns={c: c.strip() for c in df.columns})
-    # try to extract agent from metadata if not present
-    if "agent" not in df.columns and "metadata" in df.columns:
-        df["agent"] = df["metadata"].apply(lambda m: m.get("agent") if isinstance(m, dict) else None)
     rows = []
     for _, r in df.iterrows():
         prompt = r.get("prompt", "")
         response = r.get("response", "")
-        reference = r.get("reference", "") if "reference" in r else ""
         agent = r.get("agent", "Unknown")
         task = r.get("task", "Unknown")
         scores = compute_row_scores(prompt, response, reference)
         entry = {
             "Task": str(task).strip(),
             "Agent": str(agent),
             "Prompt": prompt,
             "Response": response,
-            "Reference": reference
         }
         entry.update(scores)
         rows.append(entry)
     metrics_df = pd.DataFrame(rows)
     # Visualization artifacts
     images = []
-    # Per-task spider charts
-    metric_labels = ["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy"]
     for task, g in metrics_df.groupby("Task"):
-        agents = g["Agent"].unique().tolist()
         series = []
-        for a in agents:
             subset = g[g["Agent"] == a]
-            vals = []
-            # convert to 0-100 scale for plot
-            for m in metric_labels:
-                vals.append(round(float(subset[m].mean()) * 100, 2))
             series.append({"name": a, "values": vals})
-        if len(series) == 0:
-            continue
-        fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
-        fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
-        fig.savefig(fname, bbox_inches="tight")
-        plt.close(fig)
-        images.append((fname, f"{task} - radar"))
-        # also bar plot (averages) per task
-        try:
             fig2, ax = plt.subplots(figsize=(8, 4))
-            avg = g.groupby("Agent")[["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy"]].mean()
             avg.plot(kind="bar", ax=ax)
             ax.set_title(f"{task} — Average Metrics by Agent")
             ax.set_ylabel("Score (0-1)")
@@ -250,22 +225,42 @@ def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,s
             fig2.savefig(fname2, bbox_inches="tight")
             plt.close(fig2)
             images.append((fname2, f"{task} - bar"))
-        except Exception:
-            pass
     # Global heatmap
-    metric_cols = ["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]
-    try:
-        figh = heatmap_plot(metrics_df, metric_cols)
-        fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
-        figh.savefig(fnameh, bbox_inches="tight")
-        plt.close(figh)
-        images.append((fnameh, "Metric Correlations Heatmap"))
-    except Exception:
-        pass
-    # Leaderboard: average final score per agent (global)
     lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
     lb = lb.sort_values(["FinalScore"], ascending=False)
     return metrics_df, images, lb

 """
+Evaluation module: loads models, computes metrics, and creates visualizations.
+Lightweight, CPU-friendly, no Java required.
 """
 import re
 import math
 import uuid
 from typing import List, Dict, Tuple
 import numpy as np
 from sentence_transformers import SentenceTransformer, util
 # --------------------------
+# MODEL LOADING
 # --------------------------
 NLI_MODEL = "textattack/roberta-base-MNLI"
 EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# Load NLI model & tokenizer
 nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
 nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
 nli_model.to("cpu")
 # Load embedding model
 embed_model = SentenceTransformer(EMBED_MODEL)
+# Label mapping from config
 id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
 # --------------------------
 # METRIC FUNCTIONS
 # --------------------------
 def check_instruction_following(prompt: str, response: str) -> float:
+    """Embedding-based similarity between prompt and response."""
+    if not prompt or not response:
         return 0.0
+    p_emb = embed_model.encode(prompt, convert_to_tensor=True)
+    r_emb = embed_model.encode(response, convert_to_tensor=True)
+    sim = float(util.cos_sim(p_emb, r_emb).item())
+    return round(max(0.0, min(1.0, sim)), 3)
+def check_hallucination(reference: str, response: str) -> float:
     """
+    Single hallucination score:
+    Entailment prob - Contradiction prob (normalized to [0,1]).
+    Higher = less hallucination.
     """
     if not reference or not response:
+        return 0.0
     with torch.no_grad():
         inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
         outputs = nli_model(**inputs)
         probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
+    entail_prob, contra_prob = 0.0, 0.0
     for idx, p in enumerate(probs):
+        label = id2label.get(idx, "")
         if "ENTAIL" in label:
             entail_prob = float(p)
+        elif "CONTRA" in label:
             contra_prob = float(p)
+    score = entail_prob - contra_prob
+    score = (score + 1) / 2  # normalize [-1,1] → [0,1]
+    return round(max(0.0, min(1.0, score)), 3)
 def check_assumption(response: str) -> float:
+    """Detect speculative/hedging terms."""
     if not response:
         return 0.0
     speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
     count = sum(1 for t in speculative_terms if t in response.lower())
+    score = 1.0 - min(count / 5.0, 1.0)  # smoother decay
     return round(score, 3)
 def check_coherence(response: str) -> float:
+    """Heuristic coherence metric: penalizes very short/long, rewards sentence balance."""
     if not response:
         return 0.0
+    words = len(re.findall(r"\w+", response))
     sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
+    if words < 5:
+        return 0.3
+    if words > 200:
+        return 0.5
     base = min(1.0, (words / 50.0) + (sents / 5.0))
+    return round(max(0.4, min(base, 0.95)), 3)
 def check_accuracy(reference: str, response: str) -> float:
     """Semantic similarity between reference and response via embeddings (cosine)."""
     ref_emb = embed_model.encode(reference, convert_to_tensor=True)
     resp_emb = embed_model.encode(response, convert_to_tensor=True)
     sim = float(util.cos_sim(ref_emb, resp_emb).item())
+    return round(max(0.0, min(1.0, sim)), 3)
 # --------------------------
+# SCORING PIPELINE
 # --------------------------
 def compute_row_scores(prompt, response, reference) -> Dict:
     instr = check_instruction_following(prompt, response)
+    halluc = check_hallucination(reference, response)
     assum = check_assumption(response)
     coh = check_coherence(response)
     acc = check_accuracy(reference, response)
+    # Final score: average
+    components = [instr, halluc, assum, coh, acc]
     final = round(float(sum(components) / len(components)), 3)
     return {
         "InstructionFollowing": instr,
+        "Hallucination": halluc,
         "AssumptionControl": assum,
         "Coherence": coh,
         "Accuracy": acc,
+        "FinalScore": final,
     }
 # --------------------------
 # VISUALIZATION HELPERS
 # --------------------------
+# def spider_net_multi(labels: List[str], rows: List[Dict], title: str, fill_alpha: float = 0.12):
+#     """Radar chart for multiple agents."""
+#     N = len(labels)
+#     angles = [n / float(N) * 2 * math.pi for n in range(N)]
+#     angles += angles[:1]
+#     fig = plt.figure(figsize=(6.5, 6.5))
+#     ax = plt.subplot(111, polar=True)
+#     ax.set_xticks(angles[:-1])
+#     ax.set_xticklabels(labels, fontsize=9)
+#     ax.set_ylim(0, 100)
+#     ax.set_yticks([0, 25, 50, 75, 100])
+#     for r in rows:
+#         values = r["values"]
+#         values_closed = values + values[:1]
+#         ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
+#         ax.fill(angles, values_closed, alpha=fill_alpha)
+#     ax.set_title(title, y=1.08, fontsize=12)
+#     ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
+#     return fig
+# def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
+#     fig, ax = plt.subplots(figsize=(7, 5))
+#     sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
+#     ax.set_title(title)
+#     return fig
 # --------------------------
+# HIGH-LEVEL EVALUATION
 # --------------------------
+def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str, str]], pd.DataFrame]:
     """
+    df must contain: prompt, response, task, agent, reference
+    Returns: metrics_df, [(image_path, caption)], leaderboard_df
     """
     df = df.rename(columns={c: c.strip() for c in df.columns})
     rows = []
     for _, r in df.iterrows():
         prompt = r.get("prompt", "")
         response = r.get("response", "")
+        reference = r.get("reference", "")
         agent = r.get("agent", "Unknown")
         task = r.get("task", "Unknown")
         scores = compute_row_scores(prompt, response, reference)
         entry = {
             "Task": str(task).strip(),
             "Agent": str(agent),
             "Prompt": prompt,
             "Response": response,
+            "Reference": reference,
         }
         entry.update(scores)
         rows.append(entry)
     metrics_df = pd.DataFrame(rows)
     # Visualization artifacts
     images = []
+    metric_labels = ["InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy"]
+    # Per-task radar and bar charts
     for task, g in metrics_df.groupby("Task"):
         series = []
+        for a in g["Agent"].unique():
             subset = g[g["Agent"] == a]
+            vals = [round(float(subset[m].mean()) * 100, 2) for m in metric_labels]
             series.append({"name": a, "values": vals})
+        if series:
+            fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
+            fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
+            fig.savefig(fname, bbox_inches="tight")
+            plt.close(fig)
+            images.append((fname, f"{task} - radar"))
             fig2, ax = plt.subplots(figsize=(8, 4))
+            avg = g.groupby("Agent")[metric_labels].mean()
             avg.plot(kind="bar", ax=ax)
             ax.set_title(f"{task} — Average Metrics by Agent")
             ax.set_ylabel("Score (0-1)")
             fig2.savefig(fname2, bbox_inches="tight")
             plt.close(fig2)
             images.append((fname2, f"{task} - bar"))
     # Global heatmap
+    metric_cols = metric_labels + ["FinalScore"]
+    figh = heatmap_plot(metrics_df, metric_cols)
+    fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
+    figh.savefig(fnameh, bbox_inches="tight")
+    plt.close(figh)
+    images.append((fnameh, "Metric Correlations Heatmap"))
+    # Leaderboard
     lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
     lb = lb.sort_values(["FinalScore"], ascending=False)
     return metrics_df, images, lb
+# --------------------------
+# DEMO USAGE
+# --------------------------
+if __name__ == "__main__":
+    # Sample dataset
+    data = [
+        {"task": "Math QA", "agent": "AgentA", "prompt": "What is 2+2?", "response": "The answer is 4.", "reference": "2+2=4"},
+        {"task": "Math QA", "agent": "AgentB", "prompt": "What is 2+2?", "response": "It might be 5, but usually 4.", "reference": "2+2=4"},
+        {"task": "Summarization", "agent": "AgentA", "prompt": "Summarize: 'The cat sat on the mat. The dog barked.'", "response": "A cat sat while a dog barked.", "reference": "Cat on mat, dog barking."},
+    ]
+    df = pd.DataFrame(data)
+    metrics_df, images, leaderboard = evaluate_dataframe(df)
+    print("\n=== Metrics per response ===")
+    print(metrics_df[["Task", "Agent", "InstructionFollowing", "Hallucination", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]])
+    print("\n=== Leaderboard (average per task & agent) ===")
+    print(leaderboard)
+    print("\nVisualization files saved in /tmp/:")
+    for path, caption in images:
+        print(f"{caption}: {path}")