Spaces:

Supastrikas-004
/

evaluation-framework

Runtime error

App Files Files Community

Supastrikas-004 commited on Sep 13, 2025

Commit

a095974

verified ·

1 Parent(s): badc270

Update evaluator.py

Browse files

Files changed (1) hide show

evaluator.py +270 -37

evaluator.py CHANGED Viewed

@@ -1,38 +1,271 @@
 import pandas as pd
-import textstat
-import language_tool_python
-# Use Public API version (does NOT require Java)
-tool = language_tool_python.LanguageToolPublicAPI('en-US')
-def evaluate_responses(df, use_llm_judge=False):
-    scores = []
-    for _, row in df.iterrows():
-        response = row["response"]
-        # Rule-based metrics
-        grammar_matches = len(tool.check(response))
-        readability = textstat.flesch_reading_ease(response)
-        # Simple scoring
-        instruction_follow = 1 if row["instruction"].lower() in response.lower() else 0
-        coherence = 1 if readability > 40 else 0
-        grammar_score = max(0, 1 - grammar_matches / 10)
-        final_score = (instruction_follow + coherence + grammar_score) / 3
-        # Optional LLM judge (stubbed for now, can hook Hugging Face API later)
-        if use_llm_judge:
-            final_score = (final_score + 0.8) / 2  # Example: trust LLM judge
-        scores.append({
-            "agent": row["agent"],
-            "instruction": row["instruction"],
-            "response": response,
-            "score_instruction": instruction_follow,
-            "score_coherence": coherence,
-            "score_grammar": grammar_score,
-            "final_score": final_score
-        })
-    return pd.DataFrame(scores)

+# evaluator.py
+"""
+Evaluation module: loads models (lightweight), computes metrics, and creates visualizations.
+No Java required.
+"""
+import re
+import math
+import uuid
+import os
+from typing import List, Dict, Tuple
+import numpy as np
 import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from sentence_transformers import SentenceTransformer, util
+# --------------------------
+# MODEL LOADING (CPU-friendly)
+# --------------------------
+# Use small/medium models appropriate for Spaces.
+NLI_MODEL = "textattack/roberta-base-MNLI"
+EMBED_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+# Load NLI model & tokenizer (on CPU)
+nli_tokenizer = AutoTokenizer.from_pretrained(NLI_MODEL)
+nli_model = AutoModelForSequenceClassification.from_pretrained(NLI_MODEL)
+nli_model.to("cpu")
+nli_model.eval()
+# Load embedding model
+embed_model = SentenceTransformer(EMBED_MODEL)
+# get label mapping from model config (e.g., {0: 'CONTRADICTION', 1:'NEUTRAL', 2:'ENTAILMENT'})
+id2label = {int(k): v.upper() for k, v in nli_model.config.id2label.items()}
+# --------------------------
+# METRIC FUNCTIONS
+# --------------------------
+def check_instruction_following(prompt: str, response: str) -> float:
+    """Keyword-overlap heuristic (normalized)."""
+    prompt = (prompt or "").lower()
+    response = (response or "").lower()
+    keywords = re.findall(r"\b\w+\b", prompt)
+    if len(keywords) == 0:
+        return 0.0
+    matches = sum(1 for k in set(keywords) if k in response)
+    return round(matches / len(set(keywords)), 3)
+def check_hallucination(reference: str, response: str) -> Tuple[float, float]:
+    """
+    Use NLI to get entailment and contradiction probabilities.
+    Returns (entail_prob, contra_prob) in [0,1].
+    If no reference provided, returns (0.0, 0.0).
+    """
+    if not reference or not response:
+        return 0.0, 0.0
+    with torch.no_grad():
+        inputs = nli_tokenizer.encode_plus(reference, response, return_tensors="pt", truncation=True)
+        outputs = nli_model(**inputs)
+        probs = torch.softmax(outputs.logits, dim=-1).cpu().numpy()[0]
+    # Map probabilities to labels using id2label
+    entail_prob = 0.0
+    contra_prob = 0.0
+    for idx, p in enumerate(probs):
+        label = id2label.get(idx, "").upper()
+        if "ENTAIL" in label:
+            entail_prob = float(p)
+        if "CONTRA" in label or "CONTRADICTION" in label:
+            contra_prob = float(p)
+    return round(entail_prob, 3), round(contra_prob, 3)
+def check_assumption(response: str) -> float:
+    """Penalize speculative language (hedges)."""
+    if not response:
+        return 0.0
+    speculative_terms = ["maybe", "probably", "might", "perhaps", "i guess", "seems", "could"]
+    count = sum(1 for t in speculative_terms if t in response.lower())
+    score = 1.0 - min(count / 3.0, 1.0)
+    return round(score, 3)
+def check_coherence(response: str) -> float:
+    """Placeholder coherence metric — using a bounded random or simple heuristic.
+       Replace with grammar/perplexity later. Returns in [0,1]."""
+    if not response:
+        return 0.0
+    # simple heuristic: longer responses that have many sentences get slightly higher
+    sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
+    words = max(1, len(re.findall(r"\w+", response)))
+    base = min(1.0, (words / 50.0) + (sents / 5.0))
+    # clamp to [0.5, 0.98] to avoid extreme
+    val = max(0.5, min(base * 0.9, 0.98))
+    return round(val, 3)
+def check_accuracy(reference: str, response: str) -> float:
+    """Semantic similarity between reference and response via embeddings (cosine)."""
+    if not reference or not response:
+        return 0.0
+    ref_emb = embed_model.encode(reference, convert_to_tensor=True)
+    resp_emb = embed_model.encode(response, convert_to_tensor=True)
+    sim = float(util.cos_sim(ref_emb, resp_emb).item())
+    # cosine similarity in [-1,1] but for sentences usually [0,1]
+    sim = max(0.0, min(1.0, sim))
+    return round(sim, 3)
+# --------------------------
+# AGGREGATION & SCORING
+# --------------------------
+def compute_row_scores(prompt, response, reference) -> Dict:
+    instr = check_instruction_following(prompt, response)
+    entail, contra = check_hallucination(reference, response)
+    assum = check_assumption(response)
+    coh = check_coherence(response)
+    acc = check_accuracy(reference, response)
+    # Combine hallucination metrics into single positive metric: entail good, contra bad
+    hyst = entail * (1 - contra)
+    hyst = round(max(0.0, min(1.0, hyst)), 3)
+    # final_score: simple average of six components (all in [0,1])
+    components = [instr, hyst, assum, coh, acc]
+    final = round(float(sum(components) / len(components)), 3)
+    return {
+        "InstructionFollowing": instr,
+        "Hallucination_Entail": entail,
+        "Hallucination_Contra": contra,
+        "Hallucination_Metric": hyst,
+        "AssumptionControl": assum,
+        "Coherence": coh,
+        "Accuracy": acc,
+        "FinalScore": final
+    }
+# --------------------------
+# VISUALIZATION HELPERS
+# --------------------------
+def spider_net_multi(labels: List[str], rows: List[Dict], title: str = "Spider (Radar) Chart", fill_alpha: float = 0.12):
+    """
+    Create and return Matplotlib figure for radar chart.
+    rows: list of {"name": str, "values": [v1,...,vN]} values assumed on 0-100 scale for visibility.
+    """
+    N = len(labels)
+    angles = [n / float(N) * 2 * math.pi for n in range(N)]
+    angles += angles[:1]
+    fig = plt.figure(figsize=(6.5, 6.5))
+    ax = plt.subplot(111, polar=True)
+    ax.set_xticks(angles[:-1])
+    ax.set_xticklabels(labels, fontsize=9)
+    # radial limits: 0 to 100
+    ax.set_ylim(0, 100)
+    ax.set_yticks([0, 25, 50, 75, 100])
+    for r in rows:
+        values = r["values"]
+        values_closed = values + values[:1]
+        ax.plot(angles, values_closed, linewidth=1.5, label=r["name"])
+        ax.fill(angles, values_closed, alpha=fill_alpha)
+    ax.set_title(title, y=1.08, fontsize=12)
+    ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))
+    return fig
+def heatmap_plot(df: pd.DataFrame, metric_cols: List[str], title: str = "Metric Correlations"):
+    fig, ax = plt.subplots(figsize=(7, 5))
+    sns.heatmap(df[metric_cols].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
+    ax.set_title(title)
+    return fig
+def bar_plot_avg(df: pd.DataFrame, metric_cols: List[str], title: str = "Average Metric Scores per Agent"):
+    agg = df.groupby("Agent")[metric_cols].mean().reset_index()
+    fig, ax = plt.subplots(figsize=(10, 5))
+    agg.set_index("Agent")[metric_cols].plot(kind="bar", ax=ax)
+    ax.set_title(title)
+    ax.set_ylabel("Score (0 - 1)")
+    plt.xticks(rotation=45)
+    plt.tight_layout()
+    return fig
+# --------------------------
+# HIGH-LEVEL EVALUATION (batch)
+# --------------------------
+def evaluate_dataframe(df: pd.DataFrame) -> Tuple[pd.DataFrame, List[Tuple[str,str]]]:
+    """
+    df must contain columns: prompt, response, task, agent, reference (reference optional)
+    Returns: metrics_df, list of (image_path, caption) for visualizations
+    """
+    # Normalize columns
+    df = df.rename(columns={c: c.strip() for c in df.columns})
+    # try to extract agent from metadata if not present
+    if "agent" not in df.columns and "metadata" in df.columns:
+        df["agent"] = df["metadata"].apply(lambda m: m.get("agent") if isinstance(m, dict) else None)
+    rows = []
+    for _, r in df.iterrows():
+        prompt = r.get("prompt", "")
+        response = r.get("response", "")
+        reference = r.get("reference", "") if "reference" in r else ""
+        agent = r.get("agent", "Unknown")
+        task = r.get("task", "Unknown")
+        scores = compute_row_scores(prompt, response, reference)
+        entry = {
+            "Task": str(task).strip(),
+            "Agent": str(agent),
+            "Prompt": prompt,
+            "Response": response,
+            "Reference": reference
+        }
+        entry.update(scores)
+        rows.append(entry)
+    metrics_df = pd.DataFrame(rows)
+    # Visualization artifacts
+    images = []
+    # Per-task spider charts
+    metric_labels = ["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy"]
+    for task, g in metrics_df.groupby("Task"):
+        agents = g["Agent"].unique().tolist()
+        series = []
+        for a in agents:
+            subset = g[g["Agent"] == a]
+            vals = []
+            # convert to 0-100 scale for plot
+            for m in metric_labels:
+                vals.append(round(float(subset[m].mean()) * 100, 2))
+            series.append({"name": a, "values": vals})
+        if len(series) == 0:
+            continue
+        fig = spider_net_multi(metric_labels, series, title=f"{task} — Agent Comparison")
+        fname = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
+        fig.savefig(fname, bbox_inches="tight")
+        plt.close(fig)
+        images.append((fname, f"{task} - radar"))
+        # also bar plot (averages) per task
+        try:
+            fig2, ax = plt.subplots(figsize=(8, 4))
+            avg = g.groupby("Agent")[["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy"]].mean()
+            avg.plot(kind="bar", ax=ax)
+            ax.set_title(f"{task} — Average Metrics by Agent")
+            ax.set_ylabel("Score (0-1)")
+            plt.xticks(rotation=45)
+            fname2 = f"/tmp/{uuid.uuid4().hex}_{task}_bar.png"
+            fig2.savefig(fname2, bbox_inches="tight")
+            plt.close(fig2)
+            images.append((fname2, f"{task} - bar"))
+        except Exception:
+            pass
+    # Global heatmap
+    metric_cols = ["InstructionFollowing", "Hallucination_Metric", "AssumptionControl", "Coherence", "Accuracy", "FinalScore"]
+    try:
+        figh = heatmap_plot(metrics_df, metric_cols)
+        fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
+        figh.savefig(fnameh, bbox_inches="tight")
+        plt.close(figh)
+        images.append((fnameh, "Metric Correlations Heatmap"))
+    except Exception:
+        pass
+    # Leaderboard: average final score per agent (global)
+    lb = metrics_df.groupby(["Agent", "Task"])["FinalScore"].mean().reset_index()
+    lb = lb.sort_values(["FinalScore"], ascending=False)
+    return metrics_df, images, lb