Update evaluator.py
Browse files- evaluator.py +537 -167
evaluator.py
CHANGED
|
@@ -1,31 +1,484 @@
|
|
| 1 |
-
# evaluator.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import re
|
| 3 |
import math
|
| 4 |
-
import os
|
| 5 |
import numpy as np
|
| 6 |
import pandas as pd
|
| 7 |
-
import textstat
|
| 8 |
from typing import Tuple, Dict
|
| 9 |
|
| 10 |
-
#
|
| 11 |
import language_tool_python
|
| 12 |
try:
|
| 13 |
tool = language_tool_python.LanguageToolPublicAPI('en-US')
|
| 14 |
except Exception:
|
| 15 |
-
#
|
| 16 |
-
tool = None
|
| 17 |
|
| 18 |
-
#
|
| 19 |
HALLUCINATION_AVAILABLE = True
|
| 20 |
try:
|
| 21 |
-
|
| 22 |
-
from unieval.metric.evaluator import get_evaluator # optional
|
| 23 |
-
import evaluate # required by hallucination detector
|
| 24 |
import torch
|
| 25 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
from sentence_transformers import SentenceTransformer, util
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
except Exception:
|
| 28 |
HALLUCINATION_AVAILABLE = False
|
|
|
|
|
|
|
| 29 |
|
| 30 |
# -------------------------
|
| 31 |
# Rule-based metrics
|
|
@@ -40,11 +493,7 @@ def check_instruction_following(prompt: str, response: str) -> float:
|
|
| 40 |
return round(matches / len(set(keywords)), 3)
|
| 41 |
|
| 42 |
def check_grammar(response: str) -> Tuple[int, float]:
|
| 43 |
-
"""
|
| 44 |
-
Returns (num_matches, grammar_score_in_0_1)
|
| 45 |
-
grammar_score = 1 - num_matches/10 clipped
|
| 46 |
-
If language tool unavailable, returns (0, 0.8) as a coarse default.
|
| 47 |
-
"""
|
| 48 |
if not response:
|
| 49 |
return 0, 0.0
|
| 50 |
if tool is None:
|
|
@@ -67,29 +516,21 @@ def check_coherence(response: str) -> float:
|
|
| 67 |
return round(val, 3)
|
| 68 |
|
| 69 |
def check_accuracy_embeddings(reference: str, response: str, embed_model=None) -> float:
|
| 70 |
-
"""
|
| 71 |
-
If embed_model passed and reference provided, compute cosine sim.
|
| 72 |
-
Otherwise return 0 or a neutral value.
|
| 73 |
-
"""
|
| 74 |
if not reference or not response or embed_model is None:
|
| 75 |
return 0.0
|
| 76 |
try:
|
| 77 |
ref_emb = embed_model.encode(reference, convert_to_tensor=True)
|
| 78 |
resp_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 79 |
sim = float(util.cos_sim(ref_emb, resp_emb))
|
| 80 |
-
|
| 81 |
-
return round(sim, 3)
|
| 82 |
except Exception:
|
| 83 |
return 0.0
|
| 84 |
|
|
|
|
| 85 |
# -------------------------
|
| 86 |
-
# Hallucination Detector
|
| 87 |
# -------------------------
|
| 88 |
class HallucinationDetectorWrapper:
|
| 89 |
-
"""
|
| 90 |
-
Wraps the ComprehensiveHallucinationDetector logic. Loads heavy models lazily and sets
|
| 91 |
-
DETECTOR_AVAILABLE flag depending on success. If loading fails, methods return neutral stubs.
|
| 92 |
-
"""
|
| 93 |
def __init__(self):
|
| 94 |
self.ready = False
|
| 95 |
self._init_detector()
|
|
@@ -97,45 +538,37 @@ class HallucinationDetectorWrapper:
|
|
| 97 |
def _init_detector(self):
|
| 98 |
global HALLUCINATION_AVAILABLE
|
| 99 |
if not HALLUCINATION_AVAILABLE:
|
| 100 |
-
self.ready = False
|
| 101 |
return
|
| 102 |
try:
|
| 103 |
-
# Import inside to isolate errors
|
| 104 |
-
import evaluate
|
| 105 |
-
import torch
|
| 106 |
-
from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
|
| 107 |
-
from unieval.metric.evaluator import get_evaluator
|
| 108 |
-
# Minimal lightweight choices could be substituted here if you want smaller models
|
| 109 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 110 |
|
| 111 |
-
#
|
| 112 |
self.rouge = evaluate.load('rouge')
|
| 113 |
self.sacrebleu = evaluate.load('sacrebleu')
|
| 114 |
self.bertscore = evaluate.load('bertscore')
|
| 115 |
|
| 116 |
-
#
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
self.ready = False
|
| 139 |
except Exception:
|
| 140 |
self.ready = False
|
| 141 |
|
|
@@ -143,36 +576,23 @@ class HallucinationDetectorWrapper:
|
|
| 143 |
return self.ready
|
| 144 |
|
| 145 |
def detect(self, prompt: str, output: str) -> Dict:
|
| 146 |
-
"""
|
| 147 |
-
If ready, run the comprehensive detector and return dict of metrics.
|
| 148 |
-
If not ready, return neutral placeholder dict.
|
| 149 |
-
"""
|
| 150 |
if not self.ready:
|
| 151 |
-
# Neutral placeholders (so hallucination_score = 0.5 later)
|
| 152 |
return {
|
| 153 |
-
"
|
| 154 |
-
"rouge_l": 0.0,
|
| 155 |
-
"sacrebleu": 0.0,
|
| 156 |
-
"bertscore_f1": 0.0,
|
| 157 |
"unieval_consistency": 0.0,
|
| 158 |
"q_squared_nli_contradiction": 0.5,
|
| 159 |
"critic_contradiction": 0.5
|
| 160 |
}
|
| 161 |
-
# Actual detection implementation (mirrors the code you provided)
|
| 162 |
try:
|
| 163 |
-
# generate knowledge source using judge model
|
| 164 |
input_text = f"Provide a factual answer: {prompt}"
|
| 165 |
input_ids = self.judge_tokenizer(input_text, return_tensors="pt").input_ids.to(self.device)
|
| 166 |
outputs = self.judge_model.generate(input_ids, max_length=384, num_beams=5, early_stopping=True)
|
| 167 |
knowledge_source = self.judge_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 168 |
|
| 169 |
-
# n-gram & semantic
|
| 170 |
rouge_l = self.rouge.compute(predictions=[output], references=[knowledge_source])['rougeL']
|
| 171 |
sacre = self.sacrebleu.compute(predictions=[output], references=[[knowledge_source]])['score'] / 100.0
|
| 172 |
-
|
| 173 |
-
bert_f1 = np.mean(bert_results.get('f1', [0.0]))
|
| 174 |
|
| 175 |
-
# unieval
|
| 176 |
if self.unieval_evaluator:
|
| 177 |
try:
|
| 178 |
ue = self.unieval_evaluator.evaluate([{'source': knowledge_source, 'system_output': output}])[0]['consistency']
|
|
@@ -181,65 +601,23 @@ class HallucinationDetectorWrapper:
|
|
| 181 |
else:
|
| 182 |
ue = 0.0
|
| 183 |
|
| 184 |
-
# q^2
|
| 185 |
-
qg_input = f"generate question: {output}"
|
| 186 |
-
qg_input_ids = self.qg_tokenizer(qg_input, return_tensors="pt").input_ids.to(self.device)
|
| 187 |
-
qg_out = self.qg_model.generate(qg_input_ids, max_length=64, num_beams=4)
|
| 188 |
-
question = self.qg_tokenizer.decode(qg_out[0], skip_special_tokens=True)
|
| 189 |
-
if not question:
|
| 190 |
-
q2_contra = 0.5
|
| 191 |
-
else:
|
| 192 |
-
try:
|
| 193 |
-
qa_inputs = self.qa_tokenizer(question, knowledge_source, return_tensors="pt").to(self.device)
|
| 194 |
-
with torch.no_grad():
|
| 195 |
-
qa_output = self.qa_model(**qa_inputs)
|
| 196 |
-
answer_start = torch.argmax(qa_output.start_logits)
|
| 197 |
-
answer_end = torch.argmax(qa_output.end_logits) + 1
|
| 198 |
-
answer_from_knowledge = self.qa_tokenizer.decode(qa_inputs["input_ids"][0][answer_start:answer_end])
|
| 199 |
-
if not answer_from_knowledge:
|
| 200 |
-
q2_contra = 0.5
|
| 201 |
-
else:
|
| 202 |
-
# NLI: output vs answer_from_knowledge
|
| 203 |
-
tokenized = self.nli_tokenizer(output, answer_from_knowledge, return_tensors='pt', truncation=True, max_length=512).to(self.device)
|
| 204 |
-
with torch.no_grad():
|
| 205 |
-
out = self.nli_model(**tokenized)
|
| 206 |
-
probs = torch.softmax(out.logits, dim=1)[0].tolist()
|
| 207 |
-
q2_contra = probs[0] # contradiction prob
|
| 208 |
-
except Exception:
|
| 209 |
-
q2_contra = 0.5
|
| 210 |
-
|
| 211 |
-
# critic contradiction
|
| 212 |
-
try:
|
| 213 |
-
tokenized2 = self.nli_tokenizer(knowledge_source, output, return_tensors='pt', truncation=True, max_length=512).to(self.device)
|
| 214 |
-
with torch.no_grad():
|
| 215 |
-
out2 = self.nli_model(**tokenized2)
|
| 216 |
-
probs2 = torch.softmax(out2.logits, dim=1)[0].tolist()
|
| 217 |
-
critic_contra = probs2[0]
|
| 218 |
-
except Exception:
|
| 219 |
-
critic_contra = 0.5
|
| 220 |
-
|
| 221 |
return {
|
| 222 |
-
"knowledge_source": knowledge_source,
|
| 223 |
"rouge_l": rouge_l,
|
| 224 |
"sacrebleu": sacre,
|
| 225 |
"bertscore_f1": bert_f1,
|
| 226 |
"unieval_consistency": ue,
|
| 227 |
-
"q_squared_nli_contradiction":
|
| 228 |
-
"critic_contradiction":
|
| 229 |
}
|
| 230 |
except Exception:
|
| 231 |
-
# On any runtime failure, return neutral placeholders
|
| 232 |
return {
|
| 233 |
-
"
|
| 234 |
-
"rouge_l": 0.0,
|
| 235 |
-
"sacrebleu": 0.0,
|
| 236 |
-
"bertscore_f1": 0.0,
|
| 237 |
"unieval_consistency": 0.0,
|
| 238 |
"q_squared_nli_contradiction": 0.5,
|
| 239 |
"critic_contradiction": 0.5
|
| 240 |
}
|
| 241 |
|
| 242 |
-
# Singleton
|
| 243 |
_DETECTOR = None
|
| 244 |
def get_detector():
|
| 245 |
global _DETECTOR
|
|
@@ -258,38 +636,36 @@ def hallucination_score(prompt: str, output: str) -> float:
|
|
| 258 |
}
|
| 259 |
total = sum(weights.values())
|
| 260 |
weights = {k: v/total for k, v in weights.items()}
|
| 261 |
-
|
| 262 |
final = 0.0
|
| 263 |
for m, w in weights.items():
|
| 264 |
v = res.get(m, 0.0)
|
| 265 |
-
if m in
|
| 266 |
v = 1 - v
|
| 267 |
final += w * v
|
| 268 |
-
# final is in [0,1], higher -> more hallucination (worse)
|
| 269 |
return float(final)
|
| 270 |
|
|
|
|
| 271 |
# -------------------------
|
| 272 |
-
# Main evaluation
|
| 273 |
# -------------------------
|
| 274 |
def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[pd.DataFrame, list, pd.DataFrame]:
|
| 275 |
"""
|
| 276 |
-
Input: df with columns prompt
|
| 277 |
-
Returns: metrics_df
|
| 278 |
"""
|
| 279 |
-
# Normalize
|
| 280 |
df = df.rename(columns={c: c.strip() for c in df.columns})
|
| 281 |
-
# Accept alternate column names
|
| 282 |
if "instruction" not in df.columns and "prompt" in df.columns:
|
| 283 |
df = df.rename(columns={"prompt": "instruction"})
|
| 284 |
if "response" not in df.columns and "output" in df.columns:
|
| 285 |
df = df.rename(columns={"output": "response"})
|
| 286 |
if "agent" not in df.columns:
|
| 287 |
-
df["agent"] =
|
| 288 |
|
| 289 |
-
#
|
| 290 |
embed_model = None
|
| 291 |
try:
|
| 292 |
-
from sentence_transformers import SentenceTransformer, util
|
| 293 |
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 294 |
except Exception:
|
| 295 |
embed_model = None
|
|
@@ -303,16 +679,15 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 303 |
task = r.get("task", "Unknown")
|
| 304 |
|
| 305 |
inst_score = check_instruction_following(instr, response)
|
| 306 |
-
|
| 307 |
coh_score = check_coherence(response)
|
| 308 |
acc_emb = check_accuracy_embeddings(reference, response, embed_model)
|
| 309 |
|
| 310 |
-
|
| 311 |
-
base_final = float(sum(base_components) / max(1, len(base_components)))
|
| 312 |
|
| 313 |
row_entry = {
|
| 314 |
-
"Task":
|
| 315 |
-
"Agent":
|
| 316 |
"Instruction": instr,
|
| 317 |
"Response": response,
|
| 318 |
"Reference": reference,
|
|
@@ -323,34 +698,30 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 323 |
"base_final_score": round(base_final, 4)
|
| 324 |
}
|
| 325 |
|
| 326 |
-
# optional LLM judge: compute hallucination_score
|
| 327 |
if use_llm_judge:
|
| 328 |
try:
|
| 329 |
h = hallucination_score(instr, response)
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
row_entry["score_llm_consistency"] = consistency
|
| 333 |
-
# combine base_final and consistency (simple averaging)
|
| 334 |
-
final_score = round((base_final + consistency) / 2.0, 4)
|
| 335 |
-
row_entry["final_score"] = final_score
|
| 336 |
except Exception:
|
| 337 |
-
# fallback
|
| 338 |
row_entry["score_llm_consistency"] = 0.5
|
| 339 |
-
row_entry["final_score"] =
|
| 340 |
else:
|
| 341 |
row_entry["score_llm_consistency"] = np.nan
|
| 342 |
-
row_entry["final_score"] =
|
| 343 |
|
| 344 |
rows.append(row_entry)
|
| 345 |
|
| 346 |
metrics_df = pd.DataFrame(rows)
|
| 347 |
|
| 348 |
-
#
|
| 349 |
-
images = []
|
| 350 |
import matplotlib.pyplot as plt
|
| 351 |
import seaborn as sns
|
| 352 |
import uuid
|
| 353 |
-
|
|
|
|
|
|
|
|
|
|
| 354 |
try:
|
| 355 |
lb = metrics_df.groupby("Agent")["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
| 356 |
fname = f"/tmp/{uuid.uuid4().hex}_leaderboard.png"
|
|
@@ -358,29 +729,27 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 358 |
ax.barh(lb["Agent"], lb["final_score"], color="tab:blue")
|
| 359 |
ax.invert_yaxis()
|
| 360 |
ax.set_xlabel("Average final score")
|
| 361 |
-
ax.set_title("Leaderboard
|
| 362 |
plt.tight_layout()
|
| 363 |
fig.savefig(fname, bbox_inches="tight")
|
| 364 |
plt.close(fig)
|
| 365 |
-
images.append((fname, "Leaderboard
|
| 366 |
except Exception:
|
| 367 |
pass
|
| 368 |
|
| 369 |
-
#
|
| 370 |
try:
|
| 371 |
metric_cols = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy"]
|
| 372 |
if use_llm_judge:
|
| 373 |
metric_cols.append("score_llm_consistency")
|
| 374 |
agg = metrics_df.groupby("Agent")[metric_cols].mean().reset_index()
|
| 375 |
-
labels = [c.replace("score_", "").
|
| 376 |
-
# Build rows as required
|
| 377 |
rows_for_plot = []
|
| 378 |
for _, row in agg.iterrows():
|
| 379 |
-
vals = [float(row[c])
|
| 380 |
rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 381 |
-
|
| 382 |
-
def spider_net_multi(labels, rows, title="
|
| 383 |
-
import math
|
| 384 |
N = len(labels)
|
| 385 |
angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 386 |
angles += angles[:1]
|
|
@@ -392,19 +761,20 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 392 |
for r in rows:
|
| 393 |
v = r["values"] + r["values"][:1]
|
| 394 |
ax.plot(angles, v, label=r["name"])
|
| 395 |
-
ax.fill(angles, v, alpha=0.
|
| 396 |
ax.set_title(title)
|
| 397 |
ax.legend(loc="upper right", bbox_to_anchor=(1.3,1.1))
|
| 398 |
return fig
|
| 399 |
-
|
|
|
|
| 400 |
fname2 = f"/tmp/{uuid.uuid4().hex}_radar.png"
|
| 401 |
fig.savefig(fname2, bbox_inches="tight")
|
| 402 |
plt.close(fig)
|
| 403 |
-
images.append((fname2, "All agents radar
|
| 404 |
except Exception:
|
| 405 |
pass
|
| 406 |
|
| 407 |
-
# Per-task
|
| 408 |
try:
|
| 409 |
for task, subset in metrics_df.groupby("Task"):
|
| 410 |
agg = subset.groupby("Agent")[metric_cols].mean().reset_index()
|
|
@@ -412,17 +782,17 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 412 |
continue
|
| 413 |
rows_for_plot = []
|
| 414 |
for _, row in agg.iterrows():
|
| 415 |
-
vals = [float(row[c])
|
| 416 |
rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 417 |
-
fig = spider_net_multi(labels, rows_for_plot,
|
| 418 |
fname3 = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
|
| 419 |
fig.savefig(fname3, bbox_inches="tight")
|
| 420 |
plt.close(fig)
|
| 421 |
-
images.append((fname3, f"{task}
|
| 422 |
except Exception:
|
| 423 |
pass
|
| 424 |
|
| 425 |
-
#
|
| 426 |
try:
|
| 427 |
metric_cols2 = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy", "final_score"]
|
| 428 |
if use_llm_judge:
|
|
@@ -437,7 +807,7 @@ def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[p
|
|
| 437 |
except Exception:
|
| 438 |
pass
|
| 439 |
|
| 440 |
-
|
| 441 |
-
leaderboard_df = metrics_df.groupby(["Agent", "Task"])["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
| 442 |
|
| 443 |
return metrics_df, images, leaderboard_df
|
|
|
|
|
|
| 1 |
+
# # evaluator.py
|
| 2 |
+
# import re
|
| 3 |
+
# import math
|
| 4 |
+
# import os
|
| 5 |
+
# import numpy as np
|
| 6 |
+
# import pandas as pd
|
| 7 |
+
# import textstat
|
| 8 |
+
# from typing import Tuple, Dict
|
| 9 |
+
|
| 10 |
+
# # Use LanguageTool public API to avoid Java dependency in Spaces
|
| 11 |
+
# import language_tool_python
|
| 12 |
+
# try:
|
| 13 |
+
# tool = language_tool_python.LanguageToolPublicAPI('en-US')
|
| 14 |
+
# except Exception:
|
| 15 |
+
# # final fallback: simple grammar placeholder if network issue
|
| 16 |
+
# tool = None
|
| 17 |
+
|
| 18 |
+
# # Import heavy dependencies lazily inside the hallucination detector to avoid startup OOM
|
| 19 |
+
# HALLUCINATION_AVAILABLE = True
|
| 20 |
+
# try:
|
| 21 |
+
# # 'unieval' import may fail if package not installed; guard it
|
| 22 |
+
# from unieval.metric.evaluator import get_evaluator # optional
|
| 23 |
+
# import evaluate # required by hallucination detector
|
| 24 |
+
# import torch
|
| 25 |
+
# from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
|
| 26 |
+
# from sentence_transformers import SentenceTransformer, util
|
| 27 |
+
# except Exception:
|
| 28 |
+
# HALLUCINATION_AVAILABLE = False
|
| 29 |
+
|
| 30 |
+
# # -------------------------
|
| 31 |
+
# # Rule-based metrics
|
| 32 |
+
# # -------------------------
|
| 33 |
+
# def check_instruction_following(prompt: str, response: str) -> float:
|
| 34 |
+
# prompt = (prompt or "").lower()
|
| 35 |
+
# response = (response or "").lower()
|
| 36 |
+
# keywords = re.findall(r"\b\w+\b", prompt)
|
| 37 |
+
# if not keywords:
|
| 38 |
+
# return 0.0
|
| 39 |
+
# matches = sum(1 for k in set(keywords) if k in response)
|
| 40 |
+
# return round(matches / len(set(keywords)), 3)
|
| 41 |
+
|
| 42 |
+
# def check_grammar(response: str) -> Tuple[int, float]:
|
| 43 |
+
# """
|
| 44 |
+
# Returns (num_matches, grammar_score_in_0_1)
|
| 45 |
+
# grammar_score = 1 - num_matches/10 clipped
|
| 46 |
+
# If language tool unavailable, returns (0, 0.8) as a coarse default.
|
| 47 |
+
# """
|
| 48 |
+
# if not response:
|
| 49 |
+
# return 0, 0.0
|
| 50 |
+
# if tool is None:
|
| 51 |
+
# return 0, 0.8
|
| 52 |
+
# try:
|
| 53 |
+
# matches = tool.check(response)
|
| 54 |
+
# num = len(matches)
|
| 55 |
+
# score = max(0.0, 1 - num / 10)
|
| 56 |
+
# return num, round(score, 3)
|
| 57 |
+
# except Exception:
|
| 58 |
+
# return 0, 0.8
|
| 59 |
+
|
| 60 |
+
# def check_coherence(response: str) -> float:
|
| 61 |
+
# if not response:
|
| 62 |
+
# return 0.0
|
| 63 |
+
# sents = max(1, len(re.split(r"[.!?]+", response)) - 1)
|
| 64 |
+
# words = max(1, len(re.findall(r"\w+", response)))
|
| 65 |
+
# base = min(1.0, (words / 50.0) + (sents / 5.0))
|
| 66 |
+
# val = max(0.5, min(base * 0.9, 0.98))
|
| 67 |
+
# return round(val, 3)
|
| 68 |
+
|
| 69 |
+
# def check_accuracy_embeddings(reference: str, response: str, embed_model=None) -> float:
|
| 70 |
+
# """
|
| 71 |
+
# If embed_model passed and reference provided, compute cosine sim.
|
| 72 |
+
# Otherwise return 0 or a neutral value.
|
| 73 |
+
# """
|
| 74 |
+
# if not reference or not response or embed_model is None:
|
| 75 |
+
# return 0.0
|
| 76 |
+
# try:
|
| 77 |
+
# ref_emb = embed_model.encode(reference, convert_to_tensor=True)
|
| 78 |
+
# resp_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 79 |
+
# sim = float(util.cos_sim(ref_emb, resp_emb))
|
| 80 |
+
# sim = max(0.0, min(1.0, sim))
|
| 81 |
+
# return round(sim, 3)
|
| 82 |
+
# except Exception:
|
| 83 |
+
# return 0.0
|
| 84 |
+
|
| 85 |
+
# # -------------------------
|
| 86 |
+
# # Hallucination Detector wrapper
|
| 87 |
+
# # -------------------------
|
| 88 |
+
# class HallucinationDetectorWrapper:
|
| 89 |
+
# """
|
| 90 |
+
# Wraps the ComprehensiveHallucinationDetector logic. Loads heavy models lazily and sets
|
| 91 |
+
# DETECTOR_AVAILABLE flag depending on success. If loading fails, methods return neutral stubs.
|
| 92 |
+
# """
|
| 93 |
+
# def __init__(self):
|
| 94 |
+
# self.ready = False
|
| 95 |
+
# self._init_detector()
|
| 96 |
+
|
| 97 |
+
# def _init_detector(self):
|
| 98 |
+
# global HALLUCINATION_AVAILABLE
|
| 99 |
+
# if not HALLUCINATION_AVAILABLE:
|
| 100 |
+
# self.ready = False
|
| 101 |
+
# return
|
| 102 |
+
# try:
|
| 103 |
+
# # Import inside to isolate errors
|
| 104 |
+
# import evaluate
|
| 105 |
+
# import torch
|
| 106 |
+
# from transformers import AutoTokenizer, T5ForConditionalGeneration, AutoModelForQuestionAnswering, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM
|
| 107 |
+
# from unieval.metric.evaluator import get_evaluator
|
| 108 |
+
# # Minimal lightweight choices could be substituted here if you want smaller models
|
| 109 |
+
# self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 110 |
+
|
| 111 |
+
# # Load metrics
|
| 112 |
+
# self.rouge = evaluate.load('rouge')
|
| 113 |
+
# self.sacrebleu = evaluate.load('sacrebleu')
|
| 114 |
+
# self.bertscore = evaluate.load('bertscore')
|
| 115 |
+
|
| 116 |
+
# # load unieval if available
|
| 117 |
+
# try:
|
| 118 |
+
# self.unieval_evaluator = get_evaluator('fact')
|
| 119 |
+
# except Exception:
|
| 120 |
+
# self.unieval_evaluator = None
|
| 121 |
+
|
| 122 |
+
# # Load QG / QA / NLI / knowledge gen models
|
| 123 |
+
# # Note: These models may be large; this is inside try/except
|
| 124 |
+
# try:
|
| 125 |
+
# self.qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation")
|
| 126 |
+
# self.qg_model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation").to(self.device)
|
| 127 |
+
# self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
| 128 |
+
# self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
|
| 129 |
+
# nli_model_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
|
| 130 |
+
# self.nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
|
| 131 |
+
# self.nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(self.device)
|
| 132 |
+
# judge_model_name = "google/flan-t5-large"
|
| 133 |
+
# self.judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
|
| 134 |
+
# self.judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name).to(self.device)
|
| 135 |
+
# self.ready = True
|
| 136 |
+
# except Exception:
|
| 137 |
+
# # If any heavy-model loading fails, disable the detector
|
| 138 |
+
# self.ready = False
|
| 139 |
+
# except Exception:
|
| 140 |
+
# self.ready = False
|
| 141 |
+
|
| 142 |
+
# def is_ready(self):
|
| 143 |
+
# return self.ready
|
| 144 |
+
|
| 145 |
+
# def detect(self, prompt: str, output: str) -> Dict:
|
| 146 |
+
# """
|
| 147 |
+
# If ready, run the comprehensive detector and return dict of metrics.
|
| 148 |
+
# If not ready, return neutral placeholder dict.
|
| 149 |
+
# """
|
| 150 |
+
# if not self.ready:
|
| 151 |
+
# # Neutral placeholders (so hallucination_score = 0.5 later)
|
| 152 |
+
# return {
|
| 153 |
+
# "knowledge_source": "",
|
| 154 |
+
# "rouge_l": 0.0,
|
| 155 |
+
# "sacrebleu": 0.0,
|
| 156 |
+
# "bertscore_f1": 0.0,
|
| 157 |
+
# "unieval_consistency": 0.0,
|
| 158 |
+
# "q_squared_nli_contradiction": 0.5,
|
| 159 |
+
# "critic_contradiction": 0.5
|
| 160 |
+
# }
|
| 161 |
+
# # Actual detection implementation (mirrors the code you provided)
|
| 162 |
+
# try:
|
| 163 |
+
# # generate knowledge source using judge model
|
| 164 |
+
# input_text = f"Provide a factual answer: {prompt}"
|
| 165 |
+
# input_ids = self.judge_tokenizer(input_text, return_tensors="pt").input_ids.to(self.device)
|
| 166 |
+
# outputs = self.judge_model.generate(input_ids, max_length=384, num_beams=5, early_stopping=True)
|
| 167 |
+
# knowledge_source = self.judge_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 168 |
+
|
| 169 |
+
# # n-gram & semantic
|
| 170 |
+
# rouge_l = self.rouge.compute(predictions=[output], references=[knowledge_source])['rougeL']
|
| 171 |
+
# sacre = self.sacrebleu.compute(predictions=[output], references=[[knowledge_source]])['score'] / 100.0
|
| 172 |
+
# bert_results = self.bertscore.compute(predictions=[output], references=[knowledge_source], lang='en')
|
| 173 |
+
# bert_f1 = np.mean(bert_results.get('f1', [0.0]))
|
| 174 |
+
|
| 175 |
+
# # unieval
|
| 176 |
+
# if self.unieval_evaluator:
|
| 177 |
+
# try:
|
| 178 |
+
# ue = self.unieval_evaluator.evaluate([{'source': knowledge_source, 'system_output': output}])[0]['consistency']
|
| 179 |
+
# except Exception:
|
| 180 |
+
# ue = 0.0
|
| 181 |
+
# else:
|
| 182 |
+
# ue = 0.0
|
| 183 |
+
|
| 184 |
+
# # q^2
|
| 185 |
+
# qg_input = f"generate question: {output}"
|
| 186 |
+
# qg_input_ids = self.qg_tokenizer(qg_input, return_tensors="pt").input_ids.to(self.device)
|
| 187 |
+
# qg_out = self.qg_model.generate(qg_input_ids, max_length=64, num_beams=4)
|
| 188 |
+
# question = self.qg_tokenizer.decode(qg_out[0], skip_special_tokens=True)
|
| 189 |
+
# if not question:
|
| 190 |
+
# q2_contra = 0.5
|
| 191 |
+
# else:
|
| 192 |
+
# try:
|
| 193 |
+
# qa_inputs = self.qa_tokenizer(question, knowledge_source, return_tensors="pt").to(self.device)
|
| 194 |
+
# with torch.no_grad():
|
| 195 |
+
# qa_output = self.qa_model(**qa_inputs)
|
| 196 |
+
# answer_start = torch.argmax(qa_output.start_logits)
|
| 197 |
+
# answer_end = torch.argmax(qa_output.end_logits) + 1
|
| 198 |
+
# answer_from_knowledge = self.qa_tokenizer.decode(qa_inputs["input_ids"][0][answer_start:answer_end])
|
| 199 |
+
# if not answer_from_knowledge:
|
| 200 |
+
# q2_contra = 0.5
|
| 201 |
+
# else:
|
| 202 |
+
# # NLI: output vs answer_from_knowledge
|
| 203 |
+
# tokenized = self.nli_tokenizer(output, answer_from_knowledge, return_tensors='pt', truncation=True, max_length=512).to(self.device)
|
| 204 |
+
# with torch.no_grad():
|
| 205 |
+
# out = self.nli_model(**tokenized)
|
| 206 |
+
# probs = torch.softmax(out.logits, dim=1)[0].tolist()
|
| 207 |
+
# q2_contra = probs[0] # contradiction prob
|
| 208 |
+
# except Exception:
|
| 209 |
+
# q2_contra = 0.5
|
| 210 |
+
|
| 211 |
+
# # critic contradiction
|
| 212 |
+
# try:
|
| 213 |
+
# tokenized2 = self.nli_tokenizer(knowledge_source, output, return_tensors='pt', truncation=True, max_length=512).to(self.device)
|
| 214 |
+
# with torch.no_grad():
|
| 215 |
+
# out2 = self.nli_model(**tokenized2)
|
| 216 |
+
# probs2 = torch.softmax(out2.logits, dim=1)[0].tolist()
|
| 217 |
+
# critic_contra = probs2[0]
|
| 218 |
+
# except Exception:
|
| 219 |
+
# critic_contra = 0.5
|
| 220 |
+
|
| 221 |
+
# return {
|
| 222 |
+
# "knowledge_source": knowledge_source,
|
| 223 |
+
# "rouge_l": rouge_l,
|
| 224 |
+
# "sacrebleu": sacre,
|
| 225 |
+
# "bertscore_f1": bert_f1,
|
| 226 |
+
# "unieval_consistency": ue,
|
| 227 |
+
# "q_squared_nli_contradiction": q2_contra,
|
| 228 |
+
# "critic_contradiction": critic_contra
|
| 229 |
+
# }
|
| 230 |
+
# except Exception:
|
| 231 |
+
# # On any runtime failure, return neutral placeholders
|
| 232 |
+
# return {
|
| 233 |
+
# "knowledge_source": "",
|
| 234 |
+
# "rouge_l": 0.0,
|
| 235 |
+
# "sacrebleu": 0.0,
|
| 236 |
+
# "bertscore_f1": 0.0,
|
| 237 |
+
# "unieval_consistency": 0.0,
|
| 238 |
+
# "q_squared_nli_contradiction": 0.5,
|
| 239 |
+
# "critic_contradiction": 0.5
|
| 240 |
+
# }
|
| 241 |
+
|
| 242 |
+
# # Singleton detector instance
|
| 243 |
+
# _DETECTOR = None
|
| 244 |
+
# def get_detector():
|
| 245 |
+
# global _DETECTOR
|
| 246 |
+
# if _DETECTOR is None:
|
| 247 |
+
# _DETECTOR = HallucinationDetectorWrapper()
|
| 248 |
+
# return _DETECTOR
|
| 249 |
+
|
| 250 |
+
# def hallucination_score(prompt: str, output: str) -> float:
|
| 251 |
+
# d = get_detector()
|
| 252 |
+
# res = d.detect(prompt, output)
|
| 253 |
+
# weights = {
|
| 254 |
+
# "rouge_l": 0.2, "sacrebleu": 0.05, "bertscore_f1": 0.25,
|
| 255 |
+
# "unieval_consistency": 0.25,
|
| 256 |
+
# "q_squared_nli_contradiction": 0.15,
|
| 257 |
+
# "critic_contradiction": 0.10
|
| 258 |
+
# }
|
| 259 |
+
# total = sum(weights.values())
|
| 260 |
+
# weights = {k: v/total for k, v in weights.items()}
|
| 261 |
+
# invert_metrics = {"rouge_l", "sacrebleu", "bertscore_f1", "unieval_consistency"}
|
| 262 |
+
# final = 0.0
|
| 263 |
+
# for m, w in weights.items():
|
| 264 |
+
# v = res.get(m, 0.0)
|
| 265 |
+
# if m in invert_metrics:
|
| 266 |
+
# v = 1 - v
|
| 267 |
+
# final += w * v
|
| 268 |
+
# # final is in [0,1], higher -> more hallucination (worse)
|
| 269 |
+
# return float(final)
|
| 270 |
+
|
| 271 |
+
# # -------------------------
|
| 272 |
+
# # Main evaluation function (integrate hallucination as complementary metric)
|
| 273 |
+
# # -------------------------
|
| 274 |
+
# def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[pd.DataFrame, list, pd.DataFrame]:
|
| 275 |
+
# """
|
| 276 |
+
# Input: df with columns prompt (or instruction), response, task, agent, reference (opt)
|
| 277 |
+
# Returns: metrics_df (per row), list of visualization image paths (path, caption), leaderboard_df
|
| 278 |
+
# """
|
| 279 |
+
# # Normalize column names
|
| 280 |
+
# df = df.rename(columns={c: c.strip() for c in df.columns})
|
| 281 |
+
# # Accept alternate column names
|
| 282 |
+
# if "instruction" not in df.columns and "prompt" in df.columns:
|
| 283 |
+
# df = df.rename(columns={"prompt": "instruction"})
|
| 284 |
+
# if "response" not in df.columns and "output" in df.columns:
|
| 285 |
+
# df = df.rename(columns={"output": "response"})
|
| 286 |
+
# if "agent" not in df.columns:
|
| 287 |
+
# df["agent"] = df.get("metadata", {}).apply(lambda x: x.get("agent") if isinstance(x, dict) else "Unknown")
|
| 288 |
+
|
| 289 |
+
# # optional embed model for accuracy: lazy load sentence-transformers if available
|
| 290 |
+
# embed_model = None
|
| 291 |
+
# try:
|
| 292 |
+
# from sentence_transformers import SentenceTransformer, util
|
| 293 |
+
# embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 294 |
+
# except Exception:
|
| 295 |
+
# embed_model = None
|
| 296 |
+
|
| 297 |
+
# rows = []
|
| 298 |
+
# for _, r in df.iterrows():
|
| 299 |
+
# instr = str(r.get("instruction", ""))
|
| 300 |
+
# response = str(r.get("response", ""))
|
| 301 |
+
# reference = str(r.get("reference", "")) if "reference" in r else ""
|
| 302 |
+
# agent = r.get("agent", "Unknown")
|
| 303 |
+
# task = r.get("task", "Unknown")
|
| 304 |
+
|
| 305 |
+
# inst_score = check_instruction_following(instr, response)
|
| 306 |
+
# num_matches, grammar_score = check_grammar(response)
|
| 307 |
+
# coh_score = check_coherence(response)
|
| 308 |
+
# acc_emb = check_accuracy_embeddings(reference, response, embed_model)
|
| 309 |
+
|
| 310 |
+
# base_components = [inst_score, coh_score, grammar_score, acc_emb]
|
| 311 |
+
# base_final = float(sum(base_components) / max(1, len(base_components)))
|
| 312 |
+
|
| 313 |
+
# row_entry = {
|
| 314 |
+
# "Task": str(task),
|
| 315 |
+
# "Agent": str(agent),
|
| 316 |
+
# "Instruction": instr,
|
| 317 |
+
# "Response": response,
|
| 318 |
+
# "Reference": reference,
|
| 319 |
+
# "score_instruction": inst_score,
|
| 320 |
+
# "score_grammar": grammar_score,
|
| 321 |
+
# "score_coherence": coh_score,
|
| 322 |
+
# "score_accuracy": acc_emb,
|
| 323 |
+
# "base_final_score": round(base_final, 4)
|
| 324 |
+
# }
|
| 325 |
+
|
| 326 |
+
# # optional LLM judge: compute hallucination_score
|
| 327 |
+
# if use_llm_judge:
|
| 328 |
+
# try:
|
| 329 |
+
# h = hallucination_score(instr, response)
|
| 330 |
+
# # convert to consistency (higher is better): 1 - hallucination
|
| 331 |
+
# consistency = round(1.0 - float(h), 4)
|
| 332 |
+
# row_entry["score_llm_consistency"] = consistency
|
| 333 |
+
# # combine base_final and consistency (simple averaging)
|
| 334 |
+
# final_score = round((base_final + consistency) / 2.0, 4)
|
| 335 |
+
# row_entry["final_score"] = final_score
|
| 336 |
+
# except Exception:
|
| 337 |
+
# # fallback
|
| 338 |
+
# row_entry["score_llm_consistency"] = 0.5
|
| 339 |
+
# row_entry["final_score"] = round(base_final, 4)
|
| 340 |
+
# else:
|
| 341 |
+
# row_entry["score_llm_consistency"] = np.nan
|
| 342 |
+
# row_entry["final_score"] = round(base_final, 4)
|
| 343 |
+
|
| 344 |
+
# rows.append(row_entry)
|
| 345 |
+
|
| 346 |
+
# metrics_df = pd.DataFrame(rows)
|
| 347 |
+
|
| 348 |
+
# # Create visualizations (saved to /tmp)
|
| 349 |
+
# images = []
|
| 350 |
+
# import matplotlib.pyplot as plt
|
| 351 |
+
# import seaborn as sns
|
| 352 |
+
# import uuid
|
| 353 |
+
# # Leaderboard (avg final score per agent)
|
| 354 |
+
# try:
|
| 355 |
+
# lb = metrics_df.groupby("Agent")["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
| 356 |
+
# fname = f"/tmp/{uuid.uuid4().hex}_leaderboard.png"
|
| 357 |
+
# fig, ax = plt.subplots(figsize=(8, max(4, len(lb)*0.4)))
|
| 358 |
+
# ax.barh(lb["Agent"], lb["final_score"], color="tab:blue")
|
| 359 |
+
# ax.invert_yaxis()
|
| 360 |
+
# ax.set_xlabel("Average final score")
|
| 361 |
+
# ax.set_title("Leaderboard: Avg final score per agent")
|
| 362 |
+
# plt.tight_layout()
|
| 363 |
+
# fig.savefig(fname, bbox_inches="tight")
|
| 364 |
+
# plt.close(fig)
|
| 365 |
+
# images.append((fname, "Leaderboard (horizontal bar)"))
|
| 366 |
+
# except Exception:
|
| 367 |
+
# pass
|
| 368 |
+
|
| 369 |
+
# # Combined spider / radar : compare all agents across metrics
|
| 370 |
+
# try:
|
| 371 |
+
# metric_cols = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy"]
|
| 372 |
+
# if use_llm_judge:
|
| 373 |
+
# metric_cols.append("score_llm_consistency")
|
| 374 |
+
# agg = metrics_df.groupby("Agent")[metric_cols].mean().reset_index()
|
| 375 |
+
# labels = [c.replace("score_", "").replace("_", " ").capitalize() for c in metric_cols]
|
| 376 |
+
# # Build rows as required
|
| 377 |
+
# rows_for_plot = []
|
| 378 |
+
# for _, row in agg.iterrows():
|
| 379 |
+
# vals = [float(row[c]) * 100 for c in metric_cols] # scale to 0-100
|
| 380 |
+
# rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 381 |
+
# # draw radar using a small internal function
|
| 382 |
+
# def spider_net_multi(labels, rows, title="Spider Chart"):
|
| 383 |
+
# import math
|
| 384 |
+
# N = len(labels)
|
| 385 |
+
# angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 386 |
+
# angles += angles[:1]
|
| 387 |
+
# fig = plt.figure(figsize=(6.5,6.5))
|
| 388 |
+
# ax = plt.subplot(111, polar=True)
|
| 389 |
+
# ax.set_xticks(angles[:-1])
|
| 390 |
+
# ax.set_xticklabels(labels)
|
| 391 |
+
# ax.set_ylim(0, 100)
|
| 392 |
+
# for r in rows:
|
| 393 |
+
# v = r["values"] + r["values"][:1]
|
| 394 |
+
# ax.plot(angles, v, label=r["name"])
|
| 395 |
+
# ax.fill(angles, v, alpha=0.12)
|
| 396 |
+
# ax.set_title(title)
|
| 397 |
+
# ax.legend(loc="upper right", bbox_to_anchor=(1.3,1.1))
|
| 398 |
+
# return fig
|
| 399 |
+
# fig = spider_net_multi(labels, rows_for_plot, title="All Agents Comparison (Radar)")
|
| 400 |
+
# fname2 = f"/tmp/{uuid.uuid4().hex}_radar.png"
|
| 401 |
+
# fig.savefig(fname2, bbox_inches="tight")
|
| 402 |
+
# plt.close(fig)
|
| 403 |
+
# images.append((fname2, "All agents radar chart"))
|
| 404 |
+
# except Exception:
|
| 405 |
+
# pass
|
| 406 |
+
|
| 407 |
+
# # Per-task spider charts
|
| 408 |
+
# try:
|
| 409 |
+
# for task, subset in metrics_df.groupby("Task"):
|
| 410 |
+
# agg = subset.groupby("Agent")[metric_cols].mean().reset_index()
|
| 411 |
+
# if agg.shape[0] == 0:
|
| 412 |
+
# continue
|
| 413 |
+
# rows_for_plot = []
|
| 414 |
+
# for _, row in agg.iterrows():
|
| 415 |
+
# vals = [float(row[c]) * 100 for c in metric_cols]
|
| 416 |
+
# rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 417 |
+
# fig = spider_net_multi(labels, rows_for_plot, title=f"{task} Agents (Radar)")
|
| 418 |
+
# fname3 = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
|
| 419 |
+
# fig.savefig(fname3, bbox_inches="tight")
|
| 420 |
+
# plt.close(fig)
|
| 421 |
+
# images.append((fname3, f"{task} - radar"))
|
| 422 |
+
# except Exception:
|
| 423 |
+
# pass
|
| 424 |
+
|
| 425 |
+
# # Heatmap for metric correlations
|
| 426 |
+
# try:
|
| 427 |
+
# metric_cols2 = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy", "final_score"]
|
| 428 |
+
# if use_llm_judge:
|
| 429 |
+
# metric_cols2.append("score_llm_consistency")
|
| 430 |
+
# fig, ax = plt.subplots(figsize=(7,6))
|
| 431 |
+
# sns.heatmap(metrics_df[metric_cols2].corr(), annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
|
| 432 |
+
# ax.set_title("Metric correlations")
|
| 433 |
+
# fnameh = f"/tmp/{uuid.uuid4().hex}_heatmap.png"
|
| 434 |
+
# fig.savefig(fnameh, bbox_inches="tight")
|
| 435 |
+
# plt.close(fig)
|
| 436 |
+
# images.append((fnameh, "Metric correlations"))
|
| 437 |
+
# except Exception:
|
| 438 |
+
# pass
|
| 439 |
+
|
| 440 |
+
# # Leaderboard df return
|
| 441 |
+
# leaderboard_df = metrics_df.groupby(["Agent", "Task"])["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
| 442 |
+
|
| 443 |
+
# return metrics_df, images, leaderboard_df
|
| 444 |
+
|
| 445 |
+
|
| 446 |
import re
|
| 447 |
import math
|
|
|
|
| 448 |
import numpy as np
|
| 449 |
import pandas as pd
|
|
|
|
| 450 |
from typing import Tuple, Dict
|
| 451 |
|
| 452 |
+
# Grammar checker
|
| 453 |
import language_tool_python
|
| 454 |
try:
|
| 455 |
tool = language_tool_python.LanguageToolPublicAPI('en-US')
|
| 456 |
except Exception:
|
| 457 |
+
tool = None # fallback if API not available
|
|
|
|
| 458 |
|
| 459 |
+
# Heavy dependencies – guard unieval
|
| 460 |
HALLUCINATION_AVAILABLE = True
|
| 461 |
try:
|
| 462 |
+
import evaluate
|
|
|
|
|
|
|
| 463 |
import torch
|
| 464 |
+
from transformers import (
|
| 465 |
+
AutoTokenizer,
|
| 466 |
+
T5ForConditionalGeneration,
|
| 467 |
+
AutoModelForQuestionAnswering,
|
| 468 |
+
AutoModelForSequenceClassification,
|
| 469 |
+
AutoModelForSeq2SeqLM
|
| 470 |
+
)
|
| 471 |
from sentence_transformers import SentenceTransformer, util
|
| 472 |
+
try:
|
| 473 |
+
from unieval.metric.evaluator import get_evaluator # optional
|
| 474 |
+
UNIEVAL_AVAILABLE = True
|
| 475 |
+
except ImportError:
|
| 476 |
+
print("[Warning] UniEval not installed – skipping UniEval metrics.")
|
| 477 |
+
UNIEVAL_AVAILABLE = False
|
| 478 |
except Exception:
|
| 479 |
HALLUCINATION_AVAILABLE = False
|
| 480 |
+
UNIEVAL_AVAILABLE = False
|
| 481 |
+
|
| 482 |
|
| 483 |
# -------------------------
|
| 484 |
# Rule-based metrics
|
|
|
|
| 493 |
return round(matches / len(set(keywords)), 3)
|
| 494 |
|
| 495 |
def check_grammar(response: str) -> Tuple[int, float]:
|
| 496 |
+
"""Returns (num_matches, grammar_score)."""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 497 |
if not response:
|
| 498 |
return 0, 0.0
|
| 499 |
if tool is None:
|
|
|
|
| 516 |
return round(val, 3)
|
| 517 |
|
| 518 |
def check_accuracy_embeddings(reference: str, response: str, embed_model=None) -> float:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 519 |
if not reference or not response or embed_model is None:
|
| 520 |
return 0.0
|
| 521 |
try:
|
| 522 |
ref_emb = embed_model.encode(reference, convert_to_tensor=True)
|
| 523 |
resp_emb = embed_model.encode(response, convert_to_tensor=True)
|
| 524 |
sim = float(util.cos_sim(ref_emb, resp_emb))
|
| 525 |
+
return round(max(0.0, min(1.0, sim)), 3)
|
|
|
|
| 526 |
except Exception:
|
| 527 |
return 0.0
|
| 528 |
|
| 529 |
+
|
| 530 |
# -------------------------
|
| 531 |
+
# Hallucination Detector
|
| 532 |
# -------------------------
|
| 533 |
class HallucinationDetectorWrapper:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 534 |
def __init__(self):
|
| 535 |
self.ready = False
|
| 536 |
self._init_detector()
|
|
|
|
| 538 |
def _init_detector(self):
|
| 539 |
global HALLUCINATION_AVAILABLE
|
| 540 |
if not HALLUCINATION_AVAILABLE:
|
|
|
|
| 541 |
return
|
| 542 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 544 |
|
| 545 |
+
# metrics
|
| 546 |
self.rouge = evaluate.load('rouge')
|
| 547 |
self.sacrebleu = evaluate.load('sacrebleu')
|
| 548 |
self.bertscore = evaluate.load('bertscore')
|
| 549 |
|
| 550 |
+
# UniEval if available
|
| 551 |
+
self.unieval_evaluator = None
|
| 552 |
+
if UNIEVAL_AVAILABLE:
|
| 553 |
+
try:
|
| 554 |
+
from unieval.metric.evaluator import get_evaluator
|
| 555 |
+
self.unieval_evaluator = get_evaluator('fact')
|
| 556 |
+
except Exception:
|
| 557 |
+
self.unieval_evaluator = None
|
| 558 |
+
|
| 559 |
+
# load smaller models
|
| 560 |
+
self.qg_tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-question-generation")
|
| 561 |
+
self.qg_model = T5ForConditionalGeneration.from_pretrained("mrm8488/t5-base-finetuned-question-generation").to(self.device)
|
| 562 |
+
self.qa_tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
|
| 563 |
+
self.qa_model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2").to(self.device)
|
| 564 |
+
nli_model_name = "ynie/roberta-large-snli_mnli_fever_anli_R1_R2_R3-nli"
|
| 565 |
+
self.nli_tokenizer = AutoTokenizer.from_pretrained(nli_model_name)
|
| 566 |
+
self.nli_model = AutoModelForSequenceClassification.from_pretrained(nli_model_name).to(self.device)
|
| 567 |
+
judge_model_name = "google/flan-t5-large"
|
| 568 |
+
self.judge_tokenizer = AutoTokenizer.from_pretrained(judge_model_name)
|
| 569 |
+
self.judge_model = AutoModelForSeq2SeqLM.from_pretrained(judge_model_name).to(self.device)
|
| 570 |
+
|
| 571 |
+
self.ready = True
|
|
|
|
| 572 |
except Exception:
|
| 573 |
self.ready = False
|
| 574 |
|
|
|
|
| 576 |
return self.ready
|
| 577 |
|
| 578 |
def detect(self, prompt: str, output: str) -> Dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 579 |
if not self.ready:
|
|
|
|
| 580 |
return {
|
| 581 |
+
"rouge_l": 0.0, "sacrebleu": 0.0, "bertscore_f1": 0.0,
|
|
|
|
|
|
|
|
|
|
| 582 |
"unieval_consistency": 0.0,
|
| 583 |
"q_squared_nli_contradiction": 0.5,
|
| 584 |
"critic_contradiction": 0.5
|
| 585 |
}
|
|
|
|
| 586 |
try:
|
|
|
|
| 587 |
input_text = f"Provide a factual answer: {prompt}"
|
| 588 |
input_ids = self.judge_tokenizer(input_text, return_tensors="pt").input_ids.to(self.device)
|
| 589 |
outputs = self.judge_model.generate(input_ids, max_length=384, num_beams=5, early_stopping=True)
|
| 590 |
knowledge_source = self.judge_tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 591 |
|
|
|
|
| 592 |
rouge_l = self.rouge.compute(predictions=[output], references=[knowledge_source])['rougeL']
|
| 593 |
sacre = self.sacrebleu.compute(predictions=[output], references=[[knowledge_source]])['score'] / 100.0
|
| 594 |
+
bert_f1 = np.mean(self.bertscore.compute(predictions=[output], references=[knowledge_source], lang='en')['f1'])
|
|
|
|
| 595 |
|
|
|
|
| 596 |
if self.unieval_evaluator:
|
| 597 |
try:
|
| 598 |
ue = self.unieval_evaluator.evaluate([{'source': knowledge_source, 'system_output': output}])[0]['consistency']
|
|
|
|
| 601 |
else:
|
| 602 |
ue = 0.0
|
| 603 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
return {
|
|
|
|
| 605 |
"rouge_l": rouge_l,
|
| 606 |
"sacrebleu": sacre,
|
| 607 |
"bertscore_f1": bert_f1,
|
| 608 |
"unieval_consistency": ue,
|
| 609 |
+
"q_squared_nli_contradiction": 0.5,
|
| 610 |
+
"critic_contradiction": 0.5
|
| 611 |
}
|
| 612 |
except Exception:
|
|
|
|
| 613 |
return {
|
| 614 |
+
"rouge_l": 0.0, "sacrebleu": 0.0, "bertscore_f1": 0.0,
|
|
|
|
|
|
|
|
|
|
| 615 |
"unieval_consistency": 0.0,
|
| 616 |
"q_squared_nli_contradiction": 0.5,
|
| 617 |
"critic_contradiction": 0.5
|
| 618 |
}
|
| 619 |
|
| 620 |
+
# Singleton
|
| 621 |
_DETECTOR = None
|
| 622 |
def get_detector():
|
| 623 |
global _DETECTOR
|
|
|
|
| 636 |
}
|
| 637 |
total = sum(weights.values())
|
| 638 |
weights = {k: v/total for k, v in weights.items()}
|
| 639 |
+
invert = {"rouge_l", "sacrebleu", "bertscore_f1", "unieval_consistency"}
|
| 640 |
final = 0.0
|
| 641 |
for m, w in weights.items():
|
| 642 |
v = res.get(m, 0.0)
|
| 643 |
+
if m in invert:
|
| 644 |
v = 1 - v
|
| 645 |
final += w * v
|
|
|
|
| 646 |
return float(final)
|
| 647 |
|
| 648 |
+
|
| 649 |
# -------------------------
|
| 650 |
+
# Main evaluation
|
| 651 |
# -------------------------
|
| 652 |
def evaluate_dataframe(df: pd.DataFrame, use_llm_judge: bool = False) -> Tuple[pd.DataFrame, list, pd.DataFrame]:
|
| 653 |
"""
|
| 654 |
+
Input: df with columns [prompt, response, task, agent, reference (opt)]
|
| 655 |
+
Returns: (metrics_df, images, leaderboard_df)
|
| 656 |
"""
|
| 657 |
+
# Normalize colnames
|
| 658 |
df = df.rename(columns={c: c.strip() for c in df.columns})
|
|
|
|
| 659 |
if "instruction" not in df.columns and "prompt" in df.columns:
|
| 660 |
df = df.rename(columns={"prompt": "instruction"})
|
| 661 |
if "response" not in df.columns and "output" in df.columns:
|
| 662 |
df = df.rename(columns={"output": "response"})
|
| 663 |
if "agent" not in df.columns:
|
| 664 |
+
df["agent"] = "Unknown"
|
| 665 |
|
| 666 |
+
# sentence-transformers model for accuracy
|
| 667 |
embed_model = None
|
| 668 |
try:
|
|
|
|
| 669 |
embed_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 670 |
except Exception:
|
| 671 |
embed_model = None
|
|
|
|
| 679 |
task = r.get("task", "Unknown")
|
| 680 |
|
| 681 |
inst_score = check_instruction_following(instr, response)
|
| 682 |
+
_, grammar_score = check_grammar(response)
|
| 683 |
coh_score = check_coherence(response)
|
| 684 |
acc_emb = check_accuracy_embeddings(reference, response, embed_model)
|
| 685 |
|
| 686 |
+
base_final = float(np.mean([inst_score, grammar_score, coh_score, acc_emb]))
|
|
|
|
| 687 |
|
| 688 |
row_entry = {
|
| 689 |
+
"Task": task,
|
| 690 |
+
"Agent": agent,
|
| 691 |
"Instruction": instr,
|
| 692 |
"Response": response,
|
| 693 |
"Reference": reference,
|
|
|
|
| 698 |
"base_final_score": round(base_final, 4)
|
| 699 |
}
|
| 700 |
|
|
|
|
| 701 |
if use_llm_judge:
|
| 702 |
try:
|
| 703 |
h = hallucination_score(instr, response)
|
| 704 |
+
row_entry["score_llm_consistency"] = round(1.0 - h, 4)
|
| 705 |
+
row_entry["final_score"] = round((base_final + (1.0 - h)) / 2, 4)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 706 |
except Exception:
|
|
|
|
| 707 |
row_entry["score_llm_consistency"] = 0.5
|
| 708 |
+
row_entry["final_score"] = base_final
|
| 709 |
else:
|
| 710 |
row_entry["score_llm_consistency"] = np.nan
|
| 711 |
+
row_entry["final_score"] = base_final
|
| 712 |
|
| 713 |
rows.append(row_entry)
|
| 714 |
|
| 715 |
metrics_df = pd.DataFrame(rows)
|
| 716 |
|
| 717 |
+
# ---------- Visualizations ----------
|
|
|
|
| 718 |
import matplotlib.pyplot as plt
|
| 719 |
import seaborn as sns
|
| 720 |
import uuid
|
| 721 |
+
|
| 722 |
+
images = []
|
| 723 |
+
|
| 724 |
+
# Leaderboard
|
| 725 |
try:
|
| 726 |
lb = metrics_df.groupby("Agent")["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
| 727 |
fname = f"/tmp/{uuid.uuid4().hex}_leaderboard.png"
|
|
|
|
| 729 |
ax.barh(lb["Agent"], lb["final_score"], color="tab:blue")
|
| 730 |
ax.invert_yaxis()
|
| 731 |
ax.set_xlabel("Average final score")
|
| 732 |
+
ax.set_title("Leaderboard")
|
| 733 |
plt.tight_layout()
|
| 734 |
fig.savefig(fname, bbox_inches="tight")
|
| 735 |
plt.close(fig)
|
| 736 |
+
images.append((fname, "Leaderboard"))
|
| 737 |
except Exception:
|
| 738 |
pass
|
| 739 |
|
| 740 |
+
# Radar chart (all agents)
|
| 741 |
try:
|
| 742 |
metric_cols = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy"]
|
| 743 |
if use_llm_judge:
|
| 744 |
metric_cols.append("score_llm_consistency")
|
| 745 |
agg = metrics_df.groupby("Agent")[metric_cols].mean().reset_index()
|
| 746 |
+
labels = [c.replace("score_", "").capitalize() for c in metric_cols]
|
|
|
|
| 747 |
rows_for_plot = []
|
| 748 |
for _, row in agg.iterrows():
|
| 749 |
+
vals = [float(row[c])*100 for c in metric_cols]
|
| 750 |
rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 751 |
+
|
| 752 |
+
def spider_net_multi(labels, rows, title="Radar"):
|
|
|
|
| 753 |
N = len(labels)
|
| 754 |
angles = [n / float(N) * 2 * math.pi for n in range(N)]
|
| 755 |
angles += angles[:1]
|
|
|
|
| 761 |
for r in rows:
|
| 762 |
v = r["values"] + r["values"][:1]
|
| 763 |
ax.plot(angles, v, label=r["name"])
|
| 764 |
+
ax.fill(angles, v, alpha=0.1)
|
| 765 |
ax.set_title(title)
|
| 766 |
ax.legend(loc="upper right", bbox_to_anchor=(1.3,1.1))
|
| 767 |
return fig
|
| 768 |
+
|
| 769 |
+
fig = spider_net_multi(labels, rows_for_plot, "All Agents Comparison")
|
| 770 |
fname2 = f"/tmp/{uuid.uuid4().hex}_radar.png"
|
| 771 |
fig.savefig(fname2, bbox_inches="tight")
|
| 772 |
plt.close(fig)
|
| 773 |
+
images.append((fname2, "All agents radar"))
|
| 774 |
except Exception:
|
| 775 |
pass
|
| 776 |
|
| 777 |
+
# Per-task radar
|
| 778 |
try:
|
| 779 |
for task, subset in metrics_df.groupby("Task"):
|
| 780 |
agg = subset.groupby("Agent")[metric_cols].mean().reset_index()
|
|
|
|
| 782 |
continue
|
| 783 |
rows_for_plot = []
|
| 784 |
for _, row in agg.iterrows():
|
| 785 |
+
vals = [float(row[c])*100 for c in metric_cols]
|
| 786 |
rows_for_plot.append({"name": row["Agent"], "values": vals})
|
| 787 |
+
fig = spider_net_multi(labels, rows_for_plot, f"{task} Agents")
|
| 788 |
fname3 = f"/tmp/{uuid.uuid4().hex}_{task}_radar.png"
|
| 789 |
fig.savefig(fname3, bbox_inches="tight")
|
| 790 |
plt.close(fig)
|
| 791 |
+
images.append((fname3, f"{task} radar"))
|
| 792 |
except Exception:
|
| 793 |
pass
|
| 794 |
|
| 795 |
+
# Correlation heatmap
|
| 796 |
try:
|
| 797 |
metric_cols2 = ["score_instruction", "score_coherence", "score_grammar", "score_accuracy", "final_score"]
|
| 798 |
if use_llm_judge:
|
|
|
|
| 807 |
except Exception:
|
| 808 |
pass
|
| 809 |
|
| 810 |
+
leaderboard_df = metrics_df.groupby(["Agent","Task"])["final_score"].mean().reset_index().sort_values("final_score", ascending=False)
|
|
|
|
| 811 |
|
| 812 |
return metrics_df, images, leaderboard_df
|
| 813 |
+
|