Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- evaluator_module.py +288 -0
- report_generator.py +168 -0
- requirements.txt +29 -0
- visualizer_module.py +183 -0
evaluator_module.py
ADDED
|
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import numpy as np
|
| 3 |
+
from typing import Dict, List, Optional, Tuple
|
| 4 |
+
import json
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
import spacy
|
| 7 |
+
from transformers import pipeline
|
| 8 |
+
from sentence_transformers import SentenceTransformer
|
| 9 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 10 |
+
import hashlib
|
| 11 |
+
from datetime import datetime
|
| 12 |
+
import concurrent.futures
|
| 13 |
+
import random
|
| 14 |
+
|
| 15 |
+
class AetherScoreEvaluator:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
# NLP models
|
| 18 |
+
try:
|
| 19 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 20 |
+
except OSError:
|
| 21 |
+
print("Downloading 'en_core_web_sm' spacy model...")
|
| 22 |
+
spacy.cli.download("en_core_web_sm")
|
| 23 |
+
self.nlp = spacy.load("en_core_web_sm")
|
| 24 |
+
|
| 25 |
+
# LLM Judge Model
|
| 26 |
+
self.judge_model = pipeline(
|
| 27 |
+
"text2text-generation",
|
| 28 |
+
model="google/flan-t5-base",
|
| 29 |
+
device=-1 # 0 for GPU
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
# Sentence Transformer for Sentence ----> Embedding
|
| 33 |
+
self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 34 |
+
|
| 35 |
+
# Scoring weights # Domain Specific weights can be added for better results
|
| 36 |
+
self.weights = {'instruction_following': 0.25, 'hallucination_score': 0.20,
|
| 37 |
+
'assumption_control': 0.20, 'coherence': 0.20, 'accuracy': 0.15}
|
| 38 |
+
|
| 39 |
+
# In-memory cache
|
| 40 |
+
self.cache = {}
|
| 41 |
+
|
| 42 |
+
# LLM Judge
|
| 43 |
+
# def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
|
| 44 |
+
|
| 45 |
+
# print("Using HF model LLM Judge...")
|
| 46 |
+
|
| 47 |
+
# query = (
|
| 48 |
+
# f"Prompt: {prompt}\n"
|
| 49 |
+
# f"Response: {response}\n\n"
|
| 50 |
+
# "Return ONLY a valid JSON object in this exact format:\n"
|
| 51 |
+
# "{\n"
|
| 52 |
+
# " \"hallucination_score\": <float between 0 and 1>,\n"
|
| 53 |
+
# " \"assumption_control\": <float between 0 and 1>,\n"
|
| 54 |
+
# " \"explanation\": \"<one or two sentences>\"\n"
|
| 55 |
+
# "}"
|
| 56 |
+
# )
|
| 57 |
+
|
| 58 |
+
# try:
|
| 59 |
+
# result = self.judge_model(query, max_new_tokens=128, truncation=True)
|
| 60 |
+
# output = result[0]['generated_text']
|
| 61 |
+
# except Exception as e:
|
| 62 |
+
# return {
|
| 63 |
+
# "hallucination_score": (0.1, f"HF model failed: {e}"),
|
| 64 |
+
# "assumption_control": (0.1, f"HF model failed: {e}")
|
| 65 |
+
# }
|
| 66 |
+
|
| 67 |
+
# # Default values
|
| 68 |
+
|
| 69 |
+
# halluc_score = random.uniform(0.3, 0.7)
|
| 70 |
+
# assumption_score = random.uniform(0.3, 0.7)
|
| 71 |
+
# explanation = output
|
| 72 |
+
|
| 73 |
+
# try:
|
| 74 |
+
# parsed = json.loads(output.strip())
|
| 75 |
+
# halluc_score = float(parsed.get("hallucination_score", halluc_score))
|
| 76 |
+
# assumption_score = float(parsed.get("assumption_control", assumption_score))
|
| 77 |
+
# explanation = parsed.get("explanation", explanation)
|
| 78 |
+
# except Exception:
|
| 79 |
+
# pass # fallback to defaults
|
| 80 |
+
|
| 81 |
+
# return {
|
| 82 |
+
# "hallucination_score": (halluc_score, explanation),
|
| 83 |
+
# "assumption_control": (assumption_score, explanation)
|
| 84 |
+
# }
|
| 85 |
+
|
| 86 |
+
import random
|
| 87 |
+
import json
|
| 88 |
+
|
| 89 |
+
def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
|
| 90 |
+
|
| 91 |
+
print("Using rule-based evaluation instead of HF LLM...")
|
| 92 |
+
|
| 93 |
+
prompt_words = set(prompt.lower().split())
|
| 94 |
+
response_words = response.lower().split()
|
| 95 |
+
|
| 96 |
+
# Hallucination score: fraction of words in response that are not in prompt
|
| 97 |
+
if response_words:
|
| 98 |
+
halluc_score = len([w for w in response_words if w not in prompt_words]) / len(response_words)
|
| 99 |
+
else:
|
| 100 |
+
halluc_score = 0.1
|
| 101 |
+
|
| 102 |
+
# Assumption control: fraction of sentences starting with uncertain words
|
| 103 |
+
uncertain_starts = ("i assume", "maybe", "probably", "likely", "could be")
|
| 104 |
+
sentences = response.lower().split(".")
|
| 105 |
+
if sentences:
|
| 106 |
+
assumption_score = sum(0.3 for s in sentences if s.strip().startswith(uncertain_starts)) / len(sentences)
|
| 107 |
+
else:
|
| 108 |
+
assumption_score = 0.1
|
| 109 |
+
|
| 110 |
+
# Ensure scores are between 0 and 1
|
| 111 |
+
# halluc_score = max(0, min(1, halluc_score))
|
| 112 |
+
# assumption_score = max(0, min(1, assumption_score))
|
| 113 |
+
|
| 114 |
+
explanation = "Rule-based evaluation applied."
|
| 115 |
+
|
| 116 |
+
return {
|
| 117 |
+
"hallucination_score": (halluc_score, explanation),
|
| 118 |
+
"assumption_control": (assumption_score, explanation)
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# Single Evaluation # Inputs-->> Prompt, Agent Response, Expected Answer(Optional), Agent Name and Task type( General, QA, Summarizaton)etc
|
| 123 |
+
def evaluate_single(self, prompt: str, response: str, expected_answer: Optional[str] = None, task_type: str = "general") -> Dict:
|
| 124 |
+
|
| 125 |
+
# Generating Eval ID
|
| 126 |
+
eval_id = self._generate_eval_id(prompt, response)
|
| 127 |
+
|
| 128 |
+
# If already stored in cache direclty we can return from there.
|
| 129 |
+
# if eval_id in self.cache:
|
| 130 |
+
# return self.cache[eval_id]
|
| 131 |
+
|
| 132 |
+
scores, reasons = {}, {}
|
| 133 |
+
|
| 134 |
+
# Taking Back scores and reasons of hallucination and assumption control from LLM Judge
|
| 135 |
+
llm_judge_results = self._evaluate_with_llm_judge(prompt, response)
|
| 136 |
+
scores['hallucination_score'], reasons['hallucination_score'] = llm_judge_results['hallucination_score']
|
| 137 |
+
scores['assumption_control'], reasons['assumption_control'] = llm_judge_results['assumption_control']
|
| 138 |
+
|
| 139 |
+
# Evaluating Instruction Following, Coherence and Accuracy
|
| 140 |
+
scores['instruction_following'], reasons['instruction_following'] = self._evaluate_instruction_following(prompt, response)
|
| 141 |
+
scores['coherence'], reasons['coherence'] = self._evaluate_coherence(response)
|
| 142 |
+
scores['accuracy'], reasons['accuracy'] = self._evaluate_accuracy(response, expected_answer, task_type) if expected_answer else (0.5, "No expected answer provided.")
|
| 143 |
+
|
| 144 |
+
# Calculating Overall Score
|
| 145 |
+
scores['overall_score'] = self._calculate_overall_score(scores)
|
| 146 |
+
reasons['overall_score'] = f" Weighted Average Score based on component scores."
|
| 147 |
+
|
| 148 |
+
# Updating Eval ID, Timestamp and Task Type in Scores
|
| 149 |
+
scores.update({'eval_id': eval_id, 'timestamp': datetime.now().isoformat(), 'task_type': task_type})
|
| 150 |
+
|
| 151 |
+
# Updating scores(Eval ID, timestamp and task_type) and reasons(all scores) in result
|
| 152 |
+
result = {"scores": scores, "reasons": reasons}
|
| 153 |
+
|
| 154 |
+
#Storing results with corresponding Eval ID in cache
|
| 155 |
+
# self.cache[eval_id] = result
|
| 156 |
+
|
| 157 |
+
return result
|
| 158 |
+
# Batch Evaluation # Input of JSON/CSV file
|
| 159 |
+
def evaluate_batch(self, data: List[Dict], mode: str = "comprehensive") -> List[Dict]:
|
| 160 |
+
"""Process a batch of evaluations in parallel."""
|
| 161 |
+
|
| 162 |
+
results = []
|
| 163 |
+
|
| 164 |
+
# Get Item function
|
| 165 |
+
def process_item(item):
|
| 166 |
+
# Calling our Evalution function for Single prompt response pair
|
| 167 |
+
eval_result = self.evaluate_single(
|
| 168 |
+
prompt=item.get('prompt', ''),
|
| 169 |
+
response=item.get('response', ''),
|
| 170 |
+
expected_answer=item.get('expected_answer',''),
|
| 171 |
+
task_type=item.get('task_type', 'general')
|
| 172 |
+
)
|
| 173 |
+
# Combining with original metadata
|
| 174 |
+
eval_result.update({
|
| 175 |
+
'task_id': item.get('task_id', eval_result['scores']['eval_id']),
|
| 176 |
+
'agent_name': item.get('agent_name', 'Unknown'),
|
| 177 |
+
})
|
| 178 |
+
return eval_result
|
| 179 |
+
|
| 180 |
+
with concurrent.futures.ThreadPoolExecutor() as executor:
|
| 181 |
+
future_to_item = {executor.submit(process_item, item): item for item in data}
|
| 182 |
+
for future in concurrent.futures.as_completed(future_to_item):
|
| 183 |
+
try:
|
| 184 |
+
results.append(future.result())
|
| 185 |
+
except Exception as exc:
|
| 186 |
+
print(f'An item generated an exception: {exc}')
|
| 187 |
+
|
| 188 |
+
return results
|
| 189 |
+
|
| 190 |
+
# Instruction Following Evaluation (Prompt, Response)
|
| 191 |
+
def _evaluate_instruction_following(self, prompt: str, response: str) -> Tuple[float, str]:
|
| 192 |
+
score, checks, passed = 1.0, 0, 0
|
| 193 |
+
|
| 194 |
+
# Check for negative constraints
|
| 195 |
+
negations = re.findall(r"(don't|do not|avoid|without) ([\w\s,]+)", prompt.lower())
|
| 196 |
+
for _, constraint_phrase in negations:
|
| 197 |
+
checks += 1
|
| 198 |
+
words_to_avoid = [w.strip() for w in constraint_phrase.split(',')]
|
| 199 |
+
if not any(word in response.lower() for word in words_to_avoid if len(word) > 2):
|
| 200 |
+
passed += 1
|
| 201 |
+
|
| 202 |
+
# Fallback to semantic similarity if no specific instructions found
|
| 203 |
+
if checks == 0:
|
| 204 |
+
sim = self._semantic_similarity(prompt, response)
|
| 205 |
+
return sim, f"No specific constraints found. Score based on semantic similarity ({sim:.2f}) to prompt."
|
| 206 |
+
|
| 207 |
+
# Final Score calculation
|
| 208 |
+
score = passed / checks if checks > 0 else 1.0
|
| 209 |
+
reason = f"{passed}/{checks} specific constraints were followed."
|
| 210 |
+
|
| 211 |
+
return score, reason
|
| 212 |
+
|
| 213 |
+
# Evaluating Coherence (response)
|
| 214 |
+
def _evaluate_coherence(self, response: str) -> Tuple[float, str]:
|
| 215 |
+
|
| 216 |
+
# Extracting Sentences from Response
|
| 217 |
+
doc = self.nlp(response)
|
| 218 |
+
sentences = [sent.text for sent in doc.sents]
|
| 219 |
+
|
| 220 |
+
# If only one Sentence then Coherence is Neutral
|
| 221 |
+
if len(sentences) < 2:
|
| 222 |
+
return 0.7, "Coherence is neutral for single-sentence responses."
|
| 223 |
+
|
| 224 |
+
# Fetching Embeddings from our Sentence Model
|
| 225 |
+
embeddings = self.sentence_model.encode(sentences)
|
| 226 |
+
sims = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(sentences)-1)]
|
| 227 |
+
|
| 228 |
+
score = np.mean(sims)
|
| 229 |
+
|
| 230 |
+
reason = f"Average sentence-to-sentence similarity score is {score:.2f} across {len(sentences)} sentences."
|
| 231 |
+
return score, reason
|
| 232 |
+
|
| 233 |
+
|
| 234 |
+
# Evaluating Accuracy (Response, Expected, Task_type)
|
| 235 |
+
def _evaluate_accuracy(self, response: str, expected: str, task_type: str) -> Tuple[float, str]:
|
| 236 |
+
sim = self._semantic_similarity(response, expected)
|
| 237 |
+
reason = f"Semantic similarity between response and expected answer is {sim:.2f}."
|
| 238 |
+
if sim > 0.95:
|
| 239 |
+
reason += " (High match)"
|
| 240 |
+
elif sim < 0.5:
|
| 241 |
+
reason += " (Low match)"
|
| 242 |
+
return sim, reason
|
| 243 |
+
|
| 244 |
+
# Overall Score
|
| 245 |
+
def _calculate_overall_score(self, scores: Dict) -> float:
|
| 246 |
+
total, weight_sum = 0.0, 0.0
|
| 247 |
+
for metric, weight in self.weights.items():
|
| 248 |
+
if metric in scores:
|
| 249 |
+
total += scores[metric] * weight
|
| 250 |
+
weight_sum += weight
|
| 251 |
+
return total / weight_sum #if weight_sum > 0 else 0.5
|
| 252 |
+
|
| 253 |
+
# Explanation Generator, work in progress
|
| 254 |
+
def generate_explanation(self, scores: Dict) -> str:
|
| 255 |
+
explanation = []
|
| 256 |
+
overall = scores.get('overall_score', 0)
|
| 257 |
+
explanation.append(f"Overall Score: {overall:.2f}/1.00 - Reflects a weighted average of all dimensions.")
|
| 258 |
+
|
| 259 |
+
if scores.get('instruction_following', 0) < 0.6:
|
| 260 |
+
explanation.append("⚠️ Low Instruction Following: The response may have ignored key constraints or parts of the prompt.")
|
| 261 |
+
if scores.get('hallucination_score', 0) < 0.6:
|
| 262 |
+
explanation.append("⚠️ Potential Hallucination: The response might contain unverified or fabricated information.")
|
| 263 |
+
if scores.get('accuracy', 0) < 0.6 and scores.get('accuracy', 0.5) != 0.5:
|
| 264 |
+
explanation.append("⚠️ Low Accuracy: The response significantly differs from the provided expected answer.")
|
| 265 |
+
|
| 266 |
+
if not explanation[1:]:
|
| 267 |
+
explanation.append("✅ Great Performance: The agent performed well across the primary evaluation dimensions.")
|
| 268 |
+
|
| 269 |
+
return "\n".join(explanation)
|
| 270 |
+
|
| 271 |
+
# Agent Scores
|
| 272 |
+
def get_agent_scores_from_results(self, results: List[Dict]) -> Dict[str, List[float]]:
|
| 273 |
+
agent_scores = defaultdict(list)
|
| 274 |
+
for result in results:
|
| 275 |
+
agent_name = result.get('agent_name', 'Unknown')
|
| 276 |
+
overall_score = result.get('scores', {}).get('overall_score', 0)
|
| 277 |
+
agent_scores[agent_name].append(overall_score)
|
| 278 |
+
return agent_scores
|
| 279 |
+
|
| 280 |
+
# Helper Functions
|
| 281 |
+
def _generate_eval_id(self, prompt: str, response: str) -> str:
|
| 282 |
+
return hashlib.md5(f"{prompt}{response}".encode()).hexdigest()[:12]
|
| 283 |
+
|
| 284 |
+
def _semantic_similarity(self, text1: str, text2: str) -> float:
|
| 285 |
+
if not text1 or not text2: return 0.0
|
| 286 |
+
emb1 = self.sentence_model.encode([text1])
|
| 287 |
+
emb2 = self.sentence_model.encode([text2])
|
| 288 |
+
return cosine_similarity(emb1, emb2)[0][0]
|
report_generator.py
ADDED
|
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# report_generator.py
|
| 2 |
+
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
import numpy as np
|
| 5 |
+
from collections import defaultdict
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
class ReportGenerator:
|
| 9 |
+
"""
|
| 10 |
+
Generates textual and HTML reports from evaluation results
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
def generate_batch_report(self, results: List[Dict]) -> str:
|
| 14 |
+
"""
|
| 15 |
+
Generates a comprehensive text report for a batch evaluation.
|
| 16 |
+
|
| 17 |
+
Args:
|
| 18 |
+
results: A list of evaluation result dictionaries.
|
| 19 |
+
|
| 20 |
+
Returns:
|
| 21 |
+
A formatted string containing the batch evaluation report.
|
| 22 |
+
"""
|
| 23 |
+
if not results:
|
| 24 |
+
return "No evaluation results to report."
|
| 25 |
+
|
| 26 |
+
num_evals = len(results)
|
| 27 |
+
agent_names = list(set(r['agent_name'] for r in results))
|
| 28 |
+
num_agents = len(agent_names)
|
| 29 |
+
|
| 30 |
+
# Aggregate scores
|
| 31 |
+
overall_scores = [r['scores']['overall_score'] for r in results]
|
| 32 |
+
metric_scores = defaultdict(list)
|
| 33 |
+
agent_overall_scores = defaultdict(list)
|
| 34 |
+
|
| 35 |
+
for res in results:
|
| 36 |
+
agent_overall_scores[res['agent_name']].append(res['scores']['overall_score'])
|
| 37 |
+
for metric, score in res['scores'].items():
|
| 38 |
+
if metric not in ['eval_id', 'timestamp', 'task_type']:
|
| 39 |
+
metric_scores[metric].append(score)
|
| 40 |
+
|
| 41 |
+
# Calculate agent averages
|
| 42 |
+
agent_avg_scores = {agent: np.mean(scores) for agent, scores in agent_overall_scores.items()}
|
| 43 |
+
top_agent = max(agent_avg_scores, key=agent_avg_scores.get)
|
| 44 |
+
bottom_agent = min(agent_avg_scores, key=agent_avg_scores.get)
|
| 45 |
+
|
| 46 |
+
# Build report string
|
| 47 |
+
report = []
|
| 48 |
+
report.append("="*50)
|
| 49 |
+
report.append(" AetherScore - Batch Evaluation Report")
|
| 50 |
+
report.append("="*50)
|
| 51 |
+
report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 52 |
+
|
| 53 |
+
report.append("--- Summary ---")
|
| 54 |
+
report.append(f"Total Evaluations: {num_evals}")
|
| 55 |
+
report.append(f"Number of Agents: {num_agents}")
|
| 56 |
+
report.append(f"Overall Average Score: {np.mean(overall_scores):.3f}\n")
|
| 57 |
+
|
| 58 |
+
report.append("--- Agent Performance ---")
|
| 59 |
+
report.append(f"Top Performing Agent: {top_agent} (Avg Score: {agent_avg_scores[top_agent]:.3f})")
|
| 60 |
+
report.append(f"Agent with most room for improvement: {bottom_agent} (Avg Score: {agent_avg_scores[bottom_agent]:.3f})\n")
|
| 61 |
+
|
| 62 |
+
report.append("--- Metric Breakdown (Average Scores) ---")
|
| 63 |
+
for metric, scores in metric_scores.items():
|
| 64 |
+
metric_name = metric.replace('_', ' ').title()
|
| 65 |
+
report.append(f"- {metric_name:<25}: {np.mean(scores):.3f}")
|
| 66 |
+
|
| 67 |
+
report.append("\n" + "="*50)
|
| 68 |
+
|
| 69 |
+
return "\n".join(report)
|
| 70 |
+
|
| 71 |
+
def generate_comparison_report(
|
| 72 |
+
self,
|
| 73 |
+
agent1_results: List[Dict],
|
| 74 |
+
agent2_results: List[Dict]
|
| 75 |
+
) -> str:
|
| 76 |
+
"""
|
| 77 |
+
Generates a text report comparing two agents.
|
| 78 |
+
|
| 79 |
+
Args:
|
| 80 |
+
agent1_results: Evaluation results for the first agent.
|
| 81 |
+
agent2_results: Evaluation results for the second agent.
|
| 82 |
+
|
| 83 |
+
Returns:
|
| 84 |
+
A formatted string comparing the two agents.
|
| 85 |
+
"""
|
| 86 |
+
if not agent1_results or not agent2_results:
|
| 87 |
+
return "Insufficient data for comparison. Please provide results for both agents."
|
| 88 |
+
|
| 89 |
+
agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
|
| 90 |
+
agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
|
| 91 |
+
|
| 92 |
+
# Calculate average scores for each agent
|
| 93 |
+
metrics = ['overall_score', 'instruction_following', 'hallucination_score',
|
| 94 |
+
'assumption_control', 'coherence', 'accuracy']
|
| 95 |
+
|
| 96 |
+
avg_scores1 = {m: np.mean([r['scores'].get(m, 0) for r in agent1_results]) for m in metrics}
|
| 97 |
+
avg_scores2 = {m: np.mean([r['scores'].get(m, 0) for r in agent2_results]) for m in metrics}
|
| 98 |
+
|
| 99 |
+
# Build report string
|
| 100 |
+
report = []
|
| 101 |
+
report.append("="*60)
|
| 102 |
+
report.append(f" Agent Comparison Report: {agent1_name} vs. {agent2_name}")
|
| 103 |
+
report.append("="*60)
|
| 104 |
+
report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
|
| 105 |
+
|
| 106 |
+
# Overall Winner
|
| 107 |
+
winner = agent1_name if avg_scores1['overall_score'] > avg_scores2['overall_score'] else agent2_name
|
| 108 |
+
report.append("--- Overall Performance ---")
|
| 109 |
+
report.append(f"🏆 Winner: {winner}")
|
| 110 |
+
report.append(f"{agent1_name} Avg Overall Score: {avg_scores1['overall_score']:.3f}")
|
| 111 |
+
report.append(f"{agent2_name} Avg Overall Score: {avg_scores2['overall_score']:.3f}\n")
|
| 112 |
+
|
| 113 |
+
report.append("--- Detailed Metric Comparison ---")
|
| 114 |
+
header = f"{'Metric':<25} | {agent1_name:<10} | {agent2_name:<10} | {'Delta':<8} | {'Winner'}"
|
| 115 |
+
report.append(header)
|
| 116 |
+
report.append("-"*len(header))
|
| 117 |
+
|
| 118 |
+
for metric in metrics:
|
| 119 |
+
s1 = avg_scores1[metric]
|
| 120 |
+
s2 = avg_scores2[metric]
|
| 121 |
+
delta = s2 - s1
|
| 122 |
+
metric_winner = agent1_name if s1 > s2 else agent2_name if s2 > s1 else "Tie"
|
| 123 |
+
metric_name = metric.replace('_', ' ').title()
|
| 124 |
+
|
| 125 |
+
report.append(f"{metric_name:<25} | {s1:<10.3f} | {s2:<10.3f} | {delta:<+8.3f} | {metric_winner}")
|
| 126 |
+
|
| 127 |
+
report.append("\n" + "="*60)
|
| 128 |
+
|
| 129 |
+
return "\n".join(report)
|
| 130 |
+
|
| 131 |
+
def generate_html_report(self, results_data: List[Dict]) -> str:
|
| 132 |
+
"""
|
| 133 |
+
Generates a basic HTML report from evaluation results.
|
| 134 |
+
|
| 135 |
+
Args:
|
| 136 |
+
results_data: A list of evaluation result dictionaries.
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
A string containing a full HTML report.
|
| 140 |
+
"""
|
| 141 |
+
report_str = self.generate_batch_report(results_data)
|
| 142 |
+
|
| 143 |
+
# Basic HTML template
|
| 144 |
+
html_template = f"""
|
| 145 |
+
<!DOCTYPE html>
|
| 146 |
+
<html lang="en">
|
| 147 |
+
<head>
|
| 148 |
+
<meta charset="UTF-8">
|
| 149 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 150 |
+
<title>AetherScore Evaluation Report</title>
|
| 151 |
+
<style>
|
| 152 |
+
body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; color: #333; }}
|
| 153 |
+
.container {{ max-width: 800px; margin: auto; padding: 20px; }}
|
| 154 |
+
h1 {{ color: #4a4a4a; }}
|
| 155 |
+
pre {{ background: #f4f4f4; padding: 15px; border-radius: 5px; white-space: pre-wrap; }}
|
| 156 |
+
</style>
|
| 157 |
+
</head>
|
| 158 |
+
<body>
|
| 159 |
+
<div class="container">
|
| 160 |
+
<h1>AetherScore Evaluation Report</h1>
|
| 161 |
+
<p>This report contains a summary of the batch evaluation results.</p>
|
| 162 |
+
<pre>{report_str}</pre>
|
| 163 |
+
<p><em>Note: This is a text-based summary. For interactive visualizations, please use the AetherScore dashboard.</em></p>
|
| 164 |
+
</div>
|
| 165 |
+
</body>
|
| 166 |
+
</html>
|
| 167 |
+
"""
|
| 168 |
+
return html_template
|
requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core Framework
|
| 2 |
+
gradio
|
| 3 |
+
|
| 4 |
+
# Data Handling & Numerical Operations
|
| 5 |
+
pandas
|
| 6 |
+
numpy
|
| 7 |
+
scikit-learn
|
| 8 |
+
|
| 9 |
+
# Visualization
|
| 10 |
+
plotly
|
| 11 |
+
plotly-express
|
| 12 |
+
seaborn
|
| 13 |
+
matplotlib
|
| 14 |
+
|
| 15 |
+
# NLP & Machine Learning
|
| 16 |
+
spacy==3.7.4
|
| 17 |
+
transformers
|
| 18 |
+
sentence-transformers
|
| 19 |
+
torch
|
| 20 |
+
|
| 21 |
+
# spaCy model (ensures the model is downloaded during setup)
|
| 22 |
+
https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
|
| 23 |
+
|
| 24 |
+
evaluate
|
| 25 |
+
rouge-score
|
| 26 |
+
nltk
|
| 27 |
+
absl-py
|
| 28 |
+
sacrebleu
|
| 29 |
+
bert-score
|
visualizer_module.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import plotly.graph_objects as go
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import numpy as np
|
| 4 |
+
from typing import Dict, List
|
| 5 |
+
|
| 6 |
+
class EvaluationVisualizer:
|
| 7 |
+
def __init__(self):
|
| 8 |
+
self.metric_colors = {
|
| 9 |
+
'instruction_following': '#667eea', 'hallucination_score': '#48bb78',
|
| 10 |
+
'assumption_control': '#f6ad55', 'coherence': '#63b3ed',
|
| 11 |
+
'accuracy': '#fc8181', 'overall_score': '#764ba2'
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
#Spider chart with multi dimensional scores for single evaluation
|
| 15 |
+
|
| 16 |
+
def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
|
| 17 |
+
|
| 18 |
+
metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy']
|
| 19 |
+
values = [
|
| 20 |
+
scores.get('instruction_following', 0), scores.get('hallucination_score', 0),
|
| 21 |
+
scores.get('assumption_control', 0), scores.get('coherence', 0),
|
| 22 |
+
scores.get('accuracy', 0)
|
| 23 |
+
]
|
| 24 |
+
fig = go.Figure()
|
| 25 |
+
fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following'])))
|
| 26 |
+
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white')
|
| 27 |
+
return fig
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Horizontal bar chart showing scores for single evaluation
|
| 31 |
+
|
| 32 |
+
def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
|
| 33 |
+
|
| 34 |
+
metric_map = {
|
| 35 |
+
'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following',
|
| 36 |
+
'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control',
|
| 37 |
+
'coherence': 'Coherence', 'accuracy': 'Accuracy'
|
| 38 |
+
}
|
| 39 |
+
metrics = [label for key, label in metric_map.items() if key in scores]
|
| 40 |
+
values = [scores[key] for key in metric_map if key in scores]
|
| 41 |
+
colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores]
|
| 42 |
+
|
| 43 |
+
fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto'))
|
| 44 |
+
fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False)
|
| 45 |
+
return fig
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
# Heatmap showing evaluation scores across agents and tasks
|
| 49 |
+
|
| 50 |
+
def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure:
|
| 51 |
+
|
| 52 |
+
if not results: return go.Figure().update_layout(title="No data for heatmap")
|
| 53 |
+
|
| 54 |
+
df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results]
|
| 55 |
+
df = pd.DataFrame(df_data)
|
| 56 |
+
pivot_df = df.pivot(index='agent', columns='task', values='score')
|
| 57 |
+
|
| 58 |
+
fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score")))
|
| 59 |
+
fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white')
|
| 60 |
+
return fig
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Violin plots for spread in scores across Agents
|
| 64 |
+
|
| 65 |
+
def create_score_distribution(self, results: List[Dict]) -> go.Figure:
|
| 66 |
+
|
| 67 |
+
if not results: return go.Figure().update_layout(title="No data for distribution plot")
|
| 68 |
+
|
| 69 |
+
df_data = []
|
| 70 |
+
for r in results:
|
| 71 |
+
entry = {'Agent': r.get('agent_name', 'Unknown')}
|
| 72 |
+
entry.update(r['scores'])
|
| 73 |
+
df_data.append(entry)
|
| 74 |
+
df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score')
|
| 75 |
+
|
| 76 |
+
metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()}
|
| 77 |
+
df['Metric'] = df['Metric'].map(metric_map)
|
| 78 |
+
|
| 79 |
+
fig = go.Figure()
|
| 80 |
+
for metric in df['Metric'].unique():
|
| 81 |
+
fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True))
|
| 82 |
+
|
| 83 |
+
fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False)
|
| 84 |
+
return fig
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
# Bar chart showing average overall scores of each agent
|
| 89 |
+
|
| 90 |
+
def create_performance_trends(self, results: List[Dict]) -> go.Figure:
|
| 91 |
+
|
| 92 |
+
if not results:
|
| 93 |
+
return go.Figure().update_layout(title="No data for average performance plot")
|
| 94 |
+
|
| 95 |
+
agent_scores = {}
|
| 96 |
+
for r in results:
|
| 97 |
+
agent = r['agent_name']
|
| 98 |
+
if agent not in agent_scores:
|
| 99 |
+
agent_scores[agent] = []
|
| 100 |
+
agent_scores[agent].append(r['scores'].get('overall_score', 0))
|
| 101 |
+
|
| 102 |
+
# Compute averages
|
| 103 |
+
avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()}
|
| 104 |
+
|
| 105 |
+
fig = go.Figure(go.Bar(
|
| 106 |
+
x=list(avg_scores.keys()),
|
| 107 |
+
y=list(avg_scores.values()),
|
| 108 |
+
text=[f"{v:.2f}" for v in avg_scores.values()],
|
| 109 |
+
textposition="auto",
|
| 110 |
+
marker=dict(color="#667eea")
|
| 111 |
+
))
|
| 112 |
+
|
| 113 |
+
fig.update_layout(
|
| 114 |
+
title="Average Overall Scores by Agent",
|
| 115 |
+
xaxis_title="Agents",
|
| 116 |
+
yaxis_title="Average Overall Score",
|
| 117 |
+
template="plotly_white"
|
| 118 |
+
)
|
| 119 |
+
return fig
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# Comparison chart between two agents
|
| 124 |
+
|
| 125 |
+
def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
|
| 126 |
+
|
| 127 |
+
metrics = list(self.metric_colors.keys())
|
| 128 |
+
agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
|
| 129 |
+
agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
|
| 130 |
+
|
| 131 |
+
def get_avg_scores(results):
|
| 132 |
+
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
|
| 133 |
+
|
| 134 |
+
avg1 = get_avg_scores(agent1_results)
|
| 135 |
+
avg2 = get_avg_scores(agent2_results)
|
| 136 |
+
metric_labels = [m.replace('_', ' ').title() for m in metrics]
|
| 137 |
+
|
| 138 |
+
fig = go.Figure(data=[
|
| 139 |
+
go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]),
|
| 140 |
+
go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics])
|
| 141 |
+
])
|
| 142 |
+
fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white')
|
| 143 |
+
return fig
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# Spider chart comparing two agents
|
| 147 |
+
|
| 148 |
+
def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
|
| 149 |
+
|
| 150 |
+
metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy']
|
| 151 |
+
metric_labels = [m.replace('_', ' ').title() for m in metrics]
|
| 152 |
+
|
| 153 |
+
def get_avg_scores(results):
|
| 154 |
+
return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics]
|
| 155 |
+
|
| 156 |
+
agent1_values = get_avg_scores(agent1_results)
|
| 157 |
+
agent2_values = get_avg_scores(agent2_results)
|
| 158 |
+
|
| 159 |
+
fig = go.Figure()
|
| 160 |
+
fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1')))
|
| 161 |
+
fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2')))
|
| 162 |
+
fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white')
|
| 163 |
+
return fig
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
#performance gap between two agents across metrics
|
| 167 |
+
|
| 168 |
+
def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
|
| 169 |
+
|
| 170 |
+
metrics = list(self.metric_colors.keys())
|
| 171 |
+
|
| 172 |
+
def get_avg_scores(results):
|
| 173 |
+
return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
|
| 174 |
+
|
| 175 |
+
avg1 = get_avg_scores(agent1_results)
|
| 176 |
+
avg2 = get_avg_scores(agent2_results)
|
| 177 |
+
deltas = [avg2[m] - avg1[m] for m in metrics]
|
| 178 |
+
colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas]
|
| 179 |
+
metric_labels = [m.replace('_', ' ').title() for m in metrics]
|
| 180 |
+
|
| 181 |
+
fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas]))
|
| 182 |
+
fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white')
|
| 183 |
+
return fig
|