aaditya-raj commited on
Commit
e3a1a9e
·
verified ·
1 Parent(s): 14709ff

Upload 4 files

Browse files
Files changed (4) hide show
  1. evaluator_module.py +288 -0
  2. report_generator.py +168 -0
  3. requirements.txt +29 -0
  4. visualizer_module.py +183 -0
evaluator_module.py ADDED
@@ -0,0 +1,288 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import numpy as np
3
+ from typing import Dict, List, Optional, Tuple
4
+ import json
5
+ from collections import defaultdict
6
+ import spacy
7
+ from transformers import pipeline
8
+ from sentence_transformers import SentenceTransformer
9
+ from sklearn.metrics.pairwise import cosine_similarity
10
+ import hashlib
11
+ from datetime import datetime
12
+ import concurrent.futures
13
+ import random
14
+
15
+ class AetherScoreEvaluator:
16
+ def __init__(self):
17
+ # NLP models
18
+ try:
19
+ self.nlp = spacy.load("en_core_web_sm")
20
+ except OSError:
21
+ print("Downloading 'en_core_web_sm' spacy model...")
22
+ spacy.cli.download("en_core_web_sm")
23
+ self.nlp = spacy.load("en_core_web_sm")
24
+
25
+ # LLM Judge Model
26
+ self.judge_model = pipeline(
27
+ "text2text-generation",
28
+ model="google/flan-t5-base",
29
+ device=-1 # 0 for GPU
30
+ )
31
+
32
+ # Sentence Transformer for Sentence ----> Embedding
33
+ self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
34
+
35
+ # Scoring weights # Domain Specific weights can be added for better results
36
+ self.weights = {'instruction_following': 0.25, 'hallucination_score': 0.20,
37
+ 'assumption_control': 0.20, 'coherence': 0.20, 'accuracy': 0.15}
38
+
39
+ # In-memory cache
40
+ self.cache = {}
41
+
42
+ # LLM Judge
43
+ # def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
44
+
45
+ # print("Using HF model LLM Judge...")
46
+
47
+ # query = (
48
+ # f"Prompt: {prompt}\n"
49
+ # f"Response: {response}\n\n"
50
+ # "Return ONLY a valid JSON object in this exact format:\n"
51
+ # "{\n"
52
+ # " \"hallucination_score\": <float between 0 and 1>,\n"
53
+ # " \"assumption_control\": <float between 0 and 1>,\n"
54
+ # " \"explanation\": \"<one or two sentences>\"\n"
55
+ # "}"
56
+ # )
57
+
58
+ # try:
59
+ # result = self.judge_model(query, max_new_tokens=128, truncation=True)
60
+ # output = result[0]['generated_text']
61
+ # except Exception as e:
62
+ # return {
63
+ # "hallucination_score": (0.1, f"HF model failed: {e}"),
64
+ # "assumption_control": (0.1, f"HF model failed: {e}")
65
+ # }
66
+
67
+ # # Default values
68
+
69
+ # halluc_score = random.uniform(0.3, 0.7)
70
+ # assumption_score = random.uniform(0.3, 0.7)
71
+ # explanation = output
72
+
73
+ # try:
74
+ # parsed = json.loads(output.strip())
75
+ # halluc_score = float(parsed.get("hallucination_score", halluc_score))
76
+ # assumption_score = float(parsed.get("assumption_control", assumption_score))
77
+ # explanation = parsed.get("explanation", explanation)
78
+ # except Exception:
79
+ # pass # fallback to defaults
80
+
81
+ # return {
82
+ # "hallucination_score": (halluc_score, explanation),
83
+ # "assumption_control": (assumption_score, explanation)
84
+ # }
85
+
86
+ import random
87
+ import json
88
+
89
+ def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
90
+
91
+ print("Using rule-based evaluation instead of HF LLM...")
92
+
93
+ prompt_words = set(prompt.lower().split())
94
+ response_words = response.lower().split()
95
+
96
+ # Hallucination score: fraction of words in response that are not in prompt
97
+ if response_words:
98
+ halluc_score = len([w for w in response_words if w not in prompt_words]) / len(response_words)
99
+ else:
100
+ halluc_score = 0.1
101
+
102
+ # Assumption control: fraction of sentences starting with uncertain words
103
+ uncertain_starts = ("i assume", "maybe", "probably", "likely", "could be")
104
+ sentences = response.lower().split(".")
105
+ if sentences:
106
+ assumption_score = sum(0.3 for s in sentences if s.strip().startswith(uncertain_starts)) / len(sentences)
107
+ else:
108
+ assumption_score = 0.1
109
+
110
+ # Ensure scores are between 0 and 1
111
+ # halluc_score = max(0, min(1, halluc_score))
112
+ # assumption_score = max(0, min(1, assumption_score))
113
+
114
+ explanation = "Rule-based evaluation applied."
115
+
116
+ return {
117
+ "hallucination_score": (halluc_score, explanation),
118
+ "assumption_control": (assumption_score, explanation)
119
+ }
120
+
121
+
122
+ # Single Evaluation # Inputs-->> Prompt, Agent Response, Expected Answer(Optional), Agent Name and Task type( General, QA, Summarizaton)etc
123
+ def evaluate_single(self, prompt: str, response: str, expected_answer: Optional[str] = None, task_type: str = "general") -> Dict:
124
+
125
+ # Generating Eval ID
126
+ eval_id = self._generate_eval_id(prompt, response)
127
+
128
+ # If already stored in cache direclty we can return from there.
129
+ # if eval_id in self.cache:
130
+ # return self.cache[eval_id]
131
+
132
+ scores, reasons = {}, {}
133
+
134
+ # Taking Back scores and reasons of hallucination and assumption control from LLM Judge
135
+ llm_judge_results = self._evaluate_with_llm_judge(prompt, response)
136
+ scores['hallucination_score'], reasons['hallucination_score'] = llm_judge_results['hallucination_score']
137
+ scores['assumption_control'], reasons['assumption_control'] = llm_judge_results['assumption_control']
138
+
139
+ # Evaluating Instruction Following, Coherence and Accuracy
140
+ scores['instruction_following'], reasons['instruction_following'] = self._evaluate_instruction_following(prompt, response)
141
+ scores['coherence'], reasons['coherence'] = self._evaluate_coherence(response)
142
+ scores['accuracy'], reasons['accuracy'] = self._evaluate_accuracy(response, expected_answer, task_type) if expected_answer else (0.5, "No expected answer provided.")
143
+
144
+ # Calculating Overall Score
145
+ scores['overall_score'] = self._calculate_overall_score(scores)
146
+ reasons['overall_score'] = f" Weighted Average Score based on component scores."
147
+
148
+ # Updating Eval ID, Timestamp and Task Type in Scores
149
+ scores.update({'eval_id': eval_id, 'timestamp': datetime.now().isoformat(), 'task_type': task_type})
150
+
151
+ # Updating scores(Eval ID, timestamp and task_type) and reasons(all scores) in result
152
+ result = {"scores": scores, "reasons": reasons}
153
+
154
+ #Storing results with corresponding Eval ID in cache
155
+ # self.cache[eval_id] = result
156
+
157
+ return result
158
+ # Batch Evaluation # Input of JSON/CSV file
159
+ def evaluate_batch(self, data: List[Dict], mode: str = "comprehensive") -> List[Dict]:
160
+ """Process a batch of evaluations in parallel."""
161
+
162
+ results = []
163
+
164
+ # Get Item function
165
+ def process_item(item):
166
+ # Calling our Evalution function for Single prompt response pair
167
+ eval_result = self.evaluate_single(
168
+ prompt=item.get('prompt', ''),
169
+ response=item.get('response', ''),
170
+ expected_answer=item.get('expected_answer',''),
171
+ task_type=item.get('task_type', 'general')
172
+ )
173
+ # Combining with original metadata
174
+ eval_result.update({
175
+ 'task_id': item.get('task_id', eval_result['scores']['eval_id']),
176
+ 'agent_name': item.get('agent_name', 'Unknown'),
177
+ })
178
+ return eval_result
179
+
180
+ with concurrent.futures.ThreadPoolExecutor() as executor:
181
+ future_to_item = {executor.submit(process_item, item): item for item in data}
182
+ for future in concurrent.futures.as_completed(future_to_item):
183
+ try:
184
+ results.append(future.result())
185
+ except Exception as exc:
186
+ print(f'An item generated an exception: {exc}')
187
+
188
+ return results
189
+
190
+ # Instruction Following Evaluation (Prompt, Response)
191
+ def _evaluate_instruction_following(self, prompt: str, response: str) -> Tuple[float, str]:
192
+ score, checks, passed = 1.0, 0, 0
193
+
194
+ # Check for negative constraints
195
+ negations = re.findall(r"(don't|do not|avoid|without) ([\w\s,]+)", prompt.lower())
196
+ for _, constraint_phrase in negations:
197
+ checks += 1
198
+ words_to_avoid = [w.strip() for w in constraint_phrase.split(',')]
199
+ if not any(word in response.lower() for word in words_to_avoid if len(word) > 2):
200
+ passed += 1
201
+
202
+ # Fallback to semantic similarity if no specific instructions found
203
+ if checks == 0:
204
+ sim = self._semantic_similarity(prompt, response)
205
+ return sim, f"No specific constraints found. Score based on semantic similarity ({sim:.2f}) to prompt."
206
+
207
+ # Final Score calculation
208
+ score = passed / checks if checks > 0 else 1.0
209
+ reason = f"{passed}/{checks} specific constraints were followed."
210
+
211
+ return score, reason
212
+
213
+ # Evaluating Coherence (response)
214
+ def _evaluate_coherence(self, response: str) -> Tuple[float, str]:
215
+
216
+ # Extracting Sentences from Response
217
+ doc = self.nlp(response)
218
+ sentences = [sent.text for sent in doc.sents]
219
+
220
+ # If only one Sentence then Coherence is Neutral
221
+ if len(sentences) < 2:
222
+ return 0.7, "Coherence is neutral for single-sentence responses."
223
+
224
+ # Fetching Embeddings from our Sentence Model
225
+ embeddings = self.sentence_model.encode(sentences)
226
+ sims = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(sentences)-1)]
227
+
228
+ score = np.mean(sims)
229
+
230
+ reason = f"Average sentence-to-sentence similarity score is {score:.2f} across {len(sentences)} sentences."
231
+ return score, reason
232
+
233
+
234
+ # Evaluating Accuracy (Response, Expected, Task_type)
235
+ def _evaluate_accuracy(self, response: str, expected: str, task_type: str) -> Tuple[float, str]:
236
+ sim = self._semantic_similarity(response, expected)
237
+ reason = f"Semantic similarity between response and expected answer is {sim:.2f}."
238
+ if sim > 0.95:
239
+ reason += " (High match)"
240
+ elif sim < 0.5:
241
+ reason += " (Low match)"
242
+ return sim, reason
243
+
244
+ # Overall Score
245
+ def _calculate_overall_score(self, scores: Dict) -> float:
246
+ total, weight_sum = 0.0, 0.0
247
+ for metric, weight in self.weights.items():
248
+ if metric in scores:
249
+ total += scores[metric] * weight
250
+ weight_sum += weight
251
+ return total / weight_sum #if weight_sum > 0 else 0.5
252
+
253
+ # Explanation Generator, work in progress
254
+ def generate_explanation(self, scores: Dict) -> str:
255
+ explanation = []
256
+ overall = scores.get('overall_score', 0)
257
+ explanation.append(f"Overall Score: {overall:.2f}/1.00 - Reflects a weighted average of all dimensions.")
258
+
259
+ if scores.get('instruction_following', 0) < 0.6:
260
+ explanation.append("⚠️ Low Instruction Following: The response may have ignored key constraints or parts of the prompt.")
261
+ if scores.get('hallucination_score', 0) < 0.6:
262
+ explanation.append("⚠️ Potential Hallucination: The response might contain unverified or fabricated information.")
263
+ if scores.get('accuracy', 0) < 0.6 and scores.get('accuracy', 0.5) != 0.5:
264
+ explanation.append("⚠️ Low Accuracy: The response significantly differs from the provided expected answer.")
265
+
266
+ if not explanation[1:]:
267
+ explanation.append("✅ Great Performance: The agent performed well across the primary evaluation dimensions.")
268
+
269
+ return "\n".join(explanation)
270
+
271
+ # Agent Scores
272
+ def get_agent_scores_from_results(self, results: List[Dict]) -> Dict[str, List[float]]:
273
+ agent_scores = defaultdict(list)
274
+ for result in results:
275
+ agent_name = result.get('agent_name', 'Unknown')
276
+ overall_score = result.get('scores', {}).get('overall_score', 0)
277
+ agent_scores[agent_name].append(overall_score)
278
+ return agent_scores
279
+
280
+ # Helper Functions
281
+ def _generate_eval_id(self, prompt: str, response: str) -> str:
282
+ return hashlib.md5(f"{prompt}{response}".encode()).hexdigest()[:12]
283
+
284
+ def _semantic_similarity(self, text1: str, text2: str) -> float:
285
+ if not text1 or not text2: return 0.0
286
+ emb1 = self.sentence_model.encode([text1])
287
+ emb2 = self.sentence_model.encode([text2])
288
+ return cosine_similarity(emb1, emb2)[0][0]
report_generator.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # report_generator.py
2
+
3
+ from typing import Dict, List
4
+ import numpy as np
5
+ from collections import defaultdict
6
+ from datetime import datetime
7
+
8
+ class ReportGenerator:
9
+ """
10
+ Generates textual and HTML reports from evaluation results
11
+ """
12
+
13
+ def generate_batch_report(self, results: List[Dict]) -> str:
14
+ """
15
+ Generates a comprehensive text report for a batch evaluation.
16
+
17
+ Args:
18
+ results: A list of evaluation result dictionaries.
19
+
20
+ Returns:
21
+ A formatted string containing the batch evaluation report.
22
+ """
23
+ if not results:
24
+ return "No evaluation results to report."
25
+
26
+ num_evals = len(results)
27
+ agent_names = list(set(r['agent_name'] for r in results))
28
+ num_agents = len(agent_names)
29
+
30
+ # Aggregate scores
31
+ overall_scores = [r['scores']['overall_score'] for r in results]
32
+ metric_scores = defaultdict(list)
33
+ agent_overall_scores = defaultdict(list)
34
+
35
+ for res in results:
36
+ agent_overall_scores[res['agent_name']].append(res['scores']['overall_score'])
37
+ for metric, score in res['scores'].items():
38
+ if metric not in ['eval_id', 'timestamp', 'task_type']:
39
+ metric_scores[metric].append(score)
40
+
41
+ # Calculate agent averages
42
+ agent_avg_scores = {agent: np.mean(scores) for agent, scores in agent_overall_scores.items()}
43
+ top_agent = max(agent_avg_scores, key=agent_avg_scores.get)
44
+ bottom_agent = min(agent_avg_scores, key=agent_avg_scores.get)
45
+
46
+ # Build report string
47
+ report = []
48
+ report.append("="*50)
49
+ report.append(" AetherScore - Batch Evaluation Report")
50
+ report.append("="*50)
51
+ report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
52
+
53
+ report.append("--- Summary ---")
54
+ report.append(f"Total Evaluations: {num_evals}")
55
+ report.append(f"Number of Agents: {num_agents}")
56
+ report.append(f"Overall Average Score: {np.mean(overall_scores):.3f}\n")
57
+
58
+ report.append("--- Agent Performance ---")
59
+ report.append(f"Top Performing Agent: {top_agent} (Avg Score: {agent_avg_scores[top_agent]:.3f})")
60
+ report.append(f"Agent with most room for improvement: {bottom_agent} (Avg Score: {agent_avg_scores[bottom_agent]:.3f})\n")
61
+
62
+ report.append("--- Metric Breakdown (Average Scores) ---")
63
+ for metric, scores in metric_scores.items():
64
+ metric_name = metric.replace('_', ' ').title()
65
+ report.append(f"- {metric_name:<25}: {np.mean(scores):.3f}")
66
+
67
+ report.append("\n" + "="*50)
68
+
69
+ return "\n".join(report)
70
+
71
+ def generate_comparison_report(
72
+ self,
73
+ agent1_results: List[Dict],
74
+ agent2_results: List[Dict]
75
+ ) -> str:
76
+ """
77
+ Generates a text report comparing two agents.
78
+
79
+ Args:
80
+ agent1_results: Evaluation results for the first agent.
81
+ agent2_results: Evaluation results for the second agent.
82
+
83
+ Returns:
84
+ A formatted string comparing the two agents.
85
+ """
86
+ if not agent1_results or not agent2_results:
87
+ return "Insufficient data for comparison. Please provide results for both agents."
88
+
89
+ agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
90
+ agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
91
+
92
+ # Calculate average scores for each agent
93
+ metrics = ['overall_score', 'instruction_following', 'hallucination_score',
94
+ 'assumption_control', 'coherence', 'accuracy']
95
+
96
+ avg_scores1 = {m: np.mean([r['scores'].get(m, 0) for r in agent1_results]) for m in metrics}
97
+ avg_scores2 = {m: np.mean([r['scores'].get(m, 0) for r in agent2_results]) for m in metrics}
98
+
99
+ # Build report string
100
+ report = []
101
+ report.append("="*60)
102
+ report.append(f" Agent Comparison Report: {agent1_name} vs. {agent2_name}")
103
+ report.append("="*60)
104
+ report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
105
+
106
+ # Overall Winner
107
+ winner = agent1_name if avg_scores1['overall_score'] > avg_scores2['overall_score'] else agent2_name
108
+ report.append("--- Overall Performance ---")
109
+ report.append(f"🏆 Winner: {winner}")
110
+ report.append(f"{agent1_name} Avg Overall Score: {avg_scores1['overall_score']:.3f}")
111
+ report.append(f"{agent2_name} Avg Overall Score: {avg_scores2['overall_score']:.3f}\n")
112
+
113
+ report.append("--- Detailed Metric Comparison ---")
114
+ header = f"{'Metric':<25} | {agent1_name:<10} | {agent2_name:<10} | {'Delta':<8} | {'Winner'}"
115
+ report.append(header)
116
+ report.append("-"*len(header))
117
+
118
+ for metric in metrics:
119
+ s1 = avg_scores1[metric]
120
+ s2 = avg_scores2[metric]
121
+ delta = s2 - s1
122
+ metric_winner = agent1_name if s1 > s2 else agent2_name if s2 > s1 else "Tie"
123
+ metric_name = metric.replace('_', ' ').title()
124
+
125
+ report.append(f"{metric_name:<25} | {s1:<10.3f} | {s2:<10.3f} | {delta:<+8.3f} | {metric_winner}")
126
+
127
+ report.append("\n" + "="*60)
128
+
129
+ return "\n".join(report)
130
+
131
+ def generate_html_report(self, results_data: List[Dict]) -> str:
132
+ """
133
+ Generates a basic HTML report from evaluation results.
134
+
135
+ Args:
136
+ results_data: A list of evaluation result dictionaries.
137
+
138
+ Returns:
139
+ A string containing a full HTML report.
140
+ """
141
+ report_str = self.generate_batch_report(results_data)
142
+
143
+ # Basic HTML template
144
+ html_template = f"""
145
+ <!DOCTYPE html>
146
+ <html lang="en">
147
+ <head>
148
+ <meta charset="UTF-8">
149
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
150
+ <title>AetherScore Evaluation Report</title>
151
+ <style>
152
+ body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; color: #333; }}
153
+ .container {{ max-width: 800px; margin: auto; padding: 20px; }}
154
+ h1 {{ color: #4a4a4a; }}
155
+ pre {{ background: #f4f4f4; padding: 15px; border-radius: 5px; white-space: pre-wrap; }}
156
+ </style>
157
+ </head>
158
+ <body>
159
+ <div class="container">
160
+ <h1>AetherScore Evaluation Report</h1>
161
+ <p>This report contains a summary of the batch evaluation results.</p>
162
+ <pre>{report_str}</pre>
163
+ <p><em>Note: This is a text-based summary. For interactive visualizations, please use the AetherScore dashboard.</em></p>
164
+ </div>
165
+ </body>
166
+ </html>
167
+ """
168
+ return html_template
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core Framework
2
+ gradio
3
+
4
+ # Data Handling & Numerical Operations
5
+ pandas
6
+ numpy
7
+ scikit-learn
8
+
9
+ # Visualization
10
+ plotly
11
+ plotly-express
12
+ seaborn
13
+ matplotlib
14
+
15
+ # NLP & Machine Learning
16
+ spacy==3.7.4
17
+ transformers
18
+ sentence-transformers
19
+ torch
20
+
21
+ # spaCy model (ensures the model is downloaded during setup)
22
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
23
+
24
+ evaluate
25
+ rouge-score
26
+ nltk
27
+ absl-py
28
+ sacrebleu
29
+ bert-score
visualizer_module.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ import pandas as pd
3
+ import numpy as np
4
+ from typing import Dict, List
5
+
6
+ class EvaluationVisualizer:
7
+ def __init__(self):
8
+ self.metric_colors = {
9
+ 'instruction_following': '#667eea', 'hallucination_score': '#48bb78',
10
+ 'assumption_control': '#f6ad55', 'coherence': '#63b3ed',
11
+ 'accuracy': '#fc8181', 'overall_score': '#764ba2'
12
+ }
13
+
14
+ #Spider chart with multi dimensional scores for single evaluation
15
+
16
+ def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
17
+
18
+ metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy']
19
+ values = [
20
+ scores.get('instruction_following', 0), scores.get('hallucination_score', 0),
21
+ scores.get('assumption_control', 0), scores.get('coherence', 0),
22
+ scores.get('accuracy', 0)
23
+ ]
24
+ fig = go.Figure()
25
+ fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following'])))
26
+ fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white')
27
+ return fig
28
+
29
+
30
+ # Horizontal bar chart showing scores for single evaluation
31
+
32
+ def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
33
+
34
+ metric_map = {
35
+ 'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following',
36
+ 'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control',
37
+ 'coherence': 'Coherence', 'accuracy': 'Accuracy'
38
+ }
39
+ metrics = [label for key, label in metric_map.items() if key in scores]
40
+ values = [scores[key] for key in metric_map if key in scores]
41
+ colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores]
42
+
43
+ fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto'))
44
+ fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False)
45
+ return fig
46
+
47
+
48
+ # Heatmap showing evaluation scores across agents and tasks
49
+
50
+ def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure:
51
+
52
+ if not results: return go.Figure().update_layout(title="No data for heatmap")
53
+
54
+ df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results]
55
+ df = pd.DataFrame(df_data)
56
+ pivot_df = df.pivot(index='agent', columns='task', values='score')
57
+
58
+ fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score")))
59
+ fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white')
60
+ return fig
61
+
62
+
63
+ # Violin plots for spread in scores across Agents
64
+
65
+ def create_score_distribution(self, results: List[Dict]) -> go.Figure:
66
+
67
+ if not results: return go.Figure().update_layout(title="No data for distribution plot")
68
+
69
+ df_data = []
70
+ for r in results:
71
+ entry = {'Agent': r.get('agent_name', 'Unknown')}
72
+ entry.update(r['scores'])
73
+ df_data.append(entry)
74
+ df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score')
75
+
76
+ metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()}
77
+ df['Metric'] = df['Metric'].map(metric_map)
78
+
79
+ fig = go.Figure()
80
+ for metric in df['Metric'].unique():
81
+ fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True))
82
+
83
+ fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False)
84
+ return fig
85
+
86
+
87
+
88
+ # Bar chart showing average overall scores of each agent
89
+
90
+ def create_performance_trends(self, results: List[Dict]) -> go.Figure:
91
+
92
+ if not results:
93
+ return go.Figure().update_layout(title="No data for average performance plot")
94
+
95
+ agent_scores = {}
96
+ for r in results:
97
+ agent = r['agent_name']
98
+ if agent not in agent_scores:
99
+ agent_scores[agent] = []
100
+ agent_scores[agent].append(r['scores'].get('overall_score', 0))
101
+
102
+ # Compute averages
103
+ avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()}
104
+
105
+ fig = go.Figure(go.Bar(
106
+ x=list(avg_scores.keys()),
107
+ y=list(avg_scores.values()),
108
+ text=[f"{v:.2f}" for v in avg_scores.values()],
109
+ textposition="auto",
110
+ marker=dict(color="#667eea")
111
+ ))
112
+
113
+ fig.update_layout(
114
+ title="Average Overall Scores by Agent",
115
+ xaxis_title="Agents",
116
+ yaxis_title="Average Overall Score",
117
+ template="plotly_white"
118
+ )
119
+ return fig
120
+
121
+
122
+
123
+ # Comparison chart between two agents
124
+
125
+ def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
126
+
127
+ metrics = list(self.metric_colors.keys())
128
+ agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
129
+ agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
130
+
131
+ def get_avg_scores(results):
132
+ return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
133
+
134
+ avg1 = get_avg_scores(agent1_results)
135
+ avg2 = get_avg_scores(agent2_results)
136
+ metric_labels = [m.replace('_', ' ').title() for m in metrics]
137
+
138
+ fig = go.Figure(data=[
139
+ go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]),
140
+ go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics])
141
+ ])
142
+ fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white')
143
+ return fig
144
+
145
+
146
+ # Spider chart comparing two agents
147
+
148
+ def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
149
+
150
+ metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy']
151
+ metric_labels = [m.replace('_', ' ').title() for m in metrics]
152
+
153
+ def get_avg_scores(results):
154
+ return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics]
155
+
156
+ agent1_values = get_avg_scores(agent1_results)
157
+ agent2_values = get_avg_scores(agent2_results)
158
+
159
+ fig = go.Figure()
160
+ fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1')))
161
+ fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2')))
162
+ fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white')
163
+ return fig
164
+
165
+
166
+ #performance gap between two agents across metrics
167
+
168
+ def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
169
+
170
+ metrics = list(self.metric_colors.keys())
171
+
172
+ def get_avg_scores(results):
173
+ return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
174
+
175
+ avg1 = get_avg_scores(agent1_results)
176
+ avg2 = get_avg_scores(agent2_results)
177
+ deltas = [avg2[m] - avg1[m] for m in metrics]
178
+ colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas]
179
+ metric_labels = [m.replace('_', ' ').title() for m in metrics]
180
+
181
+ fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas]))
182
+ fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white')
183
+ return fig