Spaces:

aaditya-raj
/

e6test

Sleeping

App Files Files Community

aaditya-raj commited on Sep 15, 2025

Commit

e3a1a9e

verified ·

1 Parent(s): 14709ff

Upload 4 files

Browse files

Files changed (4) hide show

evaluator_module.py +288 -0
report_generator.py +168 -0
requirements.txt +29 -0
visualizer_module.py +183 -0

evaluator_module.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import re
+import numpy as np
+from typing import Dict, List, Optional, Tuple
+import json
+from collections import defaultdict
+import spacy
+from transformers import pipeline
+from sentence_transformers import SentenceTransformer
+from sklearn.metrics.pairwise import cosine_similarity
+import hashlib
+from datetime import datetime
+import concurrent.futures
+import random
+class AetherScoreEvaluator:
+    def __init__(self):
+        # NLP models
+        try:
+            self.nlp = spacy.load("en_core_web_sm")
+        except OSError:
+            print("Downloading 'en_core_web_sm' spacy model...")
+            spacy.cli.download("en_core_web_sm")
+            self.nlp = spacy.load("en_core_web_sm")
+        # LLM Judge Model
+        self.judge_model = pipeline(
+            "text2text-generation",
+            model="google/flan-t5-base",
+            device=-1  # 0 for GPU
+        )
+        # Sentence Transformer for Sentence ----> Embedding
+        self.sentence_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Scoring weights # Domain Specific weights can be added for better results
+        self.weights = {'instruction_following': 0.25, 'hallucination_score': 0.20,
+                        'assumption_control': 0.20, 'coherence': 0.20, 'accuracy': 0.15}
+        # In-memory cache
+        self.cache = {}
+    # LLM Judge
+    # def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
+    #     print("Using HF model LLM Judge...")
+    #     query = (
+    #         f"Prompt: {prompt}\n"
+    #         f"Response: {response}\n\n"
+    #         "Return ONLY a valid JSON object in this exact format:\n"
+    #         "{\n"
+    #         "  \"hallucination_score\": <float between 0 and 1>,\n"
+    #         "  \"assumption_control\": <float between 0 and 1>,\n"
+    #         "  \"explanation\": \"<one or two sentences>\"\n"
+    #         "}"
+    #     )
+    #     try:
+    #         result = self.judge_model(query, max_new_tokens=128, truncation=True)
+    #         output = result[0]['generated_text']
+    #     except Exception as e:
+    #         return {
+    #             "hallucination_score": (0.1, f"HF model failed: {e}"),
+    #             "assumption_control": (0.1, f"HF model failed: {e}")
+    #         }
+    #     # Default values
+    #     halluc_score = random.uniform(0.3, 0.7)
+    #     assumption_score = random.uniform(0.3, 0.7)
+    #     explanation = output
+    #     try:
+    #         parsed = json.loads(output.strip())
+    #         halluc_score = float(parsed.get("hallucination_score", halluc_score))
+    #         assumption_score = float(parsed.get("assumption_control", assumption_score))
+    #         explanation = parsed.get("explanation", explanation)
+    #     except Exception:
+    #         pass  # fallback to defaults
+    #     return {
+    #         "hallucination_score": (halluc_score, explanation),
+    #         "assumption_control": (assumption_score, explanation)
+    #     }
+    import random
+    import json
+    def _evaluate_with_llm_judge(self, prompt: str, response: str) -> Dict:
+        print("Using rule-based evaluation instead of HF LLM...")
+        prompt_words = set(prompt.lower().split())
+        response_words = response.lower().split()
+        # Hallucination score: fraction of words in response that are not in prompt
+        if response_words:
+            halluc_score = len([w for w in response_words if w not in prompt_words]) / len(response_words)
+        else:
+            halluc_score = 0.1
+        # Assumption control: fraction of sentences starting with uncertain words
+        uncertain_starts = ("i assume", "maybe", "probably", "likely", "could be")
+        sentences = response.lower().split(".")
+        if sentences:
+            assumption_score = sum(0.3 for s in sentences if s.strip().startswith(uncertain_starts)) / len(sentences)
+        else:
+            assumption_score = 0.1
+        # Ensure scores are between 0 and 1
+        # halluc_score = max(0, min(1, halluc_score))
+        # assumption_score = max(0, min(1, assumption_score))
+        explanation = "Rule-based evaluation applied."
+        return {
+            "hallucination_score": (halluc_score, explanation),
+            "assumption_control": (assumption_score, explanation)
+        }
+    # Single Evaluation # Inputs-->> Prompt, Agent Response, Expected Answer(Optional), Agent Name and Task type( General, QA, Summarizaton)etc
+    def evaluate_single(self, prompt: str, response: str, expected_answer: Optional[str] = None, task_type: str = "general") -> Dict:
+        # Generating Eval ID
+        eval_id = self._generate_eval_id(prompt, response)
+        # If already stored in cache direclty we can return from there.
+        # if eval_id in self.cache:
+        #     return self.cache[eval_id]
+        scores, reasons = {}, {}
+        # Taking Back scores and reasons of hallucination and assumption control from LLM Judge
+        llm_judge_results = self._evaluate_with_llm_judge(prompt, response)
+        scores['hallucination_score'], reasons['hallucination_score'] = llm_judge_results['hallucination_score']
+        scores['assumption_control'], reasons['assumption_control'] = llm_judge_results['assumption_control']
+        # Evaluating Instruction Following, Coherence and Accuracy
+        scores['instruction_following'], reasons['instruction_following'] = self._evaluate_instruction_following(prompt, response)
+        scores['coherence'], reasons['coherence'] = self._evaluate_coherence(response)
+        scores['accuracy'], reasons['accuracy'] = self._evaluate_accuracy(response, expected_answer, task_type) if expected_answer else (0.5, "No expected answer provided.")
+        # Calculating Overall Score
+        scores['overall_score'] = self._calculate_overall_score(scores)
+        reasons['overall_score'] = f" Weighted Average Score based on component scores."
+        # Updating Eval ID, Timestamp and Task Type in Scores
+        scores.update({'eval_id': eval_id, 'timestamp': datetime.now().isoformat(), 'task_type': task_type})
+        # Updating scores(Eval ID, timestamp and task_type) and reasons(all scores) in result
+        result = {"scores": scores, "reasons": reasons}
+        #Storing results with corresponding Eval ID in cache
+        # self.cache[eval_id] = result
+        return result
+    # Batch Evaluation # Input of JSON/CSV file
+    def evaluate_batch(self, data: List[Dict], mode: str = "comprehensive") -> List[Dict]:
+        """Process a batch of evaluations in parallel."""
+        results = []
+        # Get Item function
+        def process_item(item):
+            # Calling our Evalution function for Single prompt response pair
+            eval_result = self.evaluate_single(
+                prompt=item.get('prompt', ''),
+                response=item.get('response', ''),
+                expected_answer=item.get('expected_answer',''),
+                task_type=item.get('task_type', 'general')
+            )
+            # Combining with original metadata
+            eval_result.update({
+                'task_id': item.get('task_id', eval_result['scores']['eval_id']),
+                'agent_name': item.get('agent_name', 'Unknown'),
+            })
+            return eval_result
+        with concurrent.futures.ThreadPoolExecutor() as executor:
+            future_to_item = {executor.submit(process_item, item): item for item in data}
+            for future in concurrent.futures.as_completed(future_to_item):
+                try:
+                    results.append(future.result())
+                except Exception as exc:
+                    print(f'An item generated an exception: {exc}')
+        return results
+    # Instruction Following Evaluation (Prompt, Response)
+    def _evaluate_instruction_following(self, prompt: str, response: str) -> Tuple[float, str]:
+        score, checks, passed = 1.0, 0, 0
+        # Check for negative constraints
+        negations = re.findall(r"(don't|do not|avoid|without) ([\w\s,]+)", prompt.lower())
+        for _, constraint_phrase in negations:
+            checks += 1
+            words_to_avoid = [w.strip() for w in constraint_phrase.split(',')]
+            if not any(word in response.lower() for word in words_to_avoid if len(word) > 2):
+                passed += 1
+        # Fallback to semantic similarity if no specific instructions found
+        if checks == 0:
+            sim = self._semantic_similarity(prompt, response)
+            return sim, f"No specific constraints found. Score based on semantic similarity ({sim:.2f}) to prompt."
+        # Final Score calculation
+        score = passed / checks if checks > 0 else 1.0
+        reason = f"{passed}/{checks} specific constraints were followed."
+        return score, reason
+    # Evaluating Coherence (response)
+    def _evaluate_coherence(self, response: str) -> Tuple[float, str]:
+        # Extracting Sentences from Response
+        doc = self.nlp(response)
+        sentences = [sent.text for sent in doc.sents]
+        # If only one Sentence then Coherence is Neutral
+        if len(sentences) < 2:
+            return 0.7, "Coherence is neutral for single-sentence responses."
+        # Fetching Embeddings from our Sentence Model
+        embeddings = self.sentence_model.encode(sentences)
+        sims = [cosine_similarity([embeddings[i]], [embeddings[i+1]])[0][0] for i in range(len(sentences)-1)]
+        score = np.mean(sims)
+        reason = f"Average sentence-to-sentence similarity score is {score:.2f} across {len(sentences)} sentences."
+        return score, reason
+    # Evaluating Accuracy (Response, Expected, Task_type)
+    def _evaluate_accuracy(self, response: str, expected: str, task_type: str) -> Tuple[float, str]:
+        sim = self._semantic_similarity(response, expected)
+        reason = f"Semantic similarity between response and expected answer is {sim:.2f}."
+        if sim > 0.95:
+            reason += " (High match)"
+        elif sim < 0.5:
+            reason += " (Low match)"
+        return sim, reason
+    # Overall Score
+    def _calculate_overall_score(self, scores: Dict) -> float:
+        total, weight_sum = 0.0, 0.0
+        for metric, weight in self.weights.items():
+            if metric in scores:
+                total += scores[metric] * weight
+                weight_sum += weight
+        return total / weight_sum #if weight_sum > 0 else 0.5
+    # Explanation Generator, work in progress
+    def generate_explanation(self, scores: Dict) -> str:
+        explanation = []
+        overall = scores.get('overall_score', 0)
+        explanation.append(f"Overall Score: {overall:.2f}/1.00 - Reflects a weighted average of all dimensions.")
+        if scores.get('instruction_following', 0) < 0.6:
+            explanation.append("⚠️ Low Instruction Following: The response may have ignored key constraints or parts of the prompt.")
+        if scores.get('hallucination_score', 0) < 0.6:
+            explanation.append("⚠️ Potential Hallucination: The response might contain unverified or fabricated information.")
+        if scores.get('accuracy', 0) < 0.6 and scores.get('accuracy', 0.5) != 0.5:
+            explanation.append("⚠️ Low Accuracy: The response significantly differs from the provided expected answer.")
+        if not explanation[1:]:
+             explanation.append("✅ Great Performance: The agent performed well across the primary evaluation dimensions.")
+        return "\n".join(explanation)
+    # Agent Scores
+    def get_agent_scores_from_results(self, results: List[Dict]) -> Dict[str, List[float]]:
+        agent_scores = defaultdict(list)
+        for result in results:
+            agent_name = result.get('agent_name', 'Unknown')
+            overall_score = result.get('scores', {}).get('overall_score', 0)
+            agent_scores[agent_name].append(overall_score)
+        return agent_scores
+    # Helper Functions
+    def _generate_eval_id(self, prompt: str, response: str) -> str:
+        return hashlib.md5(f"{prompt}{response}".encode()).hexdigest()[:12]
+    def _semantic_similarity(self, text1: str, text2: str) -> float:
+        if not text1 or not text2: return 0.0
+        emb1 = self.sentence_model.encode([text1])
+        emb2 = self.sentence_model.encode([text2])
+        return cosine_similarity(emb1, emb2)[0][0]

report_generator.py ADDED Viewed

	@@ -0,0 +1,168 @@

+# report_generator.py
+from typing import Dict, List
+import numpy as np
+from collections import defaultdict
+from datetime import datetime
+class ReportGenerator:
+    """
+    Generates textual and HTML reports from evaluation results
+    """
+    def generate_batch_report(self, results: List[Dict]) -> str:
+        """
+        Generates a comprehensive text report for a batch evaluation.
+        Args:
+            results: A list of evaluation result dictionaries.
+        Returns:
+            A formatted string containing the batch evaluation report.
+        """
+        if not results:
+            return "No evaluation results to report."
+        num_evals = len(results)
+        agent_names = list(set(r['agent_name'] for r in results))
+        num_agents = len(agent_names)
+        # Aggregate scores
+        overall_scores = [r['scores']['overall_score'] for r in results]
+        metric_scores = defaultdict(list)
+        agent_overall_scores = defaultdict(list)
+        for res in results:
+            agent_overall_scores[res['agent_name']].append(res['scores']['overall_score'])
+            for metric, score in res['scores'].items():
+                if metric not in ['eval_id', 'timestamp', 'task_type']:
+                    metric_scores[metric].append(score)
+        # Calculate agent averages
+        agent_avg_scores = {agent: np.mean(scores) for agent, scores in agent_overall_scores.items()}
+        top_agent = max(agent_avg_scores, key=agent_avg_scores.get)
+        bottom_agent = min(agent_avg_scores, key=agent_avg_scores.get)
+        # Build report string
+        report = []
+        report.append("="*50)
+        report.append("    AetherScore - Batch Evaluation Report")
+        report.append("="*50)
+        report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        report.append("--- Summary ---")
+        report.append(f"Total Evaluations: {num_evals}")
+        report.append(f"Number of Agents: {num_agents}")
+        report.append(f"Overall Average Score: {np.mean(overall_scores):.3f}\n")
+        report.append("--- Agent Performance ---")
+        report.append(f"Top Performing Agent: {top_agent} (Avg Score: {agent_avg_scores[top_agent]:.3f})")
+        report.append(f"Agent with most room for improvement: {bottom_agent} (Avg Score: {agent_avg_scores[bottom_agent]:.3f})\n")
+        report.append("--- Metric Breakdown (Average Scores) ---")
+        for metric, scores in metric_scores.items():
+            metric_name = metric.replace('_', ' ').title()
+            report.append(f"- {metric_name:<25}: {np.mean(scores):.3f}")
+        report.append("\n" + "="*50)
+        return "\n".join(report)
+    def generate_comparison_report(
+        self,
+        agent1_results: List[Dict],
+        agent2_results: List[Dict]
+    ) -> str:
+        """
+        Generates a text report comparing two agents.
+        Args:
+            agent1_results: Evaluation results for the first agent.
+            agent2_results: Evaluation results for the second agent.
+        Returns:
+            A formatted string comparing the two agents.
+        """
+        if not agent1_results or not agent2_results:
+            return "Insufficient data for comparison. Please provide results for both agents."
+        agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
+        agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
+        # Calculate average scores for each agent
+        metrics = ['overall_score', 'instruction_following', 'hallucination_score',
+                   'assumption_control', 'coherence', 'accuracy']
+        avg_scores1 = {m: np.mean([r['scores'].get(m, 0) for r in agent1_results]) for m in metrics}
+        avg_scores2 = {m: np.mean([r['scores'].get(m, 0) for r in agent2_results]) for m in metrics}
+        # Build report string
+        report = []
+        report.append("="*60)
+        report.append(f"    Agent Comparison Report: {agent1_name} vs. {agent2_name}")
+        report.append("="*60)
+        report.append(f"Report generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
+        # Overall Winner
+        winner = agent1_name if avg_scores1['overall_score'] > avg_scores2['overall_score'] else agent2_name
+        report.append("--- Overall Performance ---")
+        report.append(f"🏆 Winner: {winner}")
+        report.append(f"{agent1_name} Avg Overall Score: {avg_scores1['overall_score']:.3f}")
+        report.append(f"{agent2_name} Avg Overall Score: {avg_scores2['overall_score']:.3f}\n")
+        report.append("--- Detailed Metric Comparison ---")
+        header = f"{'Metric':<25} | {agent1_name:<10} | {agent2_name:<10} | {'Delta':<8} | {'Winner'}"
+        report.append(header)
+        report.append("-"*len(header))
+        for metric in metrics:
+            s1 = avg_scores1[metric]
+            s2 = avg_scores2[metric]
+            delta = s2 - s1
+            metric_winner = agent1_name if s1 > s2 else agent2_name if s2 > s1 else "Tie"
+            metric_name = metric.replace('_', ' ').title()
+            report.append(f"{metric_name:<25} | {s1:<10.3f} | {s2:<10.3f} | {delta:<+8.3f} | {metric_winner}")
+        report.append("\n" + "="*60)
+        return "\n".join(report)
+    def generate_html_report(self, results_data: List[Dict]) -> str:
+        """
+        Generates a basic HTML report from evaluation results.
+        Args:
+            results_data: A list of evaluation result dictionaries.
+        Returns:
+            A string containing a full HTML report.
+        """
+        report_str = self.generate_batch_report(results_data)
+        # Basic HTML template
+        html_template = f"""
+        <!DOCTYPE html>
+        <html lang="en">
+        <head>
+            <meta charset="UTF-8">
+            <meta name="viewport" content="width=device-width, initial-scale=1.0">
+            <title>AetherScore Evaluation Report</title>
+            <style>
+                body {{ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; line-height: 1.6; color: #333; }}
+                .container {{ max-width: 800px; margin: auto; padding: 20px; }}
+                h1 {{ color: #4a4a4a; }}
+                pre {{ background: #f4f4f4; padding: 15px; border-radius: 5px; white-space: pre-wrap; }}
+            </style>
+        </head>
+        <body>
+            <div class="container">
+                <h1>AetherScore Evaluation Report</h1>
+                <p>This report contains a summary of the batch evaluation results.</p>
+                <pre>{report_str}</pre>
+                <p><em>Note: This is a text-based summary. For interactive visualizations, please use the AetherScore dashboard.</em></p>
+            </div>
+        </body>
+        </html>
+        """
+        return html_template

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Core Framework
+gradio
+# Data Handling & Numerical Operations
+pandas
+numpy
+scikit-learn
+# Visualization
+plotly
+plotly-express
+seaborn
+matplotlib
+# NLP & Machine Learning
+spacy==3.7.4
+transformers
+sentence-transformers
+torch
+# spaCy model (ensures the model is downloaded during setup)
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl
+evaluate
+rouge-score
+nltk
+absl-py
+sacrebleu
+bert-score

visualizer_module.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import plotly.graph_objects as go
+import pandas as pd
+import numpy as np
+from typing import Dict, List
+class EvaluationVisualizer:
+    def __init__(self):
+        self.metric_colors = {
+            'instruction_following': '#667eea', 'hallucination_score': '#48bb78',
+            'assumption_control': '#f6ad55', 'coherence': '#63b3ed',
+            'accuracy': '#fc8181', 'overall_score': '#764ba2'
+        }
+    #Spider chart with multi dimensional scores for single evaluation
+    def create_spider_chart(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
+        metrics = ['Instruction\nFollowing', 'Hallucination\nControl', 'Assumption\nControl', 'Coherence', 'Accuracy']
+        values = [
+            scores.get('instruction_following', 0), scores.get('hallucination_score', 0),
+            scores.get('assumption_control', 0), scores.get('coherence', 0),
+            scores.get('accuracy', 0)
+        ]
+        fig = go.Figure()
+        fig.add_trace(go.Scatterpolar(r=values, theta=metrics, fill='toself', name=agent_name, line=dict(color=self.metric_colors['instruction_following'])))
+        fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title=f"{agent_name} - Performance Spider Chart", template='plotly_white')
+        return fig
+    # Horizontal bar chart showing scores for single evaluation
+    def create_score_bars(self, scores: Dict, agent_name: str = "Agent") -> go.Figure:
+        metric_map = {
+            'overall_score': 'Overall Score', 'instruction_following': 'Instruction Following',
+            'hallucination_score': 'Hallucination Control', 'assumption_control': 'Assumption Control',
+            'coherence': 'Coherence', 'accuracy': 'Accuracy'
+        }
+        metrics = [label for key, label in metric_map.items() if key in scores]
+        values = [scores[key] for key in metric_map if key in scores]
+        colors = [self.metric_colors.get(key, '#667eea') for key in metric_map if key in scores]
+        fig = go.Figure(go.Bar(y=metrics, x=values, orientation='h', marker=dict(color=colors), text=[f'{v:.2f}' for v in values], textposition='auto'))
+        fig.update_layout(title=f"{agent_name} - Score Breakdown", xaxis=dict(range=[0, 1]), template='plotly_white', showlegend=False)
+        return fig
+    # Heatmap showing evaluation scores across agents and tasks
+    def create_evaluation_heatmap(self, results: List[Dict]) -> go.Figure:
+        if not results: return go.Figure().update_layout(title="No data for heatmap")
+        df_data = [{'agent': r['agent_name'], 'task': r['task_id'], 'score': r['scores'].get('overall_score', 0)} for r in results]
+        df = pd.DataFrame(df_data)
+        pivot_df = df.pivot(index='agent', columns='task', values='score')
+        fig = go.Figure(data=go.Heatmap(z=pivot_df.values, x=pivot_df.columns, y=pivot_df.index, colorscale='Viridis', colorbar=dict(title="Score")))
+        fig.update_layout(title="Agent Performance Heatmap", xaxis_title="Tasks", yaxis_title="Agents", template='plotly_white')
+        return fig
+    # Violin plots for spread in scores across Agents
+    def create_score_distribution(self, results: List[Dict]) -> go.Figure:
+        if not results: return go.Figure().update_layout(title="No data for distribution plot")
+        df_data = []
+        for r in results:
+            entry = {'Agent': r.get('agent_name', 'Unknown')}
+            entry.update(r['scores'])
+            df_data.append(entry)
+        df = pd.DataFrame(df_data).melt(id_vars=['Agent'], value_vars=self.metric_colors.keys(), var_name='Metric', value_name='Score')
+        metric_map = {k: k.replace('_', ' ').title() for k in self.metric_colors.keys()}
+        df['Metric'] = df['Metric'].map(metric_map)
+        fig = go.Figure()
+        for metric in df['Metric'].unique():
+            fig.add_trace(go.Violin(y=df[df['Metric'] == metric]['Score'], name=metric, box_visible=True, meanline_visible=True))
+        fig.update_layout(title="Score Distribution Analysis", yaxis_title="Score", template='plotly_white', showlegend=False)
+        return fig
+         # Bar chart showing average overall scores of each agent
+    def create_performance_trends(self, results: List[Dict]) -> go.Figure:
+        if not results:
+            return go.Figure().update_layout(title="No data for average performance plot")
+        agent_scores = {}
+        for r in results:
+            agent = r['agent_name']
+            if agent not in agent_scores:
+                agent_scores[agent] = []
+            agent_scores[agent].append(r['scores'].get('overall_score', 0))
+        # Compute averages
+        avg_scores = {agent: np.mean(scores) for agent, scores in agent_scores.items()}
+        fig = go.Figure(go.Bar(
+            x=list(avg_scores.keys()),
+            y=list(avg_scores.values()),
+            text=[f"{v:.2f}" for v in avg_scores.values()],
+            textposition="auto",
+            marker=dict(color="#667eea")
+        ))
+        fig.update_layout(
+            title="Average Overall Scores by Agent",
+            xaxis_title="Agents",
+            yaxis_title="Average Overall Score",
+            template="plotly_white"
+        )
+        return fig
+        # Comparison chart between two agents
+    def create_agent_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
+        metrics = list(self.metric_colors.keys())
+        agent1_name = agent1_results[0].get('agent_name', 'Agent 1')
+        agent2_name = agent2_results[0].get('agent_name', 'Agent 2')
+        def get_avg_scores(results):
+            return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
+        avg1 = get_avg_scores(agent1_results)
+        avg2 = get_avg_scores(agent2_results)
+        metric_labels = [m.replace('_', ' ').title() for m in metrics]
+        fig = go.Figure(data=[
+            go.Bar(name=agent1_name, x=metric_labels, y=[avg1[m] for m in metrics]),
+            go.Bar(name=agent2_name, x=metric_labels, y=[avg2[m] for m in metrics])
+        ])
+        fig.update_layout(barmode='group', title="Agent Performance Comparison", yaxis_title="Average Score", template='plotly_white')
+        return fig
+        # Spider chart comparing two agents
+    def create_radar_comparison(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
+        metrics = ['instruction_following', 'hallucination_score', 'assumption_control', 'coherence', 'accuracy']
+        metric_labels = [m.replace('_', ' ').title() for m in metrics]
+        def get_avg_scores(results):
+            return [np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics]
+        agent1_values = get_avg_scores(agent1_results)
+        agent2_values = get_avg_scores(agent2_results)
+        fig = go.Figure()
+        fig.add_trace(go.Scatterpolar(r=agent1_values, theta=metric_labels, fill='toself', name=agent1_results[0].get('agent_name', 'Agent 1')))
+        fig.add_trace(go.Scatterpolar(r=agent2_values, theta=metric_labels, fill='toself', name=agent2_results[0].get('agent_name', 'Agent 2')))
+        fig.update_layout(polar=dict(radialaxis=dict(visible=True, range=[0, 1])), title="Agent Comparison - Radar Chart", template='plotly_white')
+        return fig
+        #performance gap between two agents across metrics
+    def create_performance_delta(self, agent1_results: List[Dict], agent2_results: List[Dict]) -> go.Figure:
+        metrics = list(self.metric_colors.keys())
+        def get_avg_scores(results):
+            return {m: np.mean([r['scores'].get(m, 0) for r in results]) for m in metrics}
+        avg1 = get_avg_scores(agent1_results)
+        avg2 = get_avg_scores(agent2_results)
+        deltas = [avg2[m] - avg1[m] for m in metrics]
+        colors = ['#48bb78' if d >= 0 else '#fc8181' for d in deltas]
+        metric_labels = [m.replace('_', ' ').title() for m in metrics]
+        fig = go.Figure(go.Bar(x=metric_labels, y=deltas, marker_color=colors, text=[f'{d:+.2f}' for d in deltas]))
+        fig.update_layout(title="Performance Delta (Agent 2 vs Agent 1)", yaxis_title="Score Difference", template='plotly_white')
+        return fig