from __future__ import annotations import gradio as gr import pandas as pd import numpy as np import json import plotly.graph_objects as go from typing import Dict, List, Optional import gc import traceback # Import evaluation modules from evaluator_module import AetherScoreEvaluator from visualizer_module import EvaluationVisualizer from report_generator import ReportGenerator # --- Global Components & Storage --- evaluator = None visualizer = None report_gen = None # In-memory storage for explainability feature evaluation_storage: Dict[str, Dict] = {} def get_evaluator(): """Get or create evaluator instance""" global evaluator if evaluator is None: evaluator = AetherScoreEvaluator() return evaluator def get_visualizer(): """Get or create visualizer instance""" global visualizer if visualizer is None: visualizer = EvaluationVisualizer() return visualizer def get_report_generator(): """Get or create report generator instance""" global report_gen if report_gen is None: report_gen = ReportGenerator() return report_gen # CSS for better styling custom_css = """ .gradio-container { font-family: 'Inter', sans-serif; } .metric-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; margin: 10px 0; } """ def process_single_evaluation( prompt: str, response: str, expected_answer: Optional[str] = None, agent_name: str = "Agent-1", task_type: str = "general" ) -> tuple[Dict, go.Figure, go.Figure, str]: """Process single evaluation with better error handling""" try: # Input validation if not prompt or not response: empty_fig = go.Figure() empty_fig.update_layout(title="No data to display") return {}, empty_fig, empty_fig, "Please provide both prompt and response." # Get evaluator instance eval_instance = get_evaluator() vis_instance = get_visualizer() # Evaluate the response eval_result = eval_instance.evaluate_single( prompt=prompt, response=response, expected_answer=expected_answer if expected_answer.strip() else None, task_type=task_type ) scores = eval_result.get("scores", {}) if not scores or scores.get('overall_score', 0) == 0: empty_fig = go.Figure() empty_fig.update_layout(title="Evaluation failed") return {}, empty_fig, empty_fig, "Evaluation failed. Please check your inputs." # Generate visualizations try: spider_chart = vis_instance.create_spider_chart(scores, agent_name) score_bars = vis_instance.create_score_bars(scores, agent_name) except Exception as viz_error: print(f"Visualization error: {viz_error}") empty_fig = go.Figure() empty_fig.update_layout(title="Visualization failed") spider_chart = score_bars = empty_fig # Generate explanation try: explanation = eval_instance.generate_explanation(scores) except Exception as exp_error: explanation = f"Explanation generation failed: {str(exp_error)}" # Format scores for display scores_display = {} for k, v in scores.items(): if isinstance(v, float): scores_display[k] = f"{v:.3f}" else: scores_display[k] = str(v) return scores_display, spider_chart, score_bars, explanation except Exception as e: error_msg = f"Single evaluation failed: {str(e)}" print(f"Error: {error_msg}") print(traceback.format_exc()) empty_fig = go.Figure() empty_fig.update_layout(title="Error occurred") return {}, empty_fig, empty_fig, error_msg def process_batch_evaluation( file_input, evaluation_mode: str = "comprehensive" ) -> tuple[go.Figure, go.Figure, go.Figure, str, pd.DataFrame]: """Process batch evaluation with robust error handling""" empty_fig = go.Figure() empty_fig.update_layout(title="No data available") empty_df = pd.DataFrame() try: # File validation if file_input is None: return empty_fig, empty_fig, empty_fig, "Please upload a file.", empty_df # Load data with error handling try: if file_input.name.endswith('.json'): with open(file_input.name, 'r', encoding='utf-8') as f: data = json.load(f) elif file_input.name.endswith('.jsonl'): data = [] with open(file_input.name, 'r', encoding='utf-8') as f: for line_num, line in enumerate(f, 1): try: if line.strip(): data.append(json.loads(line)) except json.JSONDecodeError as jde: print(f"JSON error on line {line_num}: {jde}") continue else: return empty_fig, empty_fig, empty_fig, "Unsupported file format. Please upload JSON or JSONL.", empty_df except Exception as file_error: return empty_fig, empty_fig, empty_fig, f"File reading error: {str(file_error)}", empty_df if not data: return empty_fig, empty_fig, empty_fig, "No valid data found in file.", empty_df # Validate data structure required_fields = ['prompt', 'response'] valid_data = [] for i, item in enumerate(data): if not isinstance(item, dict): print(f"Item {i} is not a dictionary, skipping") continue if all(field in item for field in required_fields): valid_data.append(item) else: print(f"Item {i} missing required fields, skipping") if not valid_data: return empty_fig, empty_fig, empty_fig, "No valid items found. Each item must have 'prompt' and 'response' fields.", empty_df print(f"Processing {len(valid_data)} valid items...") # Get instances eval_instance = get_evaluator() vis_instance = get_visualizer() report_instance = get_report_generator() # Process batch evaluation results = eval_instance.evaluate_batch(valid_data, mode=evaluation_mode) if not results: return empty_fig, empty_fig, empty_fig, "Batch evaluation produced no results.", empty_df print(f"Successfully evaluated {len(results)} items") # Generate visualizations with error handling try: heatmap = vis_instance.create_evaluation_heatmap(results) except Exception as e: print(f"Heatmap creation failed: {e}") heatmap = empty_fig try: distribution = vis_instance.create_score_distribution(results) except Exception as e: print(f"Distribution creation failed: {e}") distribution = empty_fig try: trends = vis_instance.create_performance_trends(results) except Exception as e: print(f"Trends creation failed: {e}") trends = empty_fig try: report = report_instance.generate_batch_report(results) except Exception as e: print(f"Report generation failed: {e}") report = f"Report generation failed: {str(e)}" try: leaderboard = create_leaderboard(results) except Exception as e: print(f"Leaderboard creation failed: {e}") leaderboard = empty_df # Cleanup gc.collect() return heatmap, distribution, trends, report, leaderboard except Exception as e: error_msg = f"Batch evaluation failed: {str(e)}" print(f"Error: {error_msg}") print(traceback.format_exc()) return empty_fig, empty_fig, empty_fig, error_msg, empty_df def create_leaderboard(results: List[Dict]) -> pd.DataFrame: """Create a leaderboard from evaluation results with robust error handling""" try: if not results: return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) eval_instance = get_evaluator() agent_scores = eval_instance.get_agent_scores_from_results(results) if not agent_scores: return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) leaderboard_data = [] for agent, scores in agent_scores.items(): if not scores: # Skip agents with no valid scores continue # Filter out invalid scores valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)] if not valid_scores: continue leaderboard_data.append({ 'Rank': 0, 'Agent': str(agent), 'Avg Score': np.mean(valid_scores), 'Max Score': np.max(valid_scores), 'Min Score': np.min(valid_scores), 'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0, 'Evaluations': len(valid_scores) }) if not leaderboard_data: return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) df = pd.DataFrame(leaderboard_data) # Sort by average score df = df.sort_values('Avg Score', ascending=False) df['Rank'] = range(1, len(df) + 1) # Format numeric columns for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']: if col in df.columns: df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A") return df except Exception as e: print(f"Leaderboard creation error: {e}") return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) def compare_agents( agent1_file, agent2_file, ) -> tuple[go.Figure, go.Figure, go.Figure, str]: """Compare two agents' performance with error handling""" empty_fig = go.Figure() empty_fig.update_layout(title="No data available") try: if not agent1_file or not agent2_file: return empty_fig, empty_fig, empty_fig, "Please upload files for both agents." def load_agent_data(file): try: if file.name.endswith('.json'): with open(file.name, 'r', encoding='utf-8') as f: return json.load(f) elif file.name.endswith('.jsonl'): data = [] with open(file.name, 'r', encoding='utf-8') as f: for line in f: if line.strip(): data.append(json.loads(line)) return data else: raise ValueError("Unsupported file format") except Exception as e: raise ValueError(f"Error loading file {file.name}: {str(e)}") eval_instance = get_evaluator() vis_instance = get_visualizer() report_instance = get_report_generator() # Load data for both agents agent1_data = load_agent_data(agent1_file) agent2_data = load_agent_data(agent2_file) # Validate data if not agent1_data or not agent2_data: return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data." # Evaluate both agents agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive") agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive") if not agent1_results or not agent2_results: return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents." # Generate comparison visualizations try: comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results) except Exception as e: print(f"Comparison chart creation failed: {e}") comparison_chart = empty_fig try: performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results) except Exception as e: print(f"Performance difference chart creation failed: {e}") performance_diff = empty_fig try: statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results) except Exception as e: print(f"Statistical analysis chart creation failed: {e}") statistical_analysis = empty_fig # Generate comparison report try: comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results) except Exception as e: print(f"Comparison report generation failed: {e}") comparison_report = f"Comparison report generation failed: {str(e)}" return comparison_chart, performance_diff, statistical_analysis, comparison_report except Exception as e: error_msg = f"Agent comparison failed: {str(e)}" print(f"Error: {error_msg}") print(traceback.format_exc()) return empty_fig, empty_fig, empty_fig, error_msg # --- Gradio Interface Setup --- def create_gradio_interface(): """Create and return the Gradio interface""" with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo: gr.Markdown(""" # 🎯 AetherScore Evaluation Dashboard Advanced AI response evaluation system with comprehensive metrics and visualizations. """) with gr.Tabs(): # Single Evaluation Tab with gr.TabItem("πŸ” Single Evaluation"): with gr.Row(): with gr.Column(scale=1): prompt_input = gr.Textbox( label="Prompt", placeholder="Enter the prompt/question here...", lines=3 ) response_input = gr.Textbox( label="AI Response", placeholder="Enter the AI response to evaluate...", lines=5 ) expected_input = gr.Textbox( label="Expected Answer (Optional)", placeholder="Enter expected answer for accuracy comparison...", lines=2 ) with gr.Row(): agent_name_input = gr.Textbox( label="Agent Name", value="Agent-1", scale=1 ) task_type_input = gr.Dropdown( label="Task Type", choices=["general", "reasoning", "creative", "factual"], value="general", scale=1 ) evaluate_btn = gr.Button("πŸ” Evaluate", variant="primary") with gr.Column(scale=2): scores_display = gr.JSON(label="πŸ“Š Evaluation Scores") explanation_output = gr.Textbox( label="πŸ’‘ Detailed Explanation", lines=4, interactive=False ) with gr.Row(): spider_chart = gr.Plot(label="πŸ•ΈοΈ Performance Spider Chart") score_bars = gr.Plot(label="πŸ“Š Score Breakdown") evaluate_btn.click( fn=process_single_evaluation, inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input], outputs=[scores_display, spider_chart, score_bars, explanation_output] ) # Batch Evaluation Tab with gr.TabItem("πŸ“ Batch Evaluation"): with gr.Row(): with gr.Column(scale=1): file_input = gr.File( label="Upload Evaluation Data", file_types=[".json", ".jsonl"], type="filepath" ) eval_mode = gr.Dropdown( label="Evaluation Mode", choices=["comprehensive", "fast"], value="comprehensive" ) batch_btn = gr.Button("πŸš€ Start Batch Evaluation", variant="primary") with gr.Column(scale=2): batch_report = gr.Textbox( label="πŸ“‹ Evaluation Report", lines=8, interactive=False ) with gr.Row(): heatmap_plot = gr.Plot(label="πŸ”₯ Performance Heatmap") distribution_plot = gr.Plot(label="πŸ“ˆ Score Distribution") with gr.Row(): trends_plot = gr.Plot(label="πŸ“Š Performance Trends") leaderboard_df = gr.Dataframe(label="πŸ† Leaderboard") batch_btn.click( fn=process_batch_evaluation, inputs=[file_input, eval_mode], outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df] ) # Agent Comparison Tab with gr.TabItem("βš”οΈ Agent Comparison"): with gr.Row(): with gr.Column(): agent1_file = gr.File( label="Agent 1 Data", file_types=[".json", ".jsonl"], type="filepath" ) with gr.Column(): agent2_file = gr.File( label="Agent 2 Data", file_types=[".json", ".jsonl"], type="filepath" ) compare_btn = gr.Button("πŸ” Compare Agents", variant="primary") with gr.Row(): comparison_report = gr.Textbox( label="πŸ“Š Comparison Report", lines=10, interactive=False ) with gr.Row(): comparison_chart = gr.Plot(label="πŸ“Š Agent Comparison") performance_diff = gr.Plot(label="πŸ“ˆ Performance Delta") with gr.Row(): radar_comparison = gr.Plot(label="πŸ•ΈοΈ Radar Comparison") compare_btn.click( fn=compare_agents, inputs=[agent1_file, agent2_file], outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report] ) # Help & Documentation Tab with gr.TabItem("❓ Help & Documentation"): gr.Markdown(""" ## πŸ“– How to Use AetherScore ### Single Evaluation 1. Enter your prompt and AI response 2. Optionally provide an expected answer for accuracy comparison 3. Choose agent name and task type 4. Click "Evaluate" to get comprehensive scores ### Batch Evaluation 1. Upload a JSON/JSONL file with evaluation data 2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id` 3. Choose evaluation mode and start processing 4. View results in charts and leaderboard ### Agent Comparison 1. Upload evaluation data files for two different agents 2. Click "Compare Agents" to see detailed performance analysis 3. Review comparison charts and statistical analysis ### Evaluation Metrics - **Instruction Following**: How well the response follows prompt constraints - **Hallucination Score**: Detection of fabricated or unverified information - **Assumption Control**: Management of uncertain or speculative content - **Coherence**: Logical flow and consistency of the response - **Accuracy**: Similarity to expected answer (when provided) - **Overall Score**: Weighted combination of all metrics ### Data Format Example ```json { "prompt": "Explain quantum computing", "response": "Quantum computing uses quantum bits...", "expected_answer": "Quantum computing leverages quantum mechanics...", "agent_name": "GPT-4", "task_id": "task_001", "task_type": "factual" } ``` """) return demo # Create and launch the application if __name__ == "__main__": demo = create_gradio_interface() demo.launch( share=True, server_name="0.0.0.0", server_port=7860, show_error=True )