Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import gradio as gr | |
| import pandas as pd | |
| import numpy as np | |
| import json | |
| import plotly.graph_objects as go | |
| from typing import Dict, List, Optional | |
| import gc | |
| import traceback | |
| # Import evaluation modules | |
| from evaluator_module import AetherScoreEvaluator | |
| from visualizer_module import EvaluationVisualizer | |
| from report_generator import ReportGenerator | |
| # --- Global Components & Storage --- | |
| evaluator = None | |
| visualizer = None | |
| report_gen = None | |
| # In-memory storage for explainability feature | |
| evaluation_storage: Dict[str, Dict] = {} | |
| def get_evaluator(): | |
| """Get or create evaluator instance""" | |
| global evaluator | |
| if evaluator is None: | |
| evaluator = AetherScoreEvaluator() | |
| return evaluator | |
| def get_visualizer(): | |
| """Get or create visualizer instance""" | |
| global visualizer | |
| if visualizer is None: | |
| visualizer = EvaluationVisualizer() | |
| return visualizer | |
| def get_report_generator(): | |
| """Get or create report generator instance""" | |
| global report_gen | |
| if report_gen is None: | |
| report_gen = ReportGenerator() | |
| return report_gen | |
| # CSS for better styling | |
| custom_css = """ | |
| .gradio-container { | |
| font-family: 'Inter', sans-serif; | |
| } | |
| .metric-card { | |
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); | |
| padding: 20px; | |
| border-radius: 10px; | |
| color: white; | |
| margin: 10px 0; | |
| } | |
| """ | |
| def process_single_evaluation( | |
| prompt: str, | |
| response: str, | |
| expected_answer: Optional[str] = None, | |
| agent_name: str = "Agent-1", | |
| task_type: str = "general" | |
| ) -> tuple[Dict, go.Figure, go.Figure, str]: | |
| """Process single evaluation with better error handling""" | |
| try: | |
| # Input validation | |
| if not prompt or not response: | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="No data to display") | |
| return {}, empty_fig, empty_fig, "Please provide both prompt and response." | |
| # Get evaluator instance | |
| eval_instance = get_evaluator() | |
| vis_instance = get_visualizer() | |
| # Evaluate the response | |
| eval_result = eval_instance.evaluate_single( | |
| prompt=prompt, | |
| response=response, | |
| expected_answer=expected_answer if expected_answer.strip() else None, | |
| task_type=task_type | |
| ) | |
| scores = eval_result.get("scores", {}) | |
| if not scores or scores.get('overall_score', 0) == 0: | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="Evaluation failed") | |
| return {}, empty_fig, empty_fig, "Evaluation failed. Please check your inputs." | |
| # Generate visualizations | |
| try: | |
| spider_chart = vis_instance.create_spider_chart(scores, agent_name) | |
| score_bars = vis_instance.create_score_bars(scores, agent_name) | |
| except Exception as viz_error: | |
| print(f"Visualization error: {viz_error}") | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="Visualization failed") | |
| spider_chart = score_bars = empty_fig | |
| # Generate explanation | |
| try: | |
| explanation = eval_instance.generate_explanation(scores) | |
| except Exception as exp_error: | |
| explanation = f"Explanation generation failed: {str(exp_error)}" | |
| # Format scores for display | |
| scores_display = {} | |
| for k, v in scores.items(): | |
| if isinstance(v, float): | |
| scores_display[k] = f"{v:.3f}" | |
| else: | |
| scores_display[k] = str(v) | |
| return scores_display, spider_chart, score_bars, explanation | |
| except Exception as e: | |
| error_msg = f"Single evaluation failed: {str(e)}" | |
| print(f"Error: {error_msg}") | |
| print(traceback.format_exc()) | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="Error occurred") | |
| return {}, empty_fig, empty_fig, error_msg | |
| def process_batch_evaluation( | |
| file_input, | |
| evaluation_mode: str = "comprehensive" | |
| ) -> tuple[go.Figure, go.Figure, go.Figure, str, pd.DataFrame]: | |
| """Process batch evaluation with robust error handling""" | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="No data available") | |
| empty_df = pd.DataFrame() | |
| try: | |
| # File validation | |
| if file_input is None: | |
| return empty_fig, empty_fig, empty_fig, "Please upload a file.", empty_df | |
| # Load data with error handling | |
| try: | |
| if file_input.name.endswith('.json'): | |
| with open(file_input.name, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| elif file_input.name.endswith('.jsonl'): | |
| data = [] | |
| with open(file_input.name, 'r', encoding='utf-8') as f: | |
| for line_num, line in enumerate(f, 1): | |
| try: | |
| if line.strip(): | |
| data.append(json.loads(line)) | |
| except json.JSONDecodeError as jde: | |
| print(f"JSON error on line {line_num}: {jde}") | |
| continue | |
| else: | |
| return empty_fig, empty_fig, empty_fig, "Unsupported file format. Please upload JSON or JSONL.", empty_df | |
| except Exception as file_error: | |
| return empty_fig, empty_fig, empty_fig, f"File reading error: {str(file_error)}", empty_df | |
| if not data: | |
| return empty_fig, empty_fig, empty_fig, "No valid data found in file.", empty_df | |
| # Validate data structure | |
| required_fields = ['prompt', 'response'] | |
| valid_data = [] | |
| for i, item in enumerate(data): | |
| if not isinstance(item, dict): | |
| print(f"Item {i} is not a dictionary, skipping") | |
| continue | |
| if all(field in item for field in required_fields): | |
| valid_data.append(item) | |
| else: | |
| print(f"Item {i} missing required fields, skipping") | |
| if not valid_data: | |
| return empty_fig, empty_fig, empty_fig, "No valid items found. Each item must have 'prompt' and 'response' fields.", empty_df | |
| print(f"Processing {len(valid_data)} valid items...") | |
| # Get instances | |
| eval_instance = get_evaluator() | |
| vis_instance = get_visualizer() | |
| report_instance = get_report_generator() | |
| # Process batch evaluation | |
| results = eval_instance.evaluate_batch(valid_data, mode=evaluation_mode) | |
| if not results: | |
| return empty_fig, empty_fig, empty_fig, "Batch evaluation produced no results.", empty_df | |
| print(f"Successfully evaluated {len(results)} items") | |
| # Generate visualizations with error handling | |
| try: | |
| heatmap = vis_instance.create_evaluation_heatmap(results) | |
| except Exception as e: | |
| print(f"Heatmap creation failed: {e}") | |
| heatmap = empty_fig | |
| try: | |
| distribution = vis_instance.create_score_distribution(results) | |
| except Exception as e: | |
| print(f"Distribution creation failed: {e}") | |
| distribution = empty_fig | |
| try: | |
| trends = vis_instance.create_performance_trends(results) | |
| except Exception as e: | |
| print(f"Trends creation failed: {e}") | |
| trends = empty_fig | |
| try: | |
| report = report_instance.generate_batch_report(results) | |
| except Exception as e: | |
| print(f"Report generation failed: {e}") | |
| report = f"Report generation failed: {str(e)}" | |
| try: | |
| leaderboard = create_leaderboard(results) | |
| except Exception as e: | |
| print(f"Leaderboard creation failed: {e}") | |
| leaderboard = empty_df | |
| # Cleanup | |
| gc.collect() | |
| return heatmap, distribution, trends, report, leaderboard | |
| except Exception as e: | |
| error_msg = f"Batch evaluation failed: {str(e)}" | |
| print(f"Error: {error_msg}") | |
| print(traceback.format_exc()) | |
| return empty_fig, empty_fig, empty_fig, error_msg, empty_df | |
| def create_leaderboard(results: List[Dict]) -> pd.DataFrame: | |
| """Create a leaderboard from evaluation results with robust error handling""" | |
| try: | |
| if not results: | |
| return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) | |
| eval_instance = get_evaluator() | |
| agent_scores = eval_instance.get_agent_scores_from_results(results) | |
| if not agent_scores: | |
| return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) | |
| leaderboard_data = [] | |
| for agent, scores in agent_scores.items(): | |
| if not scores: # Skip agents with no valid scores | |
| continue | |
| # Filter out invalid scores | |
| valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)] | |
| if not valid_scores: | |
| continue | |
| leaderboard_data.append({ | |
| 'Rank': 0, | |
| 'Agent': str(agent), | |
| 'Avg Score': np.mean(valid_scores), | |
| 'Max Score': np.max(valid_scores), | |
| 'Min Score': np.min(valid_scores), | |
| 'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0, | |
| 'Evaluations': len(valid_scores) | |
| }) | |
| if not leaderboard_data: | |
| return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) | |
| df = pd.DataFrame(leaderboard_data) | |
| # Sort by average score | |
| df = df.sort_values('Avg Score', ascending=False) | |
| df['Rank'] = range(1, len(df) + 1) | |
| # Format numeric columns | |
| for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']: | |
| if col in df.columns: | |
| df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A") | |
| return df | |
| except Exception as e: | |
| print(f"Leaderboard creation error: {e}") | |
| return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations']) | |
| def compare_agents( | |
| agent1_file, | |
| agent2_file, | |
| ) -> tuple[go.Figure, go.Figure, go.Figure, str]: | |
| """Compare two agents' performance with error handling""" | |
| empty_fig = go.Figure() | |
| empty_fig.update_layout(title="No data available") | |
| try: | |
| if not agent1_file or not agent2_file: | |
| return empty_fig, empty_fig, empty_fig, "Please upload files for both agents." | |
| def load_agent_data(file): | |
| try: | |
| if file.name.endswith('.json'): | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| return json.load(f) | |
| elif file.name.endswith('.jsonl'): | |
| data = [] | |
| with open(file.name, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| if line.strip(): | |
| data.append(json.loads(line)) | |
| return data | |
| else: | |
| raise ValueError("Unsupported file format") | |
| except Exception as e: | |
| raise ValueError(f"Error loading file {file.name}: {str(e)}") | |
| eval_instance = get_evaluator() | |
| vis_instance = get_visualizer() | |
| report_instance = get_report_generator() | |
| # Load data for both agents | |
| agent1_data = load_agent_data(agent1_file) | |
| agent2_data = load_agent_data(agent2_file) | |
| # Validate data | |
| if not agent1_data or not agent2_data: | |
| return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data." | |
| # Evaluate both agents | |
| agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive") | |
| agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive") | |
| if not agent1_results or not agent2_results: | |
| return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents." | |
| # Generate comparison visualizations | |
| try: | |
| comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results) | |
| except Exception as e: | |
| print(f"Comparison chart creation failed: {e}") | |
| comparison_chart = empty_fig | |
| try: | |
| performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results) | |
| except Exception as e: | |
| print(f"Performance difference chart creation failed: {e}") | |
| performance_diff = empty_fig | |
| try: | |
| statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results) | |
| except Exception as e: | |
| print(f"Statistical analysis chart creation failed: {e}") | |
| statistical_analysis = empty_fig | |
| # Generate comparison report | |
| try: | |
| comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results) | |
| except Exception as e: | |
| print(f"Comparison report generation failed: {e}") | |
| comparison_report = f"Comparison report generation failed: {str(e)}" | |
| return comparison_chart, performance_diff, statistical_analysis, comparison_report | |
| except Exception as e: | |
| error_msg = f"Agent comparison failed: {str(e)}" | |
| print(f"Error: {error_msg}") | |
| print(traceback.format_exc()) | |
| return empty_fig, empty_fig, empty_fig, error_msg | |
| # --- Gradio Interface Setup --- | |
| def create_gradio_interface(): | |
| """Create and return the Gradio interface""" | |
| with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo: | |
| gr.Markdown(""" | |
| # π― AetherScore Evaluation Dashboard | |
| Advanced AI response evaluation system with comprehensive metrics and visualizations. | |
| """) | |
| with gr.Tabs(): | |
| # Single Evaluation Tab | |
| with gr.TabItem("π Single Evaluation"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| prompt_input = gr.Textbox( | |
| label="Prompt", | |
| placeholder="Enter the prompt/question here...", | |
| lines=3 | |
| ) | |
| response_input = gr.Textbox( | |
| label="AI Response", | |
| placeholder="Enter the AI response to evaluate...", | |
| lines=5 | |
| ) | |
| expected_input = gr.Textbox( | |
| label="Expected Answer (Optional)", | |
| placeholder="Enter expected answer for accuracy comparison...", | |
| lines=2 | |
| ) | |
| with gr.Row(): | |
| agent_name_input = gr.Textbox( | |
| label="Agent Name", | |
| value="Agent-1", | |
| scale=1 | |
| ) | |
| task_type_input = gr.Dropdown( | |
| label="Task Type", | |
| choices=["general", "reasoning", "creative", "factual"], | |
| value="general", | |
| scale=1 | |
| ) | |
| evaluate_btn = gr.Button("π Evaluate", variant="primary") | |
| with gr.Column(scale=2): | |
| scores_display = gr.JSON(label="π Evaluation Scores") | |
| explanation_output = gr.Textbox( | |
| label="π‘ Detailed Explanation", | |
| lines=4, | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| spider_chart = gr.Plot(label="πΈοΈ Performance Spider Chart") | |
| score_bars = gr.Plot(label="π Score Breakdown") | |
| evaluate_btn.click( | |
| fn=process_single_evaluation, | |
| inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input], | |
| outputs=[scores_display, spider_chart, score_bars, explanation_output] | |
| ) | |
| # Batch Evaluation Tab | |
| with gr.TabItem("π Batch Evaluation"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| file_input = gr.File( | |
| label="Upload Evaluation Data", | |
| file_types=[".json", ".jsonl"], | |
| type="filepath" | |
| ) | |
| eval_mode = gr.Dropdown( | |
| label="Evaluation Mode", | |
| choices=["comprehensive", "fast"], | |
| value="comprehensive" | |
| ) | |
| batch_btn = gr.Button("π Start Batch Evaluation", variant="primary") | |
| with gr.Column(scale=2): | |
| batch_report = gr.Textbox( | |
| label="π Evaluation Report", | |
| lines=8, | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| heatmap_plot = gr.Plot(label="π₯ Performance Heatmap") | |
| distribution_plot = gr.Plot(label="π Score Distribution") | |
| with gr.Row(): | |
| trends_plot = gr.Plot(label="π Performance Trends") | |
| leaderboard_df = gr.Dataframe(label="π Leaderboard") | |
| batch_btn.click( | |
| fn=process_batch_evaluation, | |
| inputs=[file_input, eval_mode], | |
| outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df] | |
| ) | |
| # Agent Comparison Tab | |
| with gr.TabItem("βοΈ Agent Comparison"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| agent1_file = gr.File( | |
| label="Agent 1 Data", | |
| file_types=[".json", ".jsonl"], | |
| type="filepath" | |
| ) | |
| with gr.Column(): | |
| agent2_file = gr.File( | |
| label="Agent 2 Data", | |
| file_types=[".json", ".jsonl"], | |
| type="filepath" | |
| ) | |
| compare_btn = gr.Button("π Compare Agents", variant="primary") | |
| with gr.Row(): | |
| comparison_report = gr.Textbox( | |
| label="π Comparison Report", | |
| lines=10, | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| comparison_chart = gr.Plot(label="π Agent Comparison") | |
| performance_diff = gr.Plot(label="π Performance Delta") | |
| with gr.Row(): | |
| radar_comparison = gr.Plot(label="πΈοΈ Radar Comparison") | |
| compare_btn.click( | |
| fn=compare_agents, | |
| inputs=[agent1_file, agent2_file], | |
| outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report] | |
| ) | |
| # Help & Documentation Tab | |
| with gr.TabItem("β Help & Documentation"): | |
| gr.Markdown(""" | |
| ## π How to Use AetherScore | |
| ### Single Evaluation | |
| 1. Enter your prompt and AI response | |
| 2. Optionally provide an expected answer for accuracy comparison | |
| 3. Choose agent name and task type | |
| 4. Click "Evaluate" to get comprehensive scores | |
| ### Batch Evaluation | |
| 1. Upload a JSON/JSONL file with evaluation data | |
| 2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id` | |
| 3. Choose evaluation mode and start processing | |
| 4. View results in charts and leaderboard | |
| ### Agent Comparison | |
| 1. Upload evaluation data files for two different agents | |
| 2. Click "Compare Agents" to see detailed performance analysis | |
| 3. Review comparison charts and statistical analysis | |
| ### Evaluation Metrics | |
| - **Instruction Following**: How well the response follows prompt constraints | |
| - **Hallucination Score**: Detection of fabricated or unverified information | |
| - **Assumption Control**: Management of uncertain or speculative content | |
| - **Coherence**: Logical flow and consistency of the response | |
| - **Accuracy**: Similarity to expected answer (when provided) | |
| - **Overall Score**: Weighted combination of all metrics | |
| ### Data Format Example | |
| ```json | |
| { | |
| "prompt": "Explain quantum computing", | |
| "response": "Quantum computing uses quantum bits...", | |
| "expected_answer": "Quantum computing leverages quantum mechanics...", | |
| "agent_name": "GPT-4", | |
| "task_id": "task_001", | |
| "task_type": "factual" | |
| } | |
| ``` | |
| """) | |
| return demo | |
| # Create and launch the application | |
| if __name__ == "__main__": | |
| demo = create_gradio_interface() | |
| demo.launch( | |
| share=True, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |