e6test / app.py
aaditya-raj's picture
Update app.py
672a8ff verified
from __future__ import annotations
import gradio as gr
import pandas as pd
import numpy as np
import json
import plotly.graph_objects as go
from typing import Dict, List, Optional
import gc
import traceback
# Import evaluation modules
from evaluator_module import AetherScoreEvaluator
from visualizer_module import EvaluationVisualizer
from report_generator import ReportGenerator
# --- Global Components & Storage ---
evaluator = None
visualizer = None
report_gen = None
# In-memory storage for explainability feature
evaluation_storage: Dict[str, Dict] = {}
def get_evaluator():
"""Get or create evaluator instance"""
global evaluator
if evaluator is None:
evaluator = AetherScoreEvaluator()
return evaluator
def get_visualizer():
"""Get or create visualizer instance"""
global visualizer
if visualizer is None:
visualizer = EvaluationVisualizer()
return visualizer
def get_report_generator():
"""Get or create report generator instance"""
global report_gen
if report_gen is None:
report_gen = ReportGenerator()
return report_gen
# CSS for better styling
custom_css = """
.gradio-container {
font-family: 'Inter', sans-serif;
}
.metric-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
border-radius: 10px;
color: white;
margin: 10px 0;
}
"""
def process_single_evaluation(
prompt: str,
response: str,
expected_answer: Optional[str] = None,
agent_name: str = "Agent-1",
task_type: str = "general"
) -> tuple[Dict, go.Figure, go.Figure, str]:
"""Process single evaluation with better error handling"""
try:
# Input validation
if not prompt or not response:
empty_fig = go.Figure()
empty_fig.update_layout(title="No data to display")
return {}, empty_fig, empty_fig, "Please provide both prompt and response."
# Get evaluator instance
eval_instance = get_evaluator()
vis_instance = get_visualizer()
# Evaluate the response
eval_result = eval_instance.evaluate_single(
prompt=prompt,
response=response,
expected_answer=expected_answer if expected_answer.strip() else None,
task_type=task_type
)
scores = eval_result.get("scores", {})
if not scores or scores.get('overall_score', 0) == 0:
empty_fig = go.Figure()
empty_fig.update_layout(title="Evaluation failed")
return {}, empty_fig, empty_fig, "Evaluation failed. Please check your inputs."
# Generate visualizations
try:
spider_chart = vis_instance.create_spider_chart(scores, agent_name)
score_bars = vis_instance.create_score_bars(scores, agent_name)
except Exception as viz_error:
print(f"Visualization error: {viz_error}")
empty_fig = go.Figure()
empty_fig.update_layout(title="Visualization failed")
spider_chart = score_bars = empty_fig
# Generate explanation
try:
explanation = eval_instance.generate_explanation(scores)
except Exception as exp_error:
explanation = f"Explanation generation failed: {str(exp_error)}"
# Format scores for display
scores_display = {}
for k, v in scores.items():
if isinstance(v, float):
scores_display[k] = f"{v:.3f}"
else:
scores_display[k] = str(v)
return scores_display, spider_chart, score_bars, explanation
except Exception as e:
error_msg = f"Single evaluation failed: {str(e)}"
print(f"Error: {error_msg}")
print(traceback.format_exc())
empty_fig = go.Figure()
empty_fig.update_layout(title="Error occurred")
return {}, empty_fig, empty_fig, error_msg
def process_batch_evaluation(
file_input,
evaluation_mode: str = "comprehensive"
) -> tuple[go.Figure, go.Figure, go.Figure, str, pd.DataFrame]:
"""Process batch evaluation with robust error handling"""
empty_fig = go.Figure()
empty_fig.update_layout(title="No data available")
empty_df = pd.DataFrame()
try:
# File validation
if file_input is None:
return empty_fig, empty_fig, empty_fig, "Please upload a file.", empty_df
# Load data with error handling
try:
if file_input.name.endswith('.json'):
with open(file_input.name, 'r', encoding='utf-8') as f:
data = json.load(f)
elif file_input.name.endswith('.jsonl'):
data = []
with open(file_input.name, 'r', encoding='utf-8') as f:
for line_num, line in enumerate(f, 1):
try:
if line.strip():
data.append(json.loads(line))
except json.JSONDecodeError as jde:
print(f"JSON error on line {line_num}: {jde}")
continue
else:
return empty_fig, empty_fig, empty_fig, "Unsupported file format. Please upload JSON or JSONL.", empty_df
except Exception as file_error:
return empty_fig, empty_fig, empty_fig, f"File reading error: {str(file_error)}", empty_df
if not data:
return empty_fig, empty_fig, empty_fig, "No valid data found in file.", empty_df
# Validate data structure
required_fields = ['prompt', 'response']
valid_data = []
for i, item in enumerate(data):
if not isinstance(item, dict):
print(f"Item {i} is not a dictionary, skipping")
continue
if all(field in item for field in required_fields):
valid_data.append(item)
else:
print(f"Item {i} missing required fields, skipping")
if not valid_data:
return empty_fig, empty_fig, empty_fig, "No valid items found. Each item must have 'prompt' and 'response' fields.", empty_df
print(f"Processing {len(valid_data)} valid items...")
# Get instances
eval_instance = get_evaluator()
vis_instance = get_visualizer()
report_instance = get_report_generator()
# Process batch evaluation
results = eval_instance.evaluate_batch(valid_data, mode=evaluation_mode)
if not results:
return empty_fig, empty_fig, empty_fig, "Batch evaluation produced no results.", empty_df
print(f"Successfully evaluated {len(results)} items")
# Generate visualizations with error handling
try:
heatmap = vis_instance.create_evaluation_heatmap(results)
except Exception as e:
print(f"Heatmap creation failed: {e}")
heatmap = empty_fig
try:
distribution = vis_instance.create_score_distribution(results)
except Exception as e:
print(f"Distribution creation failed: {e}")
distribution = empty_fig
try:
trends = vis_instance.create_performance_trends(results)
except Exception as e:
print(f"Trends creation failed: {e}")
trends = empty_fig
try:
report = report_instance.generate_batch_report(results)
except Exception as e:
print(f"Report generation failed: {e}")
report = f"Report generation failed: {str(e)}"
try:
leaderboard = create_leaderboard(results)
except Exception as e:
print(f"Leaderboard creation failed: {e}")
leaderboard = empty_df
# Cleanup
gc.collect()
return heatmap, distribution, trends, report, leaderboard
except Exception as e:
error_msg = f"Batch evaluation failed: {str(e)}"
print(f"Error: {error_msg}")
print(traceback.format_exc())
return empty_fig, empty_fig, empty_fig, error_msg, empty_df
def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
"""Create a leaderboard from evaluation results with robust error handling"""
try:
if not results:
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
eval_instance = get_evaluator()
agent_scores = eval_instance.get_agent_scores_from_results(results)
if not agent_scores:
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
leaderboard_data = []
for agent, scores in agent_scores.items():
if not scores: # Skip agents with no valid scores
continue
# Filter out invalid scores
valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
if not valid_scores:
continue
leaderboard_data.append({
'Rank': 0,
'Agent': str(agent),
'Avg Score': np.mean(valid_scores),
'Max Score': np.max(valid_scores),
'Min Score': np.min(valid_scores),
'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
'Evaluations': len(valid_scores)
})
if not leaderboard_data:
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
df = pd.DataFrame(leaderboard_data)
# Sort by average score
df = df.sort_values('Avg Score', ascending=False)
df['Rank'] = range(1, len(df) + 1)
# Format numeric columns
for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
if col in df.columns:
df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
return df
except Exception as e:
print(f"Leaderboard creation error: {e}")
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
def compare_agents(
agent1_file,
agent2_file,
) -> tuple[go.Figure, go.Figure, go.Figure, str]:
"""Compare two agents' performance with error handling"""
empty_fig = go.Figure()
empty_fig.update_layout(title="No data available")
try:
if not agent1_file or not agent2_file:
return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."
def load_agent_data(file):
try:
if file.name.endswith('.json'):
with open(file.name, 'r', encoding='utf-8') as f:
return json.load(f)
elif file.name.endswith('.jsonl'):
data = []
with open(file.name, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
data.append(json.loads(line))
return data
else:
raise ValueError("Unsupported file format")
except Exception as e:
raise ValueError(f"Error loading file {file.name}: {str(e)}")
eval_instance = get_evaluator()
vis_instance = get_visualizer()
report_instance = get_report_generator()
# Load data for both agents
agent1_data = load_agent_data(agent1_file)
agent2_data = load_agent_data(agent2_file)
# Validate data
if not agent1_data or not agent2_data:
return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
# Evaluate both agents
agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
if not agent1_results or not agent2_results:
return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
# Generate comparison visualizations
try:
comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
except Exception as e:
print(f"Comparison chart creation failed: {e}")
comparison_chart = empty_fig
try:
performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
except Exception as e:
print(f"Performance difference chart creation failed: {e}")
performance_diff = empty_fig
try:
statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
except Exception as e:
print(f"Statistical analysis chart creation failed: {e}")
statistical_analysis = empty_fig
# Generate comparison report
try:
comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
except Exception as e:
print(f"Comparison report generation failed: {e}")
comparison_report = f"Comparison report generation failed: {str(e)}"
return comparison_chart, performance_diff, statistical_analysis, comparison_report
except Exception as e:
error_msg = f"Agent comparison failed: {str(e)}"
print(f"Error: {error_msg}")
print(traceback.format_exc())
return empty_fig, empty_fig, empty_fig, error_msg
# --- Gradio Interface Setup ---
def create_gradio_interface():
"""Create and return the Gradio interface"""
with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo:
gr.Markdown("""
# 🎯 AetherScore Evaluation Dashboard
Advanced AI response evaluation system with comprehensive metrics and visualizations.
""")
with gr.Tabs():
# Single Evaluation Tab
with gr.TabItem("πŸ” Single Evaluation"):
with gr.Row():
with gr.Column(scale=1):
prompt_input = gr.Textbox(
label="Prompt",
placeholder="Enter the prompt/question here...",
lines=3
)
response_input = gr.Textbox(
label="AI Response",
placeholder="Enter the AI response to evaluate...",
lines=5
)
expected_input = gr.Textbox(
label="Expected Answer (Optional)",
placeholder="Enter expected answer for accuracy comparison...",
lines=2
)
with gr.Row():
agent_name_input = gr.Textbox(
label="Agent Name",
value="Agent-1",
scale=1
)
task_type_input = gr.Dropdown(
label="Task Type",
choices=["general", "reasoning", "creative", "factual"],
value="general",
scale=1
)
evaluate_btn = gr.Button("πŸ” Evaluate", variant="primary")
with gr.Column(scale=2):
scores_display = gr.JSON(label="πŸ“Š Evaluation Scores")
explanation_output = gr.Textbox(
label="πŸ’‘ Detailed Explanation",
lines=4,
interactive=False
)
with gr.Row():
spider_chart = gr.Plot(label="πŸ•ΈοΈ Performance Spider Chart")
score_bars = gr.Plot(label="πŸ“Š Score Breakdown")
evaluate_btn.click(
fn=process_single_evaluation,
inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input],
outputs=[scores_display, spider_chart, score_bars, explanation_output]
)
# Batch Evaluation Tab
with gr.TabItem("πŸ“ Batch Evaluation"):
with gr.Row():
with gr.Column(scale=1):
file_input = gr.File(
label="Upload Evaluation Data",
file_types=[".json", ".jsonl"],
type="filepath"
)
eval_mode = gr.Dropdown(
label="Evaluation Mode",
choices=["comprehensive", "fast"],
value="comprehensive"
)
batch_btn = gr.Button("πŸš€ Start Batch Evaluation", variant="primary")
with gr.Column(scale=2):
batch_report = gr.Textbox(
label="πŸ“‹ Evaluation Report",
lines=8,
interactive=False
)
with gr.Row():
heatmap_plot = gr.Plot(label="πŸ”₯ Performance Heatmap")
distribution_plot = gr.Plot(label="πŸ“ˆ Score Distribution")
with gr.Row():
trends_plot = gr.Plot(label="πŸ“Š Performance Trends")
leaderboard_df = gr.Dataframe(label="πŸ† Leaderboard")
batch_btn.click(
fn=process_batch_evaluation,
inputs=[file_input, eval_mode],
outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df]
)
# Agent Comparison Tab
with gr.TabItem("βš”οΈ Agent Comparison"):
with gr.Row():
with gr.Column():
agent1_file = gr.File(
label="Agent 1 Data",
file_types=[".json", ".jsonl"],
type="filepath"
)
with gr.Column():
agent2_file = gr.File(
label="Agent 2 Data",
file_types=[".json", ".jsonl"],
type="filepath"
)
compare_btn = gr.Button("πŸ” Compare Agents", variant="primary")
with gr.Row():
comparison_report = gr.Textbox(
label="πŸ“Š Comparison Report",
lines=10,
interactive=False
)
with gr.Row():
comparison_chart = gr.Plot(label="πŸ“Š Agent Comparison")
performance_diff = gr.Plot(label="πŸ“ˆ Performance Delta")
with gr.Row():
radar_comparison = gr.Plot(label="πŸ•ΈοΈ Radar Comparison")
compare_btn.click(
fn=compare_agents,
inputs=[agent1_file, agent2_file],
outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report]
)
# Help & Documentation Tab
with gr.TabItem("❓ Help & Documentation"):
gr.Markdown("""
## πŸ“– How to Use AetherScore
### Single Evaluation
1. Enter your prompt and AI response
2. Optionally provide an expected answer for accuracy comparison
3. Choose agent name and task type
4. Click "Evaluate" to get comprehensive scores
### Batch Evaluation
1. Upload a JSON/JSONL file with evaluation data
2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id`
3. Choose evaluation mode and start processing
4. View results in charts and leaderboard
### Agent Comparison
1. Upload evaluation data files for two different agents
2. Click "Compare Agents" to see detailed performance analysis
3. Review comparison charts and statistical analysis
### Evaluation Metrics
- **Instruction Following**: How well the response follows prompt constraints
- **Hallucination Score**: Detection of fabricated or unverified information
- **Assumption Control**: Management of uncertain or speculative content
- **Coherence**: Logical flow and consistency of the response
- **Accuracy**: Similarity to expected answer (when provided)
- **Overall Score**: Weighted combination of all metrics
### Data Format Example
```json
{
"prompt": "Explain quantum computing",
"response": "Quantum computing uses quantum bits...",
"expected_answer": "Quantum computing leverages quantum mechanics...",
"agent_name": "GPT-4",
"task_id": "task_001",
"task_type": "factual"
}
```
""")
return demo
# Create and launch the application
if __name__ == "__main__":
demo = create_gradio_interface()
demo.launch(
share=True,
server_name="0.0.0.0",
server_port=7860,
show_error=True
)