Spaces:

aaditya-raj
/

e6test

Sleeping

App Files Files Community

e6test / app.py

aaditya-raj

Update app.py

672a8ff verified 5 months ago

raw

history blame contribute delete

22.6 kB

	from __future__ import annotations

	import gradio as gr
	import pandas as pd
	import numpy as np
	import json
	import plotly.graph_objects as go
	from typing import Dict, List, Optional
	import gc
	import traceback

	# Import evaluation modules
	from evaluator_module import AetherScoreEvaluator
	from visualizer_module import EvaluationVisualizer
	from report_generator import ReportGenerator

	# --- Global Components & Storage ---
	evaluator = None
	visualizer = None
	report_gen = None

	# In-memory storage for explainability feature
	evaluation_storage: Dict[str, Dict] = {}

	def get_evaluator():
	"""Get or create evaluator instance"""
	global evaluator
	if evaluator is None:
	evaluator = AetherScoreEvaluator()
	return evaluator

	def get_visualizer():
	"""Get or create visualizer instance"""
	global visualizer
	if visualizer is None:
	visualizer = EvaluationVisualizer()
	return visualizer

	def get_report_generator():
	"""Get or create report generator instance"""
	global report_gen
	if report_gen is None:
	report_gen = ReportGenerator()
	return report_gen

	# CSS for better styling
	custom_css = """
	.gradio-container {
	font-family: 'Inter', sans-serif;
	}
	.metric-card {
	background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
	padding: 20px;
	border-radius: 10px;
	color: white;
	margin: 10px 0;
	}
	"""

	def process_single_evaluation(
	prompt: str,
	response: str,
	expected_answer: Optional[str] = None,
	agent_name: str = "Agent-1",
	task_type: str = "general"
	) -> tuple[Dict, go.Figure, go.Figure, str]:
	"""Process single evaluation with better error handling"""

	try:
	# Input validation
	if not prompt or not response:
	empty_fig = go.Figure()
	empty_fig.update_layout(title="No data to display")
	return {}, empty_fig, empty_fig, "Please provide both prompt and response."

	# Get evaluator instance
	eval_instance = get_evaluator()
	vis_instance = get_visualizer()

	# Evaluate the response
	eval_result = eval_instance.evaluate_single(
	prompt=prompt,
	response=response,
	expected_answer=expected_answer if expected_answer.strip() else None,
	task_type=task_type
	)

	scores = eval_result.get("scores", {})

	if not scores or scores.get('overall_score', 0) == 0:
	empty_fig = go.Figure()
	empty_fig.update_layout(title="Evaluation failed")
	return {}, empty_fig, empty_fig, "Evaluation failed. Please check your inputs."

	# Generate visualizations
	try:
	spider_chart = vis_instance.create_spider_chart(scores, agent_name)
	score_bars = vis_instance.create_score_bars(scores, agent_name)
	except Exception as viz_error:
	print(f"Visualization error: {viz_error}")
	empty_fig = go.Figure()
	empty_fig.update_layout(title="Visualization failed")
	spider_chart = score_bars = empty_fig

	# Generate explanation
	try:
	explanation = eval_instance.generate_explanation(scores)
	except Exception as exp_error:
	explanation = f"Explanation generation failed: {str(exp_error)}"

	# Format scores for display
	scores_display = {}
	for k, v in scores.items():
	if isinstance(v, float):
	scores_display[k] = f"{v:.3f}"
	else:
	scores_display[k] = str(v)

	return scores_display, spider_chart, score_bars, explanation

	except Exception as e:
	error_msg = f"Single evaluation failed: {str(e)}"
	print(f"Error: {error_msg}")
	print(traceback.format_exc())
	empty_fig = go.Figure()
	empty_fig.update_layout(title="Error occurred")
	return {}, empty_fig, empty_fig, error_msg

	def process_batch_evaluation(
	file_input,
	evaluation_mode: str = "comprehensive"
	) -> tuple[go.Figure, go.Figure, go.Figure, str, pd.DataFrame]:
	"""Process batch evaluation with robust error handling"""

	empty_fig = go.Figure()
	empty_fig.update_layout(title="No data available")
	empty_df = pd.DataFrame()

	try:
	# File validation
	if file_input is None:
	return empty_fig, empty_fig, empty_fig, "Please upload a file.", empty_df

	# Load data with error handling
	try:
	if file_input.name.endswith('.json'):
	with open(file_input.name, 'r', encoding='utf-8') as f:
	data = json.load(f)
	elif file_input.name.endswith('.jsonl'):
	data = []
	with open(file_input.name, 'r', encoding='utf-8') as f:
	for line_num, line in enumerate(f, 1):
	try:
	if line.strip():
	data.append(json.loads(line))
	except json.JSONDecodeError as jde:
	print(f"JSON error on line {line_num}: {jde}")
	continue
	else:
	return empty_fig, empty_fig, empty_fig, "Unsupported file format. Please upload JSON or JSONL.", empty_df

	except Exception as file_error:
	return empty_fig, empty_fig, empty_fig, f"File reading error: {str(file_error)}", empty_df

	if not data:
	return empty_fig, empty_fig, empty_fig, "No valid data found in file.", empty_df

	# Validate data structure
	required_fields = ['prompt', 'response']
	valid_data = []
	for i, item in enumerate(data):
	if not isinstance(item, dict):
	print(f"Item {i} is not a dictionary, skipping")
	continue
	if all(field in item for field in required_fields):
	valid_data.append(item)
	else:
	print(f"Item {i} missing required fields, skipping")

	if not valid_data:
	return empty_fig, empty_fig, empty_fig, "No valid items found. Each item must have 'prompt' and 'response' fields.", empty_df

	print(f"Processing {len(valid_data)} valid items...")

	# Get instances
	eval_instance = get_evaluator()
	vis_instance = get_visualizer()
	report_instance = get_report_generator()

	# Process batch evaluation
	results = eval_instance.evaluate_batch(valid_data, mode=evaluation_mode)

	if not results:
	return empty_fig, empty_fig, empty_fig, "Batch evaluation produced no results.", empty_df

	print(f"Successfully evaluated {len(results)} items")

	# Generate visualizations with error handling
	try:
	heatmap = vis_instance.create_evaluation_heatmap(results)
	except Exception as e:
	print(f"Heatmap creation failed: {e}")
	heatmap = empty_fig

	try:
	distribution = vis_instance.create_score_distribution(results)
	except Exception as e:
	print(f"Distribution creation failed: {e}")
	distribution = empty_fig

	try:
	trends = vis_instance.create_performance_trends(results)
	except Exception as e:
	print(f"Trends creation failed: {e}")
	trends = empty_fig

	try:
	report = report_instance.generate_batch_report(results)
	except Exception as e:
	print(f"Report generation failed: {e}")
	report = f"Report generation failed: {str(e)}"

	try:
	leaderboard = create_leaderboard(results)
	except Exception as e:
	print(f"Leaderboard creation failed: {e}")
	leaderboard = empty_df

	# Cleanup
	gc.collect()

	return heatmap, distribution, trends, report, leaderboard

	except Exception as e:
	error_msg = f"Batch evaluation failed: {str(e)}"
	print(f"Error: {error_msg}")
	print(traceback.format_exc())
	return empty_fig, empty_fig, empty_fig, error_msg, empty_df

	def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
	"""Create a leaderboard from evaluation results with robust error handling"""
	try:
	if not results:
	return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])

	eval_instance = get_evaluator()
	agent_scores = eval_instance.get_agent_scores_from_results(results)

	if not agent_scores:
	return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])

	leaderboard_data = []
	for agent, scores in agent_scores.items():
	if not scores: # Skip agents with no valid scores
	continue

	# Filter out invalid scores
	valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]

	if not valid_scores:
	continue

	leaderboard_data.append({
	'Rank': 0,
	'Agent': str(agent),
	'Avg Score': np.mean(valid_scores),
	'Max Score': np.max(valid_scores),
	'Min Score': np.min(valid_scores),
	'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
	'Evaluations': len(valid_scores)
	})

	if not leaderboard_data:
	return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])

	df = pd.DataFrame(leaderboard_data)

	# Sort by average score
	df = df.sort_values('Avg Score', ascending=False)
	df['Rank'] = range(1, len(df) + 1)

	# Format numeric columns
	for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
	if col in df.columns:
	df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")

	return df

	except Exception as e:
	print(f"Leaderboard creation error: {e}")
	return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])

	def compare_agents(
	agent1_file,
	agent2_file,
	) -> tuple[go.Figure, go.Figure, go.Figure, str]:
	"""Compare two agents' performance with error handling"""

	empty_fig = go.Figure()
	empty_fig.update_layout(title="No data available")

	try:
	if not agent1_file or not agent2_file:
	return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."

	def load_agent_data(file):
	try:
	if file.name.endswith('.json'):
	with open(file.name, 'r', encoding='utf-8') as f:
	return json.load(f)
	elif file.name.endswith('.jsonl'):
	data = []
	with open(file.name, 'r', encoding='utf-8') as f:
	for line in f:
	if line.strip():
	data.append(json.loads(line))
	return data
	else:
	raise ValueError("Unsupported file format")
	except Exception as e:
	raise ValueError(f"Error loading file {file.name}: {str(e)}")

	eval_instance = get_evaluator()
	vis_instance = get_visualizer()
	report_instance = get_report_generator()

	# Load data for both agents
	agent1_data = load_agent_data(agent1_file)
	agent2_data = load_agent_data(agent2_file)

	# Validate data
	if not agent1_data or not agent2_data:
	return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."

	# Evaluate both agents
	agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
	agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")

	if not agent1_results or not agent2_results:
	return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."

	# Generate comparison visualizations
	try:
	comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
	except Exception as e:
	print(f"Comparison chart creation failed: {e}")
	comparison_chart = empty_fig

	try:
	performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
	except Exception as e:
	print(f"Performance difference chart creation failed: {e}")
	performance_diff = empty_fig

	try:
	statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
	except Exception as e:
	print(f"Statistical analysis chart creation failed: {e}")
	statistical_analysis = empty_fig

	# Generate comparison report
	try:
	comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
	except Exception as e:
	print(f"Comparison report generation failed: {e}")
	comparison_report = f"Comparison report generation failed: {str(e)}"

	return comparison_chart, performance_diff, statistical_analysis, comparison_report

	except Exception as e:
	error_msg = f"Agent comparison failed: {str(e)}"
	print(f"Error: {error_msg}")
	print(traceback.format_exc())
	return empty_fig, empty_fig, empty_fig, error_msg

	# --- Gradio Interface Setup ---

	def create_gradio_interface():
	"""Create and return the Gradio interface"""

	with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo:

	gr.Markdown("""
	# 🎯 AetherScore Evaluation Dashboard

	Advanced AI response evaluation system with comprehensive metrics and visualizations.
	""")

	with gr.Tabs():
	# Single Evaluation Tab
	with gr.TabItem("🔍 Single Evaluation"):
	with gr.Row():
	with gr.Column(scale=1):
	prompt_input = gr.Textbox(
	label="Prompt",
	placeholder="Enter the prompt/question here...",
	lines=3
	)
	response_input = gr.Textbox(
	label="AI Response",
	placeholder="Enter the AI response to evaluate...",
	lines=5
	)
	expected_input = gr.Textbox(
	label="Expected Answer (Optional)",
	placeholder="Enter expected answer for accuracy comparison...",
	lines=2
	)
	with gr.Row():
	agent_name_input = gr.Textbox(
	label="Agent Name",
	value="Agent-1",
	scale=1
	)
	task_type_input = gr.Dropdown(
	label="Task Type",
	choices=["general", "reasoning", "creative", "factual"],
	value="general",
	scale=1
	)
	evaluate_btn = gr.Button("🔍 Evaluate", variant="primary")

	with gr.Column(scale=2):
	scores_display = gr.JSON(label="📊 Evaluation Scores")
	explanation_output = gr.Textbox(
	label="💡 Detailed Explanation",
	lines=4,
	interactive=False
	)

	with gr.Row():
	spider_chart = gr.Plot(label="🕸️ Performance Spider Chart")
	score_bars = gr.Plot(label="📊 Score Breakdown")

	evaluate_btn.click(
	fn=process_single_evaluation,
	inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input],
	outputs=[scores_display, spider_chart, score_bars, explanation_output]
	)

	# Batch Evaluation Tab
	with gr.TabItem("📁 Batch Evaluation"):
	with gr.Row():
	with gr.Column(scale=1):
	file_input = gr.File(
	label="Upload Evaluation Data",
	file_types=[".json", ".jsonl"],
	type="filepath"
	)
	eval_mode = gr.Dropdown(
	label="Evaluation Mode",
	choices=["comprehensive", "fast"],
	value="comprehensive"
	)
	batch_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary")

	with gr.Column(scale=2):
	batch_report = gr.Textbox(
	label="📋 Evaluation Report",
	lines=8,
	interactive=False
	)

	with gr.Row():
	heatmap_plot = gr.Plot(label="🔥 Performance Heatmap")
	distribution_plot = gr.Plot(label="📈 Score Distribution")

	with gr.Row():
	trends_plot = gr.Plot(label="📊 Performance Trends")
	leaderboard_df = gr.Dataframe(label="🏆 Leaderboard")

	batch_btn.click(
	fn=process_batch_evaluation,
	inputs=[file_input, eval_mode],
	outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df]
	)

	# Agent Comparison Tab
	with gr.TabItem("⚔️ Agent Comparison"):
	with gr.Row():
	with gr.Column():
	agent1_file = gr.File(
	label="Agent 1 Data",
	file_types=[".json", ".jsonl"],
	type="filepath"
	)
	with gr.Column():
	agent2_file = gr.File(
	label="Agent 2 Data",
	file_types=[".json", ".jsonl"],
	type="filepath"
	)

	compare_btn = gr.Button("🔍 Compare Agents", variant="primary")

	with gr.Row():
	comparison_report = gr.Textbox(
	label="📊 Comparison Report",
	lines=10,
	interactive=False
	)

	with gr.Row():
	comparison_chart = gr.Plot(label="📊 Agent Comparison")
	performance_diff = gr.Plot(label="📈 Performance Delta")

	with gr.Row():
	radar_comparison = gr.Plot(label="🕸️ Radar Comparison")

	compare_btn.click(
	fn=compare_agents,
	inputs=[agent1_file, agent2_file],
	outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report]
	)

	# Help & Documentation Tab
	with gr.TabItem("❓ Help & Documentation"):
	gr.Markdown("""
	## 📖 How to Use AetherScore

	### Single Evaluation
	1. Enter your prompt and AI response
	2. Optionally provide an expected answer for accuracy comparison
	3. Choose agent name and task type
	4. Click "Evaluate" to get comprehensive scores

	### Batch Evaluation
	1. Upload a JSON/JSONL file with evaluation data
	2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id`
	3. Choose evaluation mode and start processing
	4. View results in charts and leaderboard

	### Agent Comparison
	1. Upload evaluation data files for two different agents
	2. Click "Compare Agents" to see detailed performance analysis
	3. Review comparison charts and statistical analysis

	### Evaluation Metrics
	- Instruction Following: How well the response follows prompt constraints
	- Hallucination Score: Detection of fabricated or unverified information
	- Assumption Control: Management of uncertain or speculative content
	- Coherence: Logical flow and consistency of the response
	- Accuracy: Similarity to expected answer (when provided)
	- Overall Score: Weighted combination of all metrics

	### Data Format Example
	```json
	{
	"prompt": "Explain quantum computing",
	"response": "Quantum computing uses quantum bits...",
	"expected_answer": "Quantum computing leverages quantum mechanics...",
	"agent_name": "GPT-4",
	"task_id": "task_001",
	"task_type": "factual"
	}
	```
	""")

	return demo

	# Create and launch the application
	if __name__ == "__main__":
	demo = create_gradio_interface()
	demo.launch(
	share=True,
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)