Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,3 @@
|
|
| 1 |
-
|
| 2 |
from __future__ import annotations
|
| 3 |
|
| 4 |
import gradio as gr
|
|
@@ -236,6 +235,142 @@ def process_batch_evaluation(
|
|
| 236 |
error_msg = f"Batch evaluation failed: {str(e)}"
|
| 237 |
print(f"Error: {error_msg}")
|
| 238 |
print(traceback.format_exc())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
return empty_fig, empty_fig, empty_fig, error_msg
|
| 240 |
|
| 241 |
# --- Gradio Interface Setup ---
|
|
@@ -253,7 +388,7 @@ def create_gradio_interface():
|
|
| 253 |
|
| 254 |
with gr.Tabs():
|
| 255 |
# Single Evaluation Tab
|
| 256 |
-
with gr.TabItem("
|
| 257 |
with gr.Row():
|
| 258 |
with gr.Column(scale=1):
|
| 259 |
prompt_input = gr.Textbox(
|
|
@@ -431,140 +566,4 @@ if __name__ == "__main__":
|
|
| 431 |
server_name="0.0.0.0",
|
| 432 |
server_port=7860,
|
| 433 |
show_error=True
|
| 434 |
-
)
|
| 435 |
-
|
| 436 |
-
def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
|
| 437 |
-
"""Create a leaderboard from evaluation results with robust error handling"""
|
| 438 |
-
try:
|
| 439 |
-
if not results:
|
| 440 |
-
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 441 |
-
|
| 442 |
-
eval_instance = get_evaluator()
|
| 443 |
-
agent_scores = eval_instance.get_agent_scores_from_results(results)
|
| 444 |
-
|
| 445 |
-
if not agent_scores:
|
| 446 |
-
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 447 |
-
|
| 448 |
-
leaderboard_data = []
|
| 449 |
-
for agent, scores in agent_scores.items():
|
| 450 |
-
if not scores: # Skip agents with no valid scores
|
| 451 |
-
continue
|
| 452 |
-
|
| 453 |
-
# Filter out invalid scores
|
| 454 |
-
valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
|
| 455 |
-
|
| 456 |
-
if not valid_scores:
|
| 457 |
-
continue
|
| 458 |
-
|
| 459 |
-
leaderboard_data.append({
|
| 460 |
-
'Rank': 0,
|
| 461 |
-
'Agent': str(agent),
|
| 462 |
-
'Avg Score': np.mean(valid_scores),
|
| 463 |
-
'Max Score': np.max(valid_scores),
|
| 464 |
-
'Min Score': np.min(valid_scores),
|
| 465 |
-
'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
|
| 466 |
-
'Evaluations': len(valid_scores)
|
| 467 |
-
})
|
| 468 |
-
|
| 469 |
-
if not leaderboard_data:
|
| 470 |
-
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 471 |
-
|
| 472 |
-
df = pd.DataFrame(leaderboard_data)
|
| 473 |
-
|
| 474 |
-
# Sort by average score
|
| 475 |
-
df = df.sort_values('Avg Score', ascending=False)
|
| 476 |
-
df['Rank'] = range(1, len(df) + 1)
|
| 477 |
-
|
| 478 |
-
# Format numeric columns
|
| 479 |
-
for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
|
| 480 |
-
if col in df.columns:
|
| 481 |
-
df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
|
| 482 |
-
|
| 483 |
-
return df
|
| 484 |
-
|
| 485 |
-
except Exception as e:
|
| 486 |
-
print(f"Leaderboard creation error: {e}")
|
| 487 |
-
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 488 |
-
|
| 489 |
-
def compare_agents(
|
| 490 |
-
agent1_file,
|
| 491 |
-
agent2_file,
|
| 492 |
-
) -> tuple[go.Figure, go.Figure, go.Figure, str]:
|
| 493 |
-
"""Compare two agents' performance with error handling"""
|
| 494 |
-
|
| 495 |
-
empty_fig = go.Figure()
|
| 496 |
-
empty_fig.update_layout(title="No data available")
|
| 497 |
-
|
| 498 |
-
try:
|
| 499 |
-
if not agent1_file or not agent2_file:
|
| 500 |
-
return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."
|
| 501 |
-
|
| 502 |
-
def load_agent_data(file):
|
| 503 |
-
try:
|
| 504 |
-
if file.name.endswith('.json'):
|
| 505 |
-
with open(file.name, 'r', encoding='utf-8') as f:
|
| 506 |
-
return json.load(f)
|
| 507 |
-
elif file.name.endswith('.jsonl'):
|
| 508 |
-
data = []
|
| 509 |
-
with open(file.name, 'r', encoding='utf-8') as f:
|
| 510 |
-
for line in f:
|
| 511 |
-
if line.strip():
|
| 512 |
-
data.append(json.loads(line))
|
| 513 |
-
return data
|
| 514 |
-
else:
|
| 515 |
-
raise ValueError("Unsupported file format")
|
| 516 |
-
except Exception as e:
|
| 517 |
-
raise ValueError(f"Error loading file {file.name}: {str(e)}")
|
| 518 |
-
|
| 519 |
-
eval_instance = get_evaluator()
|
| 520 |
-
vis_instance = get_visualizer()
|
| 521 |
-
report_instance = get_report_generator()
|
| 522 |
-
|
| 523 |
-
# Load data for both agents
|
| 524 |
-
agent1_data = load_agent_data(agent1_file)
|
| 525 |
-
agent2_data = load_agent_data(agent2_file)
|
| 526 |
-
|
| 527 |
-
# Validate data
|
| 528 |
-
if not agent1_data or not agent2_data:
|
| 529 |
-
return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
|
| 530 |
-
|
| 531 |
-
# Evaluate both agents
|
| 532 |
-
agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
|
| 533 |
-
agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
|
| 534 |
-
|
| 535 |
-
if not agent1_results or not agent2_results:
|
| 536 |
-
return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
|
| 537 |
-
|
| 538 |
-
# Generate comparison visualizations
|
| 539 |
-
try:
|
| 540 |
-
comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
|
| 541 |
-
except Exception as e:
|
| 542 |
-
print(f"Comparison chart creation failed: {e}")
|
| 543 |
-
comparison_chart = empty_fig
|
| 544 |
-
|
| 545 |
-
try:
|
| 546 |
-
performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
|
| 547 |
-
except Exception as e:
|
| 548 |
-
print(f"Performance difference chart creation failed: {e}")
|
| 549 |
-
performance_diff = empty_fig
|
| 550 |
-
|
| 551 |
-
try:
|
| 552 |
-
statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
|
| 553 |
-
except Exception as e:
|
| 554 |
-
print(f"Statistical analysis chart creation failed: {e}")
|
| 555 |
-
statistical_analysis = empty_fig
|
| 556 |
-
|
| 557 |
-
# Generate comparison report
|
| 558 |
-
try:
|
| 559 |
-
comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
|
| 560 |
-
except Exception as e:
|
| 561 |
-
print(f"Comparison report generation failed: {e}")
|
| 562 |
-
comparison_report = f"Comparison report generation failed: {str(e)}"
|
| 563 |
-
|
| 564 |
-
return comparison_chart, performance_diff, statistical_analysis, comparison_report
|
| 565 |
-
|
| 566 |
-
except Exception as e:
|
| 567 |
-
error_msg = f"Agent comparison failed: {str(e)}"
|
| 568 |
-
print(f"Error: {error_msg}")
|
| 569 |
-
print(traceback.format_exc())
|
| 570 |
-
return empty_fig, empty_fig, empty_fig, error_msg
|
|
|
|
|
|
|
| 1 |
from __future__ import annotations
|
| 2 |
|
| 3 |
import gradio as gr
|
|
|
|
| 235 |
error_msg = f"Batch evaluation failed: {str(e)}"
|
| 236 |
print(f"Error: {error_msg}")
|
| 237 |
print(traceback.format_exc())
|
| 238 |
+
return empty_fig, empty_fig, empty_fig, error_msg, empty_df
|
| 239 |
+
|
| 240 |
+
def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
|
| 241 |
+
"""Create a leaderboard from evaluation results with robust error handling"""
|
| 242 |
+
try:
|
| 243 |
+
if not results:
|
| 244 |
+
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 245 |
+
|
| 246 |
+
eval_instance = get_evaluator()
|
| 247 |
+
agent_scores = eval_instance.get_agent_scores_from_results(results)
|
| 248 |
+
|
| 249 |
+
if not agent_scores:
|
| 250 |
+
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 251 |
+
|
| 252 |
+
leaderboard_data = []
|
| 253 |
+
for agent, scores in agent_scores.items():
|
| 254 |
+
if not scores: # Skip agents with no valid scores
|
| 255 |
+
continue
|
| 256 |
+
|
| 257 |
+
# Filter out invalid scores
|
| 258 |
+
valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
|
| 259 |
+
|
| 260 |
+
if not valid_scores:
|
| 261 |
+
continue
|
| 262 |
+
|
| 263 |
+
leaderboard_data.append({
|
| 264 |
+
'Rank': 0,
|
| 265 |
+
'Agent': str(agent),
|
| 266 |
+
'Avg Score': np.mean(valid_scores),
|
| 267 |
+
'Max Score': np.max(valid_scores),
|
| 268 |
+
'Min Score': np.min(valid_scores),
|
| 269 |
+
'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
|
| 270 |
+
'Evaluations': len(valid_scores)
|
| 271 |
+
})
|
| 272 |
+
|
| 273 |
+
if not leaderboard_data:
|
| 274 |
+
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 275 |
+
|
| 276 |
+
df = pd.DataFrame(leaderboard_data)
|
| 277 |
+
|
| 278 |
+
# Sort by average score
|
| 279 |
+
df = df.sort_values('Avg Score', ascending=False)
|
| 280 |
+
df['Rank'] = range(1, len(df) + 1)
|
| 281 |
+
|
| 282 |
+
# Format numeric columns
|
| 283 |
+
for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
|
| 284 |
+
if col in df.columns:
|
| 285 |
+
df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
|
| 286 |
+
|
| 287 |
+
return df
|
| 288 |
+
|
| 289 |
+
except Exception as e:
|
| 290 |
+
print(f"Leaderboard creation error: {e}")
|
| 291 |
+
return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
|
| 292 |
+
|
| 293 |
+
def compare_agents(
|
| 294 |
+
agent1_file,
|
| 295 |
+
agent2_file,
|
| 296 |
+
) -> tuple[go.Figure, go.Figure, go.Figure, str]:
|
| 297 |
+
"""Compare two agents' performance with error handling"""
|
| 298 |
+
|
| 299 |
+
empty_fig = go.Figure()
|
| 300 |
+
empty_fig.update_layout(title="No data available")
|
| 301 |
+
|
| 302 |
+
try:
|
| 303 |
+
if not agent1_file or not agent2_file:
|
| 304 |
+
return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."
|
| 305 |
+
|
| 306 |
+
def load_agent_data(file):
|
| 307 |
+
try:
|
| 308 |
+
if file.name.endswith('.json'):
|
| 309 |
+
with open(file.name, 'r', encoding='utf-8') as f:
|
| 310 |
+
return json.load(f)
|
| 311 |
+
elif file.name.endswith('.jsonl'):
|
| 312 |
+
data = []
|
| 313 |
+
with open(file.name, 'r', encoding='utf-8') as f:
|
| 314 |
+
for line in f:
|
| 315 |
+
if line.strip():
|
| 316 |
+
data.append(json.loads(line))
|
| 317 |
+
return data
|
| 318 |
+
else:
|
| 319 |
+
raise ValueError("Unsupported file format")
|
| 320 |
+
except Exception as e:
|
| 321 |
+
raise ValueError(f"Error loading file {file.name}: {str(e)}")
|
| 322 |
+
|
| 323 |
+
eval_instance = get_evaluator()
|
| 324 |
+
vis_instance = get_visualizer()
|
| 325 |
+
report_instance = get_report_generator()
|
| 326 |
+
|
| 327 |
+
# Load data for both agents
|
| 328 |
+
agent1_data = load_agent_data(agent1_file)
|
| 329 |
+
agent2_data = load_agent_data(agent2_file)
|
| 330 |
+
|
| 331 |
+
# Validate data
|
| 332 |
+
if not agent1_data or not agent2_data:
|
| 333 |
+
return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
|
| 334 |
+
|
| 335 |
+
# Evaluate both agents
|
| 336 |
+
agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
|
| 337 |
+
agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
|
| 338 |
+
|
| 339 |
+
if not agent1_results or not agent2_results:
|
| 340 |
+
return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
|
| 341 |
+
|
| 342 |
+
# Generate comparison visualizations
|
| 343 |
+
try:
|
| 344 |
+
comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
|
| 345 |
+
except Exception as e:
|
| 346 |
+
print(f"Comparison chart creation failed: {e}")
|
| 347 |
+
comparison_chart = empty_fig
|
| 348 |
+
|
| 349 |
+
try:
|
| 350 |
+
performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
|
| 351 |
+
except Exception as e:
|
| 352 |
+
print(f"Performance difference chart creation failed: {e}")
|
| 353 |
+
performance_diff = empty_fig
|
| 354 |
+
|
| 355 |
+
try:
|
| 356 |
+
statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
|
| 357 |
+
except Exception as e:
|
| 358 |
+
print(f"Statistical analysis chart creation failed: {e}")
|
| 359 |
+
statistical_analysis = empty_fig
|
| 360 |
+
|
| 361 |
+
# Generate comparison report
|
| 362 |
+
try:
|
| 363 |
+
comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
|
| 364 |
+
except Exception as e:
|
| 365 |
+
print(f"Comparison report generation failed: {e}")
|
| 366 |
+
comparison_report = f"Comparison report generation failed: {str(e)}"
|
| 367 |
+
|
| 368 |
+
return comparison_chart, performance_diff, statistical_analysis, comparison_report
|
| 369 |
+
|
| 370 |
+
except Exception as e:
|
| 371 |
+
error_msg = f"Agent comparison failed: {str(e)}"
|
| 372 |
+
print(f"Error: {error_msg}")
|
| 373 |
+
print(traceback.format_exc())
|
| 374 |
return empty_fig, empty_fig, empty_fig, error_msg
|
| 375 |
|
| 376 |
# --- Gradio Interface Setup ---
|
|
|
|
| 388 |
|
| 389 |
with gr.Tabs():
|
| 390 |
# Single Evaluation Tab
|
| 391 |
+
with gr.TabItem("🔍 Single Evaluation"):
|
| 392 |
with gr.Row():
|
| 393 |
with gr.Column(scale=1):
|
| 394 |
prompt_input = gr.Textbox(
|
|
|
|
| 566 |
server_name="0.0.0.0",
|
| 567 |
server_port=7860,
|
| 568 |
show_error=True
|
| 569 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|