aaditya-raj commited on
Commit
672a8ff
·
verified ·
1 Parent(s): 164484f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -139
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  from __future__ import annotations
3
 
4
  import gradio as gr
@@ -236,6 +235,142 @@ def process_batch_evaluation(
236
  error_msg = f"Batch evaluation failed: {str(e)}"
237
  print(f"Error: {error_msg}")
238
  print(traceback.format_exc())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  return empty_fig, empty_fig, empty_fig, error_msg
240
 
241
  # --- Gradio Interface Setup ---
@@ -253,7 +388,7 @@ def create_gradio_interface():
253
 
254
  with gr.Tabs():
255
  # Single Evaluation Tab
256
- with gr.TabItem("📝 Single Evaluation"):
257
  with gr.Row():
258
  with gr.Column(scale=1):
259
  prompt_input = gr.Textbox(
@@ -431,140 +566,4 @@ if __name__ == "__main__":
431
  server_name="0.0.0.0",
432
  server_port=7860,
433
  show_error=True
434
- ), empty_df
435
-
436
- def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
437
- """Create a leaderboard from evaluation results with robust error handling"""
438
- try:
439
- if not results:
440
- return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
441
-
442
- eval_instance = get_evaluator()
443
- agent_scores = eval_instance.get_agent_scores_from_results(results)
444
-
445
- if not agent_scores:
446
- return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
447
-
448
- leaderboard_data = []
449
- for agent, scores in agent_scores.items():
450
- if not scores: # Skip agents with no valid scores
451
- continue
452
-
453
- # Filter out invalid scores
454
- valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
455
-
456
- if not valid_scores:
457
- continue
458
-
459
- leaderboard_data.append({
460
- 'Rank': 0,
461
- 'Agent': str(agent),
462
- 'Avg Score': np.mean(valid_scores),
463
- 'Max Score': np.max(valid_scores),
464
- 'Min Score': np.min(valid_scores),
465
- 'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
466
- 'Evaluations': len(valid_scores)
467
- })
468
-
469
- if not leaderboard_data:
470
- return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
471
-
472
- df = pd.DataFrame(leaderboard_data)
473
-
474
- # Sort by average score
475
- df = df.sort_values('Avg Score', ascending=False)
476
- df['Rank'] = range(1, len(df) + 1)
477
-
478
- # Format numeric columns
479
- for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
480
- if col in df.columns:
481
- df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
482
-
483
- return df
484
-
485
- except Exception as e:
486
- print(f"Leaderboard creation error: {e}")
487
- return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
488
-
489
- def compare_agents(
490
- agent1_file,
491
- agent2_file,
492
- ) -> tuple[go.Figure, go.Figure, go.Figure, str]:
493
- """Compare two agents' performance with error handling"""
494
-
495
- empty_fig = go.Figure()
496
- empty_fig.update_layout(title="No data available")
497
-
498
- try:
499
- if not agent1_file or not agent2_file:
500
- return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."
501
-
502
- def load_agent_data(file):
503
- try:
504
- if file.name.endswith('.json'):
505
- with open(file.name, 'r', encoding='utf-8') as f:
506
- return json.load(f)
507
- elif file.name.endswith('.jsonl'):
508
- data = []
509
- with open(file.name, 'r', encoding='utf-8') as f:
510
- for line in f:
511
- if line.strip():
512
- data.append(json.loads(line))
513
- return data
514
- else:
515
- raise ValueError("Unsupported file format")
516
- except Exception as e:
517
- raise ValueError(f"Error loading file {file.name}: {str(e)}")
518
-
519
- eval_instance = get_evaluator()
520
- vis_instance = get_visualizer()
521
- report_instance = get_report_generator()
522
-
523
- # Load data for both agents
524
- agent1_data = load_agent_data(agent1_file)
525
- agent2_data = load_agent_data(agent2_file)
526
-
527
- # Validate data
528
- if not agent1_data or not agent2_data:
529
- return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
530
-
531
- # Evaluate both agents
532
- agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
533
- agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
534
-
535
- if not agent1_results or not agent2_results:
536
- return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
537
-
538
- # Generate comparison visualizations
539
- try:
540
- comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
541
- except Exception as e:
542
- print(f"Comparison chart creation failed: {e}")
543
- comparison_chart = empty_fig
544
-
545
- try:
546
- performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
547
- except Exception as e:
548
- print(f"Performance difference chart creation failed: {e}")
549
- performance_diff = empty_fig
550
-
551
- try:
552
- statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
553
- except Exception as e:
554
- print(f"Statistical analysis chart creation failed: {e}")
555
- statistical_analysis = empty_fig
556
-
557
- # Generate comparison report
558
- try:
559
- comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
560
- except Exception as e:
561
- print(f"Comparison report generation failed: {e}")
562
- comparison_report = f"Comparison report generation failed: {str(e)}"
563
-
564
- return comparison_chart, performance_diff, statistical_analysis, comparison_report
565
-
566
- except Exception as e:
567
- error_msg = f"Agent comparison failed: {str(e)}"
568
- print(f"Error: {error_msg}")
569
- print(traceback.format_exc())
570
- return empty_fig, empty_fig, empty_fig, error_msg
 
 
1
  from __future__ import annotations
2
 
3
  import gradio as gr
 
235
  error_msg = f"Batch evaluation failed: {str(e)}"
236
  print(f"Error: {error_msg}")
237
  print(traceback.format_exc())
238
+ return empty_fig, empty_fig, empty_fig, error_msg, empty_df
239
+
240
+ def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
241
+ """Create a leaderboard from evaluation results with robust error handling"""
242
+ try:
243
+ if not results:
244
+ return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
245
+
246
+ eval_instance = get_evaluator()
247
+ agent_scores = eval_instance.get_agent_scores_from_results(results)
248
+
249
+ if not agent_scores:
250
+ return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
251
+
252
+ leaderboard_data = []
253
+ for agent, scores in agent_scores.items():
254
+ if not scores: # Skip agents with no valid scores
255
+ continue
256
+
257
+ # Filter out invalid scores
258
+ valid_scores = [s for s in scores if isinstance(s, (int, float)) and not np.isnan(s)]
259
+
260
+ if not valid_scores:
261
+ continue
262
+
263
+ leaderboard_data.append({
264
+ 'Rank': 0,
265
+ 'Agent': str(agent),
266
+ 'Avg Score': np.mean(valid_scores),
267
+ 'Max Score': np.max(valid_scores),
268
+ 'Min Score': np.min(valid_scores),
269
+ 'Std Dev': np.std(valid_scores) if len(valid_scores) > 1 else 0.0,
270
+ 'Evaluations': len(valid_scores)
271
+ })
272
+
273
+ if not leaderboard_data:
274
+ return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
275
+
276
+ df = pd.DataFrame(leaderboard_data)
277
+
278
+ # Sort by average score
279
+ df = df.sort_values('Avg Score', ascending=False)
280
+ df['Rank'] = range(1, len(df) + 1)
281
+
282
+ # Format numeric columns
283
+ for col in ['Avg Score', 'Max Score', 'Min Score', 'Std Dev']:
284
+ if col in df.columns:
285
+ df[col] = df[col].apply(lambda x: f"{x:.3f}" if pd.notna(x) else "N/A")
286
+
287
+ return df
288
+
289
+ except Exception as e:
290
+ print(f"Leaderboard creation error: {e}")
291
+ return pd.DataFrame(columns=['Rank', 'Agent', 'Avg Score', 'Max Score', 'Min Score', 'Std Dev', 'Evaluations'])
292
+
293
+ def compare_agents(
294
+ agent1_file,
295
+ agent2_file,
296
+ ) -> tuple[go.Figure, go.Figure, go.Figure, str]:
297
+ """Compare two agents' performance with error handling"""
298
+
299
+ empty_fig = go.Figure()
300
+ empty_fig.update_layout(title="No data available")
301
+
302
+ try:
303
+ if not agent1_file or not agent2_file:
304
+ return empty_fig, empty_fig, empty_fig, "Please upload files for both agents."
305
+
306
+ def load_agent_data(file):
307
+ try:
308
+ if file.name.endswith('.json'):
309
+ with open(file.name, 'r', encoding='utf-8') as f:
310
+ return json.load(f)
311
+ elif file.name.endswith('.jsonl'):
312
+ data = []
313
+ with open(file.name, 'r', encoding='utf-8') as f:
314
+ for line in f:
315
+ if line.strip():
316
+ data.append(json.loads(line))
317
+ return data
318
+ else:
319
+ raise ValueError("Unsupported file format")
320
+ except Exception as e:
321
+ raise ValueError(f"Error loading file {file.name}: {str(e)}")
322
+
323
+ eval_instance = get_evaluator()
324
+ vis_instance = get_visualizer()
325
+ report_instance = get_report_generator()
326
+
327
+ # Load data for both agents
328
+ agent1_data = load_agent_data(agent1_file)
329
+ agent2_data = load_agent_data(agent2_file)
330
+
331
+ # Validate data
332
+ if not agent1_data or not agent2_data:
333
+ return empty_fig, empty_fig, empty_fig, "One or both agent files contain no valid data."
334
+
335
+ # Evaluate both agents
336
+ agent1_results = eval_instance.evaluate_batch(agent1_data, mode="comprehensive")
337
+ agent2_results = eval_instance.evaluate_batch(agent2_data, mode="comprehensive")
338
+
339
+ if not agent1_results or not agent2_results:
340
+ return empty_fig, empty_fig, empty_fig, "Failed to evaluate one or both agents."
341
+
342
+ # Generate comparison visualizations
343
+ try:
344
+ comparison_chart = vis_instance.create_agent_comparison(agent1_results, agent2_results)
345
+ except Exception as e:
346
+ print(f"Comparison chart creation failed: {e}")
347
+ comparison_chart = empty_fig
348
+
349
+ try:
350
+ performance_diff = vis_instance.create_performance_delta(agent1_results, agent2_results)
351
+ except Exception as e:
352
+ print(f"Performance difference chart creation failed: {e}")
353
+ performance_diff = empty_fig
354
+
355
+ try:
356
+ statistical_analysis = vis_instance.create_radar_comparison(agent1_results, agent2_results)
357
+ except Exception as e:
358
+ print(f"Statistical analysis chart creation failed: {e}")
359
+ statistical_analysis = empty_fig
360
+
361
+ # Generate comparison report
362
+ try:
363
+ comparison_report = report_instance.generate_comparison_report(agent1_results, agent2_results)
364
+ except Exception as e:
365
+ print(f"Comparison report generation failed: {e}")
366
+ comparison_report = f"Comparison report generation failed: {str(e)}"
367
+
368
+ return comparison_chart, performance_diff, statistical_analysis, comparison_report
369
+
370
+ except Exception as e:
371
+ error_msg = f"Agent comparison failed: {str(e)}"
372
+ print(f"Error: {error_msg}")
373
+ print(traceback.format_exc())
374
  return empty_fig, empty_fig, empty_fig, error_msg
375
 
376
  # --- Gradio Interface Setup ---
 
388
 
389
  with gr.Tabs():
390
  # Single Evaluation Tab
391
+ with gr.TabItem("🔍 Single Evaluation"):
392
  with gr.Row():
393
  with gr.Column(scale=1):
394
  prompt_input = gr.Textbox(
 
566
  server_name="0.0.0.0",
567
  server_port=7860,
568
  show_error=True
569
+ )