aaditya-raj commited on
Commit
164484f
Β·
verified Β·
1 Parent(s): 497d06a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +196 -1
app.py CHANGED
@@ -236,7 +236,202 @@ def process_batch_evaluation(
236
  error_msg = f"Batch evaluation failed: {str(e)}"
237
  print(f"Error: {error_msg}")
238
  print(traceback.format_exc())
239
- return empty_fig, empty_fig, empty_fig, error_msg, empty_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
  def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
242
  """Create a leaderboard from evaluation results with robust error handling"""
 
236
  error_msg = f"Batch evaluation failed: {str(e)}"
237
  print(f"Error: {error_msg}")
238
  print(traceback.format_exc())
239
+ return empty_fig, empty_fig, empty_fig, error_msg
240
+
241
+ # --- Gradio Interface Setup ---
242
+
243
+ def create_gradio_interface():
244
+ """Create and return the Gradio interface"""
245
+
246
+ with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo:
247
+
248
+ gr.Markdown("""
249
+ # 🎯 AetherScore Evaluation Dashboard
250
+
251
+ Advanced AI response evaluation system with comprehensive metrics and visualizations.
252
+ """)
253
+
254
+ with gr.Tabs():
255
+ # Single Evaluation Tab
256
+ with gr.TabItem("πŸ“ Single Evaluation"):
257
+ with gr.Row():
258
+ with gr.Column(scale=1):
259
+ prompt_input = gr.Textbox(
260
+ label="Prompt",
261
+ placeholder="Enter the prompt/question here...",
262
+ lines=3
263
+ )
264
+ response_input = gr.Textbox(
265
+ label="AI Response",
266
+ placeholder="Enter the AI response to evaluate...",
267
+ lines=5
268
+ )
269
+ expected_input = gr.Textbox(
270
+ label="Expected Answer (Optional)",
271
+ placeholder="Enter expected answer for accuracy comparison...",
272
+ lines=2
273
+ )
274
+ with gr.Row():
275
+ agent_name_input = gr.Textbox(
276
+ label="Agent Name",
277
+ value="Agent-1",
278
+ scale=1
279
+ )
280
+ task_type_input = gr.Dropdown(
281
+ label="Task Type",
282
+ choices=["general", "reasoning", "creative", "factual"],
283
+ value="general",
284
+ scale=1
285
+ )
286
+ evaluate_btn = gr.Button("πŸ” Evaluate", variant="primary")
287
+
288
+ with gr.Column(scale=2):
289
+ scores_display = gr.JSON(label="πŸ“Š Evaluation Scores")
290
+ explanation_output = gr.Textbox(
291
+ label="πŸ’‘ Detailed Explanation",
292
+ lines=4,
293
+ interactive=False
294
+ )
295
+
296
+ with gr.Row():
297
+ spider_chart = gr.Plot(label="πŸ•ΈοΈ Performance Spider Chart")
298
+ score_bars = gr.Plot(label="πŸ“Š Score Breakdown")
299
+
300
+ evaluate_btn.click(
301
+ fn=process_single_evaluation,
302
+ inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input],
303
+ outputs=[scores_display, spider_chart, score_bars, explanation_output]
304
+ )
305
+
306
+ # Batch Evaluation Tab
307
+ with gr.TabItem("πŸ“ Batch Evaluation"):
308
+ with gr.Row():
309
+ with gr.Column(scale=1):
310
+ file_input = gr.File(
311
+ label="Upload Evaluation Data",
312
+ file_types=[".json", ".jsonl"],
313
+ type="filepath"
314
+ )
315
+ eval_mode = gr.Dropdown(
316
+ label="Evaluation Mode",
317
+ choices=["comprehensive", "fast"],
318
+ value="comprehensive"
319
+ )
320
+ batch_btn = gr.Button("πŸš€ Start Batch Evaluation", variant="primary")
321
+
322
+ with gr.Column(scale=2):
323
+ batch_report = gr.Textbox(
324
+ label="πŸ“‹ Evaluation Report",
325
+ lines=8,
326
+ interactive=False
327
+ )
328
+
329
+ with gr.Row():
330
+ heatmap_plot = gr.Plot(label="πŸ”₯ Performance Heatmap")
331
+ distribution_plot = gr.Plot(label="πŸ“ˆ Score Distribution")
332
+
333
+ with gr.Row():
334
+ trends_plot = gr.Plot(label="πŸ“Š Performance Trends")
335
+ leaderboard_df = gr.Dataframe(label="πŸ† Leaderboard")
336
+
337
+ batch_btn.click(
338
+ fn=process_batch_evaluation,
339
+ inputs=[file_input, eval_mode],
340
+ outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df]
341
+ )
342
+
343
+ # Agent Comparison Tab
344
+ with gr.TabItem("βš”οΈ Agent Comparison"):
345
+ with gr.Row():
346
+ with gr.Column():
347
+ agent1_file = gr.File(
348
+ label="Agent 1 Data",
349
+ file_types=[".json", ".jsonl"],
350
+ type="filepath"
351
+ )
352
+ with gr.Column():
353
+ agent2_file = gr.File(
354
+ label="Agent 2 Data",
355
+ file_types=[".json", ".jsonl"],
356
+ type="filepath"
357
+ )
358
+
359
+ compare_btn = gr.Button("πŸ” Compare Agents", variant="primary")
360
+
361
+ with gr.Row():
362
+ comparison_report = gr.Textbox(
363
+ label="πŸ“Š Comparison Report",
364
+ lines=10,
365
+ interactive=False
366
+ )
367
+
368
+ with gr.Row():
369
+ comparison_chart = gr.Plot(label="πŸ“Š Agent Comparison")
370
+ performance_diff = gr.Plot(label="πŸ“ˆ Performance Delta")
371
+
372
+ with gr.Row():
373
+ radar_comparison = gr.Plot(label="πŸ•ΈοΈ Radar Comparison")
374
+
375
+ compare_btn.click(
376
+ fn=compare_agents,
377
+ inputs=[agent1_file, agent2_file],
378
+ outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report]
379
+ )
380
+
381
+ # Help & Documentation Tab
382
+ with gr.TabItem("❓ Help & Documentation"):
383
+ gr.Markdown("""
384
+ ## πŸ“– How to Use AetherScore
385
+
386
+ ### Single Evaluation
387
+ 1. Enter your prompt and AI response
388
+ 2. Optionally provide an expected answer for accuracy comparison
389
+ 3. Choose agent name and task type
390
+ 4. Click "Evaluate" to get comprehensive scores
391
+
392
+ ### Batch Evaluation
393
+ 1. Upload a JSON/JSONL file with evaluation data
394
+ 2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id`
395
+ 3. Choose evaluation mode and start processing
396
+ 4. View results in charts and leaderboard
397
+
398
+ ### Agent Comparison
399
+ 1. Upload evaluation data files for two different agents
400
+ 2. Click "Compare Agents" to see detailed performance analysis
401
+ 3. Review comparison charts and statistical analysis
402
+
403
+ ### Evaluation Metrics
404
+ - **Instruction Following**: How well the response follows prompt constraints
405
+ - **Hallucination Score**: Detection of fabricated or unverified information
406
+ - **Assumption Control**: Management of uncertain or speculative content
407
+ - **Coherence**: Logical flow and consistency of the response
408
+ - **Accuracy**: Similarity to expected answer (when provided)
409
+ - **Overall Score**: Weighted combination of all metrics
410
+
411
+ ### Data Format Example
412
+ ```json
413
+ {
414
+ "prompt": "Explain quantum computing",
415
+ "response": "Quantum computing uses quantum bits...",
416
+ "expected_answer": "Quantum computing leverages quantum mechanics...",
417
+ "agent_name": "GPT-4",
418
+ "task_id": "task_001",
419
+ "task_type": "factual"
420
+ }
421
+ ```
422
+ """)
423
+
424
+ return demo
425
+
426
+ # Create and launch the application
427
+ if __name__ == "__main__":
428
+ demo = create_gradio_interface()
429
+ demo.launch(
430
+ share=True,
431
+ server_name="0.0.0.0",
432
+ server_port=7860,
433
+ show_error=True
434
+ ), empty_df
435
 
436
  def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
437
  """Create a leaderboard from evaluation results with robust error handling"""