Spaces:

aaditya-raj
/

e6test

Sleeping

App Files Files Community

aaditya-raj commited on Sep 16, 2025

Commit

164484f

verified ·

1 Parent(s): 497d06a

Update app.py

Browse files

Files changed (1) hide show

app.py +196 -1

app.py CHANGED Viewed

@@ -236,7 +236,202 @@ def process_batch_evaluation(
         error_msg = f"Batch evaluation failed: {str(e)}"
         print(f"Error: {error_msg}")
         print(traceback.format_exc())
-        return empty_fig, empty_fig, empty_fig, error_msg, empty_df
 def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
     """Create a leaderboard from evaluation results with robust error handling"""

         error_msg = f"Batch evaluation failed: {str(e)}"
         print(f"Error: {error_msg}")
         print(traceback.format_exc())
+        return empty_fig, empty_fig, empty_fig, error_msg
+# --- Gradio Interface Setup ---
+def create_gradio_interface():
+    """Create and return the Gradio interface"""
+    with gr.Blocks(css=custom_css, title="AetherScore Evaluation Dashboard") as demo:
+        gr.Markdown("""
+        # 🎯 AetherScore Evaluation Dashboard
+        Advanced AI response evaluation system with comprehensive metrics and visualizations.
+        """)
+        with gr.Tabs():
+            # Single Evaluation Tab
+            with gr.TabItem("📝 Single Evaluation"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        prompt_input = gr.Textbox(
+                            label="Prompt",
+                            placeholder="Enter the prompt/question here...",
+                            lines=3
+                        )
+                        response_input = gr.Textbox(
+                            label="AI Response",
+                            placeholder="Enter the AI response to evaluate...",
+                            lines=5
+                        )
+                        expected_input = gr.Textbox(
+                            label="Expected Answer (Optional)",
+                            placeholder="Enter expected answer for accuracy comparison...",
+                            lines=2
+                        )
+                        with gr.Row():
+                            agent_name_input = gr.Textbox(
+                                label="Agent Name",
+                                value="Agent-1",
+                                scale=1
+                            )
+                            task_type_input = gr.Dropdown(
+                                label="Task Type",
+                                choices=["general", "reasoning", "creative", "factual"],
+                                value="general",
+                                scale=1
+                            )
+                        evaluate_btn = gr.Button("🔍 Evaluate", variant="primary")
+                    with gr.Column(scale=2):
+                        scores_display = gr.JSON(label="📊 Evaluation Scores")
+                        explanation_output = gr.Textbox(
+                            label="💡 Detailed Explanation",
+                            lines=4,
+                            interactive=False
+                        )
+                with gr.Row():
+                    spider_chart = gr.Plot(label="🕸️ Performance Spider Chart")
+                    score_bars = gr.Plot(label="📊 Score Breakdown")
+                evaluate_btn.click(
+                    fn=process_single_evaluation,
+                    inputs=[prompt_input, response_input, expected_input, agent_name_input, task_type_input],
+                    outputs=[scores_display, spider_chart, score_bars, explanation_output]
+                )
+            # Batch Evaluation Tab
+            with gr.TabItem("📁 Batch Evaluation"):
+                with gr.Row():
+                    with gr.Column(scale=1):
+                        file_input = gr.File(
+                            label="Upload Evaluation Data",
+                            file_types=[".json", ".jsonl"],
+                            type="filepath"
+                        )
+                        eval_mode = gr.Dropdown(
+                            label="Evaluation Mode",
+                            choices=["comprehensive", "fast"],
+                            value="comprehensive"
+                        )
+                        batch_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary")
+                    with gr.Column(scale=2):
+                        batch_report = gr.Textbox(
+                            label="📋 Evaluation Report",
+                            lines=8,
+                            interactive=False
+                        )
+                with gr.Row():
+                    heatmap_plot = gr.Plot(label="🔥 Performance Heatmap")
+                    distribution_plot = gr.Plot(label="📈 Score Distribution")
+                with gr.Row():
+                    trends_plot = gr.Plot(label="📊 Performance Trends")
+                    leaderboard_df = gr.Dataframe(label="🏆 Leaderboard")
+                batch_btn.click(
+                    fn=process_batch_evaluation,
+                    inputs=[file_input, eval_mode],
+                    outputs=[heatmap_plot, distribution_plot, trends_plot, batch_report, leaderboard_df]
+                )
+            # Agent Comparison Tab
+            with gr.TabItem("⚔️ Agent Comparison"):
+                with gr.Row():
+                    with gr.Column():
+                        agent1_file = gr.File(
+                            label="Agent 1 Data",
+                            file_types=[".json", ".jsonl"],
+                            type="filepath"
+                        )
+                    with gr.Column():
+                        agent2_file = gr.File(
+                            label="Agent 2 Data",
+                            file_types=[".json", ".jsonl"],
+                            type="filepath"
+                        )
+                compare_btn = gr.Button("🔍 Compare Agents", variant="primary")
+                with gr.Row():
+                    comparison_report = gr.Textbox(
+                        label="📊 Comparison Report",
+                        lines=10,
+                        interactive=False
+                    )
+                with gr.Row():
+                    comparison_chart = gr.Plot(label="📊 Agent Comparison")
+                    performance_diff = gr.Plot(label="📈 Performance Delta")
+                with gr.Row():
+                    radar_comparison = gr.Plot(label="🕸️ Radar Comparison")
+                compare_btn.click(
+                    fn=compare_agents,
+                    inputs=[agent1_file, agent2_file],
+                    outputs=[comparison_chart, performance_diff, radar_comparison, comparison_report]
+                )
+            # Help & Documentation Tab
+            with gr.TabItem("❓ Help & Documentation"):
+                gr.Markdown("""
+                ## 📖 How to Use AetherScore
+                ### Single Evaluation
+                1. Enter your prompt and AI response
+                2. Optionally provide an expected answer for accuracy comparison
+                3. Choose agent name and task type
+                4. Click "Evaluate" to get comprehensive scores
+                ### Batch Evaluation
+                1. Upload a JSON/JSONL file with evaluation data
+                2. Each item should have: `prompt`, `response`, optional `expected_answer`, `agent_name`, `task_id`
+                3. Choose evaluation mode and start processing
+                4. View results in charts and leaderboard
+                ### Agent Comparison
+                1. Upload evaluation data files for two different agents
+                2. Click "Compare Agents" to see detailed performance analysis
+                3. Review comparison charts and statistical analysis
+                ### Evaluation Metrics
+                - **Instruction Following**: How well the response follows prompt constraints
+                - **Hallucination Score**: Detection of fabricated or unverified information
+                - **Assumption Control**: Management of uncertain or speculative content
+                - **Coherence**: Logical flow and consistency of the response
+                - **Accuracy**: Similarity to expected answer (when provided)
+                - **Overall Score**: Weighted combination of all metrics
+                ### Data Format Example
+                ```json
+                {
+                    "prompt": "Explain quantum computing",
+                    "response": "Quantum computing uses quantum bits...",
+                    "expected_answer": "Quantum computing leverages quantum mechanics...",
+                    "agent_name": "GPT-4",
+                    "task_id": "task_001",
+                    "task_type": "factual"
+                }
+                ```
+                """)
+    return demo
+# Create and launch the application
+if __name__ == "__main__":
+    demo = create_gradio_interface()
+    demo.launch(
+        share=True,
+        server_name="0.0.0.0",
+        server_port=7860,
+        show_error=True
+    ), empty_df
 def create_leaderboard(results: List[Dict]) -> pd.DataFrame:
     """Create a leaderboard from evaluation results with robust error handling"""