Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import os | |
| from datetime import datetime | |
| from agent import GAIAAgent | |
| from evaluate import evaluate_agent, create_sample_dataset | |
| import traceback | |
| def run_evaluation(): | |
| """Run the GAIA evaluation and return results.""" | |
| try: | |
| print("Starting GAIA Agent Evaluation...") | |
| print("=" * 50) | |
| # Initialize agent | |
| agent = GAIAAgent() | |
| # Test API connection first | |
| print("Testing xAI API connection...") | |
| test_response = agent.test_grok() | |
| print(f"API Test Response: {test_response}") | |
| # Run evaluation on sample dataset (since we don't have the full GAIA dataset) | |
| print("\nRunning evaluation on sample tasks...") | |
| score = evaluate_agent(dataset_path=None, max_tasks=10) | |
| # Read submission file if it exists | |
| submission_content = "" | |
| if os.path.exists("submission.jsonl"): | |
| with open("submission.jsonl", "r") as f: | |
| submission_content = f.read() | |
| # Format results | |
| timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
| results = f""" | |
| # GAIA Agent Evaluation Results | |
| **Timestamp:** {timestamp} | |
| **Final Score:** {score:.2f}% | |
| **Certificate Status:** {'β ACHIEVED (β₯30%)' if score >= 30 else 'β NOT ACHIEVED (<30%)'} | |
| ## API Connection Status | |
| {test_response} | |
| ## Submission File Preview | |
| ```json | |
| {submission_content[:500]}{'...' if len(submission_content) > 500 else ''} | |
| ``` | |
| ## Next Steps | |
| {'π Congratulations! You can now claim your Certificate of Excellence!' if score >= 30 else 'πͺ Keep improving your agent to reach the 30% threshold.'} | |
| """ | |
| return results, score | |
| except Exception as e: | |
| error_msg = f""" | |
| # Evaluation Error | |
| **Error:** {str(e)} | |
| **Traceback:** | |
| ``` | |
| {traceback.format_exc()} | |
| ``` | |
| Please check the logs and fix any issues before retrying. | |
| """ | |
| return error_msg, 0.0 | |
| def create_interface(): | |
| """Create the Gradio interface.""" | |
| with gr.Blocks(title="GAIA Agent Evaluation", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # π€ GAIA Agent Evaluation | |
| This is your GAIA benchmark agent for the Hugging Face Agents Course Certificate of Excellence. | |
| **Goal:** Achieve β₯30% score on GAIA benchmark tasks | |
| Click the button below to run the evaluation and submit your answers. | |
| β οΈ **Note:** This may take several minutes to complete. Please be patient. | |
| """) | |
| with gr.Row(): | |
| run_btn = gr.Button("π Run Evaluation & Submit All Answers", variant="primary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("## Run Status / Submission Result") | |
| results_output = gr.Markdown("Click the button above to start evaluation...") | |
| with gr.Column(): | |
| gr.Markdown("## Score") | |
| score_output = gr.Number(label="Final Score (%)", value=0.0, interactive=False) | |
| # Event handler | |
| run_btn.click( | |
| fn=run_evaluation, | |
| inputs=[], | |
| outputs=[results_output, score_output], | |
| show_progress=True | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ## About This Agent | |
| - **API:** xAI Grok for reasoning | |
| - **Tools:** Web search, file handling, math calculations | |
| - **Fallbacks:** Local knowledge for common questions | |
| - **Target:** 30% accuracy for certificate eligibility | |
| ## Troubleshooting | |
| If you encounter issues: | |
| 1. Check the container logs in the "Logs" tab | |
| 2. Verify API credentials and internet connectivity | |
| 3. Ensure all dependencies are installed | |
| **Good luck! π** | |
| """) | |
| return demo | |
| if __name__ == "__main__": | |
| # Create and launch the interface | |
| demo = create_interface() | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True, | |
| show_api=False | |
| ) |