import os import gradio as gr import requests import json import pandas as pd from agent import BasicAgent import traceback DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" HF_TOKEN = os.getenv("HF_TOKEN_HERE") if not HF_TOKEN: raise ValueError("HF_TOKEN_HERE is missing in Secrets!") HEADERS = { "Authorization": f"Bearer {HF_TOKEN}", "Content-Type": "application/json" } VALIDATION_URL = "https://huggingface.co/datasets/gaia-benchmark/GAIA/resolve/main/2023/validation/metadata.jsonl" def fetch_validation_questions(): """Fetch validation questions with better error handling.""" try: response = requests.get(VALIDATION_URL, headers=HEADERS, timeout=15) response.raise_for_status() lines = response.text.splitlines() questions = [] for line in lines: if line.strip(): try: row = json.loads(line) if row.get("Level") == 1: questions.append({ "task_id": row.get("task_id", ""), "question": row.get("Question", ""), "file_name": row.get("file_name", "") }) except json.JSONDecodeError as e: print(f"Error parsing line: {line[:50]}... Error: {e}") continue print(f"Fetched {len(questions)} Level 1 validation questions.") return questions[:20] # Limit to 20 for testing except Exception as e: print(f"Error fetching validation questions: {e}") print(f"Traceback: {traceback.format_exc()}") return [] def run_and_submit_all(use_validation: bool, profile: gr.OAuthProfile | None = None): """Enhanced run function with better logging and error handling.""" space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template" if profile: username = f"{profile.username}" print(f"User logged in: {username}") else: print("User not logged in.") return "Please Login to Hugging Face with the button.", None api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" submit_url = f"{api_url}/submit" agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" print(f"Agent code link: {agent_code}") # Initialize agent with error handling try: agent = BasicAgent() print("Agent initialized successfully") except Exception as e: error_msg = f"Error initializing agent: {e}\n{traceback.format_exc()}" print(error_msg) return error_msg, None # Fetch questions if use_validation: print("Using validation dataset...") questions_data = fetch_validation_questions() else: print(f"Fetching test questions from: {questions_url}") try: response = requests.get(questions_url, headers=HEADERS, timeout=15) response.raise_for_status() questions_data = response.json() print(f"Fetched {len(questions_data)} test questions.") except requests.exceptions.RequestException as e: error_msg = f"Error fetching questions: {e}" print(error_msg) return error_msg, None except json.JSONDecodeError as e: error_msg = f"Error decoding JSON response: {e}" print(error_msg) return error_msg, None if not questions_data: error_msg = "Fetched questions list is empty." print(error_msg) return error_msg, None # Process questions results_log = [] answers_payload = [] successful_answers = 0 print(f"\n{'='*60}") print(f"STARTING EVALUATION ON {len(questions_data)} QUESTIONS") print(f"{'='*60}") for i, item in enumerate(questions_data, 1): task_id = item.get("task_id") question_text = item.get("question") file_name = item.get("file_name", "") print(f"\n[{i}/{len(questions_data)}] Processing task: {task_id}") if not task_id or question_text is None: print(f"Skipping item with missing data: {item}") continue try: # Call agent with enhanced error handling submitted_answer = agent(question_text, task_id, file_name) if submitted_answer and submitted_answer != "unknown": successful_answers += 1 print(f" Answer: {submitted_answer}") else: print(f" No answer found") answers_payload.append({ "task_id": task_id, "submitted_answer": submitted_answer }) results_log.append({ "Task ID": task_id, "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, "File": file_name, "Submitted Answer": submitted_answer, "Status": "Success" if submitted_answer != "unknown" else "❓ Unknown" }) except Exception as e: error_msg = f"AGENT ERROR: {str(e)}" print(f" Error processing task {task_id}: {e}") print(f"Traceback: {traceback.format_exc()}") results_log.append({ "Task ID": task_id, "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, "File": file_name, "Submitted Answer": error_msg, "Status": " Error" }) print(f"\n{'='*60}") print(f"EVALUATION COMPLETE") print(f"Total questions: {len(questions_data)}") print(f"Successful answers: {successful_answers}") print(f"Success rate: {(successful_answers/len(questions_data)*100):.1f}%") print(f"{'='*60}") if not answers_payload: error_msg = "Agent did not produce any answers to submit." print(error_msg) return error_msg, pd.DataFrame(results_log) # Save results log try: with open("results_log.json", "w") as f: json.dump(results_log, f, indent=2) print(" Saved results_log.json") except Exception as e: print(f" Error saving results_log.json: {e}") # Prepare submission submission_data = { "username": username.strip(), "agent_code": agent_code, "answers": answers_payload } status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..." print(status_update) # Submit or return results if not use_validation: print(f"Submitting {len(answers_payload)} answers to: {submit_url}") try: response = requests.post(submit_url, json=submission_data, headers=HEADERS, timeout=60) response.raise_for_status() result_data = response.json() final_status = ( f" Submission Successful!\n" f"User: {result_data.get('username')}\n" f"Overall Score: {result_data.get('score', 'N/A')}% " f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" f"Message: {result_data.get('message', 'No message received.')}\n\n" f" Processing Summary:\n" f"• Total questions processed: {len(questions_data)}\n" f"• Answers found (non-'unknown'): {successful_answers}\n" f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%" ) print(" Submission successful.") return final_status, pd.DataFrame(results_log) except requests.exceptions.HTTPError as e: error_detail = f"Server responded with status {e.response.status_code}." try: error_json = e.response.json() error_detail += f" Detail: {error_json.get('detail', e.response.text)}" except: error_detail += f" Response: {e.response.text[:500]}" status_message = f" Submission Failed: {error_detail}" print(status_message) return status_message, pd.DataFrame(results_log) except Exception as e: status_message = f"Submission Failed: {e}\n{traceback.format_exc()}" print(status_message) return status_message, pd.DataFrame(results_log) else: print("Validation mode: Skipping submission, returning results.") validation_summary = ( f" Validation Run Complete\n\n" f" Summary:\n" f"• Total questions processed: {len(questions_data)}\n" f"• Answers found (non-'unknown'): {successful_answers}\n" f"• Processing success rate: {(successful_answers/len(questions_data)*100):.1f}%\n\n" f" This gives you an estimate of potential performance.\n" f"Check the results table below for detailed breakdown." ) return validation_summary, pd.DataFrame(results_log) # Gradio Interface with gr.Blocks(title="GAIA Benchmark Agent Evaluation", theme=gr.themes.Soft()) as demo: gr.Markdown("# GAIA Benchmark Agent Evaluation") gr.Markdown( """ ### Instructions: 1. **Setup**: Ensure `HF_TOKEN_HERE` is set in Space Secrets 2. **Development**: Clone this Space and modify `agent.py` with your logic 3. **Authentication**: Log in to Hugging Face below 4. **Testing**: Select 'Use Validation' for local testing or leave unchecked for test set submission 5. **Run**: Click 'Run Evaluation & Submit All Answers' to process questions and submit ### Important Notes: - **Validation Mode**: Use this to test your agent on known questions before submitting - **Test Mode**: Submits to the actual benchmark (limited submissions per day) - **Processing Time**: May take several minutes depending on number of questions - **Debugging**: Check `results_log.json` if you need to debug failures ### Current Goal: Improve accuracy """ ) gr.LoginButton() with gr.Row(): use_validation = gr.Checkbox( label="🧪 Use Validation Set for Testing", value=True, # Default to validation for safety info="Recommended: Test on validation set first before submitting to test set" ) run_button = gr.Button( "🚀 Run Evaluation & Submit All Answers", variant="primary", size="lg" ) status_output = gr.Textbox( label="Run Status / Submission Result", lines=10, interactive=False, show_copy_button=True ) results_table = gr.DataFrame( label="Detailed Results: Questions and Agent Answers", wrap=True, interactive=False ) run_button.click( fn=run_and_submit_all, inputs=[use_validation], outputs=[status_output, results_table] ) if __name__ == "__main__": print("\n" + "="*70) print(" GAIA BENCHMARK AGENT - STARTING UP ") print("="*70) space_host = os.getenv("SPACE_HOST") space_id = os.getenv("SPACE_ID") or "saandip5/Final_Assignment_Template" if space_host: print(f" SPACE_HOST found: {space_host}") print(f" Runtime URL: https://{space_host}.hf.space") else: print(" SPACE_HOST not found (running locally?)") if space_id: print(f" SPACE_ID found: {space_id}") print(f" Repo URL: https://huggingface.co/spaces/{space_id}") print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id}/tree/main") else: print(" SPACE_ID not found (running locally?)") print("="*70) print(" Launching Gradio Interface...") print("="*70 + "\n") demo.launch(debug=True, share=False)