import os import gradio as gr import requests import pandas as pd import tempfile import json import logging from typing import Optional # Import the optimized agent from the separate module from agent_enhanced import GAIAAgent # ============ CONFIGURATION ============ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # Set up logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ============ API INTERACTION ============ def fetch_questions(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> list: """Fetch all questions from the GAIA API with retry logic.""" for attempt in range(max_retries): try: response = requests.get(f"{api_url}/questions", timeout=30) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: logger.warning(f"Attempt {attempt + 1} failed: {e}") if attempt == max_retries - 1: raise return [] def fetch_random_question(api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict: """Fetch a random question from the GAIA API with retry logic.""" for attempt in range(max_retries): try: response = requests.get(f"{api_url}/random-question", timeout=30) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: logger.warning(f"Attempt {attempt + 1} failed: {e}") if attempt == max_retries - 1: raise return {} def fetch_file(task_id: str, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> Optional[str]: """Fetch a file associated with a task with retry logic.""" for attempt in range(max_retries): try: response = requests.get(f"{api_url}/files/{task_id}", timeout=30) if response.status_code == 200: # Save to temp file content_disposition = response.headers.get('content-disposition', '') filename = f"task_{task_id}_file" if 'filename=' in content_disposition: filename = content_disposition.split('filename=')[1].strip('"') temp_dir = tempfile.mkdtemp() file_path = os.path.join(temp_dir, filename) with open(file_path, 'wb') as f: f.write(response.content) logger.info(f"Downloaded file: {file_path}") return file_path elif response.status_code == 404: logger.info(f"No file found for task {task_id}") return None except requests.exceptions.RequestException as e: logger.warning(f"File fetch attempt {attempt + 1} failed: {e}") if attempt == max_retries - 1: logger.error(f"Failed to fetch file for task {task_id}: {e}") return None def submit_answers(username: str, agent_code: str, answers: list, api_url: str = DEFAULT_API_URL, max_retries: int = 3) -> dict: """Submit answers to the GAIA API with retry logic.""" payload = { "username": username, "agent_code": agent_code, "answers": answers } for attempt in range(max_retries): try: response = requests.post(f"{api_url}/submit", json=payload, timeout=60) response.raise_for_status() return response.json() except requests.exceptions.RequestException as e: logger.warning(f"Submission attempt {attempt + 1} failed: {e}") if attempt == max_retries - 1: raise return {} # ============ ANSWER VALIDATION ============ def validate_answer_format(answer: str) -> tuple[bool, str]: """Validate answer format and return (is_valid, warning_message).""" if not answer or answer.strip() == "": return False, "Warning: Answer is empty" # Check for common prefixes that should be removed prefixes = ["FINAL ANSWER:", "The answer is:", "Answer:", "final answer:"] answer_lower = answer.lower() for prefix in prefixes: if answer_lower.startswith(prefix.lower()): return False, f"Warning: Answer contains prefix '{prefix}' which will be removed. Consider removing it." # Check for explanations (multiple sentences) if answer.count('.') > 1 or answer.count('because') > 0 or answer.count('since') > 0: return False, "Warning: Answer may contain explanations. Only the answer should be submitted." return True, "" # ============ GRADIO INTERFACE ============ def run_agent_on_questions(openai_api_key: str, progress=gr.Progress()): """Run the agent on all GAIA questions.""" if not openai_api_key: return "Please provide your OpenAI API key.", None try: # Initialize agent progress(0, desc="Initializing agent...") agent = GAIAAgent(api_key=openai_api_key) # Fetch questions progress(0.05, desc="Fetching questions from API...") questions = fetch_questions() if not questions: return "Error: Failed to fetch questions from API. Please try again.", None total_questions = len(questions) results = [] answers_for_submission = [] for i, q in enumerate(questions): progress((i + 1) / total_questions, desc=f"Processing question {i+1}/{total_questions}...") task_id = q.get("task_id", "") question_text = q.get("question", "") # Check if there's an associated file file_path = None if q.get("file_name"): progress((i + 0.5) / total_questions, desc=f"Downloading file for question {i+1}...") file_path = fetch_file(task_id) # Run agent try: progress((i + 0.7) / total_questions, desc=f"Agent reasoning for question {i+1}...") answer = agent.run(question_text, task_id, file_path) # Validate answer format is_valid, warning = validate_answer_format(answer) if not is_valid: logger.warning(f"Question {i+1} ({task_id}): {warning}") except Exception as e: logger.error(f"Error processing question {i+1} ({task_id}): {e}") answer = f"Error: {str(e)}" results.append({ "Task ID": task_id, "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text, "Answer": answer, "Status": "โœ“" if answer and not answer.startswith("Error:") else "โœ—" }) answers_for_submission.append({ "task_id": task_id, "submitted_answer": answer }) # Cleanup temp file if file_path and os.path.exists(file_path): try: os.remove(file_path) # Also try to remove temp directory if empty temp_dir = os.path.dirname(file_path) if os.path.exists(temp_dir): try: os.rmdir(temp_dir) except: pass except Exception as e: logger.warning(f"Failed to cleanup file {file_path}: {e}") df = pd.DataFrame(results) progress(1.0, desc="Complete!") return df, answers_for_submission except Exception as e: logger.error(f"Error in run_agent_on_questions: {e}") return f"Error: {str(e)}", None def submit_to_leaderboard(username: str, space_url: str, answers_json: str): """Submit answers to the leaderboard.""" if not username or not space_url or not answers_json: return "Please fill in all fields and run the agent first." try: answers = json.loads(answers_json) if isinstance(answers_json, str) else answers_json if not isinstance(answers, list) or len(answers) == 0: return "Error: Answers must be a non-empty list. Please run the agent first." # Validate answer format before submission warnings = [] for ans in answers: if "task_id" not in ans or "submitted_answer" not in ans: return "Error: Invalid answer format. Each answer must have 'task_id' and 'submitted_answer'." is_valid, warning = validate_answer_format(ans.get("submitted_answer", "")) if not is_valid: warnings.append(f"Task {ans.get('task_id')}: {warning}") # Ensure space URL ends with /tree/main if not space_url.endswith("/tree/main"): space_url = space_url.rstrip("/") + "/tree/main" # Submit to API result = submit_answers(username, space_url, answers) score = result.get("score", 0) correct = result.get("correct_count", 0) total = result.get("total_attempted", 0) warning_text = "" if warnings: warning_text = f"\n\nโš ๏ธ **Warnings:**\n" + "\n".join(f"- {w}" for w in warnings[:5]) if len(warnings) > 5: warning_text += f"\n- ... and {len(warnings) - 5} more warnings" return f""" ## Submission Successful! ๐ŸŽ‰ **Score:** {score:.1%} **Correct:** {correct}/{total} {'๐Ÿ† Congratulations! You passed the 30% threshold!' if score >= 0.3 else '๐Ÿ“ˆ Keep improving! You need 30% to earn your certificate.'} {warning_text} Check the [leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard) to see your ranking! """ except json.JSONDecodeError as e: return f"Error: Invalid JSON format. Please run the agent first.\nDetails: {str(e)}" except Exception as e: logger.error(f"Submission error: {e}") return f"Submission error: {str(e)}" def test_single_question(openai_api_key: str): """Test the agent on a single random question.""" if not openai_api_key: return "Please provide your OpenAI API key.", "", "", "" try: agent = GAIAAgent(api_key=openai_api_key) question_data = fetch_random_question() if not question_data: return "Error: Failed to fetch question from API.", "", "", "" task_id = question_data.get("task_id", "") question_text = question_data.get("question", "") file_path = None if question_data.get("file_name"): file_path = fetch_file(task_id) answer = agent.run(question_text, task_id, file_path) # Validate answer format is_valid, warning = validate_answer_format(answer) validation_status = "โœ“ Valid format" if is_valid else f"โš ๏ธ {warning}" # Cleanup temp file if file_path and os.path.exists(file_path): try: os.remove(file_path) temp_dir = os.path.dirname(file_path) if os.path.exists(temp_dir): try: os.rmdir(temp_dir) except: pass except Exception as e: logger.warning(f"Failed to cleanup file: {e}") return question_text, answer, task_id, validation_status except Exception as e: logger.error(f"Error in test_single_question: {e}") return f"Error: {str(e)}", "", "", "" # ============ BUILD GRADIO APP ============ with gr.Blocks(title="GAIA Agent - LangGraph", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # ๐Ÿค– GAIA Benchmark Agent (LangGraph) This agent uses **LangGraph** to solve GAIA benchmark questions. It has access to: - ๐Ÿ” Web Search (DuckDuckGo) - ๐Ÿ“š Wikipedia Search - ๐Ÿ Python Code Execution - ๐Ÿ“„ File Reading (PDF, Text, Excel) - ๐Ÿ”ข Calculator ## Instructions 1. Enter your OpenAI API key 2. Test with a single question or run on all questions 3. Submit your answers to the leaderboard """) with gr.Row(): openai_key = gr.Textbox( label="OpenAI API Key", type="password", placeholder="sk-...", info="Required for GPT-4o" ) with gr.Tabs(): with gr.TabItem("๐Ÿงช Test Single Question"): test_btn = gr.Button("Fetch & Solve Random Question", variant="primary") test_question = gr.Textbox(label="Question", lines=5, interactive=False) test_answer = gr.Textbox(label="Agent's Answer", lines=3, interactive=False) test_task_id = gr.Textbox(label="Task ID", interactive=False) test_validation = gr.Textbox(label="Answer Validation", interactive=False) test_btn.click( test_single_question, inputs=[openai_key], outputs=[test_question, test_answer, test_task_id, test_validation] ) with gr.TabItem("๐Ÿš€ Run Full Benchmark"): run_btn = gr.Button("Run Agent on All Questions", variant="primary") results_table = gr.Dataframe(label="Results") answers_state = gr.State() run_btn.click( run_agent_on_questions, inputs=[openai_key], outputs=[results_table, answers_state] ) with gr.TabItem("๐Ÿ“ค Submit to Leaderboard"): gr.Markdown(""" ### Submit Your Results After running the full benchmark, fill in your details and submit to the leaderboard. **Requirements:** - Your HuggingFace username - Your Space URL (must end with `/tree/main`) - Answers will be auto-filled after running the benchmark """) with gr.Row(): username_input = gr.Textbox( label="HuggingFace Username", placeholder="your-username", info="Your HuggingFace account username" ) space_url_input = gr.Textbox( label="Your Space URL", placeholder="https://huggingface.co/spaces/your-username/your-space", info="Full URL to your Space (will auto-append /tree/main if needed)" ) answers_input = gr.Textbox( label="Answers JSON (auto-filled after running benchmark)", lines=10, placeholder="Run the full benchmark first...", info="This will be automatically populated after running the benchmark" ) submit_btn = gr.Button("Submit to Leaderboard", variant="primary") submit_result = gr.Markdown() # Auto-fill answers when benchmark completes def format_answers(answers): if answers: return json.dumps(answers, indent=2) return "" answers_state.change(format_answers, inputs=[answers_state], outputs=[answers_input]) submit_btn.click( submit_to_leaderboard, inputs=[username_input, space_url_input, answers_input], outputs=[submit_result] ) gr.Markdown(""" --- ### ๐Ÿ“‹ Tips for Better Scores **Answer Formatting:** - Answers are matched **exactly** (character-for-character), so precision is critical - Do NOT include prefixes like "FINAL ANSWER:" or "The answer is:" - For lists: use comma-separated format with NO spaces (e.g., "item1,item2,item3") - For numbers: just the number, no units unless specified - Check the validation status in the test tab **Agent Capabilities:** - Uses GPT-4o for optimal reasoning - Automatically reads files (PDFs, Excel, text) when available - Web search for current information - Wikipedia for factual lookups - Python execution for calculations **Best Practices:** 1. Test with a single question first to verify the agent works 2. Run the full benchmark (takes ~10-15 minutes) 3. Review answers before submission 4. Ensure your Space is public for verification ### ๐Ÿ”— Links - [GAIA Benchmark](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - [Student Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard) - [Course Unit 4](https://huggingface.co/learn/agents-course/en/unit4/hands-on) - [API Documentation](https://agents-course-unit4-scoring.hf.space/docs) """) if __name__ == "__main__": # For HuggingFace Spaces, use share=False # For local development, you can use share=True to get a public link demo.launch(server_name="0.0.0.0", server_port=7860)