import os import gradio as gr import requests import pandas as pd from smolagents import CodeAgent, InferenceClientModel, OpenAIModel # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" GAIA_SYSTEM_PROMPT = """You are solving GAIA level 1 questions with extreme precision. CRITICAL RULES: 1. Return ONLY the final answer - no explanations, no context, no preamble 2. For numbers: just the number (no units unless explicitly requested) 3. For strings: just the answer (no articles like "the" or "a") 4. For lists: format as "item1, item2, item3" (no quotes, no brackets) STRATEGY: - Use web search liberally - search multiple times with different keywords - Visit actual webpages to get complete information - Cross-reference multiple sources - Think step-by-step but output only the final answer - If you find relevant info but not the complete answer, search again with more specific terms NEVER output: - "FINAL ANSWER:" - "The answer is:" - Explanations or reasoning - "No information found" (keep searching!) Examples of correct outputs: Question: "How many studio albums?" → Answer: "7" Question: "What is the capital?" → Answer: "Paris" Question: "List the winners" → Answer: "John, Mary, Bob" """ class SmolGaiaAgent: """ Premium agent optimized for maximum accuracy on GAIA Level 1. """ def __init__(self): print("Initializing Premium SmolGaiaAgent...") # Use the most capable model available # Option 1: Qwen 32B (current - good balance) self.model = OpenAIModel( model_id="gpt-4.1", api_key=os.getenv("OPENAI_API_KEY"), ) # Option 2: Try Claude or GPT-4 via API if available # self.model = InferenceClientModel( # model_id="anthropic/claude-3-5-sonnet", # api_key=os.getenv("ANTHROPIC_API_KEY"), # ) # MORE STEPS = Better accuracy (but slower) try: self.agent = CodeAgent( tools=[], add_base_tools=True, model=self.model, max_steps=12, # INCREASED from 6 to 12 for thorough reasoning system_prompt=GAIA_SYSTEM_PROMPT, ) print("Agent initialized with system_prompt parameter") self.use_task_prefix = False except TypeError as e: print(f"system_prompt not supported, using task prefix: {e}") self.agent = CodeAgent( tools=[], add_base_tools=True, model=self.model, max_steps=12, ) self.use_task_prefix = True def __call__(self, question: str) -> str: """ Runs the CodeAgent on one question with enhanced answer extraction. """ print(f"[Premium Agent] Question: {question[:80]}...") if self.use_task_prefix: task = f"{GAIA_SYSTEM_PROMPT}\n\nTask: {question}" else: task = question try: answer = self.agent.run(task) answer = str(answer).strip() # Enhanced answer cleaning answer = self.aggressive_clean_answer(answer) print(f"[Premium Agent] Final Answer: {answer}") return answer except Exception as e: print(f"[Premium Agent] Error: {e}") import traceback traceback.print_exc() return "Error processing question" def aggressive_clean_answer(self, answer: str) -> str: """ Aggressively clean the answer to extract just the answer. """ original = answer # Remove common prefixes (case insensitive) prefixes_to_remove = [ "final answer:", "the final answer is:", "answer:", "the answer is:", "the answer is", "result:", "solution:", "output:", ] answer_lower = answer.lower() for prefix in prefixes_to_remove: if answer_lower.startswith(prefix): answer = answer[len(prefix):].strip() answer_lower = answer.lower() # Remove surrounding quotes if (answer.startswith('"') and answer.endswith('"')) or \ (answer.startswith("'") and answer.endswith("'")): answer = answer[1:-1].strip() # If answer contains "is:" extract what comes after if " is:" in answer.lower(): parts = answer.split("is:") if len(parts) > 1: answer = parts[-1].strip() # If answer contains "are:" extract what comes after if " are:" in answer.lower(): parts = answer.split("are:") if len(parts) > 1: answer = parts[-1].strip() # Remove trailing periods (unless it's a decimal number) if answer.endswith('.') and not answer[-2].isdigit(): answer = answer[:-1].strip() # If answer starts with "The " and is followed by a name/noun, remove "The " if answer.startswith("The ") and len(answer) > 4: # Check if next word is capitalized (likely a proper noun) next_word = answer.split()[1] if len(answer.split()) > 1 else "" if next_word and next_word[0].isupper(): answer = answer[4:].strip() # Remove "a " or "an " from the beginning if answer.lower().startswith("a "): answer = answer[2:].strip() elif answer.lower().startswith("an "): answer = answer[3:].strip() print(f"[Cleaning] Original: '{original}' → Cleaned: '{answer}'") return answer def run_and_submit_all(profile: gr.OAuthProfile | None): """ Fetches all questions, runs the Premium Agent, submits answers. """ space_id = os.getenv("SPACE_ID") if profile is None: return "Please Login to Hugging Face with the button.", None try: username = profile.username print(f"User logged in: {username}") except AttributeError: return "Please Login to Hugging Face with the button.", None api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" submit_url = f"{api_url}/submit" # 1. Instantiate Agent print("\n" + "="*70) print("INITIALIZING PREMIUM AGENT") print("="*70) try: agent = SmolGaiaAgent() except Exception as e: print(f"Error instantiating agent: {e}") import traceback traceback.print_exc() return f"Error initializing agent: {e}", None agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" # 2. Fetch Questions print(f"\nFetching questions from: {questions_url}") try: response = requests.get(questions_url, timeout=15) response.raise_for_status() questions_data = response.json() if not questions_data: return "Fetched questions list is empty or invalid format.", None print(f"✓ Fetched {len(questions_data)} questions.") except Exception as e: return f"Error fetching questions: {e}", None # 3. Run Agent with detailed progress tracking results_log = [] answers_payload = [] total = len(questions_data) print("\n" + "="*70) print(f"PROCESSING {total} QUESTIONS") print("="*70 + "\n") for idx, item in enumerate(questions_data, 1): task_id = item.get("task_id") question_text = item.get("question") if not task_id or question_text is None: print(f"⚠ Skipping item with missing task_id or question") continue print(f"\n{'='*70}") print(f"QUESTION {idx}/{total}") print(f"Task ID: {task_id}") print(f"Question: {question_text[:100]}...") print('='*70) try: submitted_answer = agent(question_text) answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer }) print(f"✓ Answer recorded: {submitted_answer}") except Exception as e: print(f"✗ Error processing question: {e}") import traceback traceback.print_exc() results_log.append({ "Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}" }) if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) # 4. Submit submission_data = { "username": username.strip(), "agent_code": agent_code, "answers": answers_payload } print("\n" + "="*70) print(f"SUBMITTING {len(answers_payload)} ANSWERS") print("="*70) try: response = requests.post(submit_url, json=submission_data, timeout=60) response.raise_for_status() result_data = response.json() score = result_data.get('score', 'N/A') correct = result_data.get('correct_count', '?') total_attempted = result_data.get('total_attempted', '?') final_status = ( f"🎉 Submission Successful!\n" f"User: {result_data.get('username')}\n" f"Overall Score: {score}% ({correct}/{total_attempted} correct)\n" f"Message: {result_data.get('message', 'No message received.')}\n\n" f"{'🏆 EXCELLENT!' if float(score) >= 80 else '👍 Good job!' if float(score) >= 50 else '💪 Keep improving!'}" ) print(f"\n✓ Submission successful! Score: {score}%") results_df = pd.DataFrame(results_log) return final_status, results_df except Exception as e: print(f"✗ Submission error: {e}") results_df = pd.DataFrame(results_log) return f"Submission Failed: {e}", results_df # --- Build Gradio Interface --- with gr.Blocks() as demo: gr.Markdown("# 🏆 Premium Agent - Optimized for Maximum Accuracy") gr.Markdown( """ **Current Configuration:** - 🧠 Model: Qwen/Qwen2.5-Coder-32B-Instruct (most capable) - 🔄 Max Steps: 12 (thorough reasoning) - 🧹 Enhanced answer cleaning - 📊 Detailed progress logging **Target Performance:** - ⏱️ Time: ~20-25 minutes for 20 questions - 🎯 Target Score: 60-80% (realistic for Level 1) - 🏆 Stretch Goal: 80%+ with optimal configuration **To Reach 100%:** Getting 100% on GAIA Level 1 is extremely difficult. The benchmark shows: - GPT-4 achieves ~70-80% - Claude 3.5 achieves ~75-85% - Human experts achieve ~90-95% For the best possible score: 1. ✅ Use this premium configuration (12 steps, 32B model) 2. 🔍 Manually review failed questions and add custom logic 3. 🛠️ Create specialized tools for specific question types 4. 🧪 Test and iterate on difficult questions """ ) gr.LoginButton() run_button = gr.Button("🚀 Run Premium Evaluation & Submit") status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False) results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) run_button.click( fn=run_and_submit_all, outputs=[status_output, results_table] ) if __name__ == "__main__": print("\n" + "="*70) print("PREMIUM AGENT STARTING") print("="*70) space_host = os.getenv("SPACE_HOST") space_id = os.getenv("SPACE_ID") if space_host: print(f"✓ Runtime URL: https://{space_host}.hf.space") if space_id: print(f"✓ Repo URL: https://huggingface.co/spaces/{space_id}/tree/main") print("="*70 + "\n") demo.launch(debug=True, share=False)