Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import requests | |
| import pandas as pd | |
| from smolagents import CodeAgent, InferenceClientModel, OpenAIModel | |
| # --- Constants --- | |
| DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" | |
| GAIA_SYSTEM_PROMPT = """You are solving GAIA level 1 questions with extreme precision. | |
| CRITICAL RULES: | |
| 1. Return ONLY the final answer - no explanations, no context, no preamble | |
| 2. For numbers: just the number (no units unless explicitly requested) | |
| 3. For strings: just the answer (no articles like "the" or "a") | |
| 4. For lists: format as "item1, item2, item3" (no quotes, no brackets) | |
| STRATEGY: | |
| - Use web search liberally - search multiple times with different keywords | |
| - Visit actual webpages to get complete information | |
| - Cross-reference multiple sources | |
| - Think step-by-step but output only the final answer | |
| - If you find relevant info but not the complete answer, search again with more specific terms | |
| NEVER output: | |
| - "FINAL ANSWER:" | |
| - "The answer is:" | |
| - Explanations or reasoning | |
| - "No information found" (keep searching!) | |
| Examples of correct outputs: | |
| Question: "How many studio albums?" β Answer: "7" | |
| Question: "What is the capital?" β Answer: "Paris" | |
| Question: "List the winners" β Answer: "John, Mary, Bob" | |
| """ | |
| class SmolGaiaAgent: | |
| """ | |
| Premium agent optimized for maximum accuracy on GAIA Level 1. | |
| """ | |
| def __init__(self): | |
| print("Initializing Premium SmolGaiaAgent...") | |
| # Use the most capable model available | |
| # Option 1: Qwen 32B (current - good balance) | |
| self.model = OpenAIModel( | |
| model_id="gpt-4.1", | |
| api_key=os.getenv("OPENAI_API_KEY"), | |
| ) | |
| # Option 2: Try Claude or GPT-4 via API if available | |
| # self.model = InferenceClientModel( | |
| # model_id="anthropic/claude-3-5-sonnet", | |
| # api_key=os.getenv("ANTHROPIC_API_KEY"), | |
| # ) | |
| # MORE STEPS = Better accuracy (but slower) | |
| try: | |
| self.agent = CodeAgent( | |
| tools=[], | |
| add_base_tools=True, | |
| model=self.model, | |
| max_steps=12, # INCREASED from 6 to 12 for thorough reasoning | |
| system_prompt=GAIA_SYSTEM_PROMPT, | |
| ) | |
| print("Agent initialized with system_prompt parameter") | |
| self.use_task_prefix = False | |
| except TypeError as e: | |
| print(f"system_prompt not supported, using task prefix: {e}") | |
| self.agent = CodeAgent( | |
| tools=[], | |
| add_base_tools=True, | |
| model=self.model, | |
| max_steps=12, | |
| ) | |
| self.use_task_prefix = True | |
| def __call__(self, question: str) -> str: | |
| """ | |
| Runs the CodeAgent on one question with enhanced answer extraction. | |
| """ | |
| print(f"[Premium Agent] Question: {question[:80]}...") | |
| if self.use_task_prefix: | |
| task = f"{GAIA_SYSTEM_PROMPT}\n\nTask: {question}" | |
| else: | |
| task = question | |
| try: | |
| answer = self.agent.run(task) | |
| answer = str(answer).strip() | |
| # Enhanced answer cleaning | |
| answer = self.aggressive_clean_answer(answer) | |
| print(f"[Premium Agent] Final Answer: {answer}") | |
| return answer | |
| except Exception as e: | |
| print(f"[Premium Agent] Error: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return "Error processing question" | |
| def aggressive_clean_answer(self, answer: str) -> str: | |
| """ | |
| Aggressively clean the answer to extract just the answer. | |
| """ | |
| original = answer | |
| # Remove common prefixes (case insensitive) | |
| prefixes_to_remove = [ | |
| "final answer:", | |
| "the final answer is:", | |
| "answer:", | |
| "the answer is:", | |
| "the answer is", | |
| "result:", | |
| "solution:", | |
| "output:", | |
| ] | |
| answer_lower = answer.lower() | |
| for prefix in prefixes_to_remove: | |
| if answer_lower.startswith(prefix): | |
| answer = answer[len(prefix):].strip() | |
| answer_lower = answer.lower() | |
| # Remove surrounding quotes | |
| if (answer.startswith('"') and answer.endswith('"')) or \ | |
| (answer.startswith("'") and answer.endswith("'")): | |
| answer = answer[1:-1].strip() | |
| # If answer contains "is:" extract what comes after | |
| if " is:" in answer.lower(): | |
| parts = answer.split("is:") | |
| if len(parts) > 1: | |
| answer = parts[-1].strip() | |
| # If answer contains "are:" extract what comes after | |
| if " are:" in answer.lower(): | |
| parts = answer.split("are:") | |
| if len(parts) > 1: | |
| answer = parts[-1].strip() | |
| # Remove trailing periods (unless it's a decimal number) | |
| if answer.endswith('.') and not answer[-2].isdigit(): | |
| answer = answer[:-1].strip() | |
| # If answer starts with "The " and is followed by a name/noun, remove "The " | |
| if answer.startswith("The ") and len(answer) > 4: | |
| # Check if next word is capitalized (likely a proper noun) | |
| next_word = answer.split()[1] if len(answer.split()) > 1 else "" | |
| if next_word and next_word[0].isupper(): | |
| answer = answer[4:].strip() | |
| # Remove "a " or "an " from the beginning | |
| if answer.lower().startswith("a "): | |
| answer = answer[2:].strip() | |
| elif answer.lower().startswith("an "): | |
| answer = answer[3:].strip() | |
| print(f"[Cleaning] Original: '{original}' β Cleaned: '{answer}'") | |
| return answer | |
| def run_and_submit_all(profile: gr.OAuthProfile | None): | |
| """ | |
| Fetches all questions, runs the Premium Agent, submits answers. | |
| """ | |
| space_id = os.getenv("SPACE_ID") | |
| if profile is None: | |
| return "Please Login to Hugging Face with the button.", None | |
| try: | |
| username = profile.username | |
| print(f"User logged in: {username}") | |
| except AttributeError: | |
| return "Please Login to Hugging Face with the button.", None | |
| api_url = DEFAULT_API_URL | |
| questions_url = f"{api_url}/questions" | |
| submit_url = f"{api_url}/submit" | |
| # 1. Instantiate Agent | |
| print("\n" + "="*70) | |
| print("INITIALIZING PREMIUM AGENT") | |
| print("="*70) | |
| try: | |
| agent = SmolGaiaAgent() | |
| except Exception as e: | |
| print(f"Error instantiating agent: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| return f"Error initializing agent: {e}", None | |
| agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" | |
| # 2. Fetch Questions | |
| print(f"\nFetching questions from: {questions_url}") | |
| try: | |
| response = requests.get(questions_url, timeout=15) | |
| response.raise_for_status() | |
| questions_data = response.json() | |
| if not questions_data: | |
| return "Fetched questions list is empty or invalid format.", None | |
| print(f"β Fetched {len(questions_data)} questions.") | |
| except Exception as e: | |
| return f"Error fetching questions: {e}", None | |
| # 3. Run Agent with detailed progress tracking | |
| results_log = [] | |
| answers_payload = [] | |
| total = len(questions_data) | |
| print("\n" + "="*70) | |
| print(f"PROCESSING {total} QUESTIONS") | |
| print("="*70 + "\n") | |
| for idx, item in enumerate(questions_data, 1): | |
| task_id = item.get("task_id") | |
| question_text = item.get("question") | |
| if not task_id or question_text is None: | |
| print(f"β Skipping item with missing task_id or question") | |
| continue | |
| print(f"\n{'='*70}") | |
| print(f"QUESTION {idx}/{total}") | |
| print(f"Task ID: {task_id}") | |
| print(f"Question: {question_text[:100]}...") | |
| print('='*70) | |
| try: | |
| submitted_answer = agent(question_text) | |
| answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) | |
| results_log.append({ | |
| "Task ID": task_id, | |
| "Question": question_text, | |
| "Submitted Answer": submitted_answer | |
| }) | |
| print(f"β Answer recorded: {submitted_answer}") | |
| except Exception as e: | |
| print(f"β Error processing question: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| results_log.append({ | |
| "Task ID": task_id, | |
| "Question": question_text, | |
| "Submitted Answer": f"AGENT ERROR: {e}" | |
| }) | |
| if not answers_payload: | |
| return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) | |
| # 4. Submit | |
| submission_data = { | |
| "username": username.strip(), | |
| "agent_code": agent_code, | |
| "answers": answers_payload | |
| } | |
| print("\n" + "="*70) | |
| print(f"SUBMITTING {len(answers_payload)} ANSWERS") | |
| print("="*70) | |
| try: | |
| response = requests.post(submit_url, json=submission_data, timeout=60) | |
| response.raise_for_status() | |
| result_data = response.json() | |
| score = result_data.get('score', 'N/A') | |
| correct = result_data.get('correct_count', '?') | |
| total_attempted = result_data.get('total_attempted', '?') | |
| final_status = ( | |
| f"π Submission Successful!\n" | |
| f"User: {result_data.get('username')}\n" | |
| f"Overall Score: {score}% ({correct}/{total_attempted} correct)\n" | |
| f"Message: {result_data.get('message', 'No message received.')}\n\n" | |
| f"{'π EXCELLENT!' if float(score) >= 80 else 'π Good job!' if float(score) >= 50 else 'πͺ Keep improving!'}" | |
| ) | |
| print(f"\nβ Submission successful! Score: {score}%") | |
| results_df = pd.DataFrame(results_log) | |
| return final_status, results_df | |
| except Exception as e: | |
| print(f"β Submission error: {e}") | |
| results_df = pd.DataFrame(results_log) | |
| return f"Submission Failed: {e}", results_df | |
| # --- Build Gradio Interface --- | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# π Premium Agent - Optimized for Maximum Accuracy") | |
| gr.Markdown( | |
| """ | |
| **Current Configuration:** | |
| - π§ Model: Qwen/Qwen2.5-Coder-32B-Instruct (most capable) | |
| - π Max Steps: 12 (thorough reasoning) | |
| - π§Ή Enhanced answer cleaning | |
| - π Detailed progress logging | |
| **Target Performance:** | |
| - β±οΈ Time: ~20-25 minutes for 20 questions | |
| - π― Target Score: 60-80% (realistic for Level 1) | |
| - π Stretch Goal: 80%+ with optimal configuration | |
| **To Reach 100%:** | |
| Getting 100% on GAIA Level 1 is extremely difficult. The benchmark shows: | |
| - GPT-4 achieves ~70-80% | |
| - Claude 3.5 achieves ~75-85% | |
| - Human experts achieve ~90-95% | |
| For the best possible score: | |
| 1. β Use this premium configuration (12 steps, 32B model) | |
| 2. π Manually review failed questions and add custom logic | |
| 3. π οΈ Create specialized tools for specific question types | |
| 4. π§ͺ Test and iterate on difficult questions | |
| """ | |
| ) | |
| gr.LoginButton() | |
| run_button = gr.Button("π Run Premium Evaluation & Submit") | |
| status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False) | |
| results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) | |
| run_button.click( | |
| fn=run_and_submit_all, | |
| outputs=[status_output, results_table] | |
| ) | |
| if __name__ == "__main__": | |
| print("\n" + "="*70) | |
| print("PREMIUM AGENT STARTING") | |
| print("="*70) | |
| space_host = os.getenv("SPACE_HOST") | |
| space_id = os.getenv("SPACE_ID") | |
| if space_host: | |
| print(f"β Runtime URL: https://{space_host}.hf.space") | |
| if space_id: | |
| print(f"β Repo URL: https://huggingface.co/spaces/{space_id}/tree/main") | |
| print("="*70 + "\n") | |
| demo.launch(debug=True, share=False) |