Final_Assignment_Template

Sleeping

App Files Files Community

Snaseem2026 commited on Jan 6

Commit

b14f9b4

verified ·

1 Parent(s): 81917a3

Update app.py

Browse files

Files changed (1) hide show

app.py +807 -187

app.py CHANGED Viewed

@@ -1,196 +1,816 @@
-import os
 import gradio as gr
 import requests
-import inspect
-import pandas as pd
-# (Keep Constants as is)
-# --- Constants ---
-DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
-# --- Basic Agent Definition ---
-# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
-class BasicAgent:
-    def __init__(self):
-        print("BasicAgent initialized.")
-    def __call__(self, question: str) -> str:
-        print(f"Agent received question (first 50 chars): {question[:50]}...")
-        fixed_answer = "This is a default answer."
-        print(f"Agent returning fixed answer: {fixed_answer}")
-        return fixed_answer
-def run_and_submit_all( profile: gr.OAuthProfile | None):
-    """
-    Fetches all questions, runs the BasicAgent on them, submits all answers,
-    and displays the results.
-    """
-    # --- Determine HF Space Runtime URL and Repo URL ---
-    space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
-    if profile:
-        username= f"{profile.username}"
-        print(f"User logged in: {username}")
-    else:
-        print("User not logged in.")
-        return "Please Login to Hugging Face with the button.", None
-    api_url = DEFAULT_API_URL
-    questions_url = f"{api_url}/questions"
-    submit_url = f"{api_url}/submit"
-    # 1. Instantiate Agent ( modify this part to create your agent)
     try:
-        agent = BasicAgent()
     except Exception as e:
-        print(f"Error instantiating agent: {e}")
-        return f"Error initializing agent: {e}", None
-    # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
-    agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
-    print(agent_code)
-    # 2. Fetch Questions
-    print(f"Fetching questions from: {questions_url}")
     try:
-        response = requests.get(questions_url, timeout=15)
-        response.raise_for_status()
-        questions_data = response.json()
-        if not questions_data:
-             print("Fetched questions list is empty.")
-             return "Fetched questions list is empty or invalid format.", None
-        print(f"Fetched {len(questions_data)} questions.")
-    except requests.exceptions.RequestException as e:
-        print(f"Error fetching questions: {e}")
-        return f"Error fetching questions: {e}", None
-    except requests.exceptions.JSONDecodeError as e:
-         print(f"Error decoding JSON response from questions endpoint: {e}")
-         print(f"Response text: {response.text[:500]}")
-         return f"Error decoding server response for questions: {e}", None
     except Exception as e:
-        print(f"An unexpected error occurred fetching questions: {e}")
-        return f"An unexpected error occurred fetching questions: {e}", None
-    # 3. Run your Agent
-    results_log = []
-    answers_payload = []
-    print(f"Running agent on {len(questions_data)} questions...")
-    for item in questions_data:
-        task_id = item.get("task_id")
-        question_text = item.get("question")
-        if not task_id or question_text is None:
-            print(f"Skipping item with missing task_id or question: {item}")
-            continue
-        try:
-            submitted_answer = agent(question_text)
-            answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
-            results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
-        except Exception as e:
-             print(f"Error running agent on task {task_id}: {e}")
-             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-    if not answers_payload:
-        print("Agent did not produce any answers to submit.")
-        return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
-    # 4. Prepare Submission
-    submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
-    status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
-    print(status_update)
-    # 5. Submit
-    print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
     try:
-        response = requests.post(submit_url, json=submission_data, timeout=60)
-        response.raise_for_status()
-        result_data = response.json()
-        final_status = (
-            f"Submission Successful!\n"
-            f"User: {result_data.get('username')}\n"
-            f"Overall Score: {result_data.get('score', 'N/A')}% "
-            f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
-            f"Message: {result_data.get('message', 'No message received.')}"
         )
-        print("Submission successful.")
-        results_df = pd.DataFrame(results_log)
-        return final_status, results_df
-    except requests.exceptions.HTTPError as e:
-        error_detail = f"Server responded with status {e.response.status_code}."
-        try:
-            error_json = e.response.json()
-            error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
-        except requests.exceptions.JSONDecodeError:
-            error_detail += f" Response: {e.response.text[:500]}"
-        status_message = f"Submission Failed: {error_detail}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.Timeout:
-        status_message = "Submission Failed: The request timed out."
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-    except requests.exceptions.RequestException as e:
-        status_message = f"Submission Failed: Network error - {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
     except Exception as e:
-        status_message = f"An unexpected error occurred during submission: {e}"
-        print(status_message)
-        results_df = pd.DataFrame(results_log)
-        return status_message, results_df
-# --- Build Gradio Interface using Blocks ---
-with gr.Blocks() as demo:
-    gr.Markdown("# Basic Agent Evaluation Runner")
-    gr.Markdown(
-        """
-        **Instructions:**
-        1.  Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
-        2.  Log in to your Hugging Face account using the button below. This uses your HF username for submission.
-        3.  Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
-        ---
-        **Disclaimers:**
-        Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
-        This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
-        """
-    )
-    gr.LoginButton()
-    run_button = gr.Button("Run Evaluation & Submit All Answers")
-    status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
-    # Removed max_rows=10 from DataFrame constructor
-    results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
-    run_button.click(
-        fn=run_and_submit_all,
-        outputs=[status_output, results_table]
-    )
-if __name__ == "__main__":
-    print("\n" + "-"*30 + " App Starting " + "-"*30)
-    # Check for SPACE_HOST and SPACE_ID at startup for information
-    space_host_startup = os.getenv("SPACE_HOST")
-    space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
-    if space_host_startup:
-        print(f"✅ SPACE_HOST found: {space_host_startup}")
-        print(f"   Runtime URL should be: https://{space_host_startup}.hf.space")
-    else:
-        print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
-    if space_id_startup: # Print repo URLs if SPACE_ID is found
-        print(f"✅ SPACE_ID found: {space_id_startup}")
-        print(f"   Repo URL: https://huggingface.co/spaces/{space_id_startup}")
-        print(f"   Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
-    else:
-        print("ℹ️  SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
-    print("-"*(60 + len(" App Starting ")) + "\n")
-    print("Launching Gradio Interface for Basic Agent Evaluation...")
-    demo.launch(debug=True, share=False)

+"""
+GAIA Benchmark Agent - Final Assignment
+This agent answers GAIA Level 1 questions using web search, calculation, and reasoning.
+"""
+from smolagents import CodeAgent, HfApiModel, tool, DuckDuckGoSearchTool
+import requests
 import gradio as gr
+import json
+# ============================================================================
+# TOOLS DEFINITION
+# ============================================================================
+# Tool 1: Web Search (built-in)
+search_tool = DuckDuckGoSearchTool()
+# Tool 2: Calculator
+@tool
+def calculator(expression: str) -> str:
+    """Evaluates mathematical expressions safely.
+    Args:
+        expression: A mathematical expression like '2+2', '15*23', or '100/4'
+    Returns:
+        The calculated result as a string
+    """
+    try:
+        # Use eval but only for math (be careful in production!)
+        result = eval(expression, {"__builtins__": {}}, {})
+        return f"{result}"
+    except Exception as e:
+        return f"Error calculating: {str(e)}"
+# Tool 3: Get Question File
+@tool
+def get_question_file(task_id: str) -> str:
+    """Downloads and reads a file associated with a GAIA question.
+    Args:
+        task_id: The task ID from the question
+    Returns:
+        The file content or error message
+    """
+    try:
+        url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
+        response = requests.get(url, timeout=30)
+        if response.status_code == 200:
+            # Return first 1000 characters of file content
+            content = response.text[:1000]
+            return f"File content (first 1000 chars):\n{content}"
+        else:
+            return f"Could not fetch file. Status code: {response.status_code}"
+    except Exception as e:
+        return f"Error fetching file: {str(e)}"
+# Tool 4: Final Answer (CRITICAL!)
+@tool
+def final_answer(answer: str) -> str:
+    """Returns the final answer to the question.
+    IMPORTANT: Use this ONLY ONCE when you have the exact answer.
+    The answer should be precise, concise, and exactly formatted.
+    Args:
+        answer: The exact answer with no extra text or explanation
+    Returns:
+        The answer
+    """
+    return answer.strip()
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def clean_answer(raw_answer: str) -> str:
+    """
+    Cleans the agent's response to extract the exact answer.
+    Removes common prefixes and extra formatting.
+    """
+    if not raw_answer:
+        return ""
+    answer = str(raw_answer).strip()
+    # Remove common prefixes (case-insensitive)
+    prefixes_to_remove = [
+        "the answer is",
+        "the result is",
+        "final answer:",
+        "answer:",
+        "final_answer:",
+        "result:",
+        "output:",
+    ]
+    answer_lower = answer.lower()
+    for prefix in prefixes_to_remove:
+        if answer_lower.startswith(prefix):
+            answer = answer[len(prefix):].strip()
+            break
+    # Remove surrounding quotes
+    answer = answer.strip('"\'')
+    # Remove trailing periods (unless it's part of decimal)
+    if answer.endswith('.') and not answer[-2].isdigit():
+        answer = answer[:-1]
+    return answer
+# ============================================================================
+# AGENT SETUP
+# ============================================================================
+# Set up the AI model
+model = HfApiModel(
+    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # Good reasoning model
+    max_tokens=4096,
+    temperature=0.1,  # Low temperature for consistency
+)
+# System prompt for better performance
+system_prompt = """You are a precise AI assistant solving GAIA benchmark questions.
+CRITICAL RULES:
+1. Give EXACT answers ONLY - no explanations, no preamble
+2. Format matters: check if answer should be a number, name, date, etc.
+3. For numbers: give just the number (e.g., "42" not "The answer is 42")
+4. For names: use proper capitalization as commonly written
+5. For lists: follow exact format requested (comma-separated, etc.)
+6. Use tools efficiently - web_search for facts, calculator for math
+7. When you have the final answer, use the final_answer tool ONCE
+8. Double-check your answer before using final_answer tool
+EXAMPLES OF CORRECT ANSWERS:
+- Question: "What is 15% of 200?" → Answer: "30"
+- Question: "Who founded Microsoft?" → Answer: "Bill Gates"
+- Question: "What year was Python released?" → Answer: "1991"
+Remember: EXACT MATCH scoring. Close doesn't count!"""
+# Create the agent
+agent = CodeAgent(
+    model=model,
+    tools=[search_tool, calculator, get_question_file, final_answer],
+    max_steps=12,  # Allow enough steps for complex questions
+    verbosity_level=2,  # Show reasoning process
+    additional_authorized_imports=["requests", "json"],
+)
+# ============================================================================
+# MAIN AGENT LOGIC
+# ============================================================================
+def process_single_question(question_data, progress_callback=None):
+    """
+    Process a single GAIA question
+    """
+    task_id = question_data['task_id']
+    question_text = question_data['Question']
+    # Check if there's a file
+    has_file = 'file_name' in question_data and question_data['file_name']
+    # Construct the prompt
+    prompt = f"""{system_prompt}
+Question: {question_text}
+{f"NOTE: This question has an attached file. Use get_question_file('{task_id}') to access it." if has_file else ""}
+Instructions:
+1. Analyze the question carefully
+2. Use tools as needed (web_search, calculator, get_question_file)
+3. When you have the exact answer, use final_answer(your_answer)
+4. Remember: ONLY the answer, nothing else!
+Now solve this question."""
+    if progress_callback:
+        progress_callback(f"Processing: {question_text[:100]}...")
+    try:
+        # Run the agent
+        result = agent.run(prompt)
+        # Clean the answer
+        cleaned = clean_answer(str(result))
+        return {
+            "task_id": task_id,
+            "submitted_answer": cleaned,
+            "raw_answer": str(result),
+            "question": question_text[:100]
+        }
+    except Exception as e:
+        print(f"Error on task {task_id}: {e}")
+        return {
+            "task_id": task_id,
+            "submitted_answer": "Error",
+            "error": str(e),
+            "question": question_text[:100]
+        }
+def run_full_evaluation(username, progress=gr.Progress()):
+    """
+    Fetches all questions, runs agent on each, and submits to the API
+    """
+    if not username or username.strip() == "":
+        return {"error": "Please provide your Hugging Face username"}
+    try:
+        # Step 1: Fetch questions
+        progress(0, desc="Fetching questions from API...")
+        response = requests.get(
+            "https://agents-course-unit4-scoring.hf.space/questions",
+            timeout=30
+        )
+        questions = response.json()
+        total_questions = len(questions)
+        progress(0.1, desc=f"Got {total_questions} questions. Starting evaluation...")
+        # Step 2: Process each question
+        all_answers = []
+        results_log = []
+        for idx, question in enumerate(questions):
+            progress((idx + 1) / total_questions,
+                    desc=f"Processing question {idx + 1}/{total_questions}")
+            result = process_single_question(question)
+            all_answers.append({
+                "task_id": result["task_id"],
+                "submitted_answer": result["submitted_answer"]
+            })
+            results_log.append(result)
+            print(f"\n{'='*60}")
+            print(f"Question {idx + 1}: {result['question']}")
+            print(f"Answer: {result['submitted_answer']}")
+            print(f"{'='*60}\n")
+        # Step 3: Submit to API
+        progress(0.95, desc="Submitting answers to scoring API...")
+        submission_data = {
+            "username": username.strip(),
+            "agent_code": "https://huggingface.co/spaces/Snaseem2026/Final_Assignment_Template/tree/main",
+            "answers": all_answers
+        }
+        submit_response = requests.post(
+            "https://agents-course-unit4-scoring.hf.space/submit",
+            json=submission_data,
+            timeout=60
+        )
+        if submit_response.status_code == 200:
+            result_data = submit_response.json()
+            progress(1.0, desc="✅ Submission complete!")
+            # Format the response nicely
+            return {
+                "status": "✅ Success!",
+                "score": result_data.get("score", "N/A"),
+                "total_questions": total_questions,
+                "submission_details": result_data,
+                "sample_answers": results_log[:5]  # Show first 5 for debugging
+            }
+        else:
+            return {
+                "status": "❌ Submission failed",
+                "error": submit_response.text,
+                "sample_answers": results_log[:5]
+            }
+    except Exception as e:
+        return {
+            "status": "❌ Error",
+            "error": str(e)
+        }
+def test_single_question(progress=gr.Progress()):
+    """
+    Test the agent on one random question (for debugging)
+    """
+    try:
+        progress(0.3, desc="Fetching random question...")
+        response = requests.get(
+            "https://agents-course-unit4-scoring.hf.space/random-question",
+            timeout=30
+        )
+        question = response.json()
+        progress(0.5, desc="Running agent...")
+        result = process_single_question(question)
+        progress(1.0, desc="✅ Complete!")
+        return {
+            "question": question['Question'],
+            "task_id": result['task_id'],
+            "agent_answer": result['submitted_answer'],
+            "raw_output": result.get('raw_answer', 'N/A')
+        }
+    except Exception as e:
+        return {"error": str(e)}
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+with gr.Blocks(title="GAIA Agent Evaluator") as demo:
+    gr.Markdown("""
+    # 🤖 GAIA Benchmark Agent - Final Assignment
+    This agent solves GAIA Level 1 questions using reasoning, web search, and calculation tools.
+    **Target Score:** 30% or higher (6/20 questions) to pass ✅
+    ### How to use:
+    1. **Test Mode**: Click "Test on Random Question" to see how your agent performs
+    2. **Full Evaluation**: Enter your HF username and run full evaluation on all 20 questions
+    3. **Submit**: Results automatically submitted to the leaderboard
+    **Note:** Make sure this Space is PUBLIC for your submission to count!
+    """)
+    with gr.Tab("🧪 Test Mode"):
+        gr.Markdown("### Test your agent on a single random question")
+        test_button = gr.Button("🎲 Test on Random Question", variant="primary")
+        test_output = gr.JSON(label="Test Results")
+        test_button.click(fn=test_single_question, outputs=test_output)
+    with gr.Tab("🚀 Full Evaluation"):
+        gr.Markdown("### Run complete evaluation and submit to leaderboard")
+        with gr.Row():
+            username_input = gr.Textbox(
+                label="Your Hugging Face Username",
+                placeholder="e.g., Snaseem2026",
+                info="Required for leaderboard submission"
+            )
+        submit_button = gr.Button("▶️ Run Full Evaluation & Submit", variant="primary", size="lg")
+        gr.Markdown("""
+        ⚠️ **This will take 10-20 minutes** to process all 20 questions.
+        The agent will:
+        - Fetch all 20 GAIA questions
+        - Answer each using web search, calculation, and reasoning
+        - Submit results to the scoring API
+        - Update the leaderboard automatically
+        """)
+        results_output = gr.JSON(label="Evaluation Results")
+        submit_button.click(
+            fn=run_full_evaluation,
+            inputs=username_input,
+            outputs=results_output
+        )
+    with gr.Tab("📊 Leaderboard"):
+        gr.Markdown("""
+        ### Check Your Score
+        After submission, view the leaderboard here:
+        👉 [Students Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
+        Your score = (Correct Answers / 20) × 100%
+        **Passing Score:** 30% or higher (6/20 questions correct)
+        """)
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ### Tools Available:
+        - 🔍 **Web Search** (DuckDuckGo): For finding current information
+        - 🧮 **Calculator**: For mathematical calculations
+        - 📁 **File Reader**: For questions with attachments
+        - ✅ **Final Answer**: Returns the exact answer
+        ### Tips for Better Scores:
+        1. Answers must be EXACT MATCH (case-sensitive)
+        2. No extra text - just the answer
+        3. Format matters (numbers vs words vs dates)
+        4. Test on random questions first before full evaluation
+        5. Check the leaderboard to see what scores are realistic
+        ### Current Model:
+        - **Qwen/Qwen2.5-Coder-32B-Instruct** (Good at reasoning and code)
+        - Temperature: 0.1 (focused, deterministic)
+        - Max steps: 12 (allows multi-step reasoning)
+        """)
+# Launch the interface
+demo.launch(share=False)
+```
+"""
+GAIA Benchmark Agent - Final Assignment
+This agent answers GAIA Level 1 questions using web search, calculation, and reasoning.
+"""
+from smolagents import CodeAgent, HfApiModel, tool, DuckDuckGoSearchTool
 import requests
+import gradio as gr
+import json
+# ============================================================================
+# TOOLS DEFINITION
+# ============================================================================
+# Tool 1: Web Search (built-in)
+search_tool = DuckDuckGoSearchTool()
+# Tool 2: Calculator
+@tool
+def calculator(expression: str) -> str:
+    """Evaluates mathematical expressions safely.
+    Args:
+        expression: A mathematical expression like '2+2', '15*23', or '100/4'
+    Returns:
+        The calculated result as a string
+    """
     try:
+        # Use eval but only for math (be careful in production!)
+        result = eval(expression, {"__builtins__": {}}, {})
+        return f"{result}"
     except Exception as e:
+        return f"Error calculating: {str(e)}"
+# Tool 3: Get Question File
+@tool
+def get_question_file(task_id: str) -> str:
+    """Downloads and reads a file associated with a GAIA question.
+    Args:
+        task_id: The task ID from the question
+    Returns:
+        The file content or error message
+    """
     try:
+        url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
+        response = requests.get(url, timeout=30)
+        if response.status_code == 200:
+            # Return first 1000 characters of file content
+            content = response.text[:1000]
+            return f"File content (first 1000 chars):\n{content}"
+        else:
+            return f"Could not fetch file. Status code: {response.status_code}"
     except Exception as e:
+        return f"Error fetching file: {str(e)}"
+# Tool 4: Final Answer (CRITICAL!)
+@tool
+def final_answer(answer: str) -> str:
+    """Returns the final answer to the question.
+    IMPORTANT: Use this ONLY ONCE when you have the exact answer.
+    The answer should be precise, concise, and exactly formatted.
+    Args:
+        answer: The exact answer with no extra text or explanation
+    Returns:
+        The answer
+    """
+    return answer.strip()
+# ============================================================================
+# HELPER FUNCTIONS
+# ============================================================================
+def clean_answer(raw_answer: str) -> str:
+    """
+    Cleans the agent's response to extract the exact answer.
+    Removes common prefixes and extra formatting.
+    """
+    if not raw_answer:
+        return ""
+    answer = str(raw_answer).strip()
+    # Remove common prefixes (case-insensitive)
+    prefixes_to_remove = [
+        "the answer is",
+        "the result is",
+        "final answer:",
+        "answer:",
+        "final_answer:",
+        "result:",
+        "output:",
+    ]
+    answer_lower = answer.lower()
+    for prefix in prefixes_to_remove:
+        if answer_lower.startswith(prefix):
+            answer = answer[len(prefix):].strip()
+            break
+    # Remove surrounding quotes
+    answer = answer.strip('"\'')
+    # Remove trailing periods (unless it's part of decimal)
+    if answer.endswith('.') and not answer[-2].isdigit():
+        answer = answer[:-1]
+    return answer
+# ============================================================================
+# AGENT SETUP
+# ============================================================================
+# Set up the AI model
+model = HfApiModel(
+    model_id='Qwen/Qwen2.5-Coder-32B-Instruct',  # Good reasoning model
+    max_tokens=4096,
+    temperature=0.1,  # Low temperature for consistency
+)
+# System prompt for better performance
+system_prompt = """You are a precise AI assistant solving GAIA benchmark questions.
+CRITICAL RULES:
+1. Give EXACT answers ONLY - no explanations, no preamble
+2. Format matters: check if answer should be a number, name, date, etc.
+3. For numbers: give just the number (e.g., "42" not "The answer is 42")
+4. For names: use proper capitalization as commonly written
+5. For lists: follow exact format requested (comma-separated, etc.)
+6. Use tools efficiently - web_search for facts, calculator for math
+7. When you have the final answer, use the final_answer tool ONCE
+8. Double-check your answer before using final_answer tool
+EXAMPLES OF CORRECT ANSWERS:
+- Question: "What is 15% of 200?" → Answer: "30"
+- Question: "Who founded Microsoft?" → Answer: "Bill Gates"
+- Question: "What year was Python released?" → Answer: "1991"
+Remember: EXACT MATCH scoring. Close doesn't count!"""
+# Create the agent
+agent = CodeAgent(
+    model=model,
+    tools=[search_tool, calculator, get_question_file, final_answer],
+    max_steps=12,  # Allow enough steps for complex questions
+    verbosity_level=2,  # Show reasoning process
+    additional_authorized_imports=["requests", "json"],
+)
+# ============================================================================
+# MAIN AGENT LOGIC
+# ============================================================================
+def process_single_question(question_data, progress_callback=None):
+    """
+    Process a single GAIA question
+    """
+    task_id = question_data['task_id']
+    question_text = question_data['Question']
+    # Check if there's a file
+    has_file = 'file_name' in question_data and question_data['file_name']
+    # Construct the prompt
+    prompt = f"""{system_prompt}
+Question: {question_text}
+{f"NOTE: This question has an attached file. Use get_question_file('{task_id}') to access it." if has_file else ""}
+Instructions:
+1. Analyze the question carefully
+2. Use tools as needed (web_search, calculator, get_question_file)
+3. When you have the exact answer, use final_answer(your_answer)
+4. Remember: ONLY the answer, nothing else!
+Now solve this question."""
+    if progress_callback:
+        progress_callback(f"Processing: {question_text[:100]}...")
+    try:
+        # Run the agent
+        result = agent.run(prompt)
+        # Clean the answer
+        cleaned = clean_answer(str(result))
+        return {
+            "task_id": task_id,
+            "submitted_answer": cleaned,
+            "raw_answer": str(result),
+            "question": question_text[:100]
+        }
+    except Exception as e:
+        print(f"Error on task {task_id}: {e}")
+        return {
+            "task_id": task_id,
+            "submitted_answer": "Error",
+            "error": str(e),
+            "question": question_text[:100]
+        }
+def run_full_evaluation(username, progress=gr.Progress()):
+    """
+    Fetches all questions, runs agent on each, and submits to the API
+    """
+    if not username or username.strip() == "":
+        return {"error": "Please provide your Hugging Face username"}
+    try:
+        # Step 1: Fetch questions
+        progress(0, desc="Fetching questions from API...")
+        response = requests.get(
+            "https://agents-course-unit4-scoring.hf.space/questions",
+            timeout=30
+        )
+        questions = response.json()
+        total_questions = len(questions)
+        progress(0.1, desc=f"Got {total_questions} questions. Starting evaluation...")
+        # Step 2: Process each question
+        all_answers = []
+        results_log = []
+        for idx, question in enumerate(questions):
+            progress((idx + 1) / total_questions,
+                    desc=f"Processing question {idx + 1}/{total_questions}")
+            result = process_single_question(question)
+            all_answers.append({
+                "task_id": result["task_id"],
+                "submitted_answer": result["submitted_answer"]
+            })
+            results_log.append(result)
+            print(f"\n{'='*60}")
+            print(f"Question {idx + 1}: {result['question']}")
+            print(f"Answer: {result['submitted_answer']}")
+            print(f"{'='*60}\n")
+        # Step 3: Submit to API
+        progress(0.95, desc="Submitting answers to scoring API...")
+        submission_data = {
+            "username": username.strip(),
+            "agent_code": "https://huggingface.co/spaces/Snaseem2026/Final_Assignment_Template/tree/main",
+            "answers": all_answers
+        }
+        submit_response = requests.post(
+            "https://agents-course-unit4-scoring.hf.space/submit",
+            json=submission_data,
+            timeout=60
+        )
+        if submit_response.status_code == 200:
+            result_data = submit_response.json()
+            progress(1.0, desc="✅ Submission complete!")
+            # Format the response nicely
+            return {
+                "status": "✅ Success!",
+                "score": result_data.get("score", "N/A"),
+                "total_questions": total_questions,
+                "submission_details": result_data,
+                "sample_answers": results_log[:5]  # Show first 5 for debugging
+            }
+        else:
+            return {
+                "status": "❌ Submission failed",
+                "error": submit_response.text,
+                "sample_answers": results_log[:5]
+            }
+    except Exception as e:
+        return {
+            "status": "❌ Error",
+            "error": str(e)
+        }
+def test_single_question(progress=gr.Progress()):
+    """
+    Test the agent on one random question (for debugging)
+    """
     try:
+        progress(0.3, desc="Fetching random question...")
+        response = requests.get(
+            "https://agents-course-unit4-scoring.hf.space/random-question",
+            timeout=30
         )
+        question = response.json()
+        progress(0.5, desc="Running agent...")
+        result = process_single_question(question)
+        progress(1.0, desc="✅ Complete!")
+        return {
+            "question": question['Question'],
+            "task_id": result['task_id'],
+            "agent_answer": result['submitted_answer'],
+            "raw_output": result.get('raw_answer', 'N/A')
+        }
     except Exception as e:
+        return {"error": str(e)}
+# ============================================================================
+# GRADIO INTERFACE
+# ============================================================================
+with gr.Blocks(title="GAIA Agent Evaluator") as demo:
+    gr.Markdown("""
+    # 🤖 GAIA Benchmark Agent - Final Assignment
+    This agent solves GAIA Level 1 questions using reasoning, web search, and calculation tools.
+    **Target Score:** 30% or higher (6/20 questions) to pass ✅
+    ### How to use:
+    1. **Test Mode**: Click "Test on Random Question" to see how your agent performs
+    2. **Full Evaluation**: Enter your HF username and run full evaluation on all 20 questions
+    3. **Submit**: Results automatically submitted to the leaderboard
+    **Note:** Make sure this Space is PUBLIC for your submission to count!
+    """)
+    with gr.Tab("🧪 Test Mode"):
+        gr.Markdown("### Test your agent on a single random question")
+        test_button = gr.Button("🎲 Test on Random Question", variant="primary")
+        test_output = gr.JSON(label="Test Results")
+        test_button.click(fn=test_single_question, outputs=test_output)
+    with gr.Tab("🚀 Full Evaluation"):
+        gr.Markdown("### Run complete evaluation and submit to leaderboard")
+        with gr.Row():
+            username_input = gr.Textbox(
+                label="Your Hugging Face Username",
+                placeholder="e.g., Snaseem2026",
+                info="Required for leaderboard submission"
+            )
+        submit_button = gr.Button("▶️ Run Full Evaluation & Submit", variant="primary", size="lg")
+        gr.Markdown("""
+        ⚠️ **This will take 10-20 minutes** to process all 20 questions.
+        The agent will:
+        - Fetch all 20 GAIA questions
+        - Answer each using web search, calculation, and reasoning
+        - Submit results to the scoring API
+        - Update the leaderboard automatically
+        """)
+        results_output = gr.JSON(label="Evaluation Results")
+        submit_button.click(
+            fn=run_full_evaluation,
+            inputs=username_input,
+            outputs=results_output
+        )
+    with gr.Tab("📊 Leaderboard"):
+        gr.Markdown("""
+        ### Check Your Score
+        After submission, view the leaderboard here:
+        👉 [Students Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
+        Your score = (Correct Answers / 20) × 100%
+        **Passing Score:** 30% or higher (6/20 questions correct)
+        """)
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ### Tools Available:
+        - 🔍 **Web Search** (DuckDuckGo): For finding current information
+        - 🧮 **Calculator**: For mathematical calculations
+        - 📁 **File Reader**: For questions with attachments
+        - ✅ **Final Answer**: Returns the exact answer
+        ### Tips for Better Scores:
+        1. Answers must be EXACT MATCH (case-sensitive)
+        2. No extra text - just the answer
+        3. Format matters (numbers vs words vs dates)
+        4. Test on random questions first before full evaluation
+        5. Check the leaderboard to see what scores are realistic
+        ### Current Model:
+        - **Qwen/Qwen2.5-Coder-32B-Instruct** (Good at reasoning and code)
+        - Temperature: 0.1 (focused, deterministic)
+        - Max steps: 12 (allows multi-step reasoning)
+        """)
+# Launch the interface
+demo.launch(share=False)