Snaseem2026 commited on
Commit
b14f9b4
ยท
verified ยท
1 Parent(s): 81917a3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +807 -187
app.py CHANGED
@@ -1,196 +1,816 @@
1
- import os
 
 
 
 
 
 
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  import requests
4
- import inspect
5
- import pandas as pd
6
-
7
- # (Keep Constants as is)
8
- # --- Constants ---
9
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
-
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
- class BasicAgent:
14
- def __init__(self):
15
- print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
17
- print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
-
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
23
- """
24
- Fetches all questions, runs the BasicAgent on them, submits all answers,
25
- and displays the results.
26
- """
27
- # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
-
30
- if profile:
31
- username= f"{profile.username}"
32
- print(f"User logged in: {username}")
33
- else:
34
- print("User not logged in.")
35
- return "Please Login to Hugging Face with the button.", None
36
-
37
- api_url = DEFAULT_API_URL
38
- questions_url = f"{api_url}/questions"
39
- submit_url = f"{api_url}/submit"
40
-
41
- # 1. Instantiate Agent ( modify this part to create your agent)
42
  try:
43
- agent = BasicAgent()
 
 
44
  except Exception as e:
45
- print(f"Error instantiating agent: {e}")
46
- return f"Error initializing agent: {e}", None
47
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
- print(agent_code)
50
-
51
- # 2. Fetch Questions
52
- print(f"Fetching questions from: {questions_url}")
 
 
 
 
 
53
  try:
54
- response = requests.get(questions_url, timeout=15)
55
- response.raise_for_status()
56
- questions_data = response.json()
57
- if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
- print(f"Fetched {len(questions_data)} questions.")
61
- except requests.exceptions.RequestException as e:
62
- print(f"Error fetching questions: {e}")
63
- return f"Error fetching questions: {e}", None
64
- except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
  except Exception as e:
69
- print(f"An unexpected error occurred fetching questions: {e}")
70
- return f"An unexpected error occurred fetching questions: {e}", None
71
-
72
- # 3. Run your Agent
73
- results_log = []
74
- answers_payload = []
75
- print(f"Running agent on {len(questions_data)} questions...")
76
- for item in questions_data:
77
- task_id = item.get("task_id")
78
- question_text = item.get("question")
79
- if not task_id or question_text is None:
80
- print(f"Skipping item with missing task_id or question: {item}")
81
- continue
82
- try:
83
- submitted_answer = agent(question_text)
84
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
- except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
89
-
90
- if not answers_payload:
91
- print("Agent did not produce any answers to submit.")
92
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
93
-
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
- print(status_update)
98
-
99
- # 5. Submit
100
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  try:
102
- response = requests.post(submit_url, json=submission_data, timeout=60)
103
- response.raise_for_status()
104
- result_data = response.json()
105
- final_status = (
106
- f"Submission Successful!\n"
107
- f"User: {result_data.get('username')}\n"
108
- f"Overall Score: {result_data.get('score', 'N/A')}% "
109
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
110
- f"Message: {result_data.get('message', 'No message received.')}"
111
  )
112
- print("Submission successful.")
113
- results_df = pd.DataFrame(results_log)
114
- return final_status, results_df
115
- except requests.exceptions.HTTPError as e:
116
- error_detail = f"Server responded with status {e.response.status_code}."
117
- try:
118
- error_json = e.response.json()
119
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
120
- except requests.exceptions.JSONDecodeError:
121
- error_detail += f" Response: {e.response.text[:500]}"
122
- status_message = f"Submission Failed: {error_detail}"
123
- print(status_message)
124
- results_df = pd.DataFrame(results_log)
125
- return status_message, results_df
126
- except requests.exceptions.Timeout:
127
- status_message = "Submission Failed: The request timed out."
128
- print(status_message)
129
- results_df = pd.DataFrame(results_log)
130
- return status_message, results_df
131
- except requests.exceptions.RequestException as e:
132
- status_message = f"Submission Failed: Network error - {e}"
133
- print(status_message)
134
- results_df = pd.DataFrame(results_log)
135
- return status_message, results_df
136
  except Exception as e:
137
- status_message = f"An unexpected error occurred during submission: {e}"
138
- print(status_message)
139
- results_df = pd.DataFrame(results_log)
140
- return status_message, results_df
141
-
142
-
143
- # --- Build Gradio Interface using Blocks ---
144
- with gr.Blocks() as demo:
145
- gr.Markdown("# Basic Agent Evaluation Runner")
146
- gr.Markdown(
147
- """
148
- **Instructions:**
149
-
150
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
151
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
152
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
153
-
154
- ---
155
- **Disclaimers:**
156
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
158
- """
159
- )
160
-
161
- gr.LoginButton()
162
-
163
- run_button = gr.Button("Run Evaluation & Submit All Answers")
164
-
165
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
166
- # Removed max_rows=10 from DataFrame constructor
167
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
-
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
-
174
- if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
- # Check for SPACE_HOST and SPACE_ID at startup for information
177
- space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
-
180
- if space_host_startup:
181
- print(f"โœ… SPACE_HOST found: {space_host_startup}")
182
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
183
- else:
184
- print("โ„น๏ธ SPACE_HOST environment variable not found (running locally?).")
185
-
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
- print(f"โœ… SPACE_ID found: {space_id_startup}")
188
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
190
- else:
191
- print("โ„น๏ธ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
-
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
-
195
- print("Launching Gradio Interface for Basic Agent Evaluation...")
196
- demo.launch(debug=True, share=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GAIA Benchmark Agent - Final Assignment
3
+ This agent answers GAIA Level 1 questions using web search, calculation, and reasoning.
4
+ """
5
+
6
+ from smolagents import CodeAgent, HfApiModel, tool, DuckDuckGoSearchTool
7
+ import requests
8
  import gradio as gr
9
+ import json
10
+
11
+ # ============================================================================
12
+ # TOOLS DEFINITION
13
+ # ============================================================================
14
+
15
+ # Tool 1: Web Search (built-in)
16
+ search_tool = DuckDuckGoSearchTool()
17
+
18
+ # Tool 2: Calculator
19
+ @tool
20
+ def calculator(expression: str) -> str:
21
+ """Evaluates mathematical expressions safely.
22
+
23
+ Args:
24
+ expression: A mathematical expression like '2+2', '15*23', or '100/4'
25
+
26
+ Returns:
27
+ The calculated result as a string
28
+ """
29
+ try:
30
+ # Use eval but only for math (be careful in production!)
31
+ result = eval(expression, {"__builtins__": {}}, {})
32
+ return f"{result}"
33
+ except Exception as e:
34
+ return f"Error calculating: {str(e)}"
35
+
36
+ # Tool 3: Get Question File
37
+ @tool
38
+ def get_question_file(task_id: str) -> str:
39
+ """Downloads and reads a file associated with a GAIA question.
40
+
41
+ Args:
42
+ task_id: The task ID from the question
43
+
44
+ Returns:
45
+ The file content or error message
46
+ """
47
+ try:
48
+ url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
49
+ response = requests.get(url, timeout=30)
50
+
51
+ if response.status_code == 200:
52
+ # Return first 1000 characters of file content
53
+ content = response.text[:1000]
54
+ return f"File content (first 1000 chars):\n{content}"
55
+ else:
56
+ return f"Could not fetch file. Status code: {response.status_code}"
57
+ except Exception as e:
58
+ return f"Error fetching file: {str(e)}"
59
+
60
+ # Tool 4: Final Answer (CRITICAL!)
61
+ @tool
62
+ def final_answer(answer: str) -> str:
63
+ """Returns the final answer to the question.
64
+
65
+ IMPORTANT: Use this ONLY ONCE when you have the exact answer.
66
+ The answer should be precise, concise, and exactly formatted.
67
+
68
+ Args:
69
+ answer: The exact answer with no extra text or explanation
70
+
71
+ Returns:
72
+ The answer
73
+ """
74
+ return answer.strip()
75
+
76
+ # ============================================================================
77
+ # HELPER FUNCTIONS
78
+ # ============================================================================
79
+
80
+ def clean_answer(raw_answer: str) -> str:
81
+ """
82
+ Cleans the agent's response to extract the exact answer.
83
+ Removes common prefixes and extra formatting.
84
+ """
85
+ if not raw_answer:
86
+ return ""
87
+
88
+ answer = str(raw_answer).strip()
89
+
90
+ # Remove common prefixes (case-insensitive)
91
+ prefixes_to_remove = [
92
+ "the answer is",
93
+ "the result is",
94
+ "final answer:",
95
+ "answer:",
96
+ "final_answer:",
97
+ "result:",
98
+ "output:",
99
+ ]
100
+
101
+ answer_lower = answer.lower()
102
+ for prefix in prefixes_to_remove:
103
+ if answer_lower.startswith(prefix):
104
+ answer = answer[len(prefix):].strip()
105
+ break
106
+
107
+ # Remove surrounding quotes
108
+ answer = answer.strip('"\'')
109
+
110
+ # Remove trailing periods (unless it's part of decimal)
111
+ if answer.endswith('.') and not answer[-2].isdigit():
112
+ answer = answer[:-1]
113
+
114
+ return answer
115
+
116
+ # ============================================================================
117
+ # AGENT SETUP
118
+ # ============================================================================
119
+
120
+ # Set up the AI model
121
+ model = HfApiModel(
122
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Good reasoning model
123
+ max_tokens=4096,
124
+ temperature=0.1, # Low temperature for consistency
125
+ )
126
+
127
+ # System prompt for better performance
128
+ system_prompt = """You are a precise AI assistant solving GAIA benchmark questions.
129
+
130
+ CRITICAL RULES:
131
+ 1. Give EXACT answers ONLY - no explanations, no preamble
132
+ 2. Format matters: check if answer should be a number, name, date, etc.
133
+ 3. For numbers: give just the number (e.g., "42" not "The answer is 42")
134
+ 4. For names: use proper capitalization as commonly written
135
+ 5. For lists: follow exact format requested (comma-separated, etc.)
136
+ 6. Use tools efficiently - web_search for facts, calculator for math
137
+ 7. When you have the final answer, use the final_answer tool ONCE
138
+ 8. Double-check your answer before using final_answer tool
139
+
140
+ EXAMPLES OF CORRECT ANSWERS:
141
+ - Question: "What is 15% of 200?" โ†’ Answer: "30"
142
+ - Question: "Who founded Microsoft?" โ†’ Answer: "Bill Gates"
143
+ - Question: "What year was Python released?" โ†’ Answer: "1991"
144
+
145
+ Remember: EXACT MATCH scoring. Close doesn't count!"""
146
+
147
+ # Create the agent
148
+ agent = CodeAgent(
149
+ model=model,
150
+ tools=[search_tool, calculator, get_question_file, final_answer],
151
+ max_steps=12, # Allow enough steps for complex questions
152
+ verbosity_level=2, # Show reasoning process
153
+ additional_authorized_imports=["requests", "json"],
154
+ )
155
+
156
+ # ============================================================================
157
+ # MAIN AGENT LOGIC
158
+ # ============================================================================
159
+
160
+ def process_single_question(question_data, progress_callback=None):
161
+ """
162
+ Process a single GAIA question
163
+ """
164
+ task_id = question_data['task_id']
165
+ question_text = question_data['Question']
166
+
167
+ # Check if there's a file
168
+ has_file = 'file_name' in question_data and question_data['file_name']
169
+
170
+ # Construct the prompt
171
+ prompt = f"""{system_prompt}
172
+
173
+ Question: {question_text}
174
+
175
+ {f"NOTE: This question has an attached file. Use get_question_file('{task_id}') to access it." if has_file else ""}
176
+
177
+ Instructions:
178
+ 1. Analyze the question carefully
179
+ 2. Use tools as needed (web_search, calculator, get_question_file)
180
+ 3. When you have the exact answer, use final_answer(your_answer)
181
+ 4. Remember: ONLY the answer, nothing else!
182
+
183
+ Now solve this question."""
184
+
185
+ if progress_callback:
186
+ progress_callback(f"Processing: {question_text[:100]}...")
187
+
188
+ try:
189
+ # Run the agent
190
+ result = agent.run(prompt)
191
+
192
+ # Clean the answer
193
+ cleaned = clean_answer(str(result))
194
+
195
+ return {
196
+ "task_id": task_id,
197
+ "submitted_answer": cleaned,
198
+ "raw_answer": str(result),
199
+ "question": question_text[:100]
200
+ }
201
+
202
+ except Exception as e:
203
+ print(f"Error on task {task_id}: {e}")
204
+ return {
205
+ "task_id": task_id,
206
+ "submitted_answer": "Error",
207
+ "error": str(e),
208
+ "question": question_text[:100]
209
+ }
210
+
211
+ def run_full_evaluation(username, progress=gr.Progress()):
212
+ """
213
+ Fetches all questions, runs agent on each, and submits to the API
214
+ """
215
+ if not username or username.strip() == "":
216
+ return {"error": "Please provide your Hugging Face username"}
217
+
218
+ try:
219
+ # Step 1: Fetch questions
220
+ progress(0, desc="Fetching questions from API...")
221
+ response = requests.get(
222
+ "https://agents-course-unit4-scoring.hf.space/questions",
223
+ timeout=30
224
+ )
225
+ questions = response.json()
226
+
227
+ total_questions = len(questions)
228
+ progress(0.1, desc=f"Got {total_questions} questions. Starting evaluation...")
229
+
230
+ # Step 2: Process each question
231
+ all_answers = []
232
+ results_log = []
233
+
234
+ for idx, question in enumerate(questions):
235
+ progress((idx + 1) / total_questions,
236
+ desc=f"Processing question {idx + 1}/{total_questions}")
237
+
238
+ result = process_single_question(question)
239
+ all_answers.append({
240
+ "task_id": result["task_id"],
241
+ "submitted_answer": result["submitted_answer"]
242
+ })
243
+
244
+ results_log.append(result)
245
+ print(f"\n{'='*60}")
246
+ print(f"Question {idx + 1}: {result['question']}")
247
+ print(f"Answer: {result['submitted_answer']}")
248
+ print(f"{'='*60}\n")
249
+
250
+ # Step 3: Submit to API
251
+ progress(0.95, desc="Submitting answers to scoring API...")
252
+
253
+ submission_data = {
254
+ "username": username.strip(),
255
+ "agent_code": "https://huggingface.co/spaces/Snaseem2026/Final_Assignment_Template/tree/main",
256
+ "answers": all_answers
257
+ }
258
+
259
+ submit_response = requests.post(
260
+ "https://agents-course-unit4-scoring.hf.space/submit",
261
+ json=submission_data,
262
+ timeout=60
263
+ )
264
+
265
+ if submit_response.status_code == 200:
266
+ result_data = submit_response.json()
267
+ progress(1.0, desc="โœ… Submission complete!")
268
+
269
+ # Format the response nicely
270
+ return {
271
+ "status": "โœ… Success!",
272
+ "score": result_data.get("score", "N/A"),
273
+ "total_questions": total_questions,
274
+ "submission_details": result_data,
275
+ "sample_answers": results_log[:5] # Show first 5 for debugging
276
+ }
277
+ else:
278
+ return {
279
+ "status": "โŒ Submission failed",
280
+ "error": submit_response.text,
281
+ "sample_answers": results_log[:5]
282
+ }
283
+
284
+ except Exception as e:
285
+ return {
286
+ "status": "โŒ Error",
287
+ "error": str(e)
288
+ }
289
+
290
+ def test_single_question(progress=gr.Progress()):
291
+ """
292
+ Test the agent on one random question (for debugging)
293
+ """
294
+ try:
295
+ progress(0.3, desc="Fetching random question...")
296
+ response = requests.get(
297
+ "https://agents-course-unit4-scoring.hf.space/random-question",
298
+ timeout=30
299
+ )
300
+ question = response.json()
301
+
302
+ progress(0.5, desc="Running agent...")
303
+ result = process_single_question(question)
304
+
305
+ progress(1.0, desc="โœ… Complete!")
306
+
307
+ return {
308
+ "question": question['Question'],
309
+ "task_id": result['task_id'],
310
+ "agent_answer": result['submitted_answer'],
311
+ "raw_output": result.get('raw_answer', 'N/A')
312
+ }
313
+
314
+ except Exception as e:
315
+ return {"error": str(e)}
316
+
317
+ # ============================================================================
318
+ # GRADIO INTERFACE
319
+ # ============================================================================
320
+
321
+ with gr.Blocks(title="GAIA Agent Evaluator") as demo:
322
+ gr.Markdown("""
323
+ # ๐Ÿค– GAIA Benchmark Agent - Final Assignment
324
+
325
+ This agent solves GAIA Level 1 questions using reasoning, web search, and calculation tools.
326
+
327
+ **Target Score:** 30% or higher (6/20 questions) to pass โœ…
328
+
329
+ ### How to use:
330
+ 1. **Test Mode**: Click "Test on Random Question" to see how your agent performs
331
+ 2. **Full Evaluation**: Enter your HF username and run full evaluation on all 20 questions
332
+ 3. **Submit**: Results automatically submitted to the leaderboard
333
+
334
+ **Note:** Make sure this Space is PUBLIC for your submission to count!
335
+ """)
336
+
337
+ with gr.Tab("๐Ÿงช Test Mode"):
338
+ gr.Markdown("### Test your agent on a single random question")
339
+ test_button = gr.Button("๐ŸŽฒ Test on Random Question", variant="primary")
340
+ test_output = gr.JSON(label="Test Results")
341
+ test_button.click(fn=test_single_question, outputs=test_output)
342
+
343
+ with gr.Tab("๐Ÿš€ Full Evaluation"):
344
+ gr.Markdown("### Run complete evaluation and submit to leaderboard")
345
+
346
+ with gr.Row():
347
+ username_input = gr.Textbox(
348
+ label="Your Hugging Face Username",
349
+ placeholder="e.g., Snaseem2026",
350
+ info="Required for leaderboard submission"
351
+ )
352
+
353
+ submit_button = gr.Button("โ–ถ๏ธ Run Full Evaluation & Submit", variant="primary", size="lg")
354
+
355
+ gr.Markdown("""
356
+ โš ๏ธ **This will take 10-20 minutes** to process all 20 questions.
357
+
358
+ The agent will:
359
+ - Fetch all 20 GAIA questions
360
+ - Answer each using web search, calculation, and reasoning
361
+ - Submit results to the scoring API
362
+ - Update the leaderboard automatically
363
+ """)
364
+
365
+ results_output = gr.JSON(label="Evaluation Results")
366
+
367
+ submit_button.click(
368
+ fn=run_full_evaluation,
369
+ inputs=username_input,
370
+ outputs=results_output
371
+ )
372
+
373
+ with gr.Tab("๐Ÿ“Š Leaderboard"):
374
+ gr.Markdown("""
375
+ ### Check Your Score
376
+
377
+ After submission, view the leaderboard here:
378
+ ๐Ÿ‘‰ [Students Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
379
+
380
+ Your score = (Correct Answers / 20) ร— 100%
381
+
382
+ **Passing Score:** 30% or higher (6/20 questions correct)
383
+ """)
384
+
385
+ with gr.Tab("โ„น๏ธ About"):
386
+ gr.Markdown("""
387
+ ### Tools Available:
388
+ - ๐Ÿ” **Web Search** (DuckDuckGo): For finding current information
389
+ - ๐Ÿงฎ **Calculator**: For mathematical calculations
390
+ - ๐Ÿ“ **File Reader**: For questions with attachments
391
+ - โœ… **Final Answer**: Returns the exact answer
392
+
393
+ ### Tips for Better Scores:
394
+ 1. Answers must be EXACT MATCH (case-sensitive)
395
+ 2. No extra text - just the answer
396
+ 3. Format matters (numbers vs words vs dates)
397
+ 4. Test on random questions first before full evaluation
398
+ 5. Check the leaderboard to see what scores are realistic
399
+
400
+ ### Current Model:
401
+ - **Qwen/Qwen2.5-Coder-32B-Instruct** (Good at reasoning and code)
402
+ - Temperature: 0.1 (focused, deterministic)
403
+ - Max steps: 12 (allows multi-step reasoning)
404
+ """)
405
+
406
+ # Launch the interface
407
+ demo.launch(share=False)
408
+ ```
409
+
410
+ """
411
+ GAIA Benchmark Agent - Final Assignment
412
+ This agent answers GAIA Level 1 questions using web search, calculation, and reasoning.
413
+ """
414
+
415
+ from smolagents import CodeAgent, HfApiModel, tool, DuckDuckGoSearchTool
416
  import requests
417
+ import gradio as gr
418
+ import json
419
+
420
+ # ============================================================================
421
+ # TOOLS DEFINITION
422
+ # ============================================================================
423
+
424
+ # Tool 1: Web Search (built-in)
425
+ search_tool = DuckDuckGoSearchTool()
426
+
427
+ # Tool 2: Calculator
428
+ @tool
429
+ def calculator(expression: str) -> str:
430
+ """Evaluates mathematical expressions safely.
431
+
432
+ Args:
433
+ expression: A mathematical expression like '2+2', '15*23', or '100/4'
434
+
435
+ Returns:
436
+ The calculated result as a string
437
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
438
  try:
439
+ # Use eval but only for math (be careful in production!)
440
+ result = eval(expression, {"__builtins__": {}}, {})
441
+ return f"{result}"
442
  except Exception as e:
443
+ return f"Error calculating: {str(e)}"
444
+
445
+ # Tool 3: Get Question File
446
+ @tool
447
+ def get_question_file(task_id: str) -> str:
448
+ """Downloads and reads a file associated with a GAIA question.
449
+
450
+ Args:
451
+ task_id: The task ID from the question
452
+
453
+ Returns:
454
+ The file content or error message
455
+ """
456
  try:
457
+ url = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
458
+ response = requests.get(url, timeout=30)
459
+
460
+ if response.status_code == 200:
461
+ # Return first 1000 characters of file content
462
+ content = response.text[:1000]
463
+ return f"File content (first 1000 chars):\n{content}"
464
+ else:
465
+ return f"Could not fetch file. Status code: {response.status_code}"
 
 
 
 
 
466
  except Exception as e:
467
+ return f"Error fetching file: {str(e)}"
468
+
469
+ # Tool 4: Final Answer (CRITICAL!)
470
+ @tool
471
+ def final_answer(answer: str) -> str:
472
+ """Returns the final answer to the question.
473
+
474
+ IMPORTANT: Use this ONLY ONCE when you have the exact answer.
475
+ The answer should be precise, concise, and exactly formatted.
476
+
477
+ Args:
478
+ answer: The exact answer with no extra text or explanation
479
+
480
+ Returns:
481
+ The answer
482
+ """
483
+ return answer.strip()
484
+
485
+ # ============================================================================
486
+ # HELPER FUNCTIONS
487
+ # ============================================================================
488
+
489
+ def clean_answer(raw_answer: str) -> str:
490
+ """
491
+ Cleans the agent's response to extract the exact answer.
492
+ Removes common prefixes and extra formatting.
493
+ """
494
+ if not raw_answer:
495
+ return ""
496
+
497
+ answer = str(raw_answer).strip()
498
+
499
+ # Remove common prefixes (case-insensitive)
500
+ prefixes_to_remove = [
501
+ "the answer is",
502
+ "the result is",
503
+ "final answer:",
504
+ "answer:",
505
+ "final_answer:",
506
+ "result:",
507
+ "output:",
508
+ ]
509
+
510
+ answer_lower = answer.lower()
511
+ for prefix in prefixes_to_remove:
512
+ if answer_lower.startswith(prefix):
513
+ answer = answer[len(prefix):].strip()
514
+ break
515
+
516
+ # Remove surrounding quotes
517
+ answer = answer.strip('"\'')
518
+
519
+ # Remove trailing periods (unless it's part of decimal)
520
+ if answer.endswith('.') and not answer[-2].isdigit():
521
+ answer = answer[:-1]
522
+
523
+ return answer
524
+
525
+ # ============================================================================
526
+ # AGENT SETUP
527
+ # ============================================================================
528
+
529
+ # Set up the AI model
530
+ model = HfApiModel(
531
+ model_id='Qwen/Qwen2.5-Coder-32B-Instruct', # Good reasoning model
532
+ max_tokens=4096,
533
+ temperature=0.1, # Low temperature for consistency
534
+ )
535
+
536
+ # System prompt for better performance
537
+ system_prompt = """You are a precise AI assistant solving GAIA benchmark questions.
538
+
539
+ CRITICAL RULES:
540
+ 1. Give EXACT answers ONLY - no explanations, no preamble
541
+ 2. Format matters: check if answer should be a number, name, date, etc.
542
+ 3. For numbers: give just the number (e.g., "42" not "The answer is 42")
543
+ 4. For names: use proper capitalization as commonly written
544
+ 5. For lists: follow exact format requested (comma-separated, etc.)
545
+ 6. Use tools efficiently - web_search for facts, calculator for math
546
+ 7. When you have the final answer, use the final_answer tool ONCE
547
+ 8. Double-check your answer before using final_answer tool
548
+
549
+ EXAMPLES OF CORRECT ANSWERS:
550
+ - Question: "What is 15% of 200?" โ†’ Answer: "30"
551
+ - Question: "Who founded Microsoft?" โ†’ Answer: "Bill Gates"
552
+ - Question: "What year was Python released?" โ†’ Answer: "1991"
553
+
554
+ Remember: EXACT MATCH scoring. Close doesn't count!"""
555
+
556
+ # Create the agent
557
+ agent = CodeAgent(
558
+ model=model,
559
+ tools=[search_tool, calculator, get_question_file, final_answer],
560
+ max_steps=12, # Allow enough steps for complex questions
561
+ verbosity_level=2, # Show reasoning process
562
+ additional_authorized_imports=["requests", "json"],
563
+ )
564
+
565
+ # ============================================================================
566
+ # MAIN AGENT LOGIC
567
+ # ============================================================================
568
+
569
+ def process_single_question(question_data, progress_callback=None):
570
+ """
571
+ Process a single GAIA question
572
+ """
573
+ task_id = question_data['task_id']
574
+ question_text = question_data['Question']
575
+
576
+ # Check if there's a file
577
+ has_file = 'file_name' in question_data and question_data['file_name']
578
+
579
+ # Construct the prompt
580
+ prompt = f"""{system_prompt}
581
+
582
+ Question: {question_text}
583
+
584
+ {f"NOTE: This question has an attached file. Use get_question_file('{task_id}') to access it." if has_file else ""}
585
+
586
+ Instructions:
587
+ 1. Analyze the question carefully
588
+ 2. Use tools as needed (web_search, calculator, get_question_file)
589
+ 3. When you have the exact answer, use final_answer(your_answer)
590
+ 4. Remember: ONLY the answer, nothing else!
591
+
592
+ Now solve this question."""
593
+
594
+ if progress_callback:
595
+ progress_callback(f"Processing: {question_text[:100]}...")
596
+
597
+ try:
598
+ # Run the agent
599
+ result = agent.run(prompt)
600
+
601
+ # Clean the answer
602
+ cleaned = clean_answer(str(result))
603
+
604
+ return {
605
+ "task_id": task_id,
606
+ "submitted_answer": cleaned,
607
+ "raw_answer": str(result),
608
+ "question": question_text[:100]
609
+ }
610
+
611
+ except Exception as e:
612
+ print(f"Error on task {task_id}: {e}")
613
+ return {
614
+ "task_id": task_id,
615
+ "submitted_answer": "Error",
616
+ "error": str(e),
617
+ "question": question_text[:100]
618
+ }
619
+
620
+ def run_full_evaluation(username, progress=gr.Progress()):
621
+ """
622
+ Fetches all questions, runs agent on each, and submits to the API
623
+ """
624
+ if not username or username.strip() == "":
625
+ return {"error": "Please provide your Hugging Face username"}
626
+
627
+ try:
628
+ # Step 1: Fetch questions
629
+ progress(0, desc="Fetching questions from API...")
630
+ response = requests.get(
631
+ "https://agents-course-unit4-scoring.hf.space/questions",
632
+ timeout=30
633
+ )
634
+ questions = response.json()
635
+
636
+ total_questions = len(questions)
637
+ progress(0.1, desc=f"Got {total_questions} questions. Starting evaluation...")
638
+
639
+ # Step 2: Process each question
640
+ all_answers = []
641
+ results_log = []
642
+
643
+ for idx, question in enumerate(questions):
644
+ progress((idx + 1) / total_questions,
645
+ desc=f"Processing question {idx + 1}/{total_questions}")
646
+
647
+ result = process_single_question(question)
648
+ all_answers.append({
649
+ "task_id": result["task_id"],
650
+ "submitted_answer": result["submitted_answer"]
651
+ })
652
+
653
+ results_log.append(result)
654
+ print(f"\n{'='*60}")
655
+ print(f"Question {idx + 1}: {result['question']}")
656
+ print(f"Answer: {result['submitted_answer']}")
657
+ print(f"{'='*60}\n")
658
+
659
+ # Step 3: Submit to API
660
+ progress(0.95, desc="Submitting answers to scoring API...")
661
+
662
+ submission_data = {
663
+ "username": username.strip(),
664
+ "agent_code": "https://huggingface.co/spaces/Snaseem2026/Final_Assignment_Template/tree/main",
665
+ "answers": all_answers
666
+ }
667
+
668
+ submit_response = requests.post(
669
+ "https://agents-course-unit4-scoring.hf.space/submit",
670
+ json=submission_data,
671
+ timeout=60
672
+ )
673
+
674
+ if submit_response.status_code == 200:
675
+ result_data = submit_response.json()
676
+ progress(1.0, desc="โœ… Submission complete!")
677
+
678
+ # Format the response nicely
679
+ return {
680
+ "status": "โœ… Success!",
681
+ "score": result_data.get("score", "N/A"),
682
+ "total_questions": total_questions,
683
+ "submission_details": result_data,
684
+ "sample_answers": results_log[:5] # Show first 5 for debugging
685
+ }
686
+ else:
687
+ return {
688
+ "status": "โŒ Submission failed",
689
+ "error": submit_response.text,
690
+ "sample_answers": results_log[:5]
691
+ }
692
+
693
+ except Exception as e:
694
+ return {
695
+ "status": "โŒ Error",
696
+ "error": str(e)
697
+ }
698
+
699
+ def test_single_question(progress=gr.Progress()):
700
+ """
701
+ Test the agent on one random question (for debugging)
702
+ """
703
  try:
704
+ progress(0.3, desc="Fetching random question...")
705
+ response = requests.get(
706
+ "https://agents-course-unit4-scoring.hf.space/random-question",
707
+ timeout=30
 
 
 
 
 
708
  )
709
+ question = response.json()
710
+
711
+ progress(0.5, desc="Running agent...")
712
+ result = process_single_question(question)
713
+
714
+ progress(1.0, desc="โœ… Complete!")
715
+
716
+ return {
717
+ "question": question['Question'],
718
+ "task_id": result['task_id'],
719
+ "agent_answer": result['submitted_answer'],
720
+ "raw_output": result.get('raw_answer', 'N/A')
721
+ }
722
+
 
 
 
 
 
 
 
 
 
 
723
  except Exception as e:
724
+ return {"error": str(e)}
725
+
726
+ # ============================================================================
727
+ # GRADIO INTERFACE
728
+ # ============================================================================
729
+
730
+ with gr.Blocks(title="GAIA Agent Evaluator") as demo:
731
+ gr.Markdown("""
732
+ # ๐Ÿค– GAIA Benchmark Agent - Final Assignment
733
+
734
+ This agent solves GAIA Level 1 questions using reasoning, web search, and calculation tools.
735
+
736
+ **Target Score:** 30% or higher (6/20 questions) to pass โœ…
737
+
738
+ ### How to use:
739
+ 1. **Test Mode**: Click "Test on Random Question" to see how your agent performs
740
+ 2. **Full Evaluation**: Enter your HF username and run full evaluation on all 20 questions
741
+ 3. **Submit**: Results automatically submitted to the leaderboard
742
+
743
+ **Note:** Make sure this Space is PUBLIC for your submission to count!
744
+ """)
745
+
746
+ with gr.Tab("๐Ÿงช Test Mode"):
747
+ gr.Markdown("### Test your agent on a single random question")
748
+ test_button = gr.Button("๐ŸŽฒ Test on Random Question", variant="primary")
749
+ test_output = gr.JSON(label="Test Results")
750
+ test_button.click(fn=test_single_question, outputs=test_output)
751
+
752
+ with gr.Tab("๐Ÿš€ Full Evaluation"):
753
+ gr.Markdown("### Run complete evaluation and submit to leaderboard")
754
+
755
+ with gr.Row():
756
+ username_input = gr.Textbox(
757
+ label="Your Hugging Face Username",
758
+ placeholder="e.g., Snaseem2026",
759
+ info="Required for leaderboard submission"
760
+ )
761
+
762
+ submit_button = gr.Button("โ–ถ๏ธ Run Full Evaluation & Submit", variant="primary", size="lg")
763
+
764
+ gr.Markdown("""
765
+ โš ๏ธ **This will take 10-20 minutes** to process all 20 questions.
766
+
767
+ The agent will:
768
+ - Fetch all 20 GAIA questions
769
+ - Answer each using web search, calculation, and reasoning
770
+ - Submit results to the scoring API
771
+ - Update the leaderboard automatically
772
+ """)
773
+
774
+ results_output = gr.JSON(label="Evaluation Results")
775
+
776
+ submit_button.click(
777
+ fn=run_full_evaluation,
778
+ inputs=username_input,
779
+ outputs=results_output
780
+ )
781
+
782
+ with gr.Tab("๐Ÿ“Š Leaderboard"):
783
+ gr.Markdown("""
784
+ ### Check Your Score
785
+
786
+ After submission, view the leaderboard here:
787
+ ๐Ÿ‘‰ [Students Leaderboard](https://huggingface.co/spaces/agents-course/Students_leaderboard)
788
+
789
+ Your score = (Correct Answers / 20) ร— 100%
790
+
791
+ **Passing Score:** 30% or higher (6/20 questions correct)
792
+ """)
793
+
794
+ with gr.Tab("โ„น๏ธ About"):
795
+ gr.Markdown("""
796
+ ### Tools Available:
797
+ - ๐Ÿ” **Web Search** (DuckDuckGo): For finding current information
798
+ - ๐Ÿงฎ **Calculator**: For mathematical calculations
799
+ - ๐Ÿ“ **File Reader**: For questions with attachments
800
+ - โœ… **Final Answer**: Returns the exact answer
801
+
802
+ ### Tips for Better Scores:
803
+ 1. Answers must be EXACT MATCH (case-sensitive)
804
+ 2. No extra text - just the answer
805
+ 3. Format matters (numbers vs words vs dates)
806
+ 4. Test on random questions first before full evaluation
807
+ 5. Check the leaderboard to see what scores are realistic
808
+
809
+ ### Current Model:
810
+ - **Qwen/Qwen2.5-Coder-32B-Instruct** (Good at reasoning and code)
811
+ - Temperature: 0.1 (focused, deterministic)
812
+ - Max steps: 12 (allows multi-step reasoning)
813
+ """)
814
+
815
+ # Launch the interface
816
+ demo.launch(share=False)