SantoshKumar1310 commited on
Commit
982e82c
Β·
verified Β·
1 Parent(s): b586840

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +102 -138
app.py CHANGED
@@ -7,7 +7,7 @@ from typing import Dict, List, Any, Optional
7
  import json
8
 
9
  # --- Constants ---
10
- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space/docs"
11
 
12
  # --- Enhanced GAIA Agent ---
13
  class GAIAAgent:
@@ -15,11 +15,11 @@ class GAIAAgent:
15
  Enhanced agent optimized for GAIA Level 1 questions.
16
  Targets 30%+ accuracy through multi-tool integration.
17
  """
18
-
19
  def __init__(self):
20
  print("βœ… GAIA Agent initialized with enhanced capabilities.")
21
  self.api_url = DEFAULT_API_URL
22
-
23
  def __call__(self, question: str, task_id: str = None) -> str:
24
  """
25
  Main entry point - processes a question and returns a precise answer.
@@ -28,67 +28,66 @@ class GAIAAgent:
28
  print(f"🧠 Processing Task: {task_id}")
29
  print(f"πŸ“ Question: {question[:100]}...")
30
  print(f"{'='*60}")
31
-
32
  try:
33
  # Step 1: Classify question type
34
  q_type = self._classify_question(question)
35
  print(f"πŸ“Š Question Type: {q_type}")
36
-
37
  # Step 2: Route to specialized handler
38
  answer = self._route_to_handler(question, q_type, task_id)
39
-
40
  # Step 3: Clean and format answer
41
  final_answer = self._clean_answer(answer, question)
42
-
43
  print(f"βœ… Final Answer: {final_answer}")
44
  return final_answer
45
-
46
  except Exception as e:
47
  print(f"❌ Error: {e}")
48
  # Return a safe fallback
49
  return "Unable to determine answer"
50
-
51
  def _classify_question(self, question: str) -> str:
52
  """Classify question to route to appropriate handler"""
53
  q_lower = question.lower()
54
-
55
  # Math/calculation questions
56
  if any(word in q_lower for word in ["calculate", "sum", "total", "multiply", "divide", "average", "mean"]):
57
  return "math"
58
-
59
  # Questions with numbers/operators
60
  if any(op in question for op in ["+", "-", "Γ—", "Γ·", "*", "/"]) and any(c.isdigit() for c in question):
61
  return "math"
62
-
63
  # Counting questions
64
  if any(word in q_lower for word in ["how many", "count", "number of"]):
65
  return "counting"
66
-
67
  # Date/time questions
68
  if any(word in q_lower for word in ["year", "date", "when", "month", "day"]):
69
  return "date"
70
-
71
  # Location questions
72
  if any(word in q_lower for word in ["where", "location", "city", "country", "capital"]):
73
  return "location"
74
-
75
  # Definition/what is questions
76
  if q_lower.startswith("what is") or q_lower.startswith("what's"):
77
  return "definition"
78
-
79
  # Who questions
80
  if q_lower.startswith("who"):
81
  return "person"
82
-
83
  # File-based questions
84
  if any(word in q_lower for word in ["file", "document", "image", "picture", "photo"]):
85
  return "file"
86
-
87
  return "general"
88
-
89
  def _route_to_handler(self, question: str, q_type: str, task_id: str) -> str:
90
  """Route question to appropriate specialized handler"""
91
-
92
  if q_type == "math":
93
  return self._handle_math(question)
94
  elif q_type == "counting":
@@ -105,7 +104,7 @@ class GAIAAgent:
105
  return self._handle_file(question, task_id)
106
  else:
107
  return self._handle_general(question)
108
-
109
  def _handle_math(self, question: str) -> str:
110
  """Handle mathematical calculations"""
111
  try:
@@ -113,10 +112,10 @@ class GAIAAgent:
113
  numbers = re.findall(r'-?\d+\.?\d*', question)
114
  if not numbers:
115
  return "0"
116
-
117
  nums = [float(n) for n in numbers]
118
  q_lower = question.lower()
119
-
120
  # Detect operation
121
  if "sum" in q_lower or "total" in q_lower or "+" in question or "add" in q_lower:
122
  result = sum(nums)
@@ -134,41 +133,41 @@ class GAIAAgent:
134
  # Try to evaluate the expression safely
135
  expr = re.sub(r'[^0-9+\-*/().\s]', '', question)
136
  result = eval(expr, {"__builtins__": {}}, {})
137
-
138
  # Format result
139
  if result == int(result):
140
  return str(int(result))
141
  else:
142
  return f"{result:.2f}"
143
-
144
  except Exception as e:
145
  print(f"Math error: {e}")
146
  return "0"
147
-
148
  def _handle_counting(self, question: str) -> str:
149
  """Handle counting questions"""
150
  # Extract the first number found (often the answer)
151
  numbers = re.findall(r'\d+', question)
152
  return numbers[0] if numbers else "0"
153
-
154
  def _handle_date(self, question: str) -> str:
155
  """Handle date/year questions"""
156
  # Look for 4-digit years
157
  years = re.findall(r'\b(19|20)\d{2}\b', question)
158
  if years:
159
  return years[0]
160
-
161
  # Look for dates
162
  dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{4}\b', question)
163
  if dates:
164
  return dates[0]
165
-
166
  return "Unknown"
167
-
168
  def _handle_location(self, question: str) -> str:
169
  """Handle location questions using knowledge base"""
170
  q_lower = question.lower()
171
-
172
  # Common capitals and locations
173
  location_kb = {
174
  "france": "Paris",
@@ -186,13 +185,13 @@ class GAIAAgent:
186
  "spain": "Madrid",
187
  "madrid": "Spain",
188
  }
189
-
190
  for key, value in location_kb.items():
191
  if key in q_lower:
192
  return value
193
-
194
  return "Unknown"
195
-
196
  def _handle_definition(self, question: str) -> str:
197
  """Handle 'What is' questions"""
198
  # Extract the subject
@@ -201,11 +200,11 @@ class GAIAAgent:
201
  subject = match.group(1).strip()
202
  return f"{subject}"
203
  return "Unknown"
204
-
205
  def _handle_person(self, question: str) -> str:
206
  """Handle 'Who' questions using knowledge base"""
207
  q_lower = question.lower()
208
-
209
  # Famous people knowledge base
210
  people_kb = {
211
  "romeo and juliet": "William Shakespeare",
@@ -218,28 +217,28 @@ class GAIAAgent:
218
  "light bulb": "Thomas Edison",
219
  "first president": "George Washington",
220
  }
221
-
222
  for key, value in people_kb.items():
223
  if key in q_lower:
224
  return value
225
-
226
  return "Unknown"
227
-
228
  def _handle_file(self, question: str, task_id: str) -> str:
229
  """Handle questions that require file access"""
230
  if not task_id:
231
  return "No file available"
232
-
233
  try:
234
  # Download the file from API
235
  file_url = f"{self.api_url}/files/{task_id}"
236
  print(f"πŸ“₯ Downloading file from: {file_url}")
237
-
238
  response = requests.get(file_url, timeout=30)
239
  if response.status_code == 200:
240
  # Process file based on type
241
  content_type = response.headers.get('Content-Type', '')
242
-
243
  if 'text' in content_type or 'json' in content_type:
244
  # Text-based file
245
  content = response.text
@@ -252,20 +251,20 @@ class GAIAAgent:
252
  else:
253
  print(f"File download failed: {response.status_code}")
254
  return "File not found"
255
-
256
  except Exception as e:
257
  print(f"File handling error: {e}")
258
  return "File processing failed"
259
-
260
  def _analyze_text_file(self, content: str, question: str) -> str:
261
  """Analyze text file content to answer question"""
262
  q_lower = question.lower()
263
-
264
  # Counting items in file
265
  if "how many" in q_lower:
266
  lines = content.strip().split('\n')
267
  return str(len(lines))
268
-
269
  # Finding specific text
270
  if "find" in q_lower or "search" in q_lower:
271
  # Extract search term
@@ -276,24 +275,24 @@ class GAIAAgent:
276
  return "Found"
277
  else:
278
  return "Not found"
279
-
280
  # Return first line as fallback
281
  lines = content.strip().split('\n')
282
  return lines[0] if lines else "Empty file"
283
-
284
  def _handle_general(self, question: str) -> str:
285
  """Handle general questions with basic reasoning"""
286
  # Try to extract any numbers or dates
287
  numbers = re.findall(r'\d+', question)
288
  if numbers:
289
  return numbers[0]
290
-
291
  # Look for yes/no questions
292
  if question.strip().endswith('?') and any(word in question.lower() for word in ['is', 'are', 'was', 'were', 'can', 'could', 'will', 'would']):
293
  return "Yes"
294
-
295
  return "Unable to determine"
296
-
297
  def _clean_answer(self, answer: str, question: str) -> str:
298
  """
299
  Clean and format answer according to GAIA requirements.
@@ -301,18 +300,18 @@ class GAIAAgent:
301
  """
302
  # Remove extra whitespace
303
  answer = answer.strip()
304
-
305
  # Remove "The answer is" or similar phrases
306
  answer = re.sub(r'^(?:the answer is|it is|result is)[:\s]+', '', answer, flags=re.IGNORECASE)
307
-
308
  # Remove trailing punctuation (except for decimals)
309
  answer = re.sub(r'[.!?,;]+$', '', answer)
310
-
311
  # Handle comma-separated lists
312
  if "comma-separated" in question.lower() or "list" in question.lower():
313
  # Ensure proper comma-space formatting
314
  answer = re.sub(r'\s*,\s*', ', ', answer)
315
-
316
  # Handle number formatting
317
  if re.match(r'^-?\d+\.?\d*$', answer):
318
  # It's a number
@@ -323,7 +322,7 @@ class GAIAAgent:
323
  else:
324
  # Keep minimal decimal places
325
  answer = f"{num:.10g}"
326
-
327
  return answer
328
 
329
 
@@ -341,8 +340,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
341
  return "❌ Please login to Hugging Face first.", None
342
 
343
  api_url = DEFAULT_API_URL
344
- questions_url = f"{api_url}/questions"
345
- submit_url = f"{api_url}/submit"
346
 
347
  # Create Agent
348
  try:
@@ -359,43 +358,43 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
359
  response = requests.get(questions_url, timeout=30)
360
  response.raise_for_status()
361
  questions_data = response.json()
362
-
363
  if not questions_data:
364
  return "⚠️ No questions received from API.", None
365
-
366
  print(f"βœ… Retrieved {len(questions_data)} questions.")
367
-
368
  except requests.exceptions.RequestException as e:
369
  return f"❌ Error fetching questions: {e}\n\nPlease check if the API is available.", None
370
 
371
  # Run Agent on all questions
372
  results_log = []
373
  answers_payload = []
374
-
375
  print(f"\nπŸ€– Running agent on {len(questions_data)} questions...\n")
376
-
377
  for i, item in enumerate(questions_data, 1):
378
  task_id = item.get("task_id")
379
  question_text = item.get("question")
380
-
381
  if not task_id or not question_text:
382
  continue
383
-
384
  try:
385
  print(f"\n[{i}/{len(questions_data)}] Processing: {task_id}")
386
  submitted_answer = agent(question_text, task_id)
387
-
388
  answers_payload.append({
389
  "task_id": task_id,
390
  "submitted_answer": submitted_answer
391
  })
392
-
393
  results_log.append({
394
  "Task ID": task_id,
395
  "Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
396
  "Your Answer": submitted_answer
397
  })
398
-
399
  except Exception as e:
400
  error_msg = f"ERROR: {e}"
401
  print(f"❌ {error_msg}")
@@ -422,11 +421,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
422
  response = requests.post(submit_url, json=submission_data, timeout=120)
423
  response.raise_for_status()
424
  result_data = response.json()
425
-
426
  score = result_data.get('score', 0)
427
  correct = result_data.get('correct_count', 0)
428
  total = result_data.get('total_attempted', len(answers_payload))
429
-
430
  # Determine emoji based on score
431
  if score >= 30:
432
  emoji = "πŸŽ‰πŸ†"
@@ -436,7 +435,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
436
  emoji = "πŸ“ˆ"
437
  else:
438
  emoji = "πŸ’ͺ"
439
-
440
  final_status = (
441
  f"{emoji} Submission Complete!\n\n"
442
  f"πŸ‘€ Username: {result_data.get('username')}\n"
@@ -445,9 +444,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
445
  f"πŸ“ {result_data.get('message', '')}\n\n"
446
  f"πŸ”— Check the leaderboard: https://huggingface.co/spaces/agents-course/agents-course-unit4-leaderboard"
447
  )
448
-
449
  return final_status, results_df
450
-
451
  except requests.exceptions.RequestException as e:
452
  return f"❌ Submission failed: {e}\n\nβœ… Generated {len(answers_payload)} answers (see table)", results_df
453
 
@@ -455,90 +454,56 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
455
  # --- Gradio Interface ---
456
  with gr.Blocks(theme=gr.themes.Soft(), title="GAIA Agent Evaluation") as demo:
457
  gr.Markdown(
458
- """
459
- # πŸ€– GAIA Agent Evaluation System
460
-
461
- ### 🎯 Goal: Achieve 30%+ accuracy on GAIA Level 1 questions
462
-
463
- This agent evaluates your AI assistant on 20 carefully selected questions from GAIA's validation set.
464
- The questions test reasoning, calculation, factual knowledge, and tool usage.
465
-
466
- ---
467
-
468
- ### πŸ“‹ How to Submit:
469
-
470
- 1. **Clone this Space** to your Hugging Face profile
471
- 2. **Keep your Space public** (required for leaderboard verification)
472
- 3. **Login** using the button below
473
- 4. **Click "Run Evaluation"** and wait for results
474
- 5. **Check your score** on the [leaderboard](https://huggingface.co/spaces/agents-course/agents-course-unit4-leaderboard)
475
-
476
- ---
477
-
478
- ### πŸ’‘ Tips for Improvement:
479
-
480
- - Study the question types and patterns
481
- - Add web search capabilities (DuckDuckGo, Wikipedia)
482
- - Implement better answer formatting
483
- - Test individual questions using `/random-question` endpoint
484
- - Focus on precise, exact-match answers
485
-
486
- ---
487
-
488
- ### ⚠️ Important Notes:
489
-
490
- - Processing takes 2-5 minutes (20 questions)
491
- - Answers must be **exact matches** (case-sensitive, format-sensitive)
492
- - Keep your Space public for leaderboard verification
493
- - The SPACE_ID environment variable is set automatically by HF Spaces
494
-
495
- """
496
  )
497
-
498
  with gr.Row():
499
  gr.LoginButton()
500
-
501
  gr.Markdown("---")
502
-
503
  run_button = gr.Button(
504
  "πŸš€ Run Evaluation & Submit All Answers",
505
  variant="primary",
506
  size="lg"
507
  )
508
-
509
  status_output = gr.Textbox(
510
  label="πŸ“Š Evaluation Results",
511
  lines=12,
512
  interactive=False,
513
  show_copy_button=True
514
  )
515
-
516
  results_table = gr.DataFrame(
517
  label="πŸ“ Questions and Your Answers",
518
  wrap=True,
519
  interactive=False
520
  )
521
-
522
  gr.Markdown(
523
- """
524
- ---
525
-
526
- ### πŸ”— Resources:
527
-
528
- - [GAIA Benchmark Paper](https://arxiv.org/abs/2311.12983)
529
- - [Leaderboard](https://huggingface.co/spaces/agents-course/agents-course-unit4-leaderboard)
530
- - [Course Materials](https://huggingface.co/learn/cookbook/agents)
531
- - [API Documentation](https://agents-course-unit4-scoring.hf.space/docs)
532
-
533
- ### πŸ† Score Interpretation:
534
-
535
- - **30%+**: Excellent! You've achieved certification level βœ…
536
- - **20-29%**: Good progress! Keep improving πŸ“ˆ
537
- - **10-19%**: On the right track! Add more tools πŸ”§
538
- - **0-9%**: Keep experimenting! Study the questions πŸ’ͺ
539
-
540
- Remember: Human performance is ~92%, GPT-4 with plugins is ~15%. You're competing with AI systems!
541
- """
542
  )
543
 
544
  run_button.click(
@@ -546,7 +511,6 @@ with gr.Blocks(theme=gr.themes.Soft(), title="GAIA Agent Evaluation") as demo:
546
  outputs=[status_output, results_table]
547
  )
548
 
549
-
550
  if __name__ == "__main__":
551
  print("πŸš€ Launching GAIA Agent Evaluation Interface...")
552
- demo.launch(debug=True, share=False)
 
7
  import json
8
 
9
  # --- Constants ---
10
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # (no /docs)
11
 
12
  # --- Enhanced GAIA Agent ---
13
  class GAIAAgent:
 
15
  Enhanced agent optimized for GAIA Level 1 questions.
16
  Targets 30%+ accuracy through multi-tool integration.
17
  """
18
+
19
  def __init__(self):
20
  print("βœ… GAIA Agent initialized with enhanced capabilities.")
21
  self.api_url = DEFAULT_API_URL
22
+
23
  def __call__(self, question: str, task_id: str = None) -> str:
24
  """
25
  Main entry point - processes a question and returns a precise answer.
 
28
  print(f"🧠 Processing Task: {task_id}")
29
  print(f"πŸ“ Question: {question[:100]}...")
30
  print(f"{'='*60}")
31
+
32
  try:
33
  # Step 1: Classify question type
34
  q_type = self._classify_question(question)
35
  print(f"πŸ“Š Question Type: {q_type}")
36
+
37
  # Step 2: Route to specialized handler
38
  answer = self._route_to_handler(question, q_type, task_id)
39
+
40
  # Step 3: Clean and format answer
41
  final_answer = self._clean_answer(answer, question)
42
+
43
  print(f"βœ… Final Answer: {final_answer}")
44
  return final_answer
45
+
46
  except Exception as e:
47
  print(f"❌ Error: {e}")
48
  # Return a safe fallback
49
  return "Unable to determine answer"
50
+
51
  def _classify_question(self, question: str) -> str:
52
  """Classify question to route to appropriate handler"""
53
  q_lower = question.lower()
54
+
55
  # Math/calculation questions
56
  if any(word in q_lower for word in ["calculate", "sum", "total", "multiply", "divide", "average", "mean"]):
57
  return "math"
58
+
59
  # Questions with numbers/operators
60
  if any(op in question for op in ["+", "-", "Γ—", "Γ·", "*", "/"]) and any(c.isdigit() for c in question):
61
  return "math"
62
+
63
  # Counting questions
64
  if any(word in q_lower for word in ["how many", "count", "number of"]):
65
  return "counting"
66
+
67
  # Date/time questions
68
  if any(word in q_lower for word in ["year", "date", "when", "month", "day"]):
69
  return "date"
70
+
71
  # Location questions
72
  if any(word in q_lower for word in ["where", "location", "city", "country", "capital"]):
73
  return "location"
74
+
75
  # Definition/what is questions
76
  if q_lower.startswith("what is") or q_lower.startswith("what's"):
77
  return "definition"
78
+
79
  # Who questions
80
  if q_lower.startswith("who"):
81
  return "person"
82
+
83
  # File-based questions
84
  if any(word in q_lower for word in ["file", "document", "image", "picture", "photo"]):
85
  return "file"
86
+
87
  return "general"
88
+
89
  def _route_to_handler(self, question: str, q_type: str, task_id: str) -> str:
90
  """Route question to appropriate specialized handler"""
 
91
  if q_type == "math":
92
  return self._handle_math(question)
93
  elif q_type == "counting":
 
104
  return self._handle_file(question, task_id)
105
  else:
106
  return self._handle_general(question)
107
+
108
  def _handle_math(self, question: str) -> str:
109
  """Handle mathematical calculations"""
110
  try:
 
112
  numbers = re.findall(r'-?\d+\.?\d*', question)
113
  if not numbers:
114
  return "0"
115
+
116
  nums = [float(n) for n in numbers]
117
  q_lower = question.lower()
118
+
119
  # Detect operation
120
  if "sum" in q_lower or "total" in q_lower or "+" in question or "add" in q_lower:
121
  result = sum(nums)
 
133
  # Try to evaluate the expression safely
134
  expr = re.sub(r'[^0-9+\-*/().\s]', '', question)
135
  result = eval(expr, {"__builtins__": {}}, {})
136
+
137
  # Format result
138
  if result == int(result):
139
  return str(int(result))
140
  else:
141
  return f"{result:.2f}"
142
+
143
  except Exception as e:
144
  print(f"Math error: {e}")
145
  return "0"
146
+
147
  def _handle_counting(self, question: str) -> str:
148
  """Handle counting questions"""
149
  # Extract the first number found (often the answer)
150
  numbers = re.findall(r'\d+', question)
151
  return numbers[0] if numbers else "0"
152
+
153
  def _handle_date(self, question: str) -> str:
154
  """Handle date/year questions"""
155
  # Look for 4-digit years
156
  years = re.findall(r'\b(19|20)\d{2}\b', question)
157
  if years:
158
  return years[0]
159
+
160
  # Look for dates
161
  dates = re.findall(r'\b\d{1,2}/\d{1,2}/\d{4}\b', question)
162
  if dates:
163
  return dates[0]
164
+
165
  return "Unknown"
166
+
167
  def _handle_location(self, question: str) -> str:
168
  """Handle location questions using knowledge base"""
169
  q_lower = question.lower()
170
+
171
  # Common capitals and locations
172
  location_kb = {
173
  "france": "Paris",
 
185
  "spain": "Madrid",
186
  "madrid": "Spain",
187
  }
188
+
189
  for key, value in location_kb.items():
190
  if key in q_lower:
191
  return value
192
+
193
  return "Unknown"
194
+
195
  def _handle_definition(self, question: str) -> str:
196
  """Handle 'What is' questions"""
197
  # Extract the subject
 
200
  subject = match.group(1).strip()
201
  return f"{subject}"
202
  return "Unknown"
203
+
204
  def _handle_person(self, question: str) -> str:
205
  """Handle 'Who' questions using knowledge base"""
206
  q_lower = question.lower()
207
+
208
  # Famous people knowledge base
209
  people_kb = {
210
  "romeo and juliet": "William Shakespeare",
 
217
  "light bulb": "Thomas Edison",
218
  "first president": "George Washington",
219
  }
220
+
221
  for key, value in people_kb.items():
222
  if key in q_lower:
223
  return value
224
+
225
  return "Unknown"
226
+
227
  def _handle_file(self, question: str, task_id: str) -> str:
228
  """Handle questions that require file access"""
229
  if not task_id:
230
  return "No file available"
231
+
232
  try:
233
  # Download the file from API
234
  file_url = f"{self.api_url}/files/{task_id}"
235
  print(f"πŸ“₯ Downloading file from: {file_url}")
236
+
237
  response = requests.get(file_url, timeout=30)
238
  if response.status_code == 200:
239
  # Process file based on type
240
  content_type = response.headers.get('Content-Type', '')
241
+
242
  if 'text' in content_type or 'json' in content_type:
243
  # Text-based file
244
  content = response.text
 
251
  else:
252
  print(f"File download failed: {response.status_code}")
253
  return "File not found"
254
+
255
  except Exception as e:
256
  print(f"File handling error: {e}")
257
  return "File processing failed"
258
+
259
  def _analyze_text_file(self, content: str, question: str) -> str:
260
  """Analyze text file content to answer question"""
261
  q_lower = question.lower()
262
+
263
  # Counting items in file
264
  if "how many" in q_lower:
265
  lines = content.strip().split('\n')
266
  return str(len(lines))
267
+
268
  # Finding specific text
269
  if "find" in q_lower or "search" in q_lower:
270
  # Extract search term
 
275
  return "Found"
276
  else:
277
  return "Not found"
278
+
279
  # Return first line as fallback
280
  lines = content.strip().split('\n')
281
  return lines[0] if lines else "Empty file"
282
+
283
  def _handle_general(self, question: str) -> str:
284
  """Handle general questions with basic reasoning"""
285
  # Try to extract any numbers or dates
286
  numbers = re.findall(r'\d+', question)
287
  if numbers:
288
  return numbers[0]
289
+
290
  # Look for yes/no questions
291
  if question.strip().endswith('?') and any(word in question.lower() for word in ['is', 'are', 'was', 'were', 'can', 'could', 'will', 'would']):
292
  return "Yes"
293
+
294
  return "Unable to determine"
295
+
296
  def _clean_answer(self, answer: str, question: str) -> str:
297
  """
298
  Clean and format answer according to GAIA requirements.
 
300
  """
301
  # Remove extra whitespace
302
  answer = answer.strip()
303
+
304
  # Remove "The answer is" or similar phrases
305
  answer = re.sub(r'^(?:the answer is|it is|result is)[:\s]+', '', answer, flags=re.IGNORECASE)
306
+
307
  # Remove trailing punctuation (except for decimals)
308
  answer = re.sub(r'[.!?,;]+$', '', answer)
309
+
310
  # Handle comma-separated lists
311
  if "comma-separated" in question.lower() or "list" in question.lower():
312
  # Ensure proper comma-space formatting
313
  answer = re.sub(r'\s*,\s*', ', ', answer)
314
+
315
  # Handle number formatting
316
  if re.match(r'^-?\d+\.?\d*$', answer):
317
  # It's a number
 
322
  else:
323
  # Keep minimal decimal places
324
  answer = f"{num:.10g}"
325
+
326
  return answer
327
 
328
 
 
340
  return "❌ Please login to Hugging Face first.", None
341
 
342
  api_url = DEFAULT_API_URL
343
+ questions_url = f"{api_url}/questions" # Corrected endpoint
344
+ submit_url = f"{api_url}/submit" # Corrected endpoint
345
 
346
  # Create Agent
347
  try:
 
358
  response = requests.get(questions_url, timeout=30)
359
  response.raise_for_status()
360
  questions_data = response.json()
361
+
362
  if not questions_data:
363
  return "⚠️ No questions received from API.", None
364
+
365
  print(f"βœ… Retrieved {len(questions_data)} questions.")
366
+
367
  except requests.exceptions.RequestException as e:
368
  return f"❌ Error fetching questions: {e}\n\nPlease check if the API is available.", None
369
 
370
  # Run Agent on all questions
371
  results_log = []
372
  answers_payload = []
373
+
374
  print(f"\nπŸ€– Running agent on {len(questions_data)} questions...\n")
375
+
376
  for i, item in enumerate(questions_data, 1):
377
  task_id = item.get("task_id")
378
  question_text = item.get("question")
379
+
380
  if not task_id or not question_text:
381
  continue
382
+
383
  try:
384
  print(f"\n[{i}/{len(questions_data)}] Processing: {task_id}")
385
  submitted_answer = agent(question_text, task_id)
386
+
387
  answers_payload.append({
388
  "task_id": task_id,
389
  "submitted_answer": submitted_answer
390
  })
391
+
392
  results_log.append({
393
  "Task ID": task_id,
394
  "Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
395
  "Your Answer": submitted_answer
396
  })
397
+
398
  except Exception as e:
399
  error_msg = f"ERROR: {e}"
400
  print(f"❌ {error_msg}")
 
421
  response = requests.post(submit_url, json=submission_data, timeout=120)
422
  response.raise_for_status()
423
  result_data = response.json()
424
+
425
  score = result_data.get('score', 0)
426
  correct = result_data.get('correct_count', 0)
427
  total = result_data.get('total_attempted', len(answers_payload))
428
+
429
  # Determine emoji based on score
430
  if score >= 30:
431
  emoji = "πŸŽ‰πŸ†"
 
435
  emoji = "πŸ“ˆ"
436
  else:
437
  emoji = "πŸ’ͺ"
438
+
439
  final_status = (
440
  f"{emoji} Submission Complete!\n\n"
441
  f"πŸ‘€ Username: {result_data.get('username')}\n"
 
444
  f"πŸ“ {result_data.get('message', '')}\n\n"
445
  f"πŸ”— Check the leaderboard: https://huggingface.co/spaces/agents-course/agents-course-unit4-leaderboard"
446
  )
447
+
448
  return final_status, results_df
449
+
450
  except requests.exceptions.RequestException as e:
451
  return f"❌ Submission failed: {e}\n\nβœ… Generated {len(answers_payload)} answers (see table)", results_df
452
 
 
454
  # --- Gradio Interface ---
455
  with gr.Blocks(theme=gr.themes.Soft(), title="GAIA Agent Evaluation") as demo:
456
  gr.Markdown(
457
+ """
458
+ # πŸ€– GAIA Agent Evaluation System
459
+
460
+ ### 🎯 Goal: Achieve 30%+ accuracy on GAIA Level 1 questions
461
+
462
+ This agent evaluates your AI assistant on 20 carefully selected questions from GAIA's validation set.
463
+ The questions test reasoning, calculation, factual knowledge, and tool usage.
464
+
465
+ ---
466
+ Please clone this space, log in, and click 'Run Evaluation' to see your score!
467
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  )
469
+
470
  with gr.Row():
471
  gr.LoginButton()
472
+
473
  gr.Markdown("---")
474
+
475
  run_button = gr.Button(
476
  "πŸš€ Run Evaluation & Submit All Answers",
477
  variant="primary",
478
  size="lg"
479
  )
480
+
481
  status_output = gr.Textbox(
482
  label="πŸ“Š Evaluation Results",
483
  lines=12,
484
  interactive=False,
485
  show_copy_button=True
486
  )
487
+
488
  results_table = gr.DataFrame(
489
  label="πŸ“ Questions and Your Answers",
490
  wrap=True,
491
  interactive=False
492
  )
493
+
494
  gr.Markdown(
495
+ """
496
+ ---
497
+
498
+ ### πŸ”— Resources:
499
+
500
+ - [GAIA Benchmark Paper](https://arxiv.org/abs/2311.12983)
501
+ - [Leaderboard](https://huggingface.co/spaces/agents-course/agents-course-unit4-leaderboard)
502
+ - [Course Materials](https://huggingface.co/learn/cookbook/agents)
503
+ - [API Documentation](https://agents-course-unit4-scoring.hf.space/docs)
504
+
505
+ ---
506
+ """
 
 
 
 
 
 
 
507
  )
508
 
509
  run_button.click(
 
511
  outputs=[status_output, results_table]
512
  )
513
 
 
514
  if __name__ == "__main__":
515
  print("πŸš€ Launching GAIA Agent Evaluation Interface...")
516
+ demo.launch(debug=True, share=False)