Alexis-alexis commited on
Commit
62b42a1
·
verified ·
1 Parent(s): e92e982

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -292
app.py CHANGED
@@ -8,319 +8,237 @@ import base64
8
  from typing import Optional, Dict, List, Any
9
  import anthropic
10
 
11
- # API URL для GAIA
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
 
14
  class GAIAAgent:
15
  def __init__(self):
16
  print("Initializing GAIA Agent powered by Claude...")
17
- # Получение API-ключа Claude из переменных окружения
18
  self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
19
  if not self.claude_key:
20
  raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
21
 
22
- # Инициализация клиента Claude
23
  self.client = anthropic.Anthropic(api_key=self.claude_key)
24
-
25
- # API URL для GAIA
26
  self.api_url = DEFAULT_API_URL
27
-
28
- # Словарь для кеширования результатов поиска и ответов
29
- self.search_cache = {}
30
  self.file_cache = {}
31
 
32
- # Системный промпт для Claude
33
- self.system_prompt = """
34
- You are an AI assistant specially designed to answer questions from the GAIA benchmark with exceptional accuracy.
35
- The GAIA benchmark evaluates AI's ability to perform real-world tasks that require reasoning, web browsing, and tool use.
36
-
37
- Your goal is to provide the EXACT answer in the format requested by each question. GAIA uses exact matching for evaluation.
38
-
39
- Guidelines for GAIA answers:
40
- 1. Provide ONLY the final answer, with NO explanations, reasoning, or additional text
41
- 2. Format is critical - follow the instructions in the question precisely
42
- 3. For comma-separated lists, provide "item1, item2, item3" with no quotes or extra punctuation
43
- 4. For numeric answers, provide just the number without units unless specifically requested
44
- 5. Maintain exact capitalization and spacing as requested in the question
45
- 6. If asked to order items, follow the requested ordering precisely
46
-
47
- Examples of correct formatting:
48
- - If asked for fruits in alphabetical order: "apples, bananas, oranges"
49
- - If asked for a single word: "photosynthesis"
50
- - If asked for a number: "42"
51
- - If asked for a date in MM/DD/YY format: "05/04/25"
52
-
53
- Remember, your score depends on exact matching against the reference answer.
54
- """
55
 
56
- def search_web(self, query: str) -> str:
57
- """Improved web search function with caching"""
58
- if query in self.search_cache:
59
- print(f"Using cached search results for: {query}")
60
- return self.search_cache[query]
61
-
62
- print(f"Performing web search for: {query}")
63
- try:
64
- # DuckDuckGo Instant Answer API
65
- response = requests.get(
66
- "https://api.duckduckgo.com/",
67
- params={"q": query, "format": "json"},
68
- timeout=10
69
- )
70
- data = response.json()
71
-
72
- # Собираем результаты из разных полей
73
- results = []
74
- if data.get("AbstractText"):
75
- results.append(f"Abstract: {data['AbstractText']}")
76
- if data.get("RelatedTopics"):
77
- topics = data.get("RelatedTopics", [])
78
- for i, topic in enumerate(topics[:5]): # Ограничиваем 5 результатами
79
- if isinstance(topic, dict) and topic.get("Text"):
80
- results.append(f"Related Topic {i+1}: {topic['Text']}")
81
-
82
- result_text = "\n\n".join(results) if results else "No results found"
83
-
84
- # Вторичный поиск с использованием серпапи.com (если бы у нас был ключ API)
85
- # В реальном приложении здесь можно было бы использовать другой поисковый API
86
-
87
- # Кешируем и возвращаем результаты
88
- self.search_cache[query] = result_text
89
- return result_text
90
- except Exception as e:
91
- print(f"Web search error: {e}")
92
- return f"Web search failed: {str(e)}"
93
 
94
  def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
95
  """Fetches and processes a file associated with a task"""
96
  if task_id in self.file_cache:
97
- print(f"Using cached file for task: {task_id}")
98
  return self.file_cache[task_id]
99
 
100
  print(f"Fetching file for task: {task_id}")
101
  try:
102
  response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
103
 
104
- if response.status_code == 200:
105
- file_content = response.content
106
- file_info = {
107
- "content": file_content,
108
- "content_type": response.headers.get("Content-Type", ""),
109
- "size": len(file_content)
110
- }
111
-
112
- # Определяем тип файла и обрабатываем соответственно
113
- content_type = file_info["content_type"].lower()
114
 
115
- if "image" in content_type:
116
- # Преобразуем изображение в base64 для Claude
117
- file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
118
- file_info["type"] = "image"
119
- print(f"Processed image file ({file_info['size']} bytes)")
120
- elif "pdf" in content_type:
121
- # Для PDF мы можем только сказать, что это PDF
122
- file_info["type"] = "pdf"
123
- print(f"Detected PDF file ({file_info['size']} bytes)")
124
- elif "text" in content_type or "json" in content_type or "csv" in content_type:
125
- # Для текстовых файлов пытаемся декодировать
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  try:
127
- file_info["text"] = file_content.decode('utf-8')
128
  file_info["type"] = "text"
129
- print(f"Processed text file ({file_info['size']} bytes)")
130
- except UnicodeDecodeError:
131
  file_info["type"] = "binary"
132
- print(f"Could not decode text file ({file_info['size']} bytes)")
133
- else:
134
- file_info["type"] = "binary"
135
- print(f"Detected binary file ({file_info['size']} bytes, {content_type})")
136
-
137
- # Кешируем файл
138
- self.file_cache[task_id] = file_info
139
- return file_info
140
  else:
141
- print(f"Failed to fetch file, status code: {response.status_code}")
142
- print(f"Response: {response.text[:1000]}")
143
- return None
 
 
 
 
 
 
 
 
144
  except Exception as e:
145
- print(f"Error fetching file: {e}")
146
  return None
147
 
148
  def extract_answer(self, response_text: str) -> str:
149
- """Extract just the final answer from Claude's response"""
150
- # Удаляем очевидные вводные фразы
151
- cleaned = re.sub(r'^(final answer|the answer is|answer|Here\'s the answer|response):?\s*', '', response_text, flags=re.IGNORECASE)
 
 
 
 
152
 
153
- # Удаляем объяснения в конце
154
- cleaned = re.sub(r'\n.*?explain.*?$', '', cleaned, flags=re.IGNORECASE | re.DOTALL)
155
-
156
- # Проверяем на многострочный ответ и берем только первую строку, если она содержит ответ
157
- lines = cleaned.strip().split('\n')
158
- if len(lines) > 1:
159
- first_line = lines[0].strip()
160
- # Если первая строка выглядит как полный ответ, возвращаем только её
161
- if len(first_line) > 5 and not first_line.startswith('I ') and not first_line.startswith('The '):
162
- return first_line
163
-
164
- # Вычищаем кавычки в начале и конце
165
- cleaned = cleaned.strip()
166
- if cleaned.startswith('"') and cleaned.endswith('"'):
167
- cleaned = cleaned[1:-1]
168
-
169
- return cleaned.strip()
170
-
171
- def process_question(self, question: str, task_id: str = None) -> Dict[str, Any]:
172
- """Processes a question to extract relevant information and prepare for Claude"""
173
- question_info = {
174
- "original": question,
175
- "task_id": task_id,
176
- "has_file": False,
177
- "file_info": None,
178
- "contains_math": bool(re.search(r'calculate|compute|sum|average|mean|median|formula|equation', question, re.IGNORECASE)),
179
- "requires_list": bool(re.search(r'list|order|sequence|rank|items|elements|values', question, re.IGNORECASE)),
180
- "format_requirements": None
181
- }
182
 
183
- # Извлекаем формат, если указан
184
- format_match = re.search(r'(format|in the format|formatted as|as a|in) ([^\.]+)', question, re.IGNORECASE)
185
- if format_match:
186
- question_info["format_requirements"] = format_match.group(2).strip()
187
-
188
- # Проверяем наличие файла
189
- if task_id and self.fetch_file(task_id):
190
- question_info["has_file"] = True
191
- question_info["file_info"] = self.fetch_file(task_id)
192
 
193
- return question_info
194
 
195
  def __call__(self, question: str, task_id: str = None) -> str:
196
- """Main method to process a question and return an answer"""
197
- if task_id is None:
198
- # Пытаемся извлечь task_id из вопроса, если он там есть
199
- match = re.search(r'task[\s_-]?id:?\s*(\w+)', question, re.IGNORECASE)
200
- if match:
201
- task_id = match.group(1)
202
-
203
- print(f"Processing question for task_id: {task_id}")
204
- print(f"Question: {question[:100]}...")
205
-
206
- # Обработка вопроса
207
- question_info = self.process_question(question, task_id)
208
 
209
  try:
210
- # Подготовка сообщения для Claude
211
- messages = []
212
 
213
- # Подготовка контента сообщения
214
- user_content = [{
215
  "type": "text",
216
- "text": f"""
217
- Question from GAIA benchmark: {question}
218
-
219
- Remember:
220
- 1. Provide ONLY the final answer
221
- 2. Format exactly as requested
222
- 3. No explanations or reasoning
223
- """
224
- }]
225
 
226
- # Добавляем результаты поиска, если нужно
227
- web_results = self.search_web(question)
228
- if web_results:
229
- user_content.append({
230
- "type": "text",
231
- "text": f"""
232
- Web search results related to this question:
233
-
234
- {web_results}
235
- """
236
- })
237
 
238
- # Добавляем файл, если он есть
239
- if question_info["has_file"] and question_info["file_info"]:
240
- file_info = question_info["file_info"]
241
 
242
- if file_info["type"] == "image":
243
- # Добавляем изображение для Claude
 
 
 
 
 
 
 
 
 
 
 
 
244
  user_content.append({
245
  "type": "image",
246
  "source": {
247
- "type": "base64",
248
- "media_type": file_info["content_type"],
249
  "data": file_info["base64"]
250
  }
251
  })
 
 
 
 
 
252
 
 
 
 
 
 
 
 
 
 
253
  user_content.append({
254
  "type": "text",
255
- "text": "The above image is part of the question. Please analyze it carefully."
256
  })
257
- elif file_info["type"] == "text" and "text" in file_info:
258
- # Для текстовых файлов добавляем содержимое
 
 
 
 
 
 
 
259
  user_content.append({
260
  "type": "text",
261
- "text": f"""
262
- The question includes a text file with the following content:
263
-
264
- {file_info["text"][:4000]} # ограничиваем, чтобы не превысить лимиты токенов
265
- """
266
  })
 
 
 
267
 
268
- # Добавляем форматирование, если указано
269
- if question_info["format_requirements"]:
270
- user_content.append({
271
- "type": "text",
272
- "text": f"""
273
- Important format requirement: {question_info["format_requirements"]}
274
- Make sure your answer follows this format EXACTLY.
275
- """
276
- })
277
-
278
- messages.append({
279
- "role": "user",
280
- "content": user_content
281
- })
282
-
283
- # Запрос к Claude
284
  response = self.client.messages.create(
285
  model="claude-sonnet-4-6",
286
  system=self.system_prompt,
287
- messages=messages,
288
- temperature=0.1, # Низкая температура для точных ответов
289
  max_tokens=4096
290
  )
291
 
292
- # Получаем ответ
293
  raw_answer = response.content[0].text.strip()
 
294
 
295
- # Вычищаем ответ от лишнего
296
- clean_answer = self.extract_answer(raw_answer)
297
 
298
- print(f"Raw answer: {raw_answer}")
299
- print(f"Clean answer: {clean_answer}")
300
 
301
- return clean_answer
302
  except Exception as e:
303
- print(f"Error in agent: {e}")
304
  import traceback
305
  traceback.print_exc()
306
- return f"Error processing question: {str(e)}"
307
 
308
 
309
- # Используем наш агент как BasicAgent для совместимости с остальным кодом
310
  class BasicAgent(GAIAAgent):
311
  pass
312
 
313
 
314
  def run_and_submit_all(profile: gr.OAuthProfile | None):
315
- """
316
- Fetches all questions, runs the BasicAgent on them, submits all answers,
317
- and displays the results.
318
- """
319
- # --- Determine HF Space Runtime URL and Repo URL ---
320
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
321
 
322
  if profile:
323
- username= f"{profile.username}"
324
  print(f"User logged in: {username}")
325
  else:
326
  print("User not logged in.")
@@ -330,41 +248,31 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
330
  questions_url = f"{api_url}/questions"
331
  submit_url = f"{api_url}/submit"
332
 
333
- # 1. Instantiate Agent ( modify this part to create your agent)
334
  try:
335
  agent = BasicAgent()
336
  except Exception as e:
337
  print(f"Error instantiating agent: {e}")
338
  return f"Error initializing agent: {e}", None
339
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
340
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
341
  print(agent_code)
342
 
343
- # 2. Fetch Questions
344
  print(f"Fetching questions from: {questions_url}")
345
  try:
346
  response = requests.get(questions_url, timeout=15)
347
  response.raise_for_status()
348
  questions_data = response.json()
349
  if not questions_data:
350
- print("Fetched questions list is empty.")
351
- return "Fetched questions list is empty or invalid format.", None
352
  print(f"Fetched {len(questions_data)} questions.")
353
- except requests.exceptions.RequestException as e:
354
  print(f"Error fetching questions: {e}")
355
  return f"Error fetching questions: {e}", None
356
- except requests.exceptions.JSONDecodeError as e:
357
- print(f"Error decoding JSON response from questions endpoint: {e}")
358
- print(f"Response text: {response.text[:500]}")
359
- return f"Error decoding server response for questions: {e}", None
360
- except Exception as e:
361
- print(f"An unexpected error occurred fetching questions: {e}")
362
- return f"An unexpected error occurred fetching questions: {e}", None
363
 
364
- # 3. Run your Agent
365
  results_log = []
366
  answers_payload = []
367
  print(f"Running agent on {len(questions_data)} questions...")
 
368
  for item in questions_data:
369
  task_id = item.get("task_id")
370
  question_text = item.get("question")
@@ -374,22 +282,29 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
374
  try:
375
  submitted_answer = agent(question_text, task_id)
376
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
377
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
378
  except Exception as e:
379
- print(f"Error running agent on task {task_id}: {e}")
380
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
381
 
382
  if not answers_payload:
383
- print("Agent did not produce any answers to submit.")
384
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
385
 
386
- # 4. Prepare Submission
387
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
388
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
389
- print(status_update)
 
 
390
 
391
- # 5. Submit
392
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
393
  try:
394
  response = requests.post(submit_url, json=submission_data, timeout=60)
395
  response.raise_for_status()
@@ -402,53 +317,33 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
402
  f"Message: {result_data.get('message', 'No message received.')}"
403
  )
404
  print("Submission successful.")
405
- results_df = pd.DataFrame(results_log)
406
- return final_status, results_df
407
  except requests.exceptions.HTTPError as e:
408
  error_detail = f"Server responded with status {e.response.status_code}."
409
  try:
410
  error_json = e.response.json()
411
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
412
- except requests.exceptions.JSONDecodeError:
413
  error_detail += f" Response: {e.response.text[:500]}"
414
- status_message = f"Submission Failed: {error_detail}"
415
- print(status_message)
416
- results_df = pd.DataFrame(results_log)
417
- return status_message, results_df
418
- except requests.exceptions.Timeout:
419
- status_message = "Submission Failed: The request timed out."
420
- print(status_message)
421
- results_df = pd.DataFrame(results_log)
422
- return status_message, results_df
423
- except requests.exceptions.RequestException as e:
424
- status_message = f"Submission Failed: Network error - {e}"
425
- print(status_message)
426
- results_df = pd.DataFrame(results_log)
427
- return status_message, results_df
428
  except Exception as e:
429
- status_message = f"An unexpected error occurred during submission: {e}"
430
- print(status_message)
431
- results_df = pd.DataFrame(results_log)
432
- return status_message, results_df
433
 
434
 
435
- # --- Build Gradio Interface using Blocks ---
436
  with gr.Blocks() as demo:
437
  gr.Markdown("# GAIA Benchmark Agent Evaluation")
438
  gr.Markdown(
439
  """
440
  **Instructions:**
441
- 1. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
442
  2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
443
 
444
- This agent uses Claude 3.5 Sonnet to solve GAIA benchmark tasks.
445
  """
446
  )
447
 
448
  gr.LoginButton()
449
-
450
  run_button = gr.Button("Run Evaluation & Submit All Answers")
451
-
452
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
453
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
454
 
@@ -459,24 +354,19 @@ with gr.Blocks() as demo:
459
 
460
  if __name__ == "__main__":
461
  print("\n" + "-"*30 + " App Starting " + "-"*30)
462
- # Check for SPACE_HOST and SPACE_ID at startup for information
463
  space_host_startup = os.getenv("SPACE_HOST")
464
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
465
 
466
  if space_host_startup:
467
  print(f"✅ SPACE_HOST found: {space_host_startup}")
468
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
469
  else:
470
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
471
 
472
- if space_id_startup: # Print repo URLs if SPACE_ID is found
473
  print(f"✅ SPACE_ID found: {space_id_startup}")
474
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
475
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
476
  else:
477
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
478
-
479
- print("-"*(60 + len(" App Starting ")) + "\n")
480
 
 
481
  print("Launching Gradio Interface for GAIA Agent Evaluation...")
482
  demo.launch(debug=True, share=False)
 
8
  from typing import Optional, Dict, List, Any
9
  import anthropic
10
 
 
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
 
13
  class GAIAAgent:
14
  def __init__(self):
15
  print("Initializing GAIA Agent powered by Claude...")
 
16
  self.claude_key = os.environ.get("ANTHROPIC_API_KEY")
17
  if not self.claude_key:
18
  raise ValueError("ANTHROPIC_API_KEY not found in environment variables")
19
 
 
20
  self.client = anthropic.Anthropic(api_key=self.claude_key)
 
 
21
  self.api_url = DEFAULT_API_URL
 
 
 
22
  self.file_cache = {}
23
 
24
+ # System prompt instructs to wrap final answer in <answer> tags
25
+ self.system_prompt = """You are a highly accurate assistant solving GAIA benchmark tasks.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
+ GAIA uses EXACT STRING MATCHING to grade answers. Your response format is CRITICAL.
28
+
29
+ Instructions:
30
+ 1. Think step by step to figure out the correct answer.
31
+ 2. At the very end of your response, output your final answer wrapped in <answer> tags like this:
32
+ <answer>your exact answer here</answer>
33
+
34
+ Rules for the content inside <answer>:
35
+ - For numbers: just the number, e.g. <answer>42</answer>
36
+ - For lists: comma-separated, e.g. <answer>apples, bananas, oranges</answer>
37
+ - For single words: just the word, e.g. <answer>photosynthesis</answer>
38
+ - For dates: use the format specified in the question
39
+ - NO extra punctuation, quotes, or explanation inside the tags
40
+ - Follow the exact format requested by the question
41
+
42
+ Think carefully before giving the final answer."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
 
44
  def fetch_file(self, task_id: str) -> Optional[Dict[str, Any]]:
45
  """Fetches and processes a file associated with a task"""
46
  if task_id in self.file_cache:
 
47
  return self.file_cache[task_id]
48
 
49
  print(f"Fetching file for task: {task_id}")
50
  try:
51
  response = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
52
 
53
+ if response.status_code != 200:
54
+ print(f"No file found for task {task_id}, status: {response.status_code}")
55
+ return None
 
 
 
 
 
 
 
56
 
57
+ file_content = response.content
58
+ content_type = response.headers.get("Content-Type", "").lower()
59
+
60
+ file_info = {
61
+ "content": file_content,
62
+ "content_type": content_type,
63
+ "size": len(file_content)
64
+ }
65
+
66
+ if "image" in content_type:
67
+ file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
68
+ file_info["type"] = "image"
69
+ print(f"Image file: {file_info['size']} bytes, type: {content_type}")
70
+ elif "pdf" in content_type:
71
+ file_info["base64"] = base64.b64encode(file_content).decode('utf-8')
72
+ file_info["type"] = "pdf"
73
+ print(f"PDF file: {file_info['size']} bytes")
74
+ elif any(t in content_type for t in ["text", "json", "csv"]):
75
+ try:
76
+ file_info["text"] = file_content.decode('utf-8')
77
+ file_info["type"] = "text"
78
+ print(f"Text file: {file_info['size']} bytes")
79
+ except UnicodeDecodeError:
80
+ # Try latin-1 fallback
81
  try:
82
+ file_info["text"] = file_content.decode('latin-1')
83
  file_info["type"] = "text"
84
+ except:
 
85
  file_info["type"] = "binary"
 
 
 
 
 
 
 
 
86
  else:
87
+ # Try to decode as text anyway
88
+ try:
89
+ file_info["text"] = file_content.decode('utf-8')
90
+ file_info["type"] = "text"
91
+ print(f"Unknown type decoded as text: {content_type}")
92
+ except:
93
+ file_info["type"] = "binary"
94
+ print(f"Binary file: {content_type}, {file_info['size']} bytes")
95
+
96
+ self.file_cache[task_id] = file_info
97
+ return file_info
98
  except Exception as e:
99
+ print(f"Error fetching file for {task_id}: {e}")
100
  return None
101
 
102
  def extract_answer(self, response_text: str) -> str:
103
+ """Extract answer from <answer> tags"""
104
+ # Primary: look for <answer> tags
105
+ match = re.search(r'<answer>(.*?)</answer>', response_text, re.DOTALL | re.IGNORECASE)
106
+ if match:
107
+ answer = match.group(1).strip()
108
+ print(f"Extracted from <answer> tags: {repr(answer)}")
109
+ return answer
110
 
111
+ # Fallback: look for "Final answer:" pattern
112
+ match = re.search(r'(?:final answer|the answer is)[:\s]+(.+?)(?:\n|$)', response_text, re.IGNORECASE)
113
+ if match:
114
+ answer = match.group(1).strip().strip('"\'')
115
+ print(f"Extracted from 'final answer:' pattern: {repr(answer)}")
116
+ return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
 
118
+ # Last resort: take the last non-empty line
119
+ lines = [l.strip() for l in response_text.strip().split('\n') if l.strip()]
120
+ if lines:
121
+ answer = lines[-1].strip('"\'.,')
122
+ print(f"Fallback to last line: {repr(answer)}")
123
+ return answer
 
 
 
124
 
125
+ return response_text.strip()
126
 
127
  def __call__(self, question: str, task_id: str = None) -> str:
128
+ """Process a question and return an answer"""
129
+ print(f"\n{'='*60}")
130
+ print(f"Task ID: {task_id}")
131
+ print(f"Question: {question[:200]}...")
 
 
 
 
 
 
 
 
132
 
133
  try:
134
+ # Build message content
135
+ user_content = []
136
 
137
+ # Add question text
138
+ user_content.append({
139
  "type": "text",
140
+ "text": f"Question: {question}"
141
+ })
 
 
 
 
 
 
 
142
 
143
+ # Try to fetch associated file
144
+ file_info = None
145
+ if task_id:
146
+ file_info = self.fetch_file(task_id)
 
 
 
 
 
 
 
147
 
148
+ if file_info:
149
+ file_type = file_info.get("type", "unknown")
 
150
 
151
+ if file_type == "image" and "base64" in file_info:
152
+ # Determine media type for Claude
153
+ ct = file_info["content_type"]
154
+ if "jpeg" in ct or "jpg" in ct:
155
+ media_type = "image/jpeg"
156
+ elif "png" in ct:
157
+ media_type = "image/png"
158
+ elif "gif" in ct:
159
+ media_type = "image/gif"
160
+ elif "webp" in ct:
161
+ media_type = "image/webp"
162
+ else:
163
+ media_type = "image/png" # default
164
+
165
  user_content.append({
166
  "type": "image",
167
  "source": {
168
+ "type": "base64",
169
+ "media_type": media_type,
170
  "data": file_info["base64"]
171
  }
172
  })
173
+ user_content.append({
174
+ "type": "text",
175
+ "text": "The image above is provided as part of this question. Analyze it carefully."
176
+ })
177
+ print("Added image to message")
178
 
179
+ elif file_type == "pdf" and "base64" in file_info:
180
+ user_content.append({
181
+ "type": "document",
182
+ "source": {
183
+ "type": "base64",
184
+ "media_type": "application/pdf",
185
+ "data": file_info["base64"]
186
+ }
187
+ })
188
  user_content.append({
189
  "type": "text",
190
+ "text": "The PDF document above is provided as part of this question. Read it carefully."
191
  })
192
+ print("Added PDF to message")
193
+
194
+ elif file_type == "text" and "text" in file_info:
195
+ file_text = file_info["text"]
196
+ # Limit to avoid token overflow but keep enough context
197
+ max_chars = 8000
198
+ if len(file_text) > max_chars:
199
+ file_text = file_text[:max_chars] + f"\n... [truncated, total {len(file_info['text'])} chars]"
200
+
201
  user_content.append({
202
  "type": "text",
203
+ "text": f"\nAttached file content:\n```\n{file_text}\n```"
 
 
 
 
204
  })
205
+ print(f"Added text file content ({len(file_info['text'])} chars)")
206
+ else:
207
+ print(f"File type {file_type} not added to message")
208
 
209
+ # Call Claude
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  response = self.client.messages.create(
211
  model="claude-sonnet-4-6",
212
  system=self.system_prompt,
213
+ messages=[{"role": "user", "content": user_content}],
214
+ temperature=0, # deterministic
215
  max_tokens=4096
216
  )
217
 
 
218
  raw_answer = response.content[0].text.strip()
219
+ print(f"\nRaw response:\n{raw_answer[:500]}...")
220
 
221
+ final_answer = self.extract_answer(raw_answer)
222
+ print(f"Final answer: {repr(final_answer)}")
223
 
224
+ return final_answer
 
225
 
 
226
  except Exception as e:
227
+ print(f"Error processing task {task_id}: {e}")
228
  import traceback
229
  traceback.print_exc()
230
+ return f"ERROR: {str(e)}"
231
 
232
 
 
233
  class BasicAgent(GAIAAgent):
234
  pass
235
 
236
 
237
  def run_and_submit_all(profile: gr.OAuthProfile | None):
238
+ space_id = os.getenv("SPACE_ID")
 
 
 
 
 
239
 
240
  if profile:
241
+ username = f"{profile.username}"
242
  print(f"User logged in: {username}")
243
  else:
244
  print("User not logged in.")
 
248
  questions_url = f"{api_url}/questions"
249
  submit_url = f"{api_url}/submit"
250
 
 
251
  try:
252
  agent = BasicAgent()
253
  except Exception as e:
254
  print(f"Error instantiating agent: {e}")
255
  return f"Error initializing agent: {e}", None
256
+
257
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
258
  print(agent_code)
259
 
 
260
  print(f"Fetching questions from: {questions_url}")
261
  try:
262
  response = requests.get(questions_url, timeout=15)
263
  response.raise_for_status()
264
  questions_data = response.json()
265
  if not questions_data:
266
+ return "Fetched questions list is empty or invalid format.", None
 
267
  print(f"Fetched {len(questions_data)} questions.")
268
+ except Exception as e:
269
  print(f"Error fetching questions: {e}")
270
  return f"Error fetching questions: {e}", None
 
 
 
 
 
 
 
271
 
 
272
  results_log = []
273
  answers_payload = []
274
  print(f"Running agent on {len(questions_data)} questions...")
275
+
276
  for item in questions_data:
277
  task_id = item.get("task_id")
278
  question_text = item.get("question")
 
282
  try:
283
  submitted_answer = agent(question_text, task_id)
284
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
285
+ results_log.append({
286
+ "Task ID": task_id,
287
+ "Question": question_text[:100],
288
+ "Submitted Answer": submitted_answer
289
+ })
290
  except Exception as e:
291
+ print(f"Error running agent on task {task_id}: {e}")
292
+ results_log.append({
293
+ "Task ID": task_id,
294
+ "Question": question_text[:100],
295
+ "Submitted Answer": f"AGENT ERROR: {e}"
296
+ })
297
 
298
  if not answers_payload:
 
299
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
300
 
301
+ submission_data = {
302
+ "username": username.strip(),
303
+ "agent_code": agent_code,
304
+ "answers": answers_payload
305
+ }
306
+ print(f"Submitting {len(answers_payload)} answers for user '{username}'...")
307
 
 
 
308
  try:
309
  response = requests.post(submit_url, json=submission_data, timeout=60)
310
  response.raise_for_status()
 
317
  f"Message: {result_data.get('message', 'No message received.')}"
318
  )
319
  print("Submission successful.")
320
+ return final_status, pd.DataFrame(results_log)
 
321
  except requests.exceptions.HTTPError as e:
322
  error_detail = f"Server responded with status {e.response.status_code}."
323
  try:
324
  error_json = e.response.json()
325
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
326
+ except:
327
  error_detail += f" Response: {e.response.text[:500]}"
328
+ return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
329
  except Exception as e:
330
+ return f"Submission Failed: {e}", pd.DataFrame(results_log)
 
 
 
331
 
332
 
 
333
  with gr.Blocks() as demo:
334
  gr.Markdown("# GAIA Benchmark Agent Evaluation")
335
  gr.Markdown(
336
  """
337
  **Instructions:**
338
+ 1. Log in to your Hugging Face account using the button below.
339
  2. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
340
 
341
+ This agent uses Claude Sonnet 4.6 to solve GAIA benchmark tasks.
342
  """
343
  )
344
 
345
  gr.LoginButton()
 
346
  run_button = gr.Button("Run Evaluation & Submit All Answers")
 
347
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
348
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
349
 
 
354
 
355
  if __name__ == "__main__":
356
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
357
  space_host_startup = os.getenv("SPACE_HOST")
358
+ space_id_startup = os.getenv("SPACE_ID")
359
 
360
  if space_host_startup:
361
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
362
  else:
363
+ print("ℹ️ SPACE_HOST not found (running locally?).")
364
 
365
+ if space_id_startup:
366
  print(f"✅ SPACE_ID found: {space_id_startup}")
 
 
367
  else:
368
+ print("ℹ️ SPACE_ID not found (running locally?).")
 
 
369
 
370
+ print("-"*60 + "\n")
371
  print("Launching Gradio Interface for GAIA Agent Evaluation...")
372
  demo.launch(debug=True, share=False)