GilbertoEwaldFilho commited on
Commit
c15943d
·
verified ·
1 Parent(s): e96252b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -313
app.py CHANGED
@@ -22,13 +22,6 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
22
  # ================================
23
 
24
  def clean_answer(text: str) -> str:
25
- """
26
- Limpa a resposta do modelo para bater em EXACT MATCH:
27
- - remove prefixos tipo 'Final answer', 'Answer:'
28
- - remove quebras de linha
29
- - remove aspas externas
30
- - normaliza espaços e ponto final solto
31
- """
32
  if not text:
33
  return ""
34
 
@@ -39,7 +32,6 @@ def clean_answer(text: str) -> str:
39
  r"(?i)^answer[:\- ]*",
40
  r"(?i)^the answer is[:\- ]*",
41
  r"(?i)^my answer is[:\- ]*",
42
- r"(?i)^resposta[:\- ]*",
43
  ]
44
  for p in patterns_to_remove:
45
  text = re.sub(p, "", text).strip()
@@ -48,134 +40,43 @@ def clean_answer(text: str) -> str:
48
  text = re.sub(r"\s+", " ", text).strip()
49
 
50
  if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
51
- text = text[1:-1].strip()
52
 
53
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
54
- text = text[:-1].strip()
55
 
56
- return text
57
 
58
 
59
  def enforce_numeric_format(question: str, answer: str) -> str:
60
- """
61
- Para questões que pedem número / duas casas / USD:
62
- tenta extrair só o número principal e formatar certo.
63
- """
64
  q = question.lower()
65
- a = answer
66
 
67
- # USD com duas casas decimais
68
- if ("usd" in q or "$" in q) and (
69
- "two decimal places" in q or "2 decimal places" in q
70
- ):
71
- match = re.search(r"[-+]?\d+(?:[.,]\d+)?", a)
72
- if match:
73
- try:
74
- value = float(match.group(0).replace(",", ""))
75
- return f"{value:.2f}"
76
- except Exception:
77
- pass
78
-
79
- # Duas casas decimais sem necessariamente USD
80
  if "two decimal places" in q or "2 decimal places" in q:
81
- match = re.search(r"[-+]?\d+(?:[.,]\d+)?", a)
82
  if match:
83
  try:
84
  value = float(match.group(0).replace(",", ""))
85
  return f"{value:.2f}"
86
- except Exception:
87
  pass
88
 
89
- # Contagens / anos etc.
90
- if any(
91
- kw in q
92
- for kw in [
93
- "how many",
94
- "number of",
95
- "at bats",
96
- "population",
97
- "what year",
98
- "in which year",
99
- ]
100
- ):
101
- match = re.search(r"-?\d+", a.replace(",", ""))
102
  if match:
103
  return match.group(0)
104
 
105
- return a
106
-
107
-
108
- def postprocess_vegetable_question(question: str, answer: str) -> str:
109
- """
110
- Ajuste especial para a questão dos VEGETAIS com definição BOTÂNICA.
111
-
112
- - Remove claramente fruits/herbs da lista
113
- - Ordena alfabeticamente
114
- - Garante formato 'item, item, item'
115
- """
116
- q = question.lower()
117
- if "vegetables" not in q:
118
- return answer
119
- if "botany" not in q and "botanical" not in q:
120
- return answer
121
-
122
- # Tenta quebrar a resposta em itens separados por vírgula
123
- items_raw = [x.strip() for x in answer.split(",") if x.strip()]
124
- if not items_raw:
125
- return answer
126
-
127
- # normalização pra comparar
128
- normalized_map = {item.lower(): item for item in items_raw}
129
-
130
- # lista de itens que NÃO devem entrar como vegetable: fruits, herbs, seeds etc.
131
- # baseado especificamente na lista dessa questão
132
- blacklist = {
133
- "plums",
134
- "green beans",
135
- "rice",
136
- "corn",
137
- "bell pepper",
138
- "whole bean coffee",
139
- "whole allspice",
140
- "acorns",
141
- "peanuts",
142
- "fresh basil",
143
- "basil",
144
- "oreos",
145
- "milk",
146
- "eggs",
147
- "flour",
148
- }
149
-
150
- filtered = []
151
- for low, original in normalized_map.items():
152
- if low in blacklist:
153
- continue
154
- filtered.append(original)
155
-
156
- if not filtered:
157
- # se por algum motivo removemos tudo, volta original
158
- filtered = list(normalized_map.values())
159
-
160
- # ordena alfabeticamente ignorando maiúsculas/minúsculas
161
- filtered_sorted = sorted(filtered, key=lambda x: x.lower())
162
-
163
- return ", ".join(filtered_sorted)
164
 
165
 
166
  def web_search(question: str, max_results: int = 5) -> str:
167
- """
168
- Usa DuckDuckGo (ddgs) pra buscar contexto web.
169
- """
170
- snippets: List[str] = []
171
  try:
172
  with DDGS() as ddgs:
173
  for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
174
- title = r.get("title") or ""
175
- body = r.get("body") or ""
176
- url = r.get("href") or ""
177
- snippet = f"{title}\n{body}\nURL: {url}"
178
- snippets.append(snippet)
179
  except Exception as e:
180
  print("[WEB SEARCH ERROR]", e)
181
  return ""
@@ -187,9 +88,6 @@ def web_search(question: str, max_results: int = 5) -> str:
187
 
188
 
189
  def get_file_context(api_url: str, task_id: str, item: dict) -> str:
190
- """
191
- Baixa arquivo em /files/{task_id} se existir e extrai texto/tab.
192
- """
193
  file_name = (
194
  item.get("file_name")
195
  or item.get("filename")
@@ -217,7 +115,7 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
217
  if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
218
  try:
219
  text = data.decode("utf-8", errors="replace")
220
- except Exception:
221
  text = data.decode("latin-1", errors="replace")
222
  return f"[FILE TXT]\n{text[:8000]}"
223
 
@@ -231,8 +129,7 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
231
  print("[EXCEL PARSE ERROR]", e)
232
  return "[FILE] Spreadsheet exists but cannot parse."
233
 
234
- # Outros tipos
235
- return f"[FILE BINARY: {file_name}] {len(data)} bytes (type: {content_type})"
236
 
237
  except Exception as e:
238
  print("[FILE ERROR]", e)
@@ -245,17 +142,12 @@ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
245
 
246
  SYSTEM_INSTRUCTIONS = """
247
  You are a highly accurate GAIA benchmark agent.
248
- Your answers are evaluated with EXACT MATCH.
249
-
250
- Core rules:
251
- - Think step by step INTERNALLY, but NEVER show your reasoning.
252
- - Output ONLY the final answer string, no explanations, no extra words.
253
- - Do NOT write prefixes like "Final answer:", "Answer is:", etc.
254
- - If the answer is a number, output only the number (no units) unless the format explicitly requires otherwise.
255
- - If the answer must have two decimal places (e.g. USD values), ensure exactly two decimal places.
256
- - If the answer is a list, output it exactly in the requested format (e.g. comma-separated, alphabetical order).
257
- - Carefully use both the provided file content (if any) and the web search snippets.
258
- - If external context is noisy or contradictory, prefer sources that match the question's constraints (dates, names, etc.).
259
  """
260
 
261
 
@@ -264,13 +156,6 @@ Core rules:
264
  # ================================
265
 
266
  class GaiaAgent:
267
- """
268
- Agente tunado:
269
- - Qwen3-Next-80B-A3B-Thinking via chat_completion
270
- - web search (ddgs)
271
- - file context (txt/csv/excel)
272
- - pós-processamento de número / USD / vegetais-botânica
273
- """
274
 
275
  def __init__(self):
276
  print("Initializing GAIA Agent with Qwen 80B...")
@@ -283,67 +168,22 @@ class GaiaAgent:
283
  token=token,
284
  )
285
 
286
- def build_prompt(self, question: str, search_ctx: str, file_ctx: str) -> str:
287
- q = question.lower()
288
-
289
- extra_guidance = []
290
-
291
- # DICAS ESPECÍFICAS POR TIPO DE QUESTÃO
292
-
293
- # Questões de USD / duas casas decimais
294
- if "usd" in q or "dollars" in q or "two decimal places" in q:
295
- extra_guidance.append(
296
- "- If the answer is a monetary value, output only the numeric value with exactly two decimal places "
297
- "(no currency symbol)."
298
- )
299
-
300
- # Questão de vegetais com definição botânica
301
- if "vegetables" in q and ("botany" in q or "botanical" in q):
302
- extra_guidance.append(
303
- "- Use strict botanical definitions: fruits are seed-bearing structures (e.g., plums, bell peppers, "
304
- "corn kernels, acorns, peanuts, beans, grains). Vegetables are other edible plant parts such as leaves, "
305
- "stems, flowers, or roots (e.g., lettuce, celery, broccoli, sweet potatoes)."
306
- )
307
- extra_guidance.append(
308
- "- Do NOT include any botanical fruits or seeds in the vegetable list, even if they are commonly "
309
- "treated as vegetables in cooking."
310
- )
311
-
312
- # Questões de contagem/quantidade
313
- if "how many" in q or "number of" in q or "at bats" in q:
314
- extra_guidance.append(
315
- "- Carefully count the exact quantity requested and output only that integer number."
316
- )
317
-
318
- # Questões de lista (ex: nomes separados por vírgula)
319
- if "comma separated" in q or "comma-delimited" in q or "comma separated list" in q:
320
- extra_guidance.append(
321
- "- Output a single line with items separated by a comma and a space (e.g., 'item1, item2, item3')."
322
- )
323
-
324
- guidance_block = ""
325
- if extra_guidance:
326
- guidance_block = "\nAdditional question-specific rules:\n" + "\n".join(extra_guidance)
327
-
328
- prompt = (
329
- f"{SYSTEM_INSTRUCTIONS.strip()}\n"
330
- f"{guidance_block}\n\n"
331
  f"QUESTION:\n{question}\n\n"
332
- f"FILE CONTEXT (may be partial or noisy):\n{file_ctx or 'No file content.'}\n\n"
333
- f"WEB SEARCH CONTEXT (may be partial or noisy):\n{search_ctx or 'No web search results.'}\n\n"
334
- "Using ALL the reliable information above, deduce the correct answer.\n"
335
- "Remember: DO NOT show your reasoning, only output the final answer string.\n"
336
- "Answer:"
337
  )
338
- return prompt
339
 
340
  def __call__(self, question: str, file_context: str = "") -> str:
341
- print("\n" + "=" * 60)
342
  print("NEW QUESTION:")
343
  print(question)
344
- print("=" * 60 + "\n")
345
 
346
- search_ctx = web_search(question, max_results=5)
347
  print(f"[SEARCH LEN] {len(search_ctx)} | [FILE LEN] {len(file_context)}")
348
 
349
  prompt = self.build_prompt(question, search_ctx, file_context)
@@ -354,9 +194,8 @@ class GaiaAgent:
354
  {"role": "system", "content": SYSTEM_INSTRUCTIONS},
355
  {"role": "user", "content": prompt},
356
  ],
357
- max_tokens=220,
358
  temperature=0.0,
359
- top_p=1.0,
360
  )
361
  raw = response.choices[0].message["content"]
362
  print("[RAW OUTPUT]", raw)
@@ -366,7 +205,6 @@ class GaiaAgent:
366
 
367
  answer = clean_answer(raw)
368
  answer = enforce_numeric_format(question, answer)
369
- answer = postprocess_vegetable_question(question, answer)
370
 
371
  print("[FINAL ANSWER]", answer)
372
  return answer
@@ -377,136 +215,70 @@ class GaiaAgent:
377
  # ================================
378
 
379
  def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
380
- """
381
- Pipeline completo:
382
- - busca questões
383
- - tenta baixar arquivo (/files/{task_id})
384
- - faz web search
385
- - responde com GaiaAgent
386
- - submete respostas ao /submit
387
- """
388
  if not profile:
389
- return "Please Login to Hugging Face with the button.", None
390
 
391
  username = profile.username
392
- print(f"User logged in: {username}")
393
-
394
  api_url = DEFAULT_API_URL
395
  questions_url = f"{api_url}/questions"
396
  submit_url = f"{api_url}/submit"
397
  space_id = os.getenv("SPACE_ID")
398
- agent_code = (
399
- f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else ""
400
- )
401
  print(f"Agent code URL: {agent_code}")
402
 
403
- # Instanciar agente
404
  try:
405
  agent = GaiaAgent()
406
  except Exception as e:
407
- print("Error instantiating agent:", e)
408
  return f"Error initializing agent: {e}", None
409
 
410
- # Buscar questões
411
- print(f"Fetching questions from: {questions_url}")
412
  try:
413
  resp = requests.get(questions_url, timeout=120)
414
  resp.raise_for_status()
415
- questions_data = resp.json()
416
- if not questions_data:
417
- return "Fetched questions list is empty or invalid format.", None
418
- print(f"Fetched {len(questions_data)} questions.")
419
  except Exception as e:
420
- print("Error fetching questions:", e)
421
  return f"Error fetching questions: {e}", None
422
 
423
- # Rodar agente em cada questão
424
- results_log = []
425
- answers_payload = []
426
-
427
- print(f"Running agent on {len(questions_data)} questions...")
428
- for item in questions_data:
429
- task_id = item.get("task_id")
430
- question_text = item.get("question")
431
-
432
- if not task_id or question_text is None:
433
- print("Skipping item with missing task_id or question:", item)
434
- continue
435
 
436
- file_context = get_file_context(api_url, task_id, item)
 
437
 
438
- try:
439
- submitted_answer = agent(question_text, file_context)
440
- except Exception as e:
441
- print(f"Error running agent on task {task_id}:", e)
442
- submitted_answer = ""
443
 
444
- answers_payload.append(
445
- {"task_id": task_id, "submitted_answer": submitted_answer}
446
- )
447
- results_log.append(
448
- {
449
- "Task ID": task_id,
450
- "Question": question_text,
451
- "Submitted Answer": submitted_answer,
452
- }
453
- )
454
 
455
- if not answers_payload:
456
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
457
 
458
- submission_data = {
459
- "username": username.strip(),
460
  "agent_code": agent_code,
461
  "answers": answers_payload,
462
  }
463
 
464
- print(
465
- f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
466
- )
467
- print(f"Submitting to: {submit_url}")
468
-
469
  try:
470
- resp = requests.post(submit_url, json=submission_data)
471
  resp.raise_for_status()
472
- result_data = resp.json()
473
 
474
- final_status = (
475
  f"Submission Successful!\n"
476
- f"User: {result_data.get('username')}\n"
477
- f"Overall Score: {result_data.get('score', 'N/A')}% "
478
- f"({result_data.get('correct_count', '?')}/"
479
- f"{result_data.get('total_attempted', '?')} correct)\n"
480
- f"Message: {result_data.get('message', 'No message received.')}"
481
  )
482
-
483
- print("Submission successful.")
484
- results_df = pd.DataFrame(results_log)
485
- return final_status, results_df
486
-
487
- except requests.exceptions.HTTPError as e:
488
- error_detail = f"Server responded with status {e.response.status_code}."
489
- try:
490
- error_json = e.response.json()
491
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
492
- except Exception:
493
- error_detail += f" Response: {e.response.text[:500]}"
494
- status_message = f"Submission Failed: {error_detail}"
495
- print(status_message)
496
- results_df = pd.DataFrame(results_log)
497
- return status_message, results_df
498
-
499
- except requests.exceptions.RequestException as e:
500
- status_message = f"Submission Failed: Network error - {e}"
501
- print(status_message)
502
- results_df = pd.DataFrame(results_log)
503
- return status_message, results_df
504
 
505
  except Exception as e:
506
- status_message = f"An unexpected error occurred during submission: {e}"
507
- print(status_message)
508
- results_df = pd.DataFrame(results_log)
509
- return status_message, results_df
510
 
511
 
512
  # ================================
@@ -514,37 +286,17 @@ def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
514
  # ================================
515
 
516
  with gr.Blocks() as demo:
517
- gr.Markdown("# GAIA Agent Evaluation Runner (Qwen 80B Tuned Version)")
518
- gr.Markdown(
519
- """
520
- **How to use**
521
-
522
- 1. Log in with your Hugging Face account.
523
- 2. Make sure this Space is public and has a Secret `HF_TOKEN`
524
- with Inference permissions.
525
- 3. Click **"Run Evaluation & Submit All Answers"** and wait.
526
-
527
- The agent will:
528
- - fetch all questions,
529
- - optionally download attached files (if any),
530
- - perform web search,
531
- - answer each question with ONLY the final answer (EXACT MATCH friendly),
532
- - submit to the scoring API.
533
- """
534
- )
535
 
536
  gr.LoginButton()
537
 
538
  run_button = gr.Button("Run Evaluation & Submit All Answers")
539
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5)
540
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
541
 
542
- run_button.click(
543
- fn=run_and_submit_all,
544
- outputs=[status_output, results_table],
545
- )
546
 
547
 
548
  if __name__ == "__main__":
549
- print("\n" + "-" * 30 + " App Starting " + "-" * 30)
550
- demo.launch(debug=True, share=False)
 
22
  # ================================
23
 
24
  def clean_answer(text: str) -> str:
 
 
 
 
 
 
 
25
  if not text:
26
  return ""
27
 
 
32
  r"(?i)^answer[:\- ]*",
33
  r"(?i)^the answer is[:\- ]*",
34
  r"(?i)^my answer is[:\- ]*",
 
35
  ]
36
  for p in patterns_to_remove:
37
  text = re.sub(p, "", text).strip()
 
40
  text = re.sub(r"\s+", " ", text).strip()
41
 
42
  if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
43
+ text = text[1:-1]
44
 
45
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
46
+ text = text[:-1]
47
 
48
+ return text.strip()
49
 
50
 
51
  def enforce_numeric_format(question: str, answer: str) -> str:
 
 
 
 
52
  q = question.lower()
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  if "two decimal places" in q or "2 decimal places" in q:
55
+ match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
56
  if match:
57
  try:
58
  value = float(match.group(0).replace(",", ""))
59
  return f"{value:.2f}"
60
+ except:
61
  pass
62
 
63
+ if any(kw in q for kw in ["how many", "number of", "what year", "in which year"]):
64
+ match = re.search(r"-?\d+", answer.replace(",", ""))
 
 
 
 
 
 
 
 
 
 
 
65
  if match:
66
  return match.group(0)
67
 
68
+ return answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
 
71
  def web_search(question: str, max_results: int = 5) -> str:
72
+ snippets = []
 
 
 
73
  try:
74
  with DDGS() as ddgs:
75
  for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
76
+ title = r.get("title", "")
77
+ body = r.get("body", "")
78
+ url = r.get("href", "")
79
+ snippets.append(f"{title}\n{body}\nURL: {url}")
 
80
  except Exception as e:
81
  print("[WEB SEARCH ERROR]", e)
82
  return ""
 
88
 
89
 
90
  def get_file_context(api_url: str, task_id: str, item: dict) -> str:
 
 
 
91
  file_name = (
92
  item.get("file_name")
93
  or item.get("filename")
 
115
  if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
116
  try:
117
  text = data.decode("utf-8", errors="replace")
118
+ except:
119
  text = data.decode("latin-1", errors="replace")
120
  return f"[FILE TXT]\n{text[:8000]}"
121
 
 
129
  print("[EXCEL PARSE ERROR]", e)
130
  return "[FILE] Spreadsheet exists but cannot parse."
131
 
132
+ return f"[FILE BINARY: {file_name}] {len(data)} bytes"
 
133
 
134
  except Exception as e:
135
  print("[FILE ERROR]", e)
 
142
 
143
  SYSTEM_INSTRUCTIONS = """
144
  You are a highly accurate GAIA benchmark agent.
145
+ Always output ONLY the final answer (EXACT MATCH).
146
+ No explanations. No reasoning. No extra words.
147
+ Rules:
148
+ - If the answer is a number only the number.
149
+ - If format requires 2 decimal places enforce it.
150
+ - If a list is required output in exact requested form.
 
 
 
 
 
151
  """
152
 
153
 
 
156
  # ================================
157
 
158
  class GaiaAgent:
 
 
 
 
 
 
 
159
 
160
  def __init__(self):
161
  print("Initializing GAIA Agent with Qwen 80B...")
 
168
  token=token,
169
  )
170
 
171
+ def build_prompt(self, question, search_ctx, file_ctx):
172
+ return (
173
+ f"{SYSTEM_INSTRUCTIONS}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
  f"QUESTION:\n{question}\n\n"
175
+ f"FILE CONTEXT:\n{file_ctx or 'No file provided.'}\n\n"
176
+ f"WEB SEARCH CONTEXT:\n{search_ctx or 'No search results.'}\n\n"
177
+ "Now output ONLY the final answer:\n"
 
 
178
  )
 
179
 
180
  def __call__(self, question: str, file_context: str = "") -> str:
181
+ print("\n====================================================")
182
  print("NEW QUESTION:")
183
  print(question)
184
+ print("====================================================\n")
185
 
186
+ search_ctx = web_search(question)
187
  print(f"[SEARCH LEN] {len(search_ctx)} | [FILE LEN] {len(file_context)}")
188
 
189
  prompt = self.build_prompt(question, search_ctx, file_context)
 
194
  {"role": "system", "content": SYSTEM_INSTRUCTIONS},
195
  {"role": "user", "content": prompt},
196
  ],
197
+ max_tokens=200,
198
  temperature=0.0,
 
199
  )
200
  raw = response.choices[0].message["content"]
201
  print("[RAW OUTPUT]", raw)
 
205
 
206
  answer = clean_answer(raw)
207
  answer = enforce_numeric_format(question, answer)
 
208
 
209
  print("[FINAL ANSWER]", answer)
210
  return answer
 
215
  # ================================
216
 
217
  def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
218
+
 
 
 
 
 
 
 
219
  if not profile:
220
+ return "Please log in first.", None
221
 
222
  username = profile.username
 
 
223
  api_url = DEFAULT_API_URL
224
  questions_url = f"{api_url}/questions"
225
  submit_url = f"{api_url}/submit"
226
  space_id = os.getenv("SPACE_ID")
227
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
228
+
229
+ print(f"User logged in: {username}")
230
  print(f"Agent code URL: {agent_code}")
231
 
 
232
  try:
233
  agent = GaiaAgent()
234
  except Exception as e:
 
235
  return f"Error initializing agent: {e}", None
236
 
237
+ print("Fetching questions...")
 
238
  try:
239
  resp = requests.get(questions_url, timeout=120)
240
  resp.raise_for_status()
241
+ questions = resp.json()
 
 
 
242
  except Exception as e:
 
243
  return f"Error fetching questions: {e}", None
244
 
245
+ print(f"Fetched {len(questions)} questions.")
 
 
 
 
 
 
 
 
 
 
 
246
 
247
+ answers_payload = []
248
+ results_log = []
249
 
250
+ for item in questions:
251
+ qid = item["task_id"]
252
+ qtext = item["question"]
 
 
253
 
254
+ file_context = get_file_context(api_url, qid, item)
255
+ answer = agent(qtext, file_context)
 
 
 
 
 
 
 
 
256
 
257
+ answers_payload.append({"task_id": qid, "submitted_answer": answer})
258
+ results_log.append({"Task ID": qid, "Question": qtext, "Submitted Answer": answer})
259
 
260
+ submission = {
261
+ "username": username,
262
  "agent_code": agent_code,
263
  "answers": answers_payload,
264
  }
265
 
266
+ print("Submitting answers...")
 
 
 
 
267
  try:
268
+ resp = requests.post(submit_url, json=submission)
269
  resp.raise_for_status()
270
+ result = resp.json()
271
 
272
+ status = (
273
  f"Submission Successful!\n"
274
+ f"Score: {result.get('score')}% "
275
+ f"({result.get('correct_count')}/{result.get('total_attempted')})\n"
276
+ f"{result.get('message')}"
 
 
277
  )
278
+ return status, pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
 
280
  except Exception as e:
281
+ return f"Submission failed: {e}", pd.DataFrame(results_log)
 
 
 
282
 
283
 
284
  # ================================
 
286
  # ================================
287
 
288
  with gr.Blocks() as demo:
289
+ gr.Markdown("## GAIA Agent Runner Qwen 80B Enhanced Version")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
 
291
  gr.LoginButton()
292
 
293
  run_button = gr.Button("Run Evaluation & Submit All Answers")
 
 
294
 
295
+ out_status = gr.Textbox(label="Status", lines=4)
296
+ out_table = gr.DataFrame(label="Answers")
297
+
298
+ run_button.click(run_and_submit_all, outputs=[out_status, out_table])
299
 
300
 
301
  if __name__ == "__main__":
302
+ demo.launch(debug=True, share=False)