GilbertoEwaldFilho commited on
Commit
b6c0776
·
verified ·
1 Parent(s): 65d648f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +154 -302
app.py CHANGED
@@ -1,128 +1,82 @@
1
  import os
2
  import re
 
3
  import requests
4
  import pandas as pd
5
  import gradio as gr
6
 
7
  from typing import Optional, List
8
- from ddgs import DDGS # pip install ddgs
9
  from huggingface_hub import InferenceClient
10
 
11
 
12
- # ============================
13
  # CONSTANTES DA AVALIAÇÃO
14
- # ============================
15
 
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
 
18
 
19
- # ============================
20
  # FUNÇÕES AUXILIARES
21
- # ============================
22
 
23
  def clean_answer(text: str) -> str:
24
- """
25
- Limpa a resposta do modelo para bater em EXACT MATCH:
26
-
27
- - remove quebras de linha
28
- - remove 'final answer', 'answer:' etc
29
- - remove aspas externas
30
- - normaliza espaços
31
- - remove ponto final se sobrar só isso no fim
32
- """
33
- if text is None:
34
  return ""
35
 
36
  text = str(text).strip()
37
 
38
- # Remover prefixos tipo "Final answer:", "Answer is", etc.
39
  patterns_to_remove = [
40
  r"(?i)^final answer[:\- ]*",
41
  r"(?i)^answer[:\- ]*",
42
  r"(?i)^the answer is[:\- ]*",
43
  r"(?i)^my answer is[:\- ]*",
44
- r"(?i)^resposta[:\- ]*",
45
  ]
46
  for p in patterns_to_remove:
47
  text = re.sub(p, "", text).strip()
48
 
49
- # remover quebras de linha
50
  text = text.replace("\n", " ").replace("\r", " ").strip()
51
-
52
- # aspas externas
53
- if len(text) >= 2 and text.startswith('"') and text.endswith('"'):
54
- text = text[1:-1].strip()
55
- if len(text) >= 2 and text.startswith("'") and text.endswith("'"):
56
- text = text[1:-1].strip()
57
-
58
- # múltiplos espaços
59
  text = re.sub(r"\s+", " ", text).strip()
60
 
61
- # ponto final isolado no fim
 
 
62
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
63
- text = text[:-1].strip()
64
 
65
- return text
66
 
67
 
68
  def enforce_numeric_format(question: str, answer: str) -> str:
69
- """
70
- Para questões que pedem número, casas decimais, etc,
71
- tenta extrair só o número principal e formatar direito.
72
- """
73
-
74
  q = question.lower()
75
 
76
- # Se pedir duas casas decimais, ex: "two decimal places"
77
  if "two decimal places" in q or "2 decimal places" in q:
78
  match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
79
  if match:
80
- num = match.group(0).replace(",", "")
81
  try:
82
- value = float(num)
83
  return f"{value:.2f}"
84
- except ValueError:
85
  pass
86
 
87
- # Se parecer que é um número inteiro (at bats, year, count etc.)
88
- if any(
89
- kw in q
90
- for kw in [
91
- "how many",
92
- "at bats",
93
- "number of",
94
- "population",
95
- "what year",
96
- "in which year",
97
- ]
98
- ):
99
  match = re.search(r"-?\d+", answer.replace(",", ""))
100
  if match:
101
  return match.group(0)
102
 
103
- # senão, devolve como veio
104
  return answer
105
 
106
 
107
  def web_search(question: str, max_results: int = 5) -> str:
108
- """
109
- Usa DuckDuckGo (ddgs) pra buscar contexto web.
110
- Retorna um texto concatenando título + snippet.
111
- """
112
- snippets: List[str] = []
113
-
114
  try:
115
  with DDGS() as ddgs:
116
- for r in ddgs.text(
117
- question,
118
- max_results=max_results,
119
- safesearch="moderate",
120
- ):
121
- title = r.get("title") or ""
122
- body = r.get("body") or ""
123
- url = r.get("href") or ""
124
- snippet = f"{title}\n{body}\nURL: {url}"
125
- snippets.append(snippet)
126
  except Exception as e:
127
  print("[WEB SEARCH ERROR]", e)
128
  return ""
@@ -130,322 +84,220 @@ def web_search(question: str, max_results: int = 5) -> str:
130
  if not snippets:
131
  return ""
132
 
133
- joined = "\n\n---\n\n".join(snippets)
134
- # limitar pra não exagerar o contexto
135
- return joined[:8000]
136
 
137
 
138
- # ============================
139
- # AGENTE PRINCIPAL
140
- # ============================
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
 
142
  SYSTEM_INSTRUCTIONS = """
143
- You are a highly accurate AI assistant solving GAIA benchmark questions.
144
- You MUST provide answers suitable for EXACT MATCH evaluation.
145
-
146
- GENERAL RULES:
147
- - Think step by step, but DO NOT show your reasoning.
148
- - Output ONLY the final answer string.
149
- - Do NOT include explanations, reasoning, or extra words.
150
- - Do NOT write things like "Final answer:", "Answer is", etc.
151
- - If the answer is a number, output only the number (no units unless explicitly requested).
152
- - If the answer is a list, output it exactly as requested (e.g., comma-separated, alphabetical order, etc.).
153
- - Respect the requested formatting (e.g., two decimal places, upper/lowercase if clearly required).
154
  """
155
 
156
 
 
 
 
 
157
  class GaiaAgent:
158
- """
159
- Agente projetado para maximizar a taxa de acerto:
160
- - usa modelo open-source via InferenceClient (rota gratuita)
161
- - faz web search com ddgs em todas as questões
162
- - aplica pós-processamento para números / duas casas decimais etc.
163
- """
164
 
165
  def __init__(self):
166
- print("Initializing GAIA Agent...")
167
-
168
- hf_token = os.getenv("HF_TOKEN")
169
- if not hf_token:
170
- raise ValueError(
171
- "HF_TOKEN não encontrado! "
172
- "Crie um Secret chamado HF_TOKEN em Settings → Variables."
173
- )
174
 
175
- # Modelo forte open-source (pode trocar se quiser tentar outros)
176
  self.client = InferenceClient(
177
- model="mistralai/Mistral-7B-Instruct-v0.2",
178
- token=hf_token,
179
  )
180
 
181
- def build_prompt(self, question: str, search_context: str) -> str:
182
- """
183
- Constrói o prompt completo para o modelo.
184
- """
185
- base = SYSTEM_INSTRUCTIONS.strip()
186
-
187
- if search_context:
188
- ctx = (
189
- "Here are web search results that may be relevant. "
190
- "They can be noisy, so you must reason carefully and ignore incorrect info.\n\n"
191
- f"{search_context}"
192
- )
193
- else:
194
- ctx = "No external web search results are available for this question."
195
-
196
- prompt = (
197
- f"{base}\n\n"
198
  f"QUESTION:\n{question}\n\n"
199
- f"{ctx}\n\n"
200
- "Now, based on all the above, provide ONLY the final answer.\n"
201
- "Remember: no explanation, only the final answer string.\n"
202
- "Answer:"
203
  )
204
- return prompt
205
 
206
- def __call__(self, question: str) -> str:
207
- print("\n" + "=" * 60)
208
  print("NEW QUESTION:")
209
  print(question)
210
- print("=" * 60 + "\n")
211
 
212
- # 1. Web search
213
- search_ctx = web_search(question, max_results=5)
214
- print(f"[SEARCH CONTEXT LENGTH] {len(search_ctx)} chars")
215
 
216
- # 2. Montar prompt
217
- prompt = self.build_prompt(question, search_ctx)
218
 
219
- # 3. Chamar modelo
220
  try:
221
- raw = self.client.text_generation(
222
- prompt,
223
- max_new_tokens=160,
 
 
 
224
  temperature=0.0,
225
- top_p=0.9,
226
- repetition_penalty=1.05,
227
  )
228
- print("[RAW MODEL OUTPUT]", repr(raw))
 
229
  except Exception as e:
230
- print("ERROR calling InferenceClient.text_generation:", e)
231
  return ""
232
 
233
- # 4. Limpeza + pós-processamento
234
  answer = clean_answer(raw)
235
  answer = enforce_numeric_format(question, answer)
236
 
237
- print("[FINAL CLEANED ANSWER]", repr(answer))
238
  return answer
239
 
240
 
241
- # ============================
242
- # PIPELINE: RODAR E SUBMETER
243
- # ============================
244
 
245
  def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
246
- """
247
- Busca todas as questões, roda o agente, submete e mostra resultado.
248
- """
249
-
250
- # --- usuário HF (pra leaderboard)
251
- if profile:
252
- username = profile.username
253
- print(f"User logged in: {username}")
254
- else:
255
- print("User not logged in.")
256
- return "Please Login to Hugging Face with the button.", None
257
-
258
- # --- URLs da API de scoring
259
- space_id = os.getenv("SPACE_ID")
260
  api_url = DEFAULT_API_URL
261
  questions_url = f"{api_url}/questions"
262
  submit_url = f"{api_url}/submit"
 
 
263
 
264
- # link do código na Space (precisa estar pública)
265
- if space_id:
266
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
267
- else:
268
- agent_code = ""
269
-
270
  print(f"Agent code URL: {agent_code}")
271
 
272
- # 1) Instanciar agente
273
  try:
274
  agent = GaiaAgent()
275
  except Exception as e:
276
- print("Error instantiating agent:", e)
277
  return f"Error initializing agent: {e}", None
278
 
279
- # 2) Buscar questões
280
- print(f"Fetching questions from: {questions_url}")
281
  try:
282
  resp = requests.get(questions_url, timeout=120)
283
  resp.raise_for_status()
284
- questions_data = resp.json()
285
- if not questions_data:
286
- print("Fetched questions list is empty or invalid.")
287
- return "Fetched questions list is empty or invalid format.", None
288
- print(f"Fetched {len(questions_data)} questions.")
289
  except Exception as e:
290
- print("Error fetching questions:", e)
291
  return f"Error fetching questions: {e}", None
292
 
293
- # 3) Rodar agente em cada questão
294
- results_log = []
295
  answers_payload = []
 
296
 
297
- print(f"Running agent on {len(questions_data)} questions...")
298
- for item in questions_data:
299
- task_id = item.get("task_id")
300
- question_text = item.get("question")
301
- if not task_id or question_text is None:
302
- print("Skipping item with missing task_id or question:", item)
303
- continue
304
 
305
- try:
306
- submitted_answer = agent(question_text)
307
- except Exception as e:
308
- print(f"Error running agent on task {task_id}:", e)
309
- submitted_answer = ""
310
-
311
- answers_payload.append(
312
- {"task_id": task_id, "submitted_answer": submitted_answer}
313
- )
314
- results_log.append(
315
- {
316
- "Task ID": task_id,
317
- "Question": question_text,
318
- "Submitted Answer": submitted_answer,
319
- }
320
- )
321
 
322
- if not answers_payload:
323
- print("Agent did not produce any answers to submit.")
324
- return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
325
 
326
- # 4) Preparar submissão
327
- submission_data = {
328
- "username": username.strip(),
329
  "agent_code": agent_code,
330
  "answers": answers_payload,
331
  }
332
 
333
- print(
334
- f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
335
- )
336
- print(f"Submitting to: {submit_url}")
337
-
338
- # 5) Submeter (sem timeout pra não cortar o servidor)
339
  try:
340
- resp = requests.post(submit_url, json=submission_data)
341
  resp.raise_for_status()
342
- result_data = resp.json()
343
 
344
- final_status = (
345
  f"Submission Successful!\n"
346
- f"User: {result_data.get('username')}\n"
347
- f"Overall Score: {result_data.get('score', 'N/A')}% "
348
- f"({result_data.get('correct_count', '?')}/"
349
- f"{result_data.get('total_attempted', '?')} correct)\n"
350
- f"Message: {result_data.get('message', 'No message received.')}"
351
  )
352
-
353
- print("Submission successful.")
354
- results_df = pd.DataFrame(results_log)
355
- return final_status, results_df
356
-
357
- except requests.exceptions.HTTPError as e:
358
- error_detail = f"Server responded with status {e.response.status_code}."
359
- try:
360
- error_json = e.response.json()
361
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
362
- except Exception:
363
- error_detail += f" Response: {e.response.text[:500]}"
364
- status_message = f"Submission Failed: {error_detail}"
365
- print(status_message)
366
- results_df = pd.DataFrame(results_log)
367
- return status_message, results_df
368
-
369
- except requests.exceptions.RequestException as e:
370
- status_message = f"Submission Failed: Network error - {e}"
371
- print(status_message)
372
- results_df = pd.DataFrame(results_log)
373
- return status_message, results_df
374
 
375
  except Exception as e:
376
- status_message = f"An unexpected error occurred during submission: {e}"
377
- print(status_message)
378
- results_df = pd.DataFrame(results_log)
379
- return status_message, results_df
380
 
381
 
382
- # ============================
383
  # INTERFACE GRADIO
384
- # ============================
385
 
386
  with gr.Blocks() as demo:
387
- gr.Markdown("# GAIA Agent Evaluation Runner (improved)")
388
- gr.Markdown(
389
- """
390
- **Como usar**
391
-
392
- 1. Faça login com sua conta Hugging Face no botão abaixo.
393
- 2. Certifique-se de que este Space está público e tem um Secret `HF_TOKEN`
394
- com permissão de Inference.
395
- 3. Clique em **"Run Evaluation & Submit All Answers"**.
396
- 4. Aguarde o agente responder às 20 questões e enviar ao servidor de scoring.
397
-
398
- **Notas**
399
-
400
- - O agente usa web search (DuckDuckGo) e um modelo open-source forte
401
- via InferenceClient.
402
- - A saída é cuidadosamente pós-processada para tentar maximizar o
403
- acerto em EXACT MATCH (números, duas casas decimais, etc.).
404
- """
405
- )
406
 
407
  gr.LoginButton()
408
 
409
  run_button = gr.Button("Run Evaluation & Submit All Answers")
410
 
411
- status_output = gr.Textbox(
412
- label="Run Status / Submission Result",
413
- lines=5,
414
- interactive=False,
415
- )
416
-
417
- results_table = gr.DataFrame(
418
- label="Questions and Agent Answers",
419
- wrap=True,
420
- )
421
 
422
- run_button.click(
423
- fn=run_and_submit_all,
424
- outputs=[status_output, results_table],
425
- )
426
 
427
 
428
  if __name__ == "__main__":
429
- print("\n" + "-" * 30 + " App Starting " + "-" * 30)
430
- space_host_startup = os.getenv("SPACE_HOST")
431
- space_id_startup = os.getenv("SPACE_ID")
432
-
433
- if space_host_startup:
434
- print(f"✅ SPACE_HOST found: {space_host_startup}")
435
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
436
- else:
437
- print("ℹ️ SPACE_HOST not found (talvez rodando localmente).")
438
-
439
- if space_id_startup:
440
- print(f"✅ SPACE_ID found: {space_id_startup}")
441
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
442
- print(
443
- f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
444
- )
445
- else:
446
- print("ℹ️ SPACE_ID not found. Repo URL cannot be determined.")
447
-
448
- print("-" * (60 + len(" App Starting ")) + "\n")
449
-
450
- print("Launching Gradio Interface for GAIA Agent Evaluation...")
451
  demo.launch(debug=True, share=False)
 
1
  import os
2
  import re
3
+ import io
4
  import requests
5
  import pandas as pd
6
  import gradio as gr
7
 
8
  from typing import Optional, List
9
+ from ddgs import DDGS
10
  from huggingface_hub import InferenceClient
11
 
12
 
13
+ # ================================
14
  # CONSTANTES DA AVALIAÇÃO
15
+ # ================================
16
 
17
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
18
 
19
 
20
+ # ================================
21
  # FUNÇÕES AUXILIARES
22
+ # ================================
23
 
24
  def clean_answer(text: str) -> str:
25
+ if not text:
 
 
 
 
 
 
 
 
 
26
  return ""
27
 
28
  text = str(text).strip()
29
 
 
30
  patterns_to_remove = [
31
  r"(?i)^final answer[:\- ]*",
32
  r"(?i)^answer[:\- ]*",
33
  r"(?i)^the answer is[:\- ]*",
34
  r"(?i)^my answer is[:\- ]*",
 
35
  ]
36
  for p in patterns_to_remove:
37
  text = re.sub(p, "", text).strip()
38
 
 
39
  text = text.replace("\n", " ").replace("\r", " ").strip()
 
 
 
 
 
 
 
 
40
  text = re.sub(r"\s+", " ", text).strip()
41
 
42
+ if len(text) > 2 and text.startswith(("'", '"')) and text.endswith(("'", '"')):
43
+ text = text[1:-1]
44
+
45
  if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
46
+ text = text[:-1]
47
 
48
+ return text.strip()
49
 
50
 
51
  def enforce_numeric_format(question: str, answer: str) -> str:
 
 
 
 
 
52
  q = question.lower()
53
 
 
54
  if "two decimal places" in q or "2 decimal places" in q:
55
  match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
56
  if match:
 
57
  try:
58
+ value = float(match.group(0).replace(",", ""))
59
  return f"{value:.2f}"
60
+ except:
61
  pass
62
 
63
+ if any(kw in q for kw in ["how many", "number of", "what year", "in which year"]):
 
 
 
 
 
 
 
 
 
 
 
64
  match = re.search(r"-?\d+", answer.replace(",", ""))
65
  if match:
66
  return match.group(0)
67
 
 
68
  return answer
69
 
70
 
71
  def web_search(question: str, max_results: int = 5) -> str:
72
+ snippets = []
 
 
 
 
 
73
  try:
74
  with DDGS() as ddgs:
75
+ for r in ddgs.text(question, max_results=max_results, safesearch="moderate"):
76
+ title = r.get("title", "")
77
+ body = r.get("body", "")
78
+ url = r.get("href", "")
79
+ snippets.append(f"{title}\n{body}\nURL: {url}")
 
 
 
 
 
80
  except Exception as e:
81
  print("[WEB SEARCH ERROR]", e)
82
  return ""
 
84
  if not snippets:
85
  return ""
86
 
87
+ return ("\n\n---\n\n".join(snippets))[:8000]
 
 
88
 
89
 
90
+ def get_file_context(api_url: str, task_id: str, item: dict) -> str:
91
+ file_name = (
92
+ item.get("file_name")
93
+ or item.get("filename")
94
+ or item.get("file")
95
+ or ""
96
+ )
97
+ has_file_flag = item.get("has_file")
98
+ has_file = bool(file_name) or bool(has_file_flag)
99
+
100
+ if not has_file:
101
+ return ""
102
+
103
+ file_url = f"{api_url}/files/{task_id}"
104
+ print(f"[FILE DOWNLOAD] {file_url}")
105
+
106
+ try:
107
+ resp = requests.get(file_url, timeout=60)
108
+ resp.raise_for_status()
109
+ data = resp.content
110
+ content_type = (resp.headers.get("content-type") or "").lower()
111
+
112
+ name_lower = file_name.lower()
113
+
114
+ # TXT / CSV
115
+ if any(name_lower.endswith(ext) for ext in [".txt", ".csv", ".tsv"]):
116
+ try:
117
+ text = data.decode("utf-8", errors="replace")
118
+ except:
119
+ text = data.decode("latin-1", errors="replace")
120
+ return f"[FILE TXT]\n{text[:8000]}"
121
+
122
+ # XLS / XLSX
123
+ if any(name_lower.endswith(ext) for ext in [".xlsx", ".xls", ".xlsm"]):
124
+ try:
125
+ df = pd.read_excel(io.BytesIO(data))
126
+ csv_text = df.to_csv(index=False)
127
+ return f"[FILE TABLE CSV]\n{csv_text[:8000]}"
128
+ except Exception as e:
129
+ print("[EXCEL PARSE ERROR]", e)
130
+ return "[FILE] Spreadsheet exists but cannot parse."
131
+
132
+ return f"[FILE BINARY: {file_name}] {len(data)} bytes"
133
+
134
+ except Exception as e:
135
+ print("[FILE ERROR]", e)
136
+ return ""
137
+
138
+
139
+ # ================================
140
+ # SISTEMA DE INSTRUÇÕES
141
+ # ================================
142
 
143
  SYSTEM_INSTRUCTIONS = """
144
+ You are a highly accurate GAIA benchmark agent.
145
+ Always output ONLY the final answer (EXACT MATCH).
146
+ No explanations. No reasoning. No extra words.
147
+
148
+ Rules:
149
+ - If the answer is a number → only the number.
150
+ - If format requires 2 decimal places enforce it.
151
+ - If a list is required output in exact requested form.
 
 
 
152
  """
153
 
154
 
155
+ # ================================
156
+ # AGENTE PRINCIPAL
157
+ # ================================
158
+
159
  class GaiaAgent:
 
 
 
 
 
 
160
 
161
  def __init__(self):
162
+ print("Initializing GAIA Agent with Qwen 80B...")
163
+ token = os.getenv("HF_TOKEN")
164
+ if not token:
165
+ raise ValueError("Missing HF_TOKEN in Space secrets.")
 
 
 
 
166
 
 
167
  self.client = InferenceClient(
168
+ model="Qwen/Qwen3-Next-80B-A3B-Thinking",
169
+ token=token,
170
  )
171
 
172
+ def build_prompt(self, question, search_ctx, file_ctx):
173
+ return (
174
+ f"{SYSTEM_INSTRUCTIONS}\n\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  f"QUESTION:\n{question}\n\n"
176
+ f"FILE CONTEXT:\n{file_ctx or 'No file provided.'}\n\n"
177
+ f"WEB SEARCH CONTEXT:\n{search_ctx or 'No search results.'}\n\n"
178
+ "Now output ONLY the final answer:\n"
 
179
  )
 
180
 
181
+ def __call__(self, question: str, file_context: str = "") -> str:
182
+ print("\n====================================================")
183
  print("NEW QUESTION:")
184
  print(question)
185
+ print("====================================================\n")
186
 
187
+ search_ctx = web_search(question)
188
+ print(f"[SEARCH LEN] {len(search_ctx)} | [FILE LEN] {len(file_context)}")
 
189
 
190
+ prompt = self.build_prompt(question, search_ctx, file_context)
 
191
 
 
192
  try:
193
+ response = self.client.chat_completion(
194
+ messages=[
195
+ {"role": "system", "content": SYSTEM_INSTRUCTIONS},
196
+ {"role": "user", "content": prompt},
197
+ ],
198
+ max_tokens=200,
199
  temperature=0.0,
 
 
200
  )
201
+ raw = response.choices[0].message["content"]
202
+ print("[RAW OUTPUT]", raw)
203
  except Exception as e:
204
+ print("ERROR calling chat_completion:", e)
205
  return ""
206
 
 
207
  answer = clean_answer(raw)
208
  answer = enforce_numeric_format(question, answer)
209
 
210
+ print("[FINAL ANSWER]", answer)
211
  return answer
212
 
213
 
214
+ # ================================
215
+ # PIPELINE DE EXECUÇÃO
216
+ # ================================
217
 
218
  def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
219
+
220
+ if not profile:
221
+ return "Please log in first.", None
222
+
223
+ username = profile.username
 
 
 
 
 
 
 
 
 
224
  api_url = DEFAULT_API_URL
225
  questions_url = f"{api_url}/questions"
226
  submit_url = f"{api_url}/submit"
227
+ space_id = os.getenv("SPACE_ID")
228
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
229
 
230
+ print(f"User logged in: {username}")
 
 
 
 
 
231
  print(f"Agent code URL: {agent_code}")
232
 
 
233
  try:
234
  agent = GaiaAgent()
235
  except Exception as e:
 
236
  return f"Error initializing agent: {e}", None
237
 
238
+ print("Fetching questions...")
 
239
  try:
240
  resp = requests.get(questions_url, timeout=120)
241
  resp.raise_for_status()
242
+ questions = resp.json()
 
 
 
 
243
  except Exception as e:
 
244
  return f"Error fetching questions: {e}", None
245
 
246
+ print(f"Fetched {len(questions)} questions.")
247
+
248
  answers_payload = []
249
+ results_log = []
250
 
251
+ for item in questions:
252
+ qid = item["task_id"]
253
+ qtext = item["question"]
 
 
 
 
254
 
255
+ file_context = get_file_context(api_url, qid, item)
256
+ answer = agent(qtext, file_context)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
+ answers_payload.append({"task_id": qid, "submitted_answer": answer})
259
+ results_log.append({"Task ID": qid, "Question": qtext, "Submitted Answer": answer})
 
260
 
261
+ submission = {
262
+ "username": username,
 
263
  "agent_code": agent_code,
264
  "answers": answers_payload,
265
  }
266
 
267
+ print("Submitting answers...")
 
 
 
 
 
268
  try:
269
+ resp = requests.post(submit_url, json=submission)
270
  resp.raise_for_status()
271
+ result = resp.json()
272
 
273
+ status = (
274
  f"Submission Successful!\n"
275
+ f"Score: {result.get('score')}% "
276
+ f"({result.get('correct_count')}/{result.get('total_attempted')})\n"
277
+ f"{result.get('message')}"
 
 
278
  )
279
+ return status, pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
  except Exception as e:
282
+ return f"Submission failed: {e}", pd.DataFrame(results_log)
 
 
 
283
 
284
 
285
+ # ================================
286
  # INTERFACE GRADIO
287
+ # ================================
288
 
289
  with gr.Blocks() as demo:
290
+ gr.Markdown("## GAIA Agent Runner – Qwen 80B Enhanced Version")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
 
292
  gr.LoginButton()
293
 
294
  run_button = gr.Button("Run Evaluation & Submit All Answers")
295
 
296
+ out_status = gr.Textbox(label="Status", lines=4)
297
+ out_table = gr.DataFrame(label="Answers")
 
 
 
 
 
 
 
 
298
 
299
+ run_button.click(run_and_submit_all, outputs=[out_status, out_table])
 
 
 
300
 
301
 
302
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
303
  demo.launch(debug=True, share=False)