GilbertoEwaldFilho commited on
Commit
f7efd53
·
verified ·
1 Parent(s): ee8b123

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +247 -228
app.py CHANGED
@@ -1,366 +1,377 @@
1
  import os
2
  import re
3
- import io
4
  import requests
5
  import pandas as pd
6
  import gradio as gr
7
 
 
 
8
  from huggingface_hub import InferenceClient
9
- from duckduckgo_search import DDGS
10
 
11
- # --- Constants ---
 
 
 
 
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
 
14
 
15
- # =========================================================
16
- # Helper: limpeza de resposta para EXACT MATCH
17
- # =========================================================
 
18
  def clean_answer(text: str) -> str:
19
  """
20
- Limpa a resposta do modelo para bater com EXACT MATCH:
 
21
  - remove quebras de linha
22
- - remove 'final answer', 'answer:', etc
23
  - remove aspas externas
24
  - normaliza espaços
 
25
  """
26
- if not text:
27
  return ""
28
 
29
  text = str(text).strip()
30
 
 
31
  patterns_to_remove = [
32
- r"(?i)final answer[:\- ]*",
33
- r"(?i)answer[:\- ]*",
34
- r"(?i)the answer is[:\- ]*",
35
- r"(?i)my answer is[:\- ]*",
 
36
  ]
37
  for p in patterns_to_remove:
38
  text = re.sub(p, "", text).strip()
39
 
40
- text = text.replace("\n", " ").strip()
 
41
 
 
42
  if len(text) >= 2 and text.startswith('"') and text.endswith('"'):
43
  text = text[1:-1].strip()
44
  if len(text) >= 2 and text.startswith("'") and text.endswith("'"):
45
  text = text[1:-1].strip()
46
 
47
- text = re.sub(r"\s+", " ", text)
48
- return text.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
 
50
 
51
- # =========================================================
52
- # Tools auxiliares (search + arquivo)
53
- # =========================================================
54
- def web_search(query: str, max_results: int = 6) -> str:
55
  """
56
- Busca no DuckDuckGo e retorna um texto com snippets.
57
- Se der erro, retorna string vazia.
58
  """
 
 
59
  try:
60
- snippets = []
61
  with DDGS() as ddgs:
62
- for r in ddgs.text(query, max_results=max_results):
 
 
 
 
63
  title = r.get("title") or ""
64
  body = r.get("body") or ""
65
  url = r.get("href") or ""
66
- snippets.append(f"Title: {title}\nSnippet: {body}\nURL: {url}")
67
- return "\n\n".join(snippets)[:4000] # corta para não estourar contexto
68
  except Exception as e:
69
- print(f"[SEARCH ERROR] {e}")
70
  return ""
71
 
 
 
72
 
73
- def get_file_context(item: dict) -> str | None:
74
- """
75
- Tenta baixar e ler um arquivo associado à questão.
76
- Supõe que o JSON possa ter um campo 'file_url'.
77
- Se não tiver ou der erro, retorna None.
78
- """
79
- url = (
80
- item.get("file_url")
81
- or item.get("file")
82
- or item.get("attachment_url")
83
- or item.get("attachment")
84
- )
85
-
86
- if not url:
87
- return None
88
-
89
- print(f"Trying to download attachment for task {item.get('task_id')} from: {url}")
90
 
91
- try:
92
- resp = requests.get(url, timeout=20)
93
- resp.raise_for_status()
94
 
95
- content_type = resp.headers.get("content-type", "")
96
- data = resp.content
 
97
 
98
- # XLSX
99
- if url.endswith(".xlsx") or (
100
- "spreadsheetml.sheet" in content_type
101
- ):
102
- try:
103
- df = pd.read_excel(io.BytesIO(data))
104
- csv_preview = df.to_csv(index=False)
105
- return csv_preview[:4000]
106
- except Exception as e:
107
- print(f"[FILE XLSX PARSE ERROR] {e}")
108
- return None
109
-
110
- # CSV / texto
111
- try:
112
- text = resp.text
113
- return text[:4000]
114
- except Exception as e:
115
- print(f"[FILE TEXT PARSE ERROR] {e}")
116
- return None
117
 
118
- except Exception as e:
119
- print(f"[FILE DOWNLOAD ERROR] {e}")
120
- return None
 
 
 
 
 
 
121
 
122
 
123
- # =========================================================
124
- # Basic Agent Definition – sem smolagents, usando só InferenceClient
125
- # =========================================================
126
- class BasicAgent:
127
  """
128
- Agente que:
129
- - usa DuckDuckGo para buscar contexto
130
- - tenta ler arquivo anexo (se o JSON tiver file_url)
131
- - chama Qwen via chat_completion
132
- - devolve apenas a resposta final (EXACT MATCH friendly)
133
  """
134
 
135
  def __init__(self):
136
- print("Initializing GAIA agent with InferenceClient + DuckDuckGo...")
137
 
138
  hf_token = os.getenv("HF_TOKEN")
139
  if not hf_token:
140
  raise ValueError(
141
- "HF_TOKEN not found! Configure um Secret chamado HF_TOKEN em Settings → Variables."
 
142
  )
143
 
144
- # Modelo conversacional (suporta chat_completion)
145
  self.client = InferenceClient(
146
- model="Qwen/Qwen2.5-72B-Instruct",
147
  token=hf_token,
148
  )
149
 
150
- self.system_instructions = (
151
- "You are solving GAIA benchmark questions.\n"
152
- "You may receive web search snippets and/or file contents.\n"
153
- "Use them to answer accurately.\n"
154
- "RULES:\n"
155
- "- Answer ONLY with the final answer.\n"
156
- "- No explanations, no reasoning steps, no justification.\n"
157
- "- Do NOT write 'Final answer', 'Answer:', etc.\n"
158
- "- If the answer is a number, output just the number.\n"
159
- "- Your output will be compared using EXACT MATCH.\n"
160
- )
161
-
162
- def __call__(self, question: str, file_context: str | None = None) -> str:
163
- print(f"\n=== NEW QUESTION ===\n{question}\n")
164
-
165
- # 1) Busca na web
166
- search_context = web_search(question)
167
- print(f"[SEARCH LENGTH] {len(search_context)} chars")
168
 
169
- # 2) Constrói contexto adicional
170
- extra_parts = []
171
  if search_context:
172
- extra_parts.append("Web search results:\n" + search_context)
173
- if file_context:
174
- extra_parts.append("Relevant file content:\n" + file_context)
175
-
176
- extra_context = "\n\n".join(extra_parts)
177
- if len(extra_context) > 6000:
178
- extra_context = extra_context[:6000]
179
-
180
- user_content = question
181
- if extra_context:
182
- user_content += (
183
- "\n\nHere is some external context (web and/or file):\n"
184
- + extra_context
185
- + "\n\nUsing ONLY the necessary information above, "
186
- "answer the question. Remember: reply ONLY with the final answer."
187
  )
188
  else:
189
- user_content += (
190
- "\n\nAnswer the question using your knowledge. "
191
- "Remember: reply ONLY with the final answer."
192
- )
 
 
 
 
 
 
 
193
 
194
- messages = [
195
- {"role": "system", "content": self.system_instructions},
196
- {"role": "user", "content": user_content},
197
- ]
 
 
 
 
 
 
 
 
198
 
 
199
  try:
200
- completion = self.client.chat_completion(
201
- messages=messages,
202
- max_tokens=96,
203
- temperature=0.1,
204
  top_p=0.9,
 
205
  )
 
 
 
 
206
 
207
- choice = completion.choices[0]
208
- msg = choice.message
209
- if isinstance(msg, dict):
210
- raw = msg.get("content", "")
211
- else:
212
- raw = getattr(msg, "content", "")
213
 
214
- print("RAW MODEL OUTPUT:", repr(raw))
215
- final = clean_answer(raw)
216
- print("CLEANED ANSWER:", repr(final))
217
- return final
218
 
219
- except Exception as e:
220
- print("ERROR calling InferenceClient.chat_completion:", e)
221
- return ""
222
 
 
 
 
223
 
224
- # =========================================================
225
- # Runner + submit (quase igual ao template original)
226
- # =========================================================
227
- def run_and_submit_all(profile: gr.OAuthProfile | None):
228
  """
229
- Busca todas as questões, roda o agente em cada uma,
230
- submete as respostas e mostra o resultado.
231
  """
232
- space_id = os.getenv("SPACE_ID")
233
 
 
234
  if profile:
235
- username = f"{profile.username}"
236
  print(f"User logged in: {username}")
237
  else:
238
  print("User not logged in.")
239
  return "Please Login to Hugging Face with the button.", None
240
 
 
 
241
  api_url = DEFAULT_API_URL
242
  questions_url = f"{api_url}/questions"
243
  submit_url = f"{api_url}/submit"
244
 
245
- # 1. Instancia o agente
 
 
 
 
 
 
 
 
246
  try:
247
- agent = BasicAgent()
248
  except Exception as e:
249
- print(f"Error instantiating agent: {e}")
250
  return f"Error initializing agent: {e}", None
251
 
252
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
253
- print(f"Agent code URL: {agent_code}")
254
-
255
- # 2. Busca perguntas
256
  print(f"Fetching questions from: {questions_url}")
257
  try:
258
- response = requests.get(questions_url, timeout=120)
259
- response.raise_for_status()
260
- questions_data = response.json()
261
  if not questions_data:
262
- print("Fetched questions list is empty.")
263
  return "Fetched questions list is empty or invalid format.", None
264
  print(f"Fetched {len(questions_data)} questions.")
265
- except requests.exceptions.RequestException as e:
266
- print(f"Error fetching questions: {e}")
267
- return f"Error fetching questions: {e}", None
268
- except requests.exceptions.JSONDecodeError as e:
269
- print(f"Error decoding JSON response from questions endpoint: {e}")
270
- print(f"Response text: {response.text[:500]}")
271
- return f"Error decoding server response for questions: {e}", None
272
  except Exception as e:
273
- print(f"An unexpected error occurred fetching questions: {e}")
274
- return f"An unexpected error occurred fetching questions: {e}", None
275
 
276
- # 3. Roda o agente
277
  results_log = []
278
  answers_payload = []
279
- print(f"Running agent on {len(questions_data)} questions...")
280
 
 
281
  for item in questions_data:
282
  task_id = item.get("task_id")
283
  question_text = item.get("question")
284
  if not task_id or question_text is None:
285
- print(f"Skipping item with missing task_id or question: {item}")
286
  continue
287
 
288
  try:
289
- file_context = get_file_context(item)
290
- submitted_answer = agent(question_text, file_context=file_context)
291
-
292
- answers_payload.append(
293
- {"task_id": task_id, "submitted_answer": submitted_answer}
294
- )
295
- results_log.append(
296
- {
297
- "Task ID": task_id,
298
- "Question": question_text,
299
- "Submitted Answer": submitted_answer,
300
- }
301
- )
302
  except Exception as e:
303
- print(f"Error running agent on task {task_id}: {e}")
304
- results_log.append(
305
- {
306
- "Task ID": task_id,
307
- "Question": question_text,
308
- "Submitted Answer": f"AGENT ERROR: {e}",
309
- }
310
- )
 
 
 
 
 
311
 
312
  if not answers_payload:
313
  print("Agent did not produce any answers to submit.")
314
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
315
 
316
- # 4. Monta submissão
317
  submission_data = {
318
  "username": username.strip(),
319
  "agent_code": agent_code,
320
  "answers": answers_payload,
321
  }
322
- status_update = (
 
323
  f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
324
  )
325
- print(status_update)
326
 
327
- # 5. Submete
328
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
329
  try:
330
- response = requests.post(submit_url, json=submission_data, timeout=120)
331
- response.raise_for_status()
332
- result_data = response.json()
 
333
  final_status = (
334
  f"Submission Successful!\n"
335
  f"User: {result_data.get('username')}\n"
336
  f"Overall Score: {result_data.get('score', 'N/A')}% "
337
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
 
338
  f"Message: {result_data.get('message', 'No message received.')}"
339
  )
 
340
  print("Submission successful.")
341
  results_df = pd.DataFrame(results_log)
342
  return final_status, results_df
 
343
  except requests.exceptions.HTTPError as e:
344
  error_detail = f"Server responded with status {e.response.status_code}."
345
  try:
346
  error_json = e.response.json()
347
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
348
- except requests.exceptions.JSONDecodeError:
349
  error_detail += f" Response: {e.response.text[:500]}"
350
  status_message = f"Submission Failed: {error_detail}"
351
  print(status_message)
352
  results_df = pd.DataFrame(results_log)
353
  return status_message, results_df
354
- except requests.exceptions.Timeout:
355
- status_message = "Submission Failed: The request timed out."
356
- print(status_message)
357
- results_df = pd.DataFrame(results_log)
358
- return status_message, results_df
359
  except requests.exceptions.RequestException as e:
360
  status_message = f"Submission Failed: Network error - {e}"
361
  print(status_message)
362
  results_df = pd.DataFrame(results_log)
363
  return status_message, results_df
 
364
  except Exception as e:
365
  status_message = f"An unexpected error occurred during submission: {e}"
366
  print(status_message)
@@ -368,23 +379,28 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
368
  return status_message, results_df
369
 
370
 
371
- # =========================================================
372
- # Interface Gradio (igual ao template, com texto atualizado)
373
- # =========================================================
374
- with gr.Blocks() as demo:
375
- gr.Markdown("# GAIA Agent Evaluation Runner (Custom Qwen + DuckDuckGo)")
376
 
 
 
377
  gr.Markdown(
378
  """
379
- **How to use:**
380
- 1. Log in to your Hugging Face account using the button below.
381
- 2. Click **'Run Evaluation & Submit All Answers'**.
382
- 3. The agent will:
383
- - fetch all questions,
384
- - optionally download attached files (if any),
385
- - perform web search,
386
- - answer each question with ONLY the final answer (EXACT MATCH friendly),
387
- - submit the answers to the scoring API.
 
 
 
 
 
388
  """
389
  )
390
 
@@ -393,8 +409,11 @@ with gr.Blocks() as demo:
393
  run_button = gr.Button("Run Evaluation & Submit All Answers")
394
 
395
  status_output = gr.Textbox(
396
- label="Run Status / Submission Result", lines=5, interactive=False
 
 
397
  )
 
398
  results_table = gr.DataFrame(
399
  label="Questions and Agent Answers",
400
  wrap=True,
@@ -415,7 +434,7 @@ if __name__ == "__main__":
415
  print(f"✅ SPACE_HOST found: {space_host_startup}")
416
  print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
417
  else:
418
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
419
 
420
  if space_id_startup:
421
  print(f"✅ SPACE_ID found: {space_id_startup}")
@@ -424,7 +443,7 @@ if __name__ == "__main__":
424
  f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
425
  )
426
  else:
427
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
428
 
429
  print("-" * (60 + len(" App Starting ")) + "\n")
430
 
 
1
  import os
2
  import re
 
3
  import requests
4
  import pandas as pd
5
  import gradio as gr
6
 
7
+ from typing import Optional, List
8
+ from ddgs import DDGS # pip install ddgs
9
  from huggingface_hub import InferenceClient
 
10
 
11
+
12
+ # ============================
13
+ # CONSTANTES DA AVALIAÇÃO
14
+ # ============================
15
+
16
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
17
 
18
 
19
+ # ============================
20
+ # FUNÇÕES AUXILIARES
21
+ # ============================
22
+
23
  def clean_answer(text: str) -> str:
24
  """
25
+ Limpa a resposta do modelo para bater em EXACT MATCH:
26
+
27
  - remove quebras de linha
28
+ - remove 'final answer', 'answer:' etc
29
  - remove aspas externas
30
  - normaliza espaços
31
+ - remove ponto final se sobrar só isso no fim
32
  """
33
+ if text is None:
34
  return ""
35
 
36
  text = str(text).strip()
37
 
38
+ # Remover prefixos tipo "Final answer:", "Answer is", etc.
39
  patterns_to_remove = [
40
+ r"(?i)^final answer[:\- ]*",
41
+ r"(?i)^answer[:\- ]*",
42
+ r"(?i)^the answer is[:\- ]*",
43
+ r"(?i)^my answer is[:\- ]*",
44
+ r"(?i)^resposta[:\- ]*",
45
  ]
46
  for p in patterns_to_remove:
47
  text = re.sub(p, "", text).strip()
48
 
49
+ # remover quebras de linha
50
+ text = text.replace("\n", " ").replace("\r", " ").strip()
51
 
52
+ # aspas externas
53
  if len(text) >= 2 and text.startswith('"') and text.endswith('"'):
54
  text = text[1:-1].strip()
55
  if len(text) >= 2 and text.startswith("'") and text.endswith("'"):
56
  text = text[1:-1].strip()
57
 
58
+ # múltiplos espaços
59
+ text = re.sub(r"\s+", " ", text).strip()
60
+
61
+ # ponto final isolado no fim
62
+ if text.endswith(".") and not re.search(r"[0-9A-Za-z][.!?]$", text[:-1]):
63
+ text = text[:-1].strip()
64
+
65
+ return text
66
+
67
+
68
+ def enforce_numeric_format(question: str, answer: str) -> str:
69
+ """
70
+ Para questões que pedem número, casas decimais, etc,
71
+ tenta extrair só o número principal e formatar direito.
72
+ """
73
+
74
+ q = question.lower()
75
+
76
+ # Se pedir duas casas decimais, ex: "two decimal places"
77
+ if "two decimal places" in q or "2 decimal places" in q:
78
+ match = re.search(r"[-+]?\d+(?:[.,]\d+)?", answer)
79
+ if match:
80
+ num = match.group(0).replace(",", "")
81
+ try:
82
+ value = float(num)
83
+ return f"{value:.2f}"
84
+ except ValueError:
85
+ pass
86
+
87
+ # Se parecer que é só um número inteiro (at bats, year, count etc.)
88
+ if any(
89
+ kw in q
90
+ for kw in [
91
+ "how many",
92
+ "at bats",
93
+ "number of",
94
+ "population",
95
+ "what year",
96
+ "in which year",
97
+ ]
98
+ ):
99
+ match = re.search(r"-?\d+", answer.replace(",", ""))
100
+ if match:
101
+ return match.group(0)
102
+
103
+ # senão, devolve como veio
104
+ return answer
105
 
106
 
107
+ def web_search(question: str, max_results: int = 5) -> str:
 
 
 
108
  """
109
+ Usa DuckDuckGo (ddgs) pra buscar contexto web.
110
+ Retorna um texto concatenando título + snippet.
111
  """
112
+ snippets: List[str] = []
113
+
114
  try:
 
115
  with DDGS() as ddgs:
116
+ for r in ddgs.text(
117
+ question,
118
+ max_results=max_results,
119
+ safesearch="moderate",
120
+ ):
121
  title = r.get("title") or ""
122
  body = r.get("body") or ""
123
  url = r.get("href") or ""
124
+ snippet = f"{title}\n{body}\nURL: {url}"
125
+ snippets.append(snippet)
126
  except Exception as e:
127
+ print("[WEB SEARCH ERROR]", e)
128
  return ""
129
 
130
+ if not snippets:
131
+ return ""
132
 
133
+ joined = "\n\n---\n\n".join(snippets)
134
+ # limitar pra não exagerar o contexto
135
+ return joined[:8000]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
 
 
 
137
 
138
+ # ============================
139
+ # AGENTE PRINCIPAL
140
+ # ============================
141
 
142
+ SYSTEM_INSTRUCTIONS = """
143
+ You are a highly accurate AI assistant solving GAIA benchmark questions.
144
+ You MUST provide answers suitable for EXACT MATCH evaluation.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
+ GENERAL RULES:
147
+ - Think step by step, but DO NOT show your reasoning.
148
+ - Output ONLY the final answer string.
149
+ - Do NOT include explanations, reasoning, or extra words.
150
+ - Do NOT write things like "Final answer:", "Answer is", etc.
151
+ - If the answer is a number, output only the number (no units unless explicitly requested).
152
+ - If the answer is a list, output it exactly as requested (e.g., comma-separated, alphabetical order, etc.).
153
+ - Respect the requested formatting (e.g., two decimal places, upper/lowercase if clearly required).
154
+ """
155
 
156
 
157
+ class GaiaAgent:
 
 
 
158
  """
159
+ Agente projetado para maximizar a taxa de acerto:
160
+ - usa modelo open-source via InferenceClient (rota gratuita)
161
+ - faz web search com ddgs em todas as questões
162
+ - aplica pós-processamento para números / duas casas decimais etc.
 
163
  """
164
 
165
  def __init__(self):
166
+ print("Initializing GAIA Agent...")
167
 
168
  hf_token = os.getenv("HF_TOKEN")
169
  if not hf_token:
170
  raise ValueError(
171
+ "HF_TOKEN não encontrado! "
172
+ "Crie um Secret chamado HF_TOKEN em Settings → Variables."
173
  )
174
 
175
+ # Modelo forte open-source (pode trocar se quiser tentar outros)
176
  self.client = InferenceClient(
177
+ model="mistralai/Mistral-7B-Instruct-v0.2",
178
  token=hf_token,
179
  )
180
 
181
+ def build_prompt(self, question: str, search_context: str) -> str:
182
+ """
183
+ Constrói o prompt completo para o modelo.
184
+ """
185
+ base = SYSTEM_INSTRUCTIONS.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
186
 
 
 
187
  if search_context:
188
+ ctx = (
189
+ "Here are web search results that may be relevant. "
190
+ "They can be noisy, so you must reason carefully and ignore incorrect info.\n\n"
191
+ f"{search_context}"
 
 
 
 
 
 
 
 
 
 
 
192
  )
193
  else:
194
+ ctx = "No external web search results are available for this question."
195
+
196
+ prompt = (
197
+ f"{base}\n\n"
198
+ f"QUESTION:\n{question}\n\n"
199
+ f"{ctx}\n\n"
200
+ "Now, based on all the above, provide ONLY the final answer.\n"
201
+ "Remember: no explanation, only the final answer string.\n"
202
+ "Answer:"
203
+ )
204
+ return prompt
205
 
206
+ def __call__(self, question: str) -> str:
207
+ print("\n" + "=" * 60)
208
+ print("NEW QUESTION:")
209
+ print(question)
210
+ print("=" * 60 + "\n")
211
+
212
+ # 1. Web search
213
+ search_ctx = web_search(question, max_results=5)
214
+ print(f"[SEARCH CONTEXT LENGTH] {len(search_ctx)} chars")
215
+
216
+ # 2. Montar prompt
217
+ prompt = self.build_prompt(question, search_ctx)
218
 
219
+ # 3. Chamar modelo
220
  try:
221
+ raw = self.client.text_generation(
222
+ prompt,
223
+ max_new_tokens=160,
224
+ temperature=0.0,
225
  top_p=0.9,
226
+ repetition_penalty=1.05,
227
  )
228
+ print("[RAW MODEL OUTPUT]", repr(raw))
229
+ except Exception as e:
230
+ print("ERROR calling InferenceClient.text_generation:", e)
231
+ return ""
232
 
233
+ # 4. Limpeza + pós-processamento
234
+ answer = clean_answer(raw)
235
+ answer = enforce_numeric_format(question, answer)
 
 
 
236
 
237
+ print("[FINAL CLEANED ANSWER]", repr(answer))
238
+ return answer
 
 
239
 
 
 
 
240
 
241
+ # ============================
242
+ # PIPELINE: RODAR E SUBMETER
243
+ # ============================
244
 
245
+ def run_and_submit_all(profile: Optional[gr.OAuthProfile]):
 
 
 
246
  """
247
+ Busca todas as questões, roda o agente, submete e mostra resultado.
 
248
  """
 
249
 
250
+ # --- usuário HF (pra leaderboard)
251
  if profile:
252
+ username = profile.username
253
  print(f"User logged in: {username}")
254
  else:
255
  print("User not logged in.")
256
  return "Please Login to Hugging Face with the button.", None
257
 
258
+ # --- URLs da API de scoring
259
+ space_id = os.getenv("SPACE_ID")
260
  api_url = DEFAULT_API_URL
261
  questions_url = f"{api_url}/questions"
262
  submit_url = f"{api_url}/submit"
263
 
264
+ # link do código na Space (precisa estar pública)
265
+ if space_id:
266
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
267
+ else:
268
+ agent_code = ""
269
+
270
+ print(f"Agent code URL: {agent_code}")
271
+
272
+ # 1) Instanciar agente
273
  try:
274
+ agent = GaiaAgent()
275
  except Exception as e:
276
+ print("Error instantiating agent:", e)
277
  return f"Error initializing agent: {e}", None
278
 
279
+ # 2) Buscar questões
 
 
 
280
  print(f"Fetching questions from: {questions_url}")
281
  try:
282
+ resp = requests.get(questions_url, timeout=120)
283
+ resp.raise_for_status()
284
+ questions_data = resp.json()
285
  if not questions_data:
286
+ print("Fetched questions list is empty or invalid.")
287
  return "Fetched questions list is empty or invalid format.", None
288
  print(f"Fetched {len(questions_data)} questions.")
 
 
 
 
 
 
 
289
  except Exception as e:
290
+ print("Error fetching questions:", e)
291
+ return f"Error fetching questions: {e}", None
292
 
293
+ # 3) Rodar agente em cada questão
294
  results_log = []
295
  answers_payload = []
 
296
 
297
+ print(f"Running agent on {len(questions_data)} questions...")
298
  for item in questions_data:
299
  task_id = item.get("task_id")
300
  question_text = item.get("question")
301
  if not task_id or question_text is None:
302
+ print("Skipping item with missing task_id or question:", item)
303
  continue
304
 
305
  try:
306
+ submitted_answer = agent(question_text)
 
 
 
 
 
 
 
 
 
 
 
 
307
  except Exception as e:
308
+ print(f"Error running agent on task {task_id}:", e)
309
+ submitted_answer = ""
310
+
311
+ answers_payload.append(
312
+ {"task_id": task_id, "submitted_answer": submitted_answer}
313
+ )
314
+ results_log.append(
315
+ {
316
+ "Task ID": task_id,
317
+ "Question": question_text,
318
+ "Submitted Answer": submitted_answer,
319
+ }
320
+ )
321
 
322
  if not answers_payload:
323
  print("Agent did not produce any answers to submit.")
324
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
325
 
326
+ # 4) Preparar submissão
327
  submission_data = {
328
  "username": username.strip(),
329
  "agent_code": agent_code,
330
  "answers": answers_payload,
331
  }
332
+
333
+ print(
334
  f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
335
  )
336
+ print(f"Submitting to: {submit_url}")
337
 
338
+ # 5) Submeter (sem timeout pra não cortar o servidor)
 
339
  try:
340
+ resp = requests.post(submit_url, json=submission_data)
341
+ resp.raise_for_status()
342
+ result_data = resp.json()
343
+
344
  final_status = (
345
  f"Submission Successful!\n"
346
  f"User: {result_data.get('username')}\n"
347
  f"Overall Score: {result_data.get('score', 'N/A')}% "
348
+ f"({result_data.get('correct_count', '?')}/"
349
+ f"{result_data.get('total_attempted', '?')} correct)\n"
350
  f"Message: {result_data.get('message', 'No message received.')}"
351
  )
352
+
353
  print("Submission successful.")
354
  results_df = pd.DataFrame(results_log)
355
  return final_status, results_df
356
+
357
  except requests.exceptions.HTTPError as e:
358
  error_detail = f"Server responded with status {e.response.status_code}."
359
  try:
360
  error_json = e.response.json()
361
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
362
+ except Exception:
363
  error_detail += f" Response: {e.response.text[:500]}"
364
  status_message = f"Submission Failed: {error_detail}"
365
  print(status_message)
366
  results_df = pd.DataFrame(results_log)
367
  return status_message, results_df
368
+
 
 
 
 
369
  except requests.exceptions.RequestException as e:
370
  status_message = f"Submission Failed: Network error - {e}"
371
  print(status_message)
372
  results_df = pd.DataFrame(results_log)
373
  return status_message, results_df
374
+
375
  except Exception as e:
376
  status_message = f"An unexpected error occurred during submission: {e}"
377
  print(status_message)
 
379
  return status_message, results_df
380
 
381
 
382
+ # ============================
383
+ # INTERFACE GRADIO
384
+ # ============================
 
 
385
 
386
+ with gr.Blocks() as demo:
387
+ gr.Markdown("# GAIA Agent Evaluation Runner (improved)")
388
  gr.Markdown(
389
  """
390
+ **Como usar**
391
+
392
+ 1. Faça login com sua conta Hugging Face no botão abaixo.
393
+ 2. Certifique-se de que este Space está público e tem um Secret `HF_TOKEN`
394
+ com permissão de Inference.
395
+ 3. Clique em **"Run Evaluation & Submit All Answers"**.
396
+ 4. Aguarde o agente responder às 20 questões e enviar ao servidor de scoring.
397
+
398
+ **Notas**
399
+
400
+ - O agente usa web search (DuckDuckGo) e um modelo open-source forte
401
+ via InferenceClient.
402
+ - A saída é cuidadosamente pós-processada para tentar maximizar o
403
+ acerto em EXACT MATCH (números, duas casas decimais, etc.).
404
  """
405
  )
406
 
 
409
  run_button = gr.Button("Run Evaluation & Submit All Answers")
410
 
411
  status_output = gr.Textbox(
412
+ label="Run Status / Submission Result",
413
+ lines=5,
414
+ interactive=False,
415
  )
416
+
417
  results_table = gr.DataFrame(
418
  label="Questions and Agent Answers",
419
  wrap=True,
 
434
  print(f"✅ SPACE_HOST found: {space_host_startup}")
435
  print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
436
  else:
437
+ print("ℹ️ SPACE_HOST not found (talvez rodando localmente).")
438
 
439
  if space_id_startup:
440
  print(f"✅ SPACE_ID found: {space_id_startup}")
 
443
  f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
444
  )
445
  else:
446
+ print("ℹ️ SPACE_ID not found. Repo URL cannot be determined.")
447
 
448
  print("-" * (60 + len(" App Starting ")) + "\n")
449