johnnychiang commited on
Commit
6051f37
·
verified ·
1 Parent(s): dd67f62

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +439 -125
app.py CHANGED
@@ -1,178 +1,492 @@
1
  import os
2
- import gradio as gr
 
 
3
  import requests
4
  import pandas as pd
5
- import re
6
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
7
 
8
- # ===============================
9
- # Constants
10
- # ===============================
11
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
12
 
13
- # ===============================
14
- # Basic Agent (PASS VERSION)
15
- # ===============================
16
- class BasicAgent:
17
- """
18
- Minimal GAIA Level-1 agent.
19
- Target: >=30% exact match
20
- """
21
 
22
- def __init__(self):
23
- print("BasicAgent initialized (PASS MODE).")
24
 
25
- # 必須在 Space → Settings → Secrets 設定
26
- self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
27
- if not self.hf_token:
28
- raise RuntimeError("HF_TOKEN missing. Set it in Space Settings → Secrets.")
29
 
30
- # 模型(可在 Variables 改)
31
- self.model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
 
 
 
 
 
32
 
33
- # ✅ 正確用法:不要給 base_url
34
- self.client = InferenceClient(
35
- model=self.model_id,
36
- token=self.hf_token,
37
- timeout=120,
38
- )
39
 
40
- self.system = (
41
- "You answer questions with EXACT MATCH.\n"
42
- "Return ONLY the final answer.\n"
43
- "No explanation.\n"
44
- "No extra words.\n"
45
- "No punctuation unless required.\n"
46
- "No quotes.\n"
47
- )
48
 
49
- def _sanitize(self, text: str) -> str:
50
- if not text:
51
- return ""
52
 
53
- t = str(text).strip()
 
 
 
54
 
55
- t = re.sub(r"(?i)final answer\s*[:\-]*", "", t)
56
- t = re.sub(r"(?i)answer\s*[:\-]*", "", t)
57
 
58
- lines = [ln.strip() for ln in t.splitlines() if ln.strip()]
59
- if lines:
60
- t = lines[-1]
61
 
62
- t = t.strip().strip('"').strip("'")
63
- t = re.sub(r"[.,;:!?]$", "", t)
 
 
64
 
65
- return t
 
66
 
67
- def __call__(self, question: str) -> str:
68
- print(f"Q: {question[:60]}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- prompt = f"{self.system}\nQuestion: {question}\nAnswer:"
 
 
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  try:
73
- out = self.client.text_generation(
74
  prompt,
75
- max_new_tokens=64,
 
76
  temperature=0.0,
77
  do_sample=False,
78
  return_full_text=False,
79
  )
80
- except Exception:
81
- out = self.client.chat_completion(
82
- messages=[
83
- {"role": "system", "content": self.system},
84
- {"role": "user", "content": question},
85
- ],
86
- max_tokens=64,
87
- temperature=0.0,
88
- ).choices[0].message.content
89
 
90
- ans = self._sanitize(out)
91
- print(f"A: {ans}")
92
- return ans
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
 
94
 
95
- # ===============================
96
- # Run & Submit
97
- # ===============================
98
  def run_and_submit_all(profile: gr.OAuthProfile | None):
99
-
100
  space_id = os.getenv("SPACE_ID")
101
 
102
- if not profile:
103
- return "Please login with Hugging Face.", None
 
 
 
104
 
105
- username = profile.username
106
- print(f"User: {username}")
107
-
108
- questions_url = f"{DEFAULT_API_URL}/questions"
109
- submit_url = f"{DEFAULT_API_URL}/submit"
110
 
111
  try:
112
- agent = BasicAgent()
113
  except Exception as e:
114
- return f"Agent init error: {e}", None
115
 
116
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
117
 
118
- resp = requests.get(questions_url, timeout=20)
119
- resp.raise_for_status()
120
- questions = resp.json()
 
 
 
 
 
 
121
 
 
122
  answers_payload = []
123
- log_rows = []
124
 
125
- for q in questions:
126
- task_id = q["task_id"]
127
- question = q["question"]
 
 
 
128
  try:
129
- ans = agent(question)
130
- except Exception:
131
- ans = ""
132
-
133
- answers_payload.append({
134
- "task_id": task_id,
135
- "submitted_answer": ans
136
- })
137
-
138
- log_rows.append({
139
- "Task ID": task_id,
140
- "Question": question,
141
- "Submitted Answer": ans
142
- })
143
-
144
- submission = {
145
- "username": username,
146
  "agent_code": agent_code,
147
- "answers": answers_payload
148
  }
149
 
150
- resp = requests.post(submit_url, json=submission, timeout=60)
151
- resp.raise_for_status()
152
- result = resp.json()
153
-
154
- status = (
155
- f"Submission Successful!\n"
156
- f"User: {result.get('username')}\n"
157
- f"Score: {result.get('score')}% "
158
- f"({result.get('correct_count')}/{result.get('total_attempted')})\n"
159
- f"{result.get('message')}"
160
- )
161
-
162
- return status, pd.DataFrame(log_rows)
 
163
 
164
 
165
- # ===============================
166
- # Gradio UI
167
- # ===============================
168
  with gr.Blocks() as demo:
169
- gr.Markdown("# Basic Agent Evaluation Runner (PASS MODE)")
 
 
 
 
 
 
 
 
 
 
 
 
170
  gr.LoginButton()
171
- run_btn = gr.Button("Run Evaluation & Submit All Answers")
172
- status = gr.Textbox(label="Result", lines=6)
173
- table = gr.DataFrame(label="Answers", wrap=True)
 
174
 
175
- run_btn.click(fn=run_and_submit_all, outputs=[status, table])
176
 
177
  if __name__ == "__main__":
178
- demo.launch()
 
1
  import os
2
+ import re
3
+ import json
4
+ import math
5
  import requests
6
  import pandas as pd
7
+ import gradio as gr
8
+
9
+ from bs4 import BeautifulSoup
10
+ from sympy import sympify
11
+ from pint import UnitRegistry
12
+
13
+ try:
14
+ from huggingface_hub import InferenceClient
15
+ except Exception:
16
+ InferenceClient = None
17
 
 
 
 
18
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
19
 
20
+ WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
21
+ HF_API_BASE = "https://huggingface.co/api"
22
+ OPEN_METEO = "https://api.open-meteo.com/v1/forecast"
 
 
 
 
 
23
 
24
+ ureg = UnitRegistry()
25
+ Q = ureg.Quantity
26
 
 
 
 
 
27
 
28
+ def http_get(url, timeout=20, headers=None, params=None):
29
+ headers = headers or {
30
+ "User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co)"
31
+ }
32
+ r = requests.get(url, timeout=timeout, headers=headers, params=params)
33
+ r.raise_for_status()
34
+ return r
35
 
 
 
 
 
 
 
36
 
37
+ def wikidata_query(sparql: str):
38
+ r = http_get(
39
+ WIKIDATA_SPARQL,
40
+ params={"format": "json", "query": sparql},
41
+ headers={"Accept": "application/sparql-results+json"}
42
+ )
43
+ return r.json()
 
44
 
 
 
 
45
 
46
+ def clean_answer(s: str) -> str:
47
+ if s is None:
48
+ return ""
49
+ s = str(s).strip()
50
 
51
+ # remove FINAL ANSWER patterns
52
+ s = re.sub(r"(?i)\bFINAL\s*ANSWER\b\s*[:\-]*\s*", "", s).strip()
53
 
54
+ # remove markdown/code fences
55
+ s = re.sub(r"```.*?```", "", s, flags=re.S).strip()
 
56
 
57
+ # keep last non-empty line (common for model outputs)
58
+ lines = [ln.strip() for ln in s.splitlines() if ln.strip()]
59
+ if lines:
60
+ s = lines[-1]
61
 
62
+ # strip quotes
63
+ s = s.strip().strip('"').strip("'").strip()
64
 
65
+ # collapse spaces
66
+ s = re.sub(r"\s+", " ", s).strip()
67
+ return s
68
+
69
+
70
+ def looks_like_math(q: str) -> bool:
71
+ # crude heuristic: contains digits and operators
72
+ return bool(re.search(r"\d", q)) and bool(re.search(r"[+\-*/^=()]", q))
73
+
74
+
75
+ def try_solve_math(q: str):
76
+ """
77
+ Try to extract a math expression and evaluate.
78
+ """
79
+ # grab something that looks like an expression
80
+ m = re.search(r"([-+*/^().\d\s]+)", q)
81
+ if not m:
82
+ return None
83
+ expr = m.group(1).strip()
84
+ if len(expr) < 3:
85
+ return None
86
+ expr = expr.replace("^", "**")
87
+ try:
88
+ val = sympify(expr).evalf()
89
+ # if near int, output int
90
+ if abs(val - int(val)) < 1e-10:
91
+ return str(int(val))
92
+ return str(val)
93
+ except Exception:
94
+ return None
95
+
96
+
97
+ def try_unit_convert(q: str):
98
+ """
99
+ Very basic unit conversion:
100
+ e.g., "Convert 5 miles to km"
101
+ """
102
+ # match "convert <num> <unit> to <unit>"
103
+ m = re.search(r"(?i)\bconvert\s+([-+]?\d+(?:\.\d+)?)\s*([a-zA-Z°]+)\s+to\s+([a-zA-Z°]+)\b", q)
104
+ if not m:
105
+ return None
106
+ num = float(m.group(1))
107
+ u1 = m.group(2)
108
+ u2 = m.group(3)
109
+ try:
110
+ out = (Q(num, u1)).to(u2)
111
+ # output without unit text unless question requires it; GAIA exact match often wants number only
112
+ # we'll return just magnitude, trimmed
113
+ mag = out.magnitude
114
+ if abs(mag - int(mag)) < 1e-10:
115
+ return str(int(mag))
116
+ return str(mag)
117
+ except Exception:
118
+ return None
119
+
120
+
121
+ def ddg_search_snippet(query: str, max_results=5):
122
+ """
123
+ DuckDuckGo HTML scraping (no paid key).
124
+ Returns list of (title, url, snippet)
125
+ """
126
+ url = "https://duckduckgo.com/html/"
127
+ r = http_get(url, params={"q": query}, timeout=20)
128
+ soup = BeautifulSoup(r.text, "lxml")
129
+ results = []
130
+ for res in soup.select(".result")[:max_results]:
131
+ a = res.select_one(".result__a")
132
+ sn = res.select_one(".result__snippet")
133
+ if a:
134
+ title = a.get_text(" ", strip=True)
135
+ link = a.get("href")
136
+ snippet = sn.get_text(" ", strip=True) if sn else ""
137
+ results.append((title, link, snippet))
138
+ return results
139
+
140
+
141
+ def hf_model_info(model_id: str):
142
+ r = http_get(f"{HF_API_BASE}/models/{model_id}", timeout=20)
143
+ return r.json()
144
+
145
+
146
+ def hf_search_models(query: str, limit=5):
147
+ r = http_get(f"{HF_API_BASE}/models", params={"search": query, "limit": limit}, timeout=20)
148
+ return r.json()
149
+
150
+
151
+ def open_meteo_weather(city: str):
152
+ # naive: use geocoding via Open-Meteo geocoding
153
+ geo = http_get(
154
+ "https://geocoding-api.open-meteo.com/v1/search",
155
+ params={"name": city, "count": 1, "language": "en", "format": "json"},
156
+ timeout=20
157
+ ).json()
158
+ if not geo.get("results"):
159
+ return None
160
+ lat = geo["results"][0]["latitude"]
161
+ lon = geo["results"][0]["longitude"]
162
+
163
+ data = http_get(
164
+ OPEN_METEO,
165
+ params={
166
+ "latitude": lat,
167
+ "longitude": lon,
168
+ "current": "temperature_2m,weather_code,wind_speed_10m",
169
+ },
170
+ timeout=20
171
+ ).json()
172
+ cur = data.get("current", {})
173
+ # return temperature only (often GAIA asks a single value)
174
+ if "temperature_2m" in cur:
175
+ t = cur["temperature_2m"]
176
+ if abs(t - int(t)) < 1e-10:
177
+ return str(int(t))
178
+ return str(t)
179
+ return None
180
+
181
+
182
+ def wikidata_simple_lookup(entity: str, prop: str):
183
+ """
184
+ Use Wikidata to fetch a single property for a named entity.
185
+ prop: one of 'capital', 'population', 'area', 'birth', 'death', 'country', 'founder', etc.
186
+ We'll map prop -> Wikidata property IDs and return a clean string.
187
+ """
188
+ prop_map = {
189
+ "capital": "P36",
190
+ "population": "P1082",
191
+ "area": "P2046",
192
+ "birth": "P569",
193
+ "death": "P570",
194
+ "country": "P17",
195
+ "founder": "P112",
196
+ "headquarters": "P159",
197
+ }
198
+ pid = prop_map.get(prop)
199
+ if not pid:
200
+ return None
201
+
202
+ # Try entity as label search then property
203
+ sparql = f"""
204
+ SELECT ?valueLabel WHERE {{
205
+ ?item rdfs:label "{entity}"@en .
206
+ OPTIONAL {{ ?item wdt:{pid} ?value . }}
207
+ SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
208
+ }}
209
+ LIMIT 1
210
+ """
211
+ try:
212
+ data = wikidata_query(sparql)
213
+ bindings = data.get("results", {}).get("bindings", [])
214
+ if not bindings:
215
+ return None
216
+ v = bindings[0].get("valueLabel", {}).get("value")
217
+ return clean_answer(v)
218
+ except Exception:
219
+ return None
220
+
221
+
222
+ def download_task_file(task_id: str, save_dir="/tmp"):
223
+ url = f"{DEFAULT_API_URL}/files/{task_id}"
224
+ try:
225
+ r = http_get(url, timeout=30)
226
+ # try detect filename from headers
227
+ fname = f"{task_id}.bin"
228
+ cd = r.headers.get("content-disposition", "")
229
+ m = re.search(r'filename="?([^"]+)"?', cd)
230
+ if m:
231
+ fname = m.group(1)
232
+ path = os.path.join(save_dir, fname)
233
+ with open(path, "wb") as f:
234
+ f.write(r.content)
235
+ return path
236
+ except Exception:
237
+ return None
238
+
239
+
240
+ class ToolFirstAgent:
241
+ """
242
+ Tool-first agent for GAIA Level-1 exact-match scoring.
243
+ Designed to work WITHOUT paid models.
244
+ Optional fallback to a free small model if HF_TOKEN is set.
245
+ """
246
 
247
+ def __init__(self):
248
+ self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
249
+ self.model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
250
 
251
+ self.llm = None
252
+ if self.hf_token and InferenceClient is not None:
253
+ # IMPORTANT: do NOT pass both model and base_url in constructor.
254
+ # We'll use router and pass model at call-time (supported by huggingface_hub client).
255
+ try:
256
+ self.llm = InferenceClient(token=self.hf_token, base_url="https://router.huggingface.co", timeout=120)
257
+ print("✅ LLM fallback enabled via HF router.")
258
+ except Exception as e:
259
+ print("⚠️ LLM fallback init failed, continue tool-only:", e)
260
+ self.llm = None
261
+ else:
262
+ print("ℹ️ Running in tool-only mode (no HF_TOKEN or huggingface_hub missing).")
263
+
264
+ def llm_answer(self, question: str) -> str:
265
+ if not self.llm:
266
+ return ""
267
+ system = (
268
+ "Return ONLY the final answer for this question.\n"
269
+ "No explanation. No extra words.\n"
270
+ "If it is a name/number/date, output it exactly.\n"
271
+ )
272
+ prompt = f"{system}\nQuestion: {question}\nAnswer:"
273
  try:
274
+ out = self.llm.text_generation(
275
  prompt,
276
+ model=self.model_id,
277
+ max_new_tokens=96,
278
  temperature=0.0,
279
  do_sample=False,
280
  return_full_text=False,
281
  )
282
+ return clean_answer(out)
283
+ except Exception as e:
284
+ print("LLM text_generation failed:", e)
285
+ return ""
 
 
 
 
 
286
 
287
+ def answer(self, question: str, task_id: str = None) -> str:
288
+ q = question.strip()
289
+
290
+ # 0) if task has a file, try download (some GAIA Qs rely on it)
291
+ if task_id:
292
+ fpath = download_task_file(task_id)
293
+ # For now, just note: without knowing file types, we won't parse deeply.
294
+ # But downloading sometimes is required; you can extend later.
295
+ if fpath:
296
+ print(f"Downloaded file for task {task_id}: {fpath}")
297
+
298
+ # 1) math
299
+ if looks_like_math(q):
300
+ m = try_solve_math(q)
301
+ if m:
302
+ return clean_answer(m)
303
+
304
+ # 2) unit conversion
305
+ u = try_unit_convert(q)
306
+ if u:
307
+ return clean_answer(u)
308
+
309
+ # 3) weather questions: "weather in <city>"
310
+ m = re.search(r"(?i)\bweather in ([A-Za-z \-]+)\b", q)
311
+ if m:
312
+ city = m.group(1).strip()
313
+ w = open_meteo_weather(city)
314
+ if w:
315
+ return clean_answer(w)
316
+
317
+ # 4) Hugging Face / model popularity questions
318
+ # e.g. "most downloaded model", "downloads of Qwen/..."
319
+ if "hugging face" in q.lower() or "download" in q.lower() or "downloads" in q.lower():
320
+ mm = re.search(r"([A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+)", q)
321
+ if mm:
322
+ mid = mm.group(1)
323
+ try:
324
+ info = hf_model_info(mid)
325
+ # common: downloads field
326
+ if "downloads" in info:
327
+ return clean_answer(str(info["downloads"]))
328
+ except Exception:
329
+ pass
330
+
331
+ # 5) Wikidata lookups (capitals, birth, etc.)
332
+ # Capital of X
333
+ m = re.search(r"(?i)\bcapital of ([A-Za-z \-]+)\b", q)
334
+ if m:
335
+ ent = m.group(1).strip()
336
+ v = wikidata_simple_lookup(ent, "capital")
337
+ if v:
338
+ return clean_answer(v)
339
+
340
+ # Birth date of X
341
+ m = re.search(r"(?i)\bwhen was ([A-Za-z .\-]+) born\b", q)
342
+ if m:
343
+ ent = m.group(1).strip()
344
+ v = wikidata_simple_lookup(ent, "birth")
345
+ if v:
346
+ # often wikidata returns ISO datetime; keep only date part
347
+ v = v.split("T")[0]
348
+ return clean_answer(v)
349
+
350
+ # Population of X
351
+ m = re.search(r"(?i)\bpopulation of ([A-Za-z \-]+)\b", q)
352
+ if m:
353
+ ent = m.group(1).strip()
354
+ v = wikidata_simple_lookup(ent, "population")
355
+ if v:
356
+ # sometimes returns "1,234,567" vs "1234567"; exact match varies.
357
+ # keep as-is; but remove commas if question likely expects plain digits
358
+ if re.search(r"(?i)\bhow many\b|\bpopulation\b", q):
359
+ v2 = v.replace(",", "")
360
+ return clean_answer(v2)
361
+ return clean_answer(v)
362
+
363
+ # 6) lightweight web search fallback (snippets)
364
+ # Works for factoid questions with clear short answers
365
+ try:
366
+ results = ddg_search_snippet(q, max_results=3)
367
+ if results:
368
+ # Heuristic: if question asks for a year, grab 4-digit year from snippet
369
+ if re.search(r"\b(19|20)\d{2}\b", q):
370
+ for _, __, sn in results:
371
+ yy = re.search(r"\b(19|20)\d{2}\b", sn)
372
+ if yy:
373
+ return clean_answer(yy.group(0))
374
+
375
+ # If asks "Who is ..." try first snippet capitalized name chunk
376
+ if q.lower().startswith("who is") or "who was" in q.lower():
377
+ # naive: take first result title before "-" or "|"
378
+ title = results[0][0]
379
+ title = re.split(r"[-|–]", title)[0].strip()
380
+ if title:
381
+ return clean_answer(title)
382
+ except Exception as e:
383
+ print("DDG fallback failed:", e)
384
+
385
+ # 7) optional LLM fallback (free small model) — last resort
386
+ llm = self.llm_answer(q)
387
+ if llm:
388
+ # If too long, ask again implicitly by trimming to last line already done.
389
+ # Also strip trailing punctuation
390
+ llm = re.sub(r"[.。!!]+$", "", llm).strip()
391
+ return clean_answer(llm)
392
+
393
+ # 8) final fallback
394
+ return "I don't know"
395
 
396
 
 
 
 
397
  def run_and_submit_all(profile: gr.OAuthProfile | None):
 
398
  space_id = os.getenv("SPACE_ID")
399
 
400
+ if profile:
401
+ username = f"{profile.username}"
402
+ print(f"User logged in: {username}")
403
+ else:
404
+ return "Please Login to Hugging Face with the button.", None
405
 
406
+ api_url = DEFAULT_API_URL
407
+ questions_url = f"{api_url}/questions"
408
+ submit_url = f"{api_url}/submit"
 
 
409
 
410
  try:
411
+ agent = ToolFirstAgent()
412
  except Exception as e:
413
+ return f"Error initializing agent: {e}", None
414
 
415
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
416
 
417
+ # Fetch Questions
418
+ try:
419
+ response = requests.get(questions_url, timeout=20)
420
+ response.raise_for_status()
421
+ questions_data = response.json()
422
+ if not questions_data:
423
+ return "Fetched questions list is empty.", None
424
+ except Exception as e:
425
+ return f"Error fetching questions: {e}", None
426
 
427
+ results_log = []
428
  answers_payload = []
 
429
 
430
+ for item in questions_data:
431
+ task_id = item.get("task_id")
432
+ question_text = item.get("question")
433
+ if not task_id or question_text is None:
434
+ continue
435
+
436
  try:
437
+ submitted_answer = agent.answer(question_text, task_id=task_id)
438
+ submitted_answer = clean_answer(submitted_answer)
439
+ except Exception as e:
440
+ submitted_answer = f"AGENT ERROR: {e}"
441
+
442
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
443
+ results_log.append(
444
+ {"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}
445
+ )
446
+
447
+ submission_data = {
448
+ "username": username.strip(),
 
 
 
 
 
449
  "agent_code": agent_code,
450
+ "answers": answers_payload,
451
  }
452
 
453
+ try:
454
+ response = requests.post(submit_url, json=submission_data, timeout=90)
455
+ response.raise_for_status()
456
+ result_data = response.json()
457
+ final_status = (
458
+ f"Submission Successful!\n"
459
+ f"User: {result_data.get('username')}\n"
460
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
461
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
462
+ f"Message: {result_data.get('message', 'No message received.')}"
463
+ )
464
+ return final_status, pd.DataFrame(results_log)
465
+ except Exception as e:
466
+ return f"Submission Failed: {e}", pd.DataFrame(results_log)
467
 
468
 
 
 
 
469
  with gr.Blocks() as demo:
470
+ gr.Markdown("# Basic Agent Evaluation Runner (Tool-first, no paid model)")
471
+ gr.Markdown(
472
+ """
473
+ **Instructions**
474
+ 1. Login with the button.
475
+ 2. Click Run to fetch questions, answer them, submit, and get score.
476
+
477
+ **Notes**
478
+ - Works without paid models.
479
+ - Optional HF_TOKEN enables small-model fallback (free tier permitting).
480
+ """
481
+ )
482
+
483
  gr.LoginButton()
484
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
485
+
486
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=6, interactive=False)
487
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
488
 
489
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
490
 
491
  if __name__ == "__main__":
492
+ demo.launch(debug=True, share=False)