johnnychiang commited on
Commit
11b435a
·
verified ·
1 Parent(s): 6051f37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -468
app.py CHANGED
@@ -1,492 +1,100 @@
1
  import os
2
- import re
3
- import json
4
- import math
5
  import requests
6
  import pandas as pd
7
- import gradio as gr
8
-
9
- from bs4 import BeautifulSoup
10
- from sympy import sympify
11
- from pint import UnitRegistry
12
-
13
- try:
14
- from huggingface_hub import InferenceClient
15
- except Exception:
16
- InferenceClient = None
17
 
18
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
19
 
20
- WIKIDATA_SPARQL = "https://query.wikidata.org/sparql"
21
- HF_API_BASE = "https://huggingface.co/api"
22
- OPEN_METEO = "https://api.open-meteo.com/v1/forecast"
23
-
24
- ureg = UnitRegistry()
25
- Q = ureg.Quantity
26
-
27
-
28
- def http_get(url, timeout=20, headers=None, params=None):
29
- headers = headers or {
30
- "User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0; +https://huggingface.co)"
31
- }
32
- r = requests.get(url, timeout=timeout, headers=headers, params=params)
33
- r.raise_for_status()
34
- return r
35
-
36
-
37
- def wikidata_query(sparql: str):
38
- r = http_get(
39
- WIKIDATA_SPARQL,
40
- params={"format": "json", "query": sparql},
41
- headers={"Accept": "application/sparql-results+json"}
42
- )
43
- return r.json()
44
-
45
-
46
- def clean_answer(s: str) -> str:
47
- if s is None:
48
- return ""
49
- s = str(s).strip()
50
-
51
- # remove FINAL ANSWER patterns
52
- s = re.sub(r"(?i)\bFINAL\s*ANSWER\b\s*[:\-]*\s*", "", s).strip()
53
-
54
- # remove markdown/code fences
55
- s = re.sub(r"```.*?```", "", s, flags=re.S).strip()
56
-
57
- # keep last non-empty line (common for model outputs)
58
- lines = [ln.strip() for ln in s.splitlines() if ln.strip()]
59
- if lines:
60
- s = lines[-1]
61
-
62
- # strip quotes
63
- s = s.strip().strip('"').strip("'").strip()
64
-
65
- # collapse spaces
66
- s = re.sub(r"\s+", " ", s).strip()
67
- return s
68
-
69
-
70
- def looks_like_math(q: str) -> bool:
71
- # crude heuristic: contains digits and operators
72
- return bool(re.search(r"\d", q)) and bool(re.search(r"[+\-*/^=()]", q))
73
-
74
-
75
- def try_solve_math(q: str):
76
- """
77
- Try to extract a math expression and evaluate.
78
- """
79
- # grab something that looks like an expression
80
- m = re.search(r"([-+*/^().\d\s]+)", q)
81
- if not m:
82
- return None
83
- expr = m.group(1).strip()
84
- if len(expr) < 3:
85
- return None
86
- expr = expr.replace("^", "**")
87
- try:
88
- val = sympify(expr).evalf()
89
- # if near int, output int
90
- if abs(val - int(val)) < 1e-10:
91
- return str(int(val))
92
- return str(val)
93
- except Exception:
94
- return None
95
-
96
-
97
- def try_unit_convert(q: str):
98
- """
99
- Very basic unit conversion:
100
- e.g., "Convert 5 miles to km"
101
- """
102
- # match "convert <num> <unit> to <unit>"
103
- m = re.search(r"(?i)\bconvert\s+([-+]?\d+(?:\.\d+)?)\s*([a-zA-Z°]+)\s+to\s+([a-zA-Z°]+)\b", q)
104
- if not m:
105
- return None
106
- num = float(m.group(1))
107
- u1 = m.group(2)
108
- u2 = m.group(3)
109
- try:
110
- out = (Q(num, u1)).to(u2)
111
- # output without unit text unless question requires it; GAIA exact match often wants number only
112
- # we'll return just magnitude, trimmed
113
- mag = out.magnitude
114
- if abs(mag - int(mag)) < 1e-10:
115
- return str(int(mag))
116
- return str(mag)
117
- except Exception:
118
- return None
119
-
120
-
121
- def ddg_search_snippet(query: str, max_results=5):
122
- """
123
- DuckDuckGo HTML scraping (no paid key).
124
- Returns list of (title, url, snippet)
125
- """
126
- url = "https://duckduckgo.com/html/"
127
- r = http_get(url, params={"q": query}, timeout=20)
128
- soup = BeautifulSoup(r.text, "lxml")
129
- results = []
130
- for res in soup.select(".result")[:max_results]:
131
- a = res.select_one(".result__a")
132
- sn = res.select_one(".result__snippet")
133
- if a:
134
- title = a.get_text(" ", strip=True)
135
- link = a.get("href")
136
- snippet = sn.get_text(" ", strip=True) if sn else ""
137
- results.append((title, link, snippet))
138
- return results
139
-
140
-
141
- def hf_model_info(model_id: str):
142
- r = http_get(f"{HF_API_BASE}/models/{model_id}", timeout=20)
143
- return r.json()
144
-
145
-
146
- def hf_search_models(query: str, limit=5):
147
- r = http_get(f"{HF_API_BASE}/models", params={"search": query, "limit": limit}, timeout=20)
148
- return r.json()
149
-
150
-
151
- def open_meteo_weather(city: str):
152
- # naive: use geocoding via Open-Meteo geocoding
153
- geo = http_get(
154
- "https://geocoding-api.open-meteo.com/v1/search",
155
- params={"name": city, "count": 1, "language": "en", "format": "json"},
156
- timeout=20
157
- ).json()
158
- if not geo.get("results"):
159
- return None
160
- lat = geo["results"][0]["latitude"]
161
- lon = geo["results"][0]["longitude"]
162
-
163
- data = http_get(
164
- OPEN_METEO,
165
- params={
166
- "latitude": lat,
167
- "longitude": lon,
168
- "current": "temperature_2m,weather_code,wind_speed_10m",
169
- },
170
- timeout=20
171
- ).json()
172
- cur = data.get("current", {})
173
- # return temperature only (often GAIA asks a single value)
174
- if "temperature_2m" in cur:
175
- t = cur["temperature_2m"]
176
- if abs(t - int(t)) < 1e-10:
177
- return str(int(t))
178
- return str(t)
179
- return None
180
-
181
-
182
- def wikidata_simple_lookup(entity: str, prop: str):
183
- """
184
- Use Wikidata to fetch a single property for a named entity.
185
- prop: one of 'capital', 'population', 'area', 'birth', 'death', 'country', 'founder', etc.
186
- We'll map prop -> Wikidata property IDs and return a clean string.
187
- """
188
- prop_map = {
189
- "capital": "P36",
190
- "population": "P1082",
191
- "area": "P2046",
192
- "birth": "P569",
193
- "death": "P570",
194
- "country": "P17",
195
- "founder": "P112",
196
- "headquarters": "P159",
197
- }
198
- pid = prop_map.get(prop)
199
- if not pid:
200
- return None
201
-
202
- # Try entity as label search then property
203
- sparql = f"""
204
- SELECT ?valueLabel WHERE {{
205
- ?item rdfs:label "{entity}"@en .
206
- OPTIONAL {{ ?item wdt:{pid} ?value . }}
207
- SERVICE wikibase:label {{ bd:serviceParam wikibase:language "en". }}
208
- }}
209
- LIMIT 1
210
- """
211
- try:
212
- data = wikidata_query(sparql)
213
- bindings = data.get("results", {}).get("bindings", [])
214
- if not bindings:
215
- return None
216
- v = bindings[0].get("valueLabel", {}).get("value")
217
- return clean_answer(v)
218
- except Exception:
219
- return None
220
-
221
-
222
- def download_task_file(task_id: str, save_dir="/tmp"):
223
- url = f"{DEFAULT_API_URL}/files/{task_id}"
224
- try:
225
- r = http_get(url, timeout=30)
226
- # try detect filename from headers
227
- fname = f"{task_id}.bin"
228
- cd = r.headers.get("content-disposition", "")
229
- m = re.search(r'filename="?([^"]+)"?', cd)
230
- if m:
231
- fname = m.group(1)
232
- path = os.path.join(save_dir, fname)
233
- with open(path, "wb") as f:
234
- f.write(r.content)
235
- return path
236
- except Exception:
237
- return None
238
-
239
-
240
- class ToolFirstAgent:
241
- """
242
- Tool-first agent for GAIA Level-1 exact-match scoring.
243
- Designed to work WITHOUT paid models.
244
- Optional fallback to a free small model if HF_TOKEN is set.
245
- """
246
-
247
  def __init__(self):
248
- self.hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
249
- self.model_id = os.getenv("MODEL_ID", "Qwen/Qwen2.5-7B-Instruct")
250
 
251
- self.llm = None
252
- if self.hf_token and InferenceClient is not None:
253
- # IMPORTANT: do NOT pass both model and base_url in constructor.
254
- # We'll use router and pass model at call-time (supported by huggingface_hub client).
255
- try:
256
- self.llm = InferenceClient(token=self.hf_token, base_url="https://router.huggingface.co", timeout=120)
257
- print("✅ LLM fallback enabled via HF router.")
258
- except Exception as e:
259
- print("⚠️ LLM fallback init failed, continue tool-only:", e)
260
- self.llm = None
261
- else:
262
- print("ℹ️ Running in tool-only mode (no HF_TOKEN or huggingface_hub missing).")
263
 
264
- def llm_answer(self, question: str) -> str:
265
- if not self.llm:
266
- return ""
267
- system = (
268
- "Return ONLY the final answer for this question.\n"
269
- "No explanation. No extra words.\n"
270
- "If it is a name/number/date, output it exactly.\n"
271
  )
272
- prompt = f"{system}\nQuestion: {question}\nAnswer:"
273
- try:
274
- out = self.llm.text_generation(
275
- prompt,
276
- model=self.model_id,
277
- max_new_tokens=96,
278
- temperature=0.0,
279
- do_sample=False,
280
- return_full_text=False,
281
- )
282
- return clean_answer(out)
283
- except Exception as e:
284
- print("LLM text_generation failed:", e)
285
- return ""
286
-
287
- def answer(self, question: str, task_id: str = None) -> str:
288
- q = question.strip()
289
-
290
- # 0) if task has a file, try download (some GAIA Qs rely on it)
291
- if task_id:
292
- fpath = download_task_file(task_id)
293
- # For now, just note: without knowing file types, we won't parse deeply.
294
- # But downloading sometimes is required; you can extend later.
295
- if fpath:
296
- print(f"Downloaded file for task {task_id}: {fpath}")
297
-
298
- # 1) math
299
- if looks_like_math(q):
300
- m = try_solve_math(q)
301
- if m:
302
- return clean_answer(m)
303
-
304
- # 2) unit conversion
305
- u = try_unit_convert(q)
306
- if u:
307
- return clean_answer(u)
308
 
309
- # 3) weather questions: "weather in <city>"
310
- m = re.search(r"(?i)\bweather in ([A-Za-z \-]+)\b", q)
311
- if m:
312
- city = m.group(1).strip()
313
- w = open_meteo_weather(city)
314
- if w:
315
- return clean_answer(w)
316
 
317
- # 4) Hugging Face / model popularity questions
318
- # e.g. "most downloaded model", "downloads of Qwen/..."
319
- if "hugging face" in q.lower() or "download" in q.lower() or "downloads" in q.lower():
320
- mm = re.search(r"([A-Za-z0-9_.-]+\/[A-Za-z0-9_.-]+)", q)
321
- if mm:
322
- mid = mm.group(1)
323
- try:
324
- info = hf_model_info(mid)
325
- # common: downloads field
326
- if "downloads" in info:
327
- return clean_answer(str(info["downloads"]))
328
- except Exception:
329
- pass
330
-
331
- # 5) Wikidata lookups (capitals, birth, etc.)
332
- # Capital of X
333
- m = re.search(r"(?i)\bcapital of ([A-Za-z \-]+)\b", q)
334
- if m:
335
- ent = m.group(1).strip()
336
- v = wikidata_simple_lookup(ent, "capital")
337
- if v:
338
- return clean_answer(v)
339
-
340
- # Birth date of X
341
- m = re.search(r"(?i)\bwhen was ([A-Za-z .\-]+) born\b", q)
342
- if m:
343
- ent = m.group(1).strip()
344
- v = wikidata_simple_lookup(ent, "birth")
345
- if v:
346
- # often wikidata returns ISO datetime; keep only date part
347
- v = v.split("T")[0]
348
- return clean_answer(v)
349
-
350
- # Population of X
351
- m = re.search(r"(?i)\bpopulation of ([A-Za-z \-]+)\b", q)
352
- if m:
353
- ent = m.group(1).strip()
354
- v = wikidata_simple_lookup(ent, "population")
355
- if v:
356
- # sometimes returns "1,234,567" vs "1234567"; exact match varies.
357
- # keep as-is; but remove commas if question likely expects plain digits
358
- if re.search(r"(?i)\bhow many\b|\bpopulation\b", q):
359
- v2 = v.replace(",", "")
360
- return clean_answer(v2)
361
- return clean_answer(v)
362
 
363
- # 6) lightweight web search fallback (snippets)
364
- # Works for factoid questions with clear short answers
365
  try:
366
- results = ddg_search_snippet(q, max_results=3)
367
- if results:
368
- # Heuristic: if question asks for a year, grab 4-digit year from snippet
369
- if re.search(r"\b(19|20)\d{2}\b", q):
370
- for _, __, sn in results:
371
- yy = re.search(r"\b(19|20)\d{2}\b", sn)
372
- if yy:
373
- return clean_answer(yy.group(0))
374
-
375
- # If asks "Who is ..." try first snippet capitalized name chunk
376
- if q.lower().startswith("who is") or "who was" in q.lower():
377
- # naive: take first result title before "-" or "|"
378
- title = results[0][0]
379
- title = re.split(r"[-|–]", title)[0].strip()
380
- if title:
381
- return clean_answer(title)
382
  except Exception as e:
383
- print("DDG fallback failed:", e)
384
-
385
- # 7) optional LLM fallback (free small model) — last resort
386
- llm = self.llm_answer(q)
387
- if llm:
388
- # If too long, ask again implicitly by trimming to last line already done.
389
- # Also strip trailing punctuation
390
- llm = re.sub(r"[.。!!]+$", "", llm).strip()
391
- return clean_answer(llm)
392
-
393
- # 8) final fallback
394
- return "I don't know"
395
-
396
 
397
  def run_and_submit_all(profile: gr.OAuthProfile | None):
398
- space_id = os.getenv("SPACE_ID")
399
-
400
- if profile:
401
- username = f"{profile.username}"
402
- print(f"User logged in: {username}")
403
- else:
404
- return "Please Login to Hugging Face with the button.", None
405
-
406
- api_url = DEFAULT_API_URL
407
- questions_url = f"{api_url}/questions"
408
- submit_url = f"{api_url}/submit"
409
-
410
- try:
411
- agent = ToolFirstAgent()
412
- except Exception as e:
413
- return f"Error initializing agent: {e}", None
414
-
415
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
416
-
417
- # Fetch Questions
418
- try:
419
- response = requests.get(questions_url, timeout=20)
420
- response.raise_for_status()
421
- questions_data = response.json()
422
- if not questions_data:
423
- return "Fetched questions list is empty.", None
424
- except Exception as e:
425
- return f"Error fetching questions: {e}", None
426
-
427
- results_log = []
428
- answers_payload = []
429
-
430
- for item in questions_data:
431
- task_id = item.get("task_id")
432
- question_text = item.get("question")
433
- if not task_id or question_text is None:
434
- continue
435
-
436
- try:
437
- submitted_answer = agent.answer(question_text, task_id=task_id)
438
- submitted_answer = clean_answer(submitted_answer)
439
- except Exception as e:
440
- submitted_answer = f"AGENT ERROR: {e}"
441
-
442
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
443
- results_log.append(
444
- {"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}
445
- )
446
-
447
- submission_data = {
448
- "username": username.strip(),
449
- "agent_code": agent_code,
450
- "answers": answers_payload,
451
  }
452
 
453
- try:
454
- response = requests.post(submit_url, json=submission_data, timeout=90)
455
- response.raise_for_status()
456
- result_data = response.json()
457
- final_status = (
458
- f"Submission Successful!\n"
459
- f"User: {result_data.get('username')}\n"
460
- f"Overall Score: {result_data.get('score', 'N/A')}% "
461
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
462
- f"Message: {result_data.get('message', 'No message received.')}"
463
- )
464
- return final_status, pd.DataFrame(results_log)
465
- except Exception as e:
466
- return f"Submission Failed: {e}", pd.DataFrame(results_log)
467
-
468
 
469
- with gr.Blocks() as demo:
470
- gr.Markdown("# Basic Agent Evaluation Runner (Tool-first, no paid model)")
471
- gr.Markdown(
472
- """
473
- **Instructions**
474
- 1. Login with the button.
475
- 2. Click Run to fetch questions, answer them, submit, and get score.
476
-
477
- **Notes**
478
- - Works without paid models.
479
- - Optional HF_TOKEN enables small-model fallback (free tier permitting).
480
- """
481
  )
482
 
483
- gr.LoginButton()
484
- run_button = gr.Button("Run Evaluation & Submit All Answers")
485
-
486
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=6, interactive=False)
487
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
488
 
489
- run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
 
 
 
490
 
491
- if __name__ == "__main__":
492
- demo.launch(debug=True, share=False)
 
1
  import os
2
+ import gradio as gr
 
 
3
  import requests
4
  import pandas as pd
5
+ import re
6
+ from huggingface_hub import InferenceClient
 
 
 
 
 
 
 
 
7
 
8
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
9
 
10
+ class BasicAgent:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
  def __init__(self):
12
+ print("Agent init")
 
13
 
14
+ token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
15
+ if not token:
16
+ raise RuntimeError("HF_TOKEN not set")
 
 
 
 
 
 
 
 
 
17
 
18
+ # 免費可用,穩定
19
+ self.client = InferenceClient(
20
+ "Qwen/Qwen2.5-7B-Instruct",
21
+ token=token,
 
 
 
22
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ def clean(self, text: str) -> str:
25
+ text = text.strip()
26
+ text = re.sub(r"(?i)final answer[:\-]*", "", text)
27
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
28
+ return lines[-1] if lines else text
 
 
29
 
30
+ def __call__(self, question: str) -> str:
31
+ system = (
32
+ "You are a precise QA agent.\n"
33
+ "Return ONLY the final answer.\n"
34
+ "No explanation.\n"
35
+ "No extra words.\n"
36
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
 
 
38
  try:
39
+ out = self.client.chat_completion(
40
+ messages=[
41
+ {"role": "system", "content": system},
42
+ {"role": "user", "content": question},
43
+ ],
44
+ temperature=0,
45
+ max_tokens=256,
46
+ ).choices[0].message.content
47
+ return self.clean(out)
 
 
 
 
 
 
 
48
  except Exception as e:
49
+ print("LLM error:", e)
50
+ return ""
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def run_and_submit_all(profile: gr.OAuthProfile | None):
53
+ if not profile:
54
+ return "Please login", None
55
+
56
+ username = profile.username
57
+ agent = BasicAgent()
58
+
59
+ questions = requests.get(f"{DEFAULT_API_URL}/questions").json()
60
+
61
+ answers = []
62
+ log = []
63
+
64
+ for q in questions:
65
+ ans = agent(q["question"])
66
+ answers.append({
67
+ "task_id": q["task_id"],
68
+ "submitted_answer": ans
69
+ })
70
+ log.append({
71
+ "task_id": q["task_id"],
72
+ "question": q["question"],
73
+ "answer": ans
74
+ })
75
+
76
+ payload = {
77
+ "username": username,
78
+ "agent_code": f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}/tree/main",
79
+ "answers": answers
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80
  }
81
 
82
+ r = requests.post(f"{DEFAULT_API_URL}/submit", json=payload).json()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
+ status = (
85
+ f"User: {r.get('username')}\n"
86
+ f"Score: {r.get('score')}%\n"
87
+ f"{r.get('correct_count')}/{r.get('total_attempted')} correct"
 
 
 
 
 
 
 
 
88
  )
89
 
90
+ return status, pd.DataFrame(log)
 
 
 
 
91
 
92
+ with gr.Blocks() as demo:
93
+ gr.Markdown("# GAIA Agent Runner")
94
+ gr.LoginButton()
95
+ btn = gr.Button("Run Evaluation & Submit All Answers")
96
+ out = gr.Textbox(lines=4)
97
+ table = gr.DataFrame()
98
+ btn.click(run_and_submit_all, outputs=[out, table])
99
 
100
+ demo.launch()