bhotta commited on
Commit
cb4182d
Β·
verified Β·
1 Parent(s): 0c10a78

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +469 -355
app.py CHANGED
@@ -1,420 +1,534 @@
1
  import os
2
- import gradio as gr
 
 
3
  import requests
4
  import pandas as pd
5
- from smolagents import CodeAgent, OpenAIServerModel, tool
6
  from openai import OpenAI
7
- import base64
8
 
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Custom Tools ---
12
- @tool
13
- def get_youtube_transcript(video_url: str) -> str:
14
- """Fetch the transcript/captions of a YouTube video.
15
-
16
- Args:
17
- video_url: The full YouTube video URL e.g. https://www.youtube.com/watch?v=XXXXX
18
- """
19
- try:
20
- from youtube_transcript_api import YouTubeTranscriptApi
21
-
22
- if "v=" in video_url:
23
- video_id = video_url.split("v=")[-1].split("&")[0]
24
- elif "youtu.be/" in video_url:
25
- video_id = video_url.split("youtu.be/")[-1].split("?")[0]
26
- else:
27
- return "Could not extract video ID."
28
-
29
- # New API style (v0.6.0+)
30
- ytt_api = YouTubeTranscriptApi()
31
- fetched = ytt_api.fetch(video_id)
32
- transcript_text = " ".join([t.text for t in fetched])
33
- return transcript_text[:8000]
34
-
35
- except Exception as e:
36
- # Fallback: try fetching via youtubetotranscript API
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  try:
38
- resp = requests.get(
39
- f"https://youtubetotranscript.com/transcript?v={video_id}",
40
- headers={"User-Agent": "Mozilla/5.0"},
41
- timeout=15
42
- )
43
- if resp.status_code == 200:
44
- import re
45
- text = re.sub(r'<[^>]+>', ' ', resp.text)
46
- text = re.sub(r'\s+', ' ', text).strip()
47
- return text[:6000]
48
- except:
49
  pass
50
- return f"Transcript fetch failed: {e}"
51
-
52
- @tool
53
- def wikipedia_fetch_page(page_title: str) -> str:
54
- """Fetch the full content of a specific Wikipedia page by its exact title.
55
-
56
- Args:
57
- page_title: The exact Wikipedia page title, e.g. 'Mercedes Sosa' or 'Mercedes Sosa discography'.
58
- """
59
- import time
60
- time.sleep(1) # avoid rate limiting
61
-
62
- headers = {
63
- "User-Agent": "GaiaResearchBot/1.0 (huggingface educational project)"
64
- }
65
-
66
- # Try action API with full extract
67
- try:
68
- params = {
69
- "action": "query",
70
- "titles": page_title,
71
- "prop": "extracts",
72
- "explaintext": True,
73
- "exsectionformat": "plain",
74
- "format": "json",
75
- "redirects": 1,
76
- }
77
- resp = requests.get(
78
- "https://en.wikipedia.org/w/api.php",
79
- params=params,
80
- headers=headers,
81
- timeout=20
82
- )
83
- resp.raise_for_status()
84
- data = resp.json()
85
- pages = data.get("query", {}).get("pages", {})
86
- for pid, page in pages.items():
87
- if pid == "-1":
88
- return f"Page '{page_title}' not found on Wikipedia."
89
- return page.get("extract", "No content.")[:10000]
90
- except Exception as e:
91
- pass
92
-
93
- # Fallback: try wikimedia API
94
- try:
95
- url = f"https://en.wikipedia.org/api/rest_v1/page/summary/{page_title.replace(' ', '_')}"
96
- resp = requests.get(url, headers=headers, timeout=15)
97
- data = resp.json()
98
- return data.get("extract", "No summary found.")
99
- except Exception as e:
100
- return f"Failed to fetch Wikipedia page: {e}"
101
-
102
-
103
- @tool
104
- def search_web(query: str) -> str:
105
- """Search the web using a query string. Returns search results as text.
106
-
107
- Args:
108
- query: The search query string. Be very specific, include full names to avoid ambiguity.
109
- """
110
- import time
111
-
112
- # Try DuckDuckGo with retries
113
- for attempt in range(3):
114
  try:
115
- from duckduckgo_search import DDGS
116
- with DDGS() as ddgs:
117
- results = list(ddgs.text(query, max_results=6))
118
- if results:
119
- # Filter out obviously irrelevant results
120
- filtered = [r for r in results if query.split()[0].lower() in
121
- (r['title'] + r['body']).lower()]
122
- use_results = filtered if filtered else results
123
- return "\n\n".join(
124
- f"Title: {r['title']}\nURL: {r['href']}\nSnippet: {r['body']}"
125
- for r in use_results[:5]
126
  )
 
 
 
 
127
  except Exception as e:
128
- time.sleep(2)
129
- continue
130
-
131
- return "Search unavailable. Try wikipedia_fetch_page or visit_url instead."
132
-
133
-
134
- @tool
135
- def visit_url(url: str) -> str:
136
- """Fetch the text content of a webpage. Tries direct fetch then Wayback Machine.
137
-
138
- Args:
139
- url: The full URL of the webpage to fetch.
140
- """
141
- import re, time
142
-
143
- # Don't even try sites known to block bots
144
- blocked = ["genius.com", "rateyourmusic.com", "discogs.com",
145
- "allmusic.com", "albumoftheyear.org", "famousfix.com"]
146
- if any(b in url for b in blocked):
147
- # Go straight to Wayback Machine for these
148
  try:
149
- wb_api = f"https://archive.org/wayback/available?url={url}&timestamp=20221201"
150
- wb_resp = requests.get(wb_api, timeout=10)
151
- snapshot = wb_resp.json().get("archived_snapshots", {}).get("closest", {})
152
- snapshot_url = snapshot.get("url")
153
- if snapshot_url:
154
- time.sleep(1)
155
- headers = {"User-Agent": "Mozilla/5.0"}
156
- snap_resp = requests.get(snapshot_url, headers=headers, timeout=20)
157
- text = re.sub(r'<[^>]+>', ' ', snap_resp.text)
158
- text = re.sub(r'\s+', ' ', text).strip()
159
- return f"[Via Wayback Machine]\n{text[:6000]}"
 
 
 
 
 
 
 
 
 
 
 
 
 
160
  except Exception as e:
161
- return f"Wayback Machine fetch failed for {url}: {e}"
162
-
163
- # For Wikipedia, use the API instead
164
- if "wikipedia.org/wiki/" in url:
165
- page_title = url.split("/wiki/")[-1].replace("_", " ")
166
- return wikipedia_fetch_page(page_title)
167
-
168
- headers = {
169
- "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
170
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
171
- }
172
-
173
- try:
174
- resp = requests.get(url, headers=headers, timeout=15)
175
- if resp.status_code == 200:
176
- text = re.sub(r'<[^>]+>', ' ', resp.text)
177
- text = re.sub(r'\s+', ' ', text).strip()
178
  return text[:6000]
179
- except Exception:
180
- pass
181
-
182
- # Fallback: Wayback Machine
183
- try:
184
- wb_api = f"https://archive.org/wayback/available?url={url}&timestamp=20221201"
185
- wb_resp = requests.get(wb_api, timeout=10)
186
- snapshot_url = wb_resp.json().get("archived_snapshots", {}).get("closest", {}).get("url")
187
- if snapshot_url:
188
- snap_resp = requests.get(snapshot_url, headers=headers, timeout=20)
189
- text = re.sub(r'<[^>]+>', ' ', snap_resp.text)
190
- text = re.sub(r'\s+', ' ', text).strip()
191
- return f"[Via Wayback Machine]\n{text[:6000]}"
192
- except Exception as e:
193
- pass
194
-
195
- return f"Could not fetch {url}"
196
-
197
- @tool
198
- def analyze_image_from_url(image_url: str, question: str) -> str:
199
- """Analyze an image from a URL using GPT-4o vision and answer a question about it.
200
-
201
- Args:
202
- image_url: The direct URL to the image file to analyze.
203
- question: The question to answer about the image content.
204
- """
205
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
206
- try:
207
- response = client.chat.completions.create(
208
  model="gpt-4o",
209
  messages=[{
210
  "role": "user",
211
  "content": [
212
- {"type": "image_url", "image_url": {"url": image_url}},
213
- {"type": "text", "text": question}
214
- ]
 
 
 
 
 
 
215
  }],
216
- max_tokens=500
 
217
  )
218
- return response.choices[0].message.content
219
- except Exception as e:
220
- return f"Image analysis failed: {e}"
221
-
222
- @tool
223
- def analyze_task_file(task_id: str, question: str) -> str:
224
- """Download and analyze a file attached to a GAIA task.
225
-
226
- Args:
227
- task_id: The GAIA task ID used to fetch the associated file.
228
- question: The question to answer based on the file content.
229
- """
230
- api_url = DEFAULT_API_URL
231
- file_url = f"{api_url}/files/{task_id}"
232
- client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
233
-
234
- try:
235
- resp = requests.get(file_url, timeout=30)
236
- if resp.status_code == 404:
237
- return "NO_FILE_ATTACHED" # Signal to agent to use other tools
238
- resp.raise_for_status()
239
-
240
- content_type = resp.headers.get("content-type", "")
241
- file_bytes = resp.content
242
-
243
- if any(x in content_type for x in ["image", "png", "jpeg", "jpg", "gif", "webp"]):
244
- b64 = base64.b64encode(file_bytes).decode()
245
- data_url = f"data:{content_type};base64,{b64}"
246
- response = client.chat.completions.create(
247
- model="gpt-4o",
248
- messages=[{"role": "user", "content": [
249
- {"type": "image_url", "image_url": {"url": data_url}},
250
- {"type": "text", "text": question}
251
- ]}],
252
- max_tokens=500
253
- )
254
- return response.choices[0].message.content
255
-
256
- elif any(x in content_type for x in ["text", "csv", "json", "html"]):
257
- text_content = file_bytes.decode("utf-8", errors="ignore")[:8000]
258
- response = client.chat.completions.create(
259
- model="gpt-4o",
260
- messages=[{"role": "user", "content": f"File content:\n{text_content}\n\nQuestion: {question}"}],
261
- max_tokens=500
262
- )
263
- return response.choices[0].message.content
264
-
265
- elif any(x in content_type for x in ["audio", "mp3", "wav", "m4a", "ogg", "mpeg"]):
266
- import tempfile
267
- suffix = "." + content_type.split("/")[-1].split(";")[0]
268
- with tempfile.NamedTemporaryFile(suffix=suffix, delete=False) as f:
269
- f.write(file_bytes)
270
- fname = f.name
271
- with open(fname, "rb") as audio_file:
272
- transcript = client.audio.transcriptions.create(
273
- model="whisper-1", file=audio_file
274
- )
275
- return f"Audio transcript: {transcript.text}"
276
-
277
- elif any(x in content_type for x in ["excel", "spreadsheet", "xlsx", "xls"]):
278
- import tempfile, subprocess
279
- with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f:
280
- f.write(file_bytes)
281
- fname = f.name
282
- response = client.chat.completions.create(
283
- model="gpt-4o",
284
- messages=[{"role": "user", "content": f"I have an Excel file. {question}. The file is at {fname}. Please note I cannot execute code - give me your best analysis based on context."}],
285
- max_tokens=500
286
- )
287
- return response.choices[0].message.content
288
-
289
- else:
290
- return f"File downloaded ({len(file_bytes)} bytes, type: {content_type}) but format not yet supported."
291
-
292
- except Exception as e:
293
- return f"NO_FILE_ATTACHED"
294
-
295
-
296
- # --- Agent ---
297
-
298
- class BasicAgent:
299
- def __init__(self):
300
- api_key = os.getenv("OPENAI_API_KEY")
301
- if not api_key:
302
- raise ValueError("OPENAI_API_KEY is missing!")
303
-
304
- self.model = OpenAIServerModel(
305
- model_id="gpt-4o",
306
- api_key=api_key
307
  )
308
 
309
- self.agent = CodeAgent(
310
- tools=[
311
- search_web,
312
- visit_url,
313
- wikipedia_fetch_page,
314
- analyze_image_from_url,
315
- analyze_task_file,
316
- get_youtube_transcript
317
- ],
318
- model=self.model,
319
- add_base_tools=True,
320
- max_steps=12,
321
- )
322
- print("βœ… OpenAI-powered Agent initialized.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
323
 
324
  def __call__(self, question: str, task_id: str = "") -> str:
325
- print(f"DEBUG: Agent received question: {question[:100]}...")
326
-
327
- prompt = (
328
- f"You are a precise research agent solving GAIA benchmark tasks.\n"
329
- f"Task ID: {task_id}\n"
330
- f"Task: {question}\n\n"
331
- "Instructions:\n"
332
- "- ALWAYS call analyze_task_file first for every task. If it returns 'NO_FILE_ATTACHED', proceed with web search.\n"
333
- "- For Wikipedia lookups, use wikipedia_fetch_page with the EXACT article title.\n"
334
- " Example: wikipedia_fetch_page('Mercedes Sosa') NOT web_search('Mercedes Sosa')\n"
335
- "- When searching, use FULL specific names to avoid ambiguity.\n"
336
- " BAD: 'Mercedes Sosa albums 2000 to 2009' (confuses with Mercedes-Benz)\n"
337
- " GOOD: wikipedia_fetch_page('Mercedes Sosa discography')\n"
338
- "- If a Wikipedia page 404s, try the parent page e.g. 'Mercedes Sosa' instead.\n"
339
- "- For YouTube links, ALWAYS call get_youtube_transcript(video_url) FIRST before any web search.\n"
340
- "- Provide ONLY the final direct answer. No explanations, just the value.\n"
341
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
  try:
343
- result = self.agent.run(prompt)
344
- return str(result).strip()
345
- except Exception as e:
346
- print(f"❌ Error: {e}")
 
 
 
 
 
 
 
 
347
  return "Error finding answer."
348
 
349
 
350
- # --- Gradio + Submission ---
351
 
352
  def run_and_submit_all(profile: gr.OAuthProfile | None):
353
- if profile:
354
- username = f"{profile.username}"
355
- print(f"Logged in as: {username}")
356
- else:
357
- return "Please Login to Hugging Face first.", None
358
 
359
- space_id = os.getenv("SPACE_ID")
 
360
  api_url = DEFAULT_API_URL
361
- questions_url = f"{api_url}/questions"
362
- submit_url = f"{api_url}/submit"
363
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
364
 
365
  try:
366
  agent = BasicAgent()
367
  except Exception as e:
368
- return f"Initialization Failed: {e}", None
369
 
370
  try:
371
- response = requests.get(questions_url, timeout=15)
372
- response.raise_for_status()
373
- questions_data = response.json()
374
  except Exception as e:
375
  return f"Error fetching questions: {e}", None
376
 
377
- results_log = []
378
- answers_payload = []
379
 
380
  for item in questions_data:
381
  task_id = item.get("task_id", "")
382
  question_text = item.get("question", "")
383
  try:
384
- submitted_answer = agent(question_text, task_id=task_id)
385
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
386
- results_log.append({"Task ID": task_id, "Question": question_text[:80], "Answer": submitted_answer})
387
  except Exception as e:
388
- results_log.append({"Task ID": task_id, "Question": question_text[:80], "Answer": f"Error: {e}"})
389
 
390
- submission_data = {
391
- "username": username.strip(),
392
- "agent_code": agent_code,
393
- "answers": answers_payload
394
- }
 
395
 
396
  try:
397
- response = requests.post(submit_url, json=submission_data, timeout=60)
398
- response.raise_for_status()
399
- res = response.json()
 
 
 
 
 
 
 
 
400
  status = (
401
- f"Submission Successful!\n"
402
- f"Score: {res.get('score')}% ({res.get('correct_count')}/{res.get('total_attempted')})\n"
 
403
  f"Message: {res.get('message')}"
404
  )
405
- return status, pd.DataFrame(results_log)
406
  except Exception as e:
407
- return f"Submission Failed: {e}", pd.DataFrame(results_log)
 
 
408
 
409
 
410
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
411
  gr.Markdown("# πŸ€– GAIA Agent Evaluation")
412
- gr.Markdown("Click Login, then Run to evaluate your agent on the GAIA dataset.")
 
 
 
 
413
  gr.LoginButton()
414
  run_button = gr.Button("πŸš€ Run Evaluation & Submit", variant="primary")
415
- status_output = gr.Textbox(label="Status", lines=4)
416
- results_table = gr.DataFrame(label="Agent Performance Log")
417
  run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
418
 
419
  if __name__ == "__main__":
420
- demo.launch(ssr_mode=False)
 
1
  import os
2
+ import re
3
+ import json
4
+ import base64
5
  import requests
6
  import pandas as pd
7
+ import gradio as gr
8
  from openai import OpenAI
 
9
 
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
12
+ # ── helpers ──────────────────────────────────────────────────────────────────
13
+
14
+ def _strip_html(html: str) -> str:
15
+ from html.parser import HTMLParser
16
+
17
+ class _P(HTMLParser):
18
+ def __init__(self):
19
+ super().__init__()
20
+ self.parts = []
21
+ self._skip = False
22
+ self._skip_tags = {"script", "style", "nav", "footer", "head"}
23
+
24
+ def handle_starttag(self, tag, attrs):
25
+ if tag in self._skip_tags:
26
+ self._skip = True
27
+
28
+ def handle_endtag(self, tag):
29
+ if tag in self._skip_tags:
30
+ self._skip = False
31
+
32
+ def handle_data(self, data):
33
+ if not self._skip and data.strip():
34
+ self.parts.append(data.strip())
35
+
36
+ p = _P()
37
+ p.feed(html)
38
+ return " ".join(p.parts)
39
+
40
+
41
+ # ── agent ─────────────────────────────────────────────────────────────────────
42
+
43
+ class BasicAgent:
44
+ def __init__(self):
45
+ api_key = os.getenv("OPENAI_API_KEY")
46
+ if not api_key:
47
+ raise ValueError("OPENAI_API_KEY missing – add it to Space Secrets.")
48
+ self.client = OpenAI(api_key=api_key)
49
+ self.api_url = DEFAULT_API_URL
50
+ print("βœ… Agent initialised.")
51
+
52
+ # ── tool implementations ──────────────────────────────────────────────────
53
+
54
+ def fetch_task_file(self, task_id: str):
55
  try:
56
+ r = requests.get(f"{self.api_url}/files/{task_id}", timeout=15)
57
+ if r.status_code == 200 and r.content:
58
+ return r.content, r.headers.get("Content-Type", "")
59
+ except Exception:
 
 
 
 
 
 
 
60
  pass
61
+ return None, ""
62
+
63
+ def search_web(self, query: str) -> str:
64
+ try:
65
+ hdrs = {
66
+ "User-Agent": (
67
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
68
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
69
+ "Chrome/124.0 Safari/537.36"
70
+ )
71
+ }
72
+ r = requests.get(
73
+ "https://html.duckduckgo.com/html/",
74
+ params={"q": query},
75
+ headers=hdrs,
76
+ timeout=12,
77
+ )
78
+ from html.parser import HTMLParser
79
+
80
+ class _DDG(HTMLParser):
81
+ def __init__(self):
82
+ super().__init__()
83
+ self.results = []
84
+ self._in = False
85
+ self._cur = ""
86
+
87
+ def handle_starttag(self, tag, attrs):
88
+ d = dict(attrs)
89
+ cls = d.get("class", "")
90
+ if tag in ("a", "span") and "result__snippet" in cls:
91
+ self._in = True
92
+ self._cur = ""
93
+
94
+ def handle_data(self, data):
95
+ if self._in:
96
+ self._cur += data
97
+
98
+ def handle_endtag(self, tag):
99
+ if self._in:
100
+ t = self._cur.strip()
101
+ if t:
102
+ self.results.append(t)
103
+ self._in = False
104
+
105
+ p = _DDG()
106
+ p.feed(r.text)
107
+ snippets = p.results[:6]
108
+ if snippets:
109
+ return "\n\n".join(snippets)
110
+ except Exception as e:
111
+ return f"Search error: {e}"
112
+ return "No results."
113
+
114
+ def fetch_webpage(self, url: str) -> str:
 
 
 
 
 
 
 
 
 
 
115
  try:
116
+ hdrs = {
117
+ "User-Agent": (
118
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
119
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
120
+ "Chrome/124.0 Safari/537.36"
 
 
 
 
 
 
121
  )
122
+ }
123
+ r = requests.get(url, headers=hdrs, timeout=18)
124
+ r.raise_for_status()
125
+ return _strip_html(r.text)[:8000]
126
  except Exception as e:
127
+ return f"Error fetching {url}: {e}"
128
+
129
+ def fetch_wikipedia(self, title: str) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  try:
131
+ slug = requests.utils.quote(title.replace(" ", "_"))
132
+ r = requests.get(
133
+ f"https://en.wikipedia.org/api/rest_v1/page/summary/{slug}",
134
+ timeout=12,
135
+ )
136
+ if r.status_code == 200:
137
+ return r.json().get("extract", "Not found.")
138
+ # fallback: full extract via w/api.php
139
+ r2 = requests.get(
140
+ "https://en.wikipedia.org/w/api.php",
141
+ params={
142
+ "action": "query",
143
+ "prop": "extracts",
144
+ "exintro": True,
145
+ "titles": title,
146
+ "format": "json",
147
+ },
148
+ timeout=12,
149
+ )
150
+ pages = r2.json().get("query", {}).get("pages", {})
151
+ for page in pages.values():
152
+ extract = _strip_html(page.get("extract", ""))
153
+ if extract:
154
+ return extract[:6000]
155
  except Exception as e:
156
+ return f"Wikipedia error: {e}"
157
+ return "Not found."
158
+
159
+ def fetch_youtube_transcript(self, video_url: str) -> str:
160
+ try:
161
+ from youtube_transcript_api import YouTubeTranscriptApi
162
+ vid_id = re.search(r"v=([^&]+)", video_url)
163
+ if not vid_id:
164
+ return "Could not parse video ID."
165
+ entries = YouTubeTranscriptApi.get_transcript(vid_id.group(1))
166
+ text = " ".join(e["text"] for e in entries)
 
 
 
 
 
 
167
  return text[:6000]
168
+ except Exception as e:
169
+ err = str(e)
170
+ if any(k in err.lower() for k in ("blocked", "ip", "cloud", "requestblocked")):
171
+ return (
172
+ "TRANSCRIPT_UNAVAILABLE: cloud IP blocked by YouTube. "
173
+ "Use search_web to find the video title, description, or "
174
+ "third-party pages that describe its content."
175
+ )
176
+ return f"Transcript error: {err}"
177
+
178
+ # ── image analysis ────────────────────────────────────────────────────────
179
+
180
+ def _analyse_image(self, task_id: str, question: str) -> str:
181
+ file_bytes, content_type = self.fetch_task_file(task_id)
182
+ if not file_bytes or "image" not in (content_type or ""):
183
+ return "No image found for this task."
184
+ ct = content_type.split(";")[0].strip()
185
+ b64 = base64.b64encode(file_bytes).decode()
186
+ resp = self.client.chat.completions.create(
 
 
 
 
 
 
 
 
 
 
187
  model="gpt-4o",
188
  messages=[{
189
  "role": "user",
190
  "content": [
191
+ {
192
+ "type": "image_url",
193
+ "image_url": {
194
+ "url": f"data:{ct};base64,{b64}",
195
+ "detail": "high",
196
+ },
197
+ },
198
+ {"type": "text", "text": question},
199
+ ],
200
  }],
201
+ max_tokens=800,
202
+ temperature=0,
203
  )
204
+ return resp.choices[0].message.content or "No description."
205
+
206
+ # ── messages ──────────────────────────────────────────────────────────────
207
+
208
+ def _build_messages(self, question: str, task_id: str) -> list:
209
+ system = (
210
+ "You are an expert research agent solving GAIA benchmark tasks.\n\n"
211
+ "STRICT RULES:\n"
212
+ "1. Call get_task_file(task_id) FIRST for every task.\n"
213
+ " - If it returns 'NO_FILE', proceed with other tools.\n"
214
+ " - If it says a file is attached AND the task involves an image "
215
+ "(chess board, diagram, photo), call analyse_image_file(task_id, question) "
216
+ "to get a vision description. Then reason from that description.\n"
217
+ " - NEVER return 'NO_FILE' or any tool result directly as your final answer.\n\n"
218
+ "2. YouTube tasks: call get_youtube_transcript(url) first.\n"
219
+ " If blocked, use search_web to find what the video says "
220
+ "(search for the exact video title + key phrase from the question).\n\n"
221
+ "3. Wikipedia tasks: use fetch_wikipedia(exact_title).\n"
222
+ " For discography tasks, fetch the artist's Wikipedia page and look "
223
+ "at the Studio albums table. Count ONLY the artist's SOLO studio albums. "
224
+ "Do NOT count collaborative albums, live albums, or compilations.\n\n"
225
+ "4. LibreTexts 1.E Exercises: fetch this EXACT URL for the Introductory "
226
+ "Chemistry bookshelf version (not campus remixes):\n"
227
+ "https://chem.libretexts.org/Bookshelves/Introductory_Chemistry/"
228
+ "Introductory_Chemistry_(LibreTexts)/02%3A_Measurement_and_Problem_Solving/"
229
+ "2.E%3A_Measurement_and_Problem_Solving_(Exercises)\n\n"
230
+ "5. Final answer: ONLY the value – no explanation, no 'The answer is'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  )
232
 
233
+ file_bytes, content_type = self.fetch_task_file(task_id)
234
+ user_parts = []
235
+
236
+ if file_bytes and content_type:
237
+ ct = content_type.split(";")[0].strip()
238
+ if "image" in ct:
239
+ b64 = base64.b64encode(file_bytes).decode()
240
+ user_parts.append({
241
+ "type": "image_url",
242
+ "image_url": {"url": f"data:{ct};base64,{b64}", "detail": "high"},
243
+ })
244
+ user_parts.append({
245
+ "type": "text",
246
+ "text": f"The image above is attached to this task.\n\nTask: {question}",
247
+ })
248
+ else:
249
+ try:
250
+ text = file_bytes.decode("utf-8", errors="ignore")[:6000]
251
+ except Exception:
252
+ text = "(binary file)"
253
+ user_parts.append({
254
+ "type": "text",
255
+ "text": f"Attached file:\n{text}\n\nTask: {question}",
256
+ })
257
+ else:
258
+ user_parts.append({"type": "text", "text": f"Task: {question}"})
259
+
260
+ return [
261
+ {"role": "system", "content": system},
262
+ {"role": "user", "content": user_parts},
263
+ ]
264
+
265
+ # ── tool specs ────────────────────────────────────────────────────────────
266
+
267
+ TOOLS = [
268
+ {
269
+ "type": "function",
270
+ "function": {
271
+ "name": "get_task_file",
272
+ "description": (
273
+ "Check if a file is attached to this GAIA task. "
274
+ "Returns 'NO_FILE' or a description of the file. "
275
+ "ALWAYS call this first."
276
+ ),
277
+ "parameters": {
278
+ "type": "object",
279
+ "properties": {"task_id": {"type": "string"}},
280
+ "required": ["task_id"],
281
+ },
282
+ },
283
+ },
284
+ {
285
+ "type": "function",
286
+ "function": {
287
+ "name": "analyse_image_file",
288
+ "description": (
289
+ "Use GPT-4o vision to analyse the image attached to a task. "
290
+ "Call this after get_task_file confirms an image file exists "
291
+ "and the task requires visual reasoning (chess, diagrams, photos)."
292
+ ),
293
+ "parameters": {
294
+ "type": "object",
295
+ "properties": {
296
+ "task_id": {"type": "string"},
297
+ "question": {
298
+ "type": "string",
299
+ "description": "What to look for / answer from the image.",
300
+ },
301
+ },
302
+ "required": ["task_id", "question"],
303
+ },
304
+ },
305
+ },
306
+ {
307
+ "type": "function",
308
+ "function": {
309
+ "name": "get_youtube_transcript",
310
+ "description": (
311
+ "Fetch the transcript of a YouTube video. "
312
+ "Returns 'TRANSCRIPT_UNAVAILABLE' if cloud-blocked – "
313
+ "in that case use search_web to find info about the video."
314
+ ),
315
+ "parameters": {
316
+ "type": "object",
317
+ "properties": {"video_url": {"type": "string"}},
318
+ "required": ["video_url"],
319
+ },
320
+ },
321
+ },
322
+ {
323
+ "type": "function",
324
+ "function": {
325
+ "name": "search_web",
326
+ "description": "Search the web using DuckDuckGo. Returns top snippets.",
327
+ "parameters": {
328
+ "type": "object",
329
+ "properties": {"query": {"type": "string"}},
330
+ "required": ["query"],
331
+ },
332
+ },
333
+ },
334
+ {
335
+ "type": "function",
336
+ "function": {
337
+ "name": "fetch_webpage",
338
+ "description": "Fetch and read the full text of any URL.",
339
+ "parameters": {
340
+ "type": "object",
341
+ "properties": {"url": {"type": "string"}},
342
+ "required": ["url"],
343
+ },
344
+ },
345
+ },
346
+ {
347
+ "type": "function",
348
+ "function": {
349
+ "name": "fetch_wikipedia",
350
+ "description": (
351
+ "Fetch a Wikipedia article by exact title via the REST API "
352
+ "(avoids 403 errors). Use for all Wikipedia lookups."
353
+ ),
354
+ "parameters": {
355
+ "type": "object",
356
+ "properties": {"title": {"type": "string"}},
357
+ "required": ["title"],
358
+ },
359
+ },
360
+ },
361
+ ]
362
+
363
+ # ── main ──────────────────────────────────────────────────────────────────
364
 
365
  def __call__(self, question: str, task_id: str = "") -> str:
366
+ print(f"β–Ά Task {task_id[:8]}: {question[:80]}")
367
+ messages = self._build_messages(question, task_id)
368
+
369
+ for _ in range(8):
370
+ try:
371
+ resp = self.client.chat.completions.create(
372
+ model="gpt-4o",
373
+ messages=messages,
374
+ tools=self.TOOLS,
375
+ tool_choice="auto",
376
+ temperature=0,
377
+ max_tokens=1200,
378
+ )
379
+ except Exception as e:
380
+ print(f" OpenAI error: {e}")
381
+ return "Error finding answer."
382
+
383
+ msg = resp.choices[0].message
384
+
385
+ if not msg.tool_calls:
386
+ return (msg.content or "").strip()
387
+
388
+ messages.append({
389
+ "role": "assistant",
390
+ "content": msg.content,
391
+ "tool_calls": [
392
+ {
393
+ "id": tc.id,
394
+ "type": "function",
395
+ "function": {
396
+ "name": tc.function.name,
397
+ "arguments": tc.function.arguments,
398
+ },
399
+ }
400
+ for tc in msg.tool_calls
401
+ ],
402
+ })
403
+
404
+ for tc in msg.tool_calls:
405
+ fn = tc.function.name
406
+ try:
407
+ args = json.loads(tc.function.arguments)
408
+ except Exception:
409
+ args = {}
410
+
411
+ if fn == "get_task_file":
412
+ fb, ct = self.fetch_task_file(args.get("task_id", task_id))
413
+ result = (
414
+ f"File attached – content_type: {ct}, size: {len(fb)} bytes."
415
+ if fb else "NO_FILE"
416
+ )
417
+ elif fn == "analyse_image_file":
418
+ result = self._analyse_image(
419
+ args.get("task_id", task_id),
420
+ args.get("question", question),
421
+ )
422
+ elif fn == "get_youtube_transcript":
423
+ result = self.fetch_youtube_transcript(args.get("video_url", ""))
424
+ elif fn == "search_web":
425
+ result = self.search_web(args.get("query", ""))
426
+ elif fn == "fetch_webpage":
427
+ result = self.fetch_webpage(args.get("url", ""))
428
+ elif fn == "fetch_wikipedia":
429
+ result = self.fetch_wikipedia(args.get("title", ""))
430
+ else:
431
+ result = "Unknown tool."
432
+
433
+ messages.append({
434
+ "role": "tool",
435
+ "tool_call_id": tc.id,
436
+ "content": result or "Empty result.",
437
+ })
438
+
439
+ # Force final answer after max rounds
440
  try:
441
+ messages.append({
442
+ "role": "user",
443
+ "content": "Give your best final answer now – value only, no explanation.",
444
+ })
445
+ resp = self.client.chat.completions.create(
446
+ model="gpt-4o",
447
+ messages=messages,
448
+ temperature=0,
449
+ max_tokens=200,
450
+ )
451
+ return (resp.choices[0].message.content or "").strip()
452
+ except Exception:
453
  return "Error finding answer."
454
 
455
 
456
+ # ── Gradio UI ─────────────────────────────────────────────────────────────────
457
 
458
  def run_and_submit_all(profile: gr.OAuthProfile | None):
459
+ if not profile:
460
+ return "Please login to Hugging Face first.", None
 
 
 
461
 
462
+ username = profile.username
463
+ space_id = os.getenv("SPACE_ID", "")
464
  api_url = DEFAULT_API_URL
 
 
465
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
466
 
467
  try:
468
  agent = BasicAgent()
469
  except Exception as e:
470
+ return f"Initialisation failed: {e}", None
471
 
472
  try:
473
+ qs = requests.get(f"{api_url}/questions", timeout=15)
474
+ qs.raise_for_status()
475
+ questions_data = qs.json()
476
  except Exception as e:
477
  return f"Error fetching questions: {e}", None
478
 
479
+ results_log, answers_payload = [], []
 
480
 
481
  for item in questions_data:
482
  task_id = item.get("task_id", "")
483
  question_text = item.get("question", "")
484
  try:
485
+ answer = agent(question_text, task_id=task_id)
 
 
486
  except Exception as e:
487
+ answer = f"Error: {e}"
488
 
489
+ answers_payload.append({"task_id": task_id, "submitted_answer": answer})
490
+ results_log.append({
491
+ "Task ID": task_id,
492
+ "Question": question_text[:120],
493
+ "Answer": answer,
494
+ })
495
 
496
  try:
497
+ r = requests.post(
498
+ f"{api_url}/submit",
499
+ json={
500
+ "username": username.strip(),
501
+ "agent_code": agent_code,
502
+ "answers": answers_payload,
503
+ },
504
+ timeout=60,
505
+ )
506
+ r.raise_for_status()
507
+ res = r.json()
508
  status = (
509
+ f"βœ… Submitted!\n"
510
+ f"Score: {res.get('score')}% "
511
+ f"({res.get('correct_count')}/{res.get('total_attempted')})\n"
512
  f"Message: {res.get('message')}"
513
  )
 
514
  except Exception as e:
515
+ status = f"Submission failed: {e}"
516
+
517
+ return status, pd.DataFrame(results_log)
518
 
519
 
520
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
521
  gr.Markdown("# πŸ€– GAIA Agent Evaluation")
522
+ gr.Markdown(
523
+ "Fixes applied: chess image via GPT-4o vision Β· YouTube IP-block fallback Β· "
524
+ "correct LibreTexts canonical URL Β· solo-only discography counting Β· "
525
+ "stable DDG HTML scrape."
526
+ )
527
  gr.LoginButton()
528
  run_button = gr.Button("πŸš€ Run Evaluation & Submit", variant="primary")
529
+ status_output = gr.Textbox(label="Status", lines=5)
530
+ results_table = gr.DataFrame(label="Results")
531
  run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
532
 
533
  if __name__ == "__main__":
534
+ demo.launch()