maytemuma commited on
Commit
634f467
·
verified ·
1 Parent(s): 01674ea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +74 -70
app.py CHANGED
@@ -1,4 +1,5 @@
1
  import os
 
2
  import gradio as gr
3
  import requests
4
  import pandas as pd
@@ -66,7 +67,7 @@ def download_file_from_api(task_id: str) -> str:
66
  text = ""
67
  for page in reader.pages:
68
  text += page.extract_text() or ""
69
- return text[:15000] if text.strip() else "PDF found but could not extract text (may be scanned/image-based)."
70
  except Exception as e:
71
  return f"PDF file detected but error reading: {str(e)}"
72
 
@@ -102,7 +103,7 @@ def download_file_from_api(task_id: str) -> str:
102
  # --- FALLBACK ---
103
  with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as f:
104
  f.write(response.content)
105
- return f"File downloaded to {f.name} (type: {content_type}). Size: {len(response.content)} bytes. Could not auto-parse."
106
 
107
  except Exception as e:
108
  return f"Error downloading file for task {task_id}: {str(e)}"
@@ -111,8 +112,8 @@ def download_file_from_api(task_id: str) -> str:
111
  @tool
112
  def describe_image(image_path: str) -> str:
113
  """Describes the content of an image file using an AI vision model.
114
- Use this when you have an image file path (e.g. from IMAGE_FILE_SAVED)
115
- and need to understand what the image shows, including any text in it.
116
 
117
  Args:
118
  image_path: The local file path to the image to describe.
@@ -120,26 +121,22 @@ def describe_image(image_path: str) -> str:
120
  try:
121
  from huggingface_hub import InferenceClient
122
 
123
- token = os.getenv("HF_TOKEN")
124
- client = InferenceClient(token=token)
125
 
126
  with open(image_path, "rb") as f:
127
  image_bytes = f.read()
128
 
129
- # Use BLIP2 for image captioning
130
  result = client.image_to_text(
131
  image=image_bytes,
132
  model="Salesforce/blip2-opt-2.7b",
133
  )
134
 
135
  if isinstance(result, str):
136
- description = result
137
  elif hasattr(result, "generated_text"):
138
- description = result.generated_text
139
  else:
140
- description = str(result)
141
-
142
- return f"Image description: {description}"
143
 
144
  except Exception as e:
145
  return f"Could not describe image at {image_path}. Error: {str(e)}"
@@ -148,8 +145,7 @@ def describe_image(image_path: str) -> str:
148
  @tool
149
  def transcribe_audio(audio_path: str) -> str:
150
  """Transcribes an audio file to text using Whisper speech recognition.
151
- Use this when you have an audio file path (e.g. from AUDIO_FILE_SAVED)
152
- and need to know what is spoken in the recording.
153
 
154
  Args:
155
  audio_path: The local file path to the audio file to transcribe.
@@ -157,8 +153,7 @@ def transcribe_audio(audio_path: str) -> str:
157
  try:
158
  from huggingface_hub import InferenceClient
159
 
160
- token = os.getenv("HF_TOKEN")
161
- client = InferenceClient(token=token)
162
 
163
  with open(audio_path, "rb") as f:
164
  audio_bytes = f.read()
@@ -198,7 +193,6 @@ def read_local_file(file_path: str) -> str:
198
  @tool
199
  def execute_python_file(file_path: str) -> str:
200
  """Executes a Python script file and returns its stdout output.
201
- Use this when you receive a .py file that needs to be run to get the answer.
202
 
203
  Args:
204
  file_path: The path to the Python file to execute.
@@ -216,8 +210,6 @@ def execute_python_file(file_path: str) -> str:
216
  output += result.stdout
217
  if result.stderr:
218
  output += f"\nSTDERR: {result.stderr}"
219
- if result.returncode != 0:
220
- output += f"\nReturn code: {result.returncode}"
221
  return output.strip() if output.strip() else "Script executed but produced no output."
222
  except subprocess.TimeoutExpired:
223
  return "Script execution timed out after 30 seconds."
@@ -229,22 +221,7 @@ def execute_python_file(file_path: str) -> str:
229
  # AGENT CLASS
230
  # =============================================
231
 
232
- class BasicAgent:
233
- """An agent using smolagents CodeAgent with web search, file handling,
234
- image description, and audio transcription tools.
235
- Uses HF Inference API — no GPU needed."""
236
-
237
- def __init__(self):
238
- print("Initializing SmolAgent for GAIA benchmark...")
239
-
240
- model = InferenceClientModel(
241
- model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
242
- token=os.getenv("HF_TOKEN"),
243
- max_tokens=2096,
244
- temperature=0.1,
245
- )
246
-
247
- custom_instructions = """You are a precise AI assistant solving GAIA benchmark questions.
248
 
249
  CRITICAL RULES FOR ANSWERING:
250
  1. Your final answer must be ONLY the answer itself — no explanations, no "The answer is", no extra words.
@@ -255,8 +232,8 @@ CRITICAL RULES FOR ANSWERING:
255
 
256
  TOOL USAGE RULES:
257
  6. If a question mentions an attached file, image, audio, spreadsheet, or document, FIRST use download_file_from_api with the task_id.
258
- 7. If download returns "IMAGE_FILE_SAVED:/some/path", then call describe_image("/some/path") to see what the image contains.
259
- 8. If download returns "AUDIO_FILE_SAVED:/some/path", then call transcribe_audio("/some/path") to hear what is said.
260
  9. If the file is a Python script (.py), you can use read_local_file to view it or execute_python_file to run it.
261
  10. Use DuckDuckGoSearchTool when you need factual information from the internet.
262
  11. Use visit_webpage to read the full content of a specific URL.
@@ -266,6 +243,23 @@ REASONING:
266
  13. Double-check your answer before giving it.
267
  """
268
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  self.agent = CodeAgent(
270
  model=model,
271
  tools=[
@@ -278,14 +272,15 @@ REASONING:
278
  execute_python_file,
279
  ],
280
  max_steps=10,
281
- verbosity_level=1,
282
- instructions=custom_instructions,
283
  additional_authorized_imports=[
284
  "json", "re", "math", "datetime", "collections",
285
  "csv", "io", "os", "tempfile", "subprocess",
286
  "base64", "hashlib", "unicodedata", "string",
287
  ],
288
  )
 
289
  print("SmolAgent initialized successfully!")
290
 
291
  def __call__(self, question: str, task_id: str = None) -> str:
@@ -304,37 +299,43 @@ Question: {question}
304
 
305
  Remember: respond with ONLY the final answer, nothing else."""
306
 
307
- try:
308
- result = self.agent.run(prompt)
309
- answer = str(result).strip()
310
-
311
- # Clean up common LLM prefixes
312
- prefixes_to_remove = [
313
- "The answer is ", "The answer is: ",
314
- "Answer: ", "FINAL ANSWER: ",
315
- "Final answer: ", "The final answer is ",
316
- "The final answer is: ", "Result: ",
317
- ]
318
- for prefix in prefixes_to_remove:
319
- if answer.lower().startswith(prefix.lower()):
320
- answer = answer[len(prefix):].strip()
321
-
322
- # Remove wrapping quotes
323
- if len(answer) > 2 and \
324
- ((answer.startswith('"') and answer.endswith('"')) or
325
- (answer.startswith("'") and answer.endswith("'"))):
326
- answer = answer[1:-1].strip()
327
-
328
- # Remove trailing period for short answers
329
- if answer.endswith(".") and len(answer.split()) <= 5:
330
- answer = answer[:-1].strip()
331
-
332
- print(f"Final answer: {answer}")
333
- return answer
 
 
334
 
335
- except Exception as e:
336
- print(f"Agent error: {e}")
337
- return "Unable to determine the answer."
 
 
 
 
338
 
339
 
340
  # =============================================
@@ -405,6 +406,9 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
405
  "Submitted Answer": f"AGENT ERROR: {e}"
406
  })
407
 
 
 
 
408
  if not answers_payload:
409
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
410
 
@@ -448,7 +452,7 @@ with gr.Blocks() as demo:
448
  gr.Markdown("# 🤖 GAIA Agent — Final Assignment")
449
  gr.Markdown(
450
  """
451
- **Agent**: SmolAgent (CodeAgent) with Qwen2.5-Coder-32B via HF Inference API
452
 
453
  **Tools**: Web Search · Webpage Visitor · File Downloader · Image Describer · Audio Transcriber · Python Executor
454
 
 
1
  import os
2
+ import time
3
  import gradio as gr
4
  import requests
5
  import pandas as pd
 
67
  text = ""
68
  for page in reader.pages:
69
  text += page.extract_text() or ""
70
+ return text[:15000] if text.strip() else "PDF found but could not extract text."
71
  except Exception as e:
72
  return f"PDF file detected but error reading: {str(e)}"
73
 
 
103
  # --- FALLBACK ---
104
  with tempfile.NamedTemporaryFile(delete=False, suffix=".bin") as f:
105
  f.write(response.content)
106
+ return f"File downloaded to {f.name} (type: {content_type}). Size: {len(response.content)} bytes."
107
 
108
  except Exception as e:
109
  return f"Error downloading file for task {task_id}: {str(e)}"
 
112
  @tool
113
  def describe_image(image_path: str) -> str:
114
  """Describes the content of an image file using an AI vision model.
115
+ Use this when you have an image file path (from IMAGE_FILE_SAVED)
116
+ and need to understand what the image shows.
117
 
118
  Args:
119
  image_path: The local file path to the image to describe.
 
121
  try:
122
  from huggingface_hub import InferenceClient
123
 
124
+ client = InferenceClient(token=os.getenv("HF_TOKEN"))
 
125
 
126
  with open(image_path, "rb") as f:
127
  image_bytes = f.read()
128
 
 
129
  result = client.image_to_text(
130
  image=image_bytes,
131
  model="Salesforce/blip2-opt-2.7b",
132
  )
133
 
134
  if isinstance(result, str):
135
+ return f"Image description: {result}"
136
  elif hasattr(result, "generated_text"):
137
+ return f"Image description: {result.generated_text}"
138
  else:
139
+ return f"Image description: {str(result)}"
 
 
140
 
141
  except Exception as e:
142
  return f"Could not describe image at {image_path}. Error: {str(e)}"
 
145
  @tool
146
  def transcribe_audio(audio_path: str) -> str:
147
  """Transcribes an audio file to text using Whisper speech recognition.
148
+ Use this when you have an audio file path (from AUDIO_FILE_SAVED).
 
149
 
150
  Args:
151
  audio_path: The local file path to the audio file to transcribe.
 
153
  try:
154
  from huggingface_hub import InferenceClient
155
 
156
+ client = InferenceClient(token=os.getenv("HF_TOKEN"))
 
157
 
158
  with open(audio_path, "rb") as f:
159
  audio_bytes = f.read()
 
193
  @tool
194
  def execute_python_file(file_path: str) -> str:
195
  """Executes a Python script file and returns its stdout output.
 
196
 
197
  Args:
198
  file_path: The path to the Python file to execute.
 
210
  output += result.stdout
211
  if result.stderr:
212
  output += f"\nSTDERR: {result.stderr}"
 
 
213
  return output.strip() if output.strip() else "Script executed but produced no output."
214
  except subprocess.TimeoutExpired:
215
  return "Script execution timed out after 30 seconds."
 
221
  # AGENT CLASS
222
  # =============================================
223
 
224
+ CUSTOM_INSTRUCTIONS = """You are a precise AI assistant solving GAIA benchmark questions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  CRITICAL RULES FOR ANSWERING:
227
  1. Your final answer must be ONLY the answer itself — no explanations, no "The answer is", no extra words.
 
232
 
233
  TOOL USAGE RULES:
234
  6. If a question mentions an attached file, image, audio, spreadsheet, or document, FIRST use download_file_from_api with the task_id.
235
+ 7. If download returns "IMAGE_FILE_SAVED:/some/path", then call describe_image with that path.
236
+ 8. If download returns "AUDIO_FILE_SAVED:/some/path", then call transcribe_audio with that path.
237
  9. If the file is a Python script (.py), you can use read_local_file to view it or execute_python_file to run it.
238
  10. Use DuckDuckGoSearchTool when you need factual information from the internet.
239
  11. Use visit_webpage to read the full content of a specific URL.
 
243
  13. Double-check your answer before giving it.
244
  """
245
 
246
+
247
+ class BasicAgent:
248
+ """Agent using smolagents CodeAgent with HF Inference API."""
249
+
250
+ def __init__(self):
251
+ print("Initializing SmolAgent for GAIA benchmark...")
252
+
253
+ # Use the default model with Nebius provider for better reliability
254
+ model = InferenceClientModel(
255
+ model_id="Qwen/Qwen2.5-Coder-32B-Instruct",
256
+ provider="nebius",
257
+ token=os.getenv("HF_TOKEN"),
258
+ timeout=180,
259
+ max_tokens=2096,
260
+ temperature=0.1,
261
+ )
262
+
263
  self.agent = CodeAgent(
264
  model=model,
265
  tools=[
 
272
  execute_python_file,
273
  ],
274
  max_steps=10,
275
+ verbosity_level=2,
276
+ instructions=CUSTOM_INSTRUCTIONS,
277
  additional_authorized_imports=[
278
  "json", "re", "math", "datetime", "collections",
279
  "csv", "io", "os", "tempfile", "subprocess",
280
  "base64", "hashlib", "unicodedata", "string",
281
  ],
282
  )
283
+
284
  print("SmolAgent initialized successfully!")
285
 
286
  def __call__(self, question: str, task_id: str = None) -> str:
 
299
 
300
  Remember: respond with ONLY the final answer, nothing else."""
301
 
302
+ # Retry logic: try up to 2 times
303
+ for attempt in range(2):
304
+ try:
305
+ result = self.agent.run(prompt)
306
+ answer = str(result).strip()
307
+
308
+ # Clean up common LLM prefixes
309
+ prefixes_to_remove = [
310
+ "The answer is ", "The answer is: ",
311
+ "Answer: ", "FINAL ANSWER: ",
312
+ "Final answer: ", "The final answer is ",
313
+ "The final answer is: ", "Result: ",
314
+ ]
315
+ for prefix in prefixes_to_remove:
316
+ if answer.lower().startswith(prefix.lower()):
317
+ answer = answer[len(prefix):].strip()
318
+
319
+ # Remove wrapping quotes
320
+ if len(answer) > 2 and \
321
+ ((answer.startswith('"') and answer.endswith('"')) or
322
+ (answer.startswith("'") and answer.endswith("'"))):
323
+ answer = answer[1:-1].strip()
324
+
325
+ # Remove trailing period for short answers
326
+ if answer.endswith(".") and len(answer.split()) <= 5:
327
+ answer = answer[:-1].strip()
328
+
329
+ print(f"Final answer: {answer}")
330
+ return answer
331
 
332
+ except Exception as e:
333
+ print(f"Agent error (attempt {attempt + 1}): {e}")
334
+ if attempt == 0:
335
+ print("Retrying in 5 seconds...")
336
+ time.sleep(5)
337
+
338
+ return "Unable to determine the answer."
339
 
340
 
341
  # =============================================
 
406
  "Submitted Answer": f"AGENT ERROR: {e}"
407
  })
408
 
409
+ # Small delay between questions to avoid rate limiting
410
+ time.sleep(2)
411
+
412
  if not answers_payload:
413
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
414
 
 
452
  gr.Markdown("# 🤖 GAIA Agent — Final Assignment")
453
  gr.Markdown(
454
  """
455
+ **Agent**: SmolAgent (CodeAgent) with Qwen2.5-Coder-32B via Nebius (HF Inference)
456
 
457
  **Tools**: Web Search · Webpage Visitor · File Downloader · Image Describer · Audio Transcriber · Python Executor
458