Vinsmart06 commited on
Commit
cd66b27
Β·
verified Β·
1 Parent(s): 8afb125

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +182 -118
app.py CHANGED
@@ -64,164 +64,228 @@ def youtube_captions(self, url):
64
  from openai import OpenAI
65
 
66
  class BasicAgent:
67
- def __call__(self, question, file_url=None):
68
- return self.agent_loop(question, file_url)
69
  def __init__(self):
70
  print("πŸš€ Super GAIA Agent initialized")
71
  self.client = OpenAI()
72
- # Initialize Whisper model once to avoid reloading in the loop
73
  self.audio_model = whisper.load_model("base")
74
- def read_audio(self, file_url):
75
- try:
76
- r = requests.get(file_url, timeout=20)
77
- with open("temp_audio.mp3", "wb") as f: f.write(r.content)
78
- result = self.audio_model.transcribe("temp_audio.mp3")
79
- return result
80
- except Exception as e:
81
- return f"Audio error: {str(e)}"
82
- def download_file(self, url):
83
- if not url or not url.startswith("http"):
84
- return None
85
- try:
86
- r = requests.get(url, timeout=20)
87
- file_name = url.split("/")[-1] or "temp_file"
88
- with open(file_name, "wb") as f:
89
- f.write(r.content)
90
- return file_name
91
- except Exception as e:
92
- print(f"Download error: {e}")
93
- return None
94
 
95
- # --- Robust Wikipedia Tool ---
 
 
 
96
  def wiki_search(self, query):
97
  try:
98
  query = query.strip(' ".,')
99
- # Step 1: find the best matching title
100
- search = requests.get(
101
  "https://en.wikipedia.org/w/api.php",
102
  params={"action": "query", "list": "search", "srsearch": query,
103
- "format": "json", "srlimit": 1},
104
  timeout=10
105
  ).json()
106
- if not search.get("query", {}).get("search"):
107
- return f"No Wikipedia results for '{query}'"
108
- title = search["query"]["search"][0]["title"]
109
-
110
- # Step 2: fetch the full summary via REST API
111
  summary = requests.get(
112
- f"https://en.wikipedia.org/api/rest_v1/page/summary/{title.replace(' ', '_')}",
113
  timeout=10
114
  ).json()
115
- return f"WIKI: {title}\n{summary.get('extract', 'No extract found.')}"
 
 
 
116
  except Exception as e:
117
- return f"Wiki error: {str(e)}"
118
-
119
-
120
-
121
- def execute_tool(self, tool, input_data, file_url):
122
- input_data = input_data.strip(' ".,')
123
- # If agent provides no URL or says 'none', use the system-provided file_url
124
- target = file_url if (not input_data or "http" not in input_data) else input_data
125
-
126
  try:
127
- if tool == "wiki_search": return self.wiki_search(input_data)
128
-
129
- if tool in ["read_image", "read_excel", "read_audio"]:
130
- if not target: return "Error: No file URL available for this task."
131
- r = requests.get(target, timeout=20)
132
- ext = target.split('.')[-1].lower() if '.' in target else 'tmp'
133
- with open(f"temp.{ext}", "wb") as f: f.write(r.content)
134
-
135
- if tool == "read_image": return f"IMAGE_CONTENT: {pytesseract.image_to_string(Image.open(f'temp.{ext}'))}"
136
- if tool == "read_excel": return f"EXCEL_DATA: {pd.read_excel(f'temp.{ext}').to_string()[:3000]}"
137
- if tool == "read_audio": return f"TRANSCRIPT: {self.audio_model.transcribe(f'temp.{ext}')}"
138
-
139
- if tool == "scrape_page":
140
- soup = BeautifulSoup(requests.get(input_data, timeout=10).text, "html.parser")
141
- return f"PAGE_TEXT: {soup.get_text()[:4000]}"
142
  except Exception as e:
143
- return f"Tool error: {str(e)}"
144
- return f"Unknown tool: {tool}"
145
 
146
-
 
 
 
 
 
 
 
 
 
 
147
 
148
- # 2. Handle web/search tools
 
 
 
 
 
 
 
 
 
 
149
 
150
- if tool == "youtube_captions":
151
- return self.youtube_captions(input_data)
 
 
 
 
 
 
 
 
 
 
 
 
152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def agent_loop(self, question, file_url):
155
- # Pre-load file content if available
156
  pre_context = ""
157
  if file_url:
158
- ext = file_url.split('.')[-1].lower()
159
- if ext in ['mp3', 'wav', 'ogg', 'm4a']:
160
- result = self.execute_tool("read_audio", file_url, file_url)
161
- pre_context = f"\nFILE TRANSCRIPTION: {result}"
162
- elif ext in ['xlsx', 'xls']:
163
- result = self.execute_tool("read_excel", file_url, file_url)
164
- pre_context = f"\nEXCEL DATA: {result}"
165
- elif ext in ['png', 'jpg', 'jpeg']:
166
- result = self.execute_tool("read_image", file_url, file_url)
167
- pre_context = f"\nIMAGE TEXT: {result}"
168
  elif ext == 'py':
169
- r = requests.get(file_url, timeout=10)
170
- pre_context = f"\nPYTHON CODE:\n{r.text[:3000]}"
171
-
 
 
172
  memory = pre_context # seed memory with file content
173
- for step in range(10):
174
- prompt = f"""You are a precise GAIA solver.
175
- FILE_URL (use this for file tools if available): {file_url if file_url else 'None'}
176
-
177
- AVAILABLE TOOLS: wiki_search, read_image, read_audio, read_excel, scrape_page
178
-
179
- RULES:
180
- - If FILE_URL is not None, use it as INPUT when calling read_image, read_audio, or read_excel
181
- - NEVER use INPUT: none β€” always use the FILE_URL as INPUT for file tools
182
- - For web questions, use wiki_search or scrape_page
183
- - Decode reversed text before answering
184
- - Botanical vegetables only: exclude all items containing seeds (tomatoes, peppers, zucchini, corn, beans, peas)
185
- - Botanical herbs like basil are NOT vegetables
186
- - For YouTube video questions, use scrape_page with the YouTube URL
187
- - For questions about papers or articles, use scrape_page on the URL
188
- - For any factual lookup not in a file, use wiki_search first, then scrape_page
189
- Question: {question}
190
- History: {memory}
191
-
192
- Respond EXACTLY:
193
- TOOL: tool_name
194
- INPUT: tool_input
195
- OR
196
- FINAL: your_precise_answer"""
197
 
198
  response = self.client.chat.completions.create(
199
- model="gpt-4o",
200
- #model ="gpt-5.2-chat-latest",
201
  temperature=0,
202
- messages=[{"role": "system", "content": "You are a scientific agent. Always use tools to verify facts before answering."},
203
- {"role": "user", "content": prompt}]
 
 
204
  )
205
-
206
  resp = response.choices[0].message.content.strip()
207
- print(f"Step {step}: {resp}")
208
 
 
209
  if "FINAL:" in resp:
210
  return resp.split("FINAL:")[-1].strip()
211
 
212
- # Improved regex to handle tool names regardless of trailing punctuation
213
  t_match = re.search(r"TOOL:\s*(\w+)", resp, re.I)
214
- i_match = re.search(r"INPUT:\s*(.*)", resp, re.I)
215
-
216
  if t_match and i_match:
217
- t_name = t_match.group(1).lower().strip()
218
- t_input = i_match.group(1).strip()
219
- result = self.execute_tool(t_name, t_input, file_url)
220
- memory += f"\nStep {step} - {t_name} output: {result[:1200]}"
 
 
 
 
 
 
 
 
221
  else:
222
- memory += f"\nStep {step} - Thought: {resp}"
223
-
224
- return "No answer found."
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
  def run_and_submit_all( profile: gr.OAuthProfile | None):
227
  """
 
64
  from openai import OpenAI
65
 
66
  class BasicAgent:
 
 
67
  def __init__(self):
68
  print("πŸš€ Super GAIA Agent initialized")
69
  self.client = OpenAI()
 
70
  self.audio_model = whisper.load_model("base")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
+ def __call__(self, question, file_url=None):
73
+ return self.agent_loop(question, file_url)
74
+
75
+ # ── TOOL: Wikipedia ──────────────────────────────────────────────
76
  def wiki_search(self, query):
77
  try:
78
  query = query.strip(' ".,')
79
+ # 1. Find best matching title
80
+ r = requests.get(
81
  "https://en.wikipedia.org/w/api.php",
82
  params={"action": "query", "list": "search", "srsearch": query,
83
+ "format": "json", "srlimit": 3},
84
  timeout=10
85
  ).json()
86
+ results = r.get("query", {}).get("search", [])
87
+ if not results:
88
+ return f"No Wikipedia results for: {query}"
89
+ title = results[0]["title"]
90
+ # 2. Get full extract via REST
91
  summary = requests.get(
92
+ f"https://en.wikipedia.org/api/rest_v1/page/summary/{requests.utils.quote(title)}",
93
  timeout=10
94
  ).json()
95
+ extract = summary.get("extract", "")
96
+ if not extract:
97
+ return f"No extract for: {title}"
98
+ return f"WIKI [{title}]: {extract[:3000]}"
99
  except Exception as e:
100
+ return f"Wiki error: {e}"
101
+
102
+ # ── TOOL: Scrape web page ─────────────────────────────────────────
103
+ def scrape_page(self, url):
104
+ url = url.strip(' "')
105
+ # Block YouTube β€” it never returns useful content via scraping
106
+ if "youtube.com" in url or "youtu.be" in url:
107
+ return "YouTube pages cannot be scraped. Use yt-dlp captions instead or search for video transcript online."
 
108
  try:
109
+ headers = {"User-Agent": "Mozilla/5.0"}
110
+ resp = requests.get(url, timeout=15, headers=headers)
111
+ soup = BeautifulSoup(resp.text, "html.parser")
112
+ # Remove nav/footer/script noise
113
+ for tag in soup(["script", "style", "nav", "footer", "header"]):
114
+ tag.decompose()
115
+ text = soup.get_text(separator=" ", strip=True)
116
+ return f"PAGE [{url[:60]}]: {text[:4000]}"
 
 
 
 
 
 
 
117
  except Exception as e:
118
+ return f"Scrape error: {e}"
 
119
 
120
+ # ── TOOL: Read audio via Whisper ──────────────────────────────────
121
+ def read_audio(self, url):
122
+ try:
123
+ url = url.strip(' "')
124
+ r = requests.get(url, timeout=30)
125
+ with open("temp_audio_file.mp3", "wb") as f:
126
+ f.write(r.content)
127
+ result = self.audio_model.transcribe("temp_audio_file.mp3")
128
+ return f"TRANSCRIPT: {result['text']}"
129
+ except Exception as e:
130
+ return f"Audio error: {e}"
131
 
132
+ # ── TOOL: Read Excel ──────────────────────────────────────────────
133
+ def read_excel(self, url):
134
+ try:
135
+ url = url.strip(' "')
136
+ r = requests.get(url, timeout=20)
137
+ with open("temp_file.xlsx", "wb") as f:
138
+ f.write(r.content)
139
+ df = pd.read_excel("temp_file.xlsx")
140
+ return f"EXCEL_DATA:\n{df.to_string()[:4000]}"
141
+ except Exception as e:
142
+ return f"Excel error: {e}"
143
 
144
+ # ── TOOL: Read image via OCR ──────────────────────────────────────
145
+ def read_image(self, url):
146
+ try:
147
+ url = url.strip(' "')
148
+ r = requests.get(url, timeout=20)
149
+ ext = url.split('.')[-1].lower() or 'png'
150
+ fname = f"temp_img.{ext}"
151
+ with open(fname, "wb") as f:
152
+ f.write(r.content)
153
+ img = Image.open(fname)
154
+ text = pytesseract.image_to_string(img)
155
+ return f"IMAGE_TEXT: {text[:3000]}" if text.strip() else "IMAGE_TEXT: (no text detected by OCR)"
156
+ except Exception as e:
157
+ return f"Image error: {e}"
158
 
159
+ # ── TOOL: Execute Python code ─────────────────────────────────────
160
+ def run_python(self, url):
161
+ try:
162
+ url = url.strip(' "')
163
+ r = requests.get(url, timeout=15)
164
+ code = r.text
165
+ # Safe exec with captured stdout
166
+ import io, contextlib
167
+ stdout = io.StringIO()
168
+ with contextlib.redirect_stdout(stdout):
169
+ exec(code, {})
170
+ output = stdout.getvalue()
171
+ return f"PYTHON_OUTPUT: {output[:2000]}" if output else "PYTHON_OUTPUT: (no print output)"
172
+ except Exception as e:
173
+ return f"Python exec error: {e}"
174
 
175
+ # ── Route tool calls ──────────────────────────────────────────────
176
+ def execute_tool(self, tool, input_data, file_url):
177
+ # Use file_url as fallback when input_data has no URL
178
+ target = input_data.strip(' "')
179
+ if not target.startswith("http") and file_url:
180
+ target = file_url
181
+
182
+ if tool == "wiki_search":
183
+ return self.wiki_search(input_data)
184
+ elif tool == "scrape_page":
185
+ return self.scrape_page(target)
186
+ elif tool == "read_audio":
187
+ return self.read_audio(target)
188
+ elif tool == "read_excel":
189
+ return self.read_excel(target)
190
+ elif tool == "read_image":
191
+ return self.read_image(target)
192
+ elif tool == "run_python":
193
+ return self.run_python(target)
194
+ else:
195
+ return f"Unknown tool: {tool}"
196
+
197
+ # ── Main agent loop ───────────────────────────────────────────────
198
  def agent_loop(self, question, file_url):
199
+ # ── PRE-LOAD: handle file-based questions before the loop ──
200
  pre_context = ""
201
  if file_url:
202
+ ext = file_url.split('.')[-1].lower().split('?')[0]
203
+ print(f" [Pre-load] detected file ext={ext}, url={file_url}")
204
+ if ext in ['mp3', 'wav', 'ogg', 'm4a', 'flac']:
205
+ pre_context = self.read_audio(file_url)
206
+ elif ext in ['xlsx', 'xls', 'csv']:
207
+ pre_context = self.read_excel(file_url)
208
+ elif ext in ['png', 'jpg', 'jpeg', 'gif', 'webp']:
209
+ pre_context = self.read_image(file_url)
 
 
210
  elif ext == 'py':
211
+ try:
212
+ pre_context = "PYTHON_CODE:\n" + requests.get(file_url, timeout=10).text[:3000]
213
+ except:
214
+ pass
215
+
216
  memory = pre_context # seed memory with file content
217
+
218
+ system_prompt = """You are a precise GAIA benchmark solver.
219
+ Rules:
220
+ - Always output exactly: TOOL: tool_name\\nINPUT: tool_input OR FINAL: answer
221
+ - Never repeat a failed tool call with the same input β€” change the query or try a different tool
222
+ - For math/logic questions: reason step by step then output FINAL
223
+ - Botanical rule: vegetables are plant parts that are NOT fruits. Seeds inside = botanical fruit (tomato, pepper, corn, zucchini, green beans, peas, cucumber). Roots/stems/leaves = vegetable (carrot, celery, lettuce, broccoli, sweet potato). Basil = herb, not vegetable.
224
+ - For reversed text: decode it completely before answering"""
225
+
226
+ for step in range(10):
227
+ # Build prompt with all context
228
+ prompt = f"""FILE_URL: {file_url if file_url else 'None'}
229
+
230
+ QUESTION: {question}
231
+
232
+ ACCUMULATED KNOWLEDGE:
233
+ {memory if memory else '(none yet)'}
234
+
235
+ AVAILABLE TOOLS: wiki_search, scrape_page, read_audio, read_excel, read_image, run_python
236
+
237
+ What is your next action? Output TOOL+INPUT or FINAL:"""
 
 
 
238
 
239
  response = self.client.chat.completions.create(
240
+ model="gpt-4o", # upgraded from gpt-4o-mini
 
241
  temperature=0,
242
+ messages=[
243
+ {"role": "system", "content": system_prompt},
244
+ {"role": "user", "content": prompt}
245
+ ]
246
  )
247
+
248
  resp = response.choices[0].message.content.strip()
249
+ print(f" Step {step}: {resp[:120]}")
250
 
251
+ # ── Check for final answer ──
252
  if "FINAL:" in resp:
253
  return resp.split("FINAL:")[-1].strip()
254
 
255
+ # ── Parse tool call ──
256
  t_match = re.search(r"TOOL:\s*(\w+)", resp, re.I)
257
+ i_match = re.search(r"INPUT:\s*(.+)", resp, re.I | re.DOTALL)
258
+
259
  if t_match and i_match:
260
+ tool_name = t_match.group(1).lower().strip()
261
+ tool_input = i_match.group(1).strip().split('\n')[0] # first line only
262
+
263
+ result = self.execute_tool(tool_name, tool_input, file_url)
264
+ print(f" [{tool_name}] β†’ {result[:100]}")
265
+
266
+ # Only add useful results to memory (skip empty/error loops)
267
+ if len(result) > 30 and "error" not in result.lower()[:20]:
268
+ memory += f"\n\n[Step {step} - {tool_name}({tool_input[:50]})]\n{result[:1500]}"
269
+ else:
270
+ # Tool failed β€” tell the model so it tries something different
271
+ memory += f"\n\n[Step {step} - {tool_name} FAILED: {result[:200]}. Try a different approach.]"
272
  else:
273
+ # Model gave a thought without a tool call β€” add to memory as reasoning
274
+ memory += f"\n\n[Step {step} - Reasoning]: {resp[:300]}"
275
+
276
+ # Fallback: ask the model to give best answer from what it has
277
+ fallback = self.client.chat.completions.create(
278
+ model="gpt-4o",
279
+ temperature=0,
280
+ messages=[
281
+ {"role": "system", "content": system_prompt},
282
+ {"role": "user", "content": f"Based on everything gathered, give your best FINAL answer.\nQUESTION: {question}\nKNOWLEDGE:\n{memory}"}
283
+ ]
284
+ )
285
+ resp = fallback.choices[0].message.content.strip()
286
+ if "FINAL:" in resp:
287
+ return resp.split("FINAL:")[-1].strip()
288
+ return resp
289
 
290
  def run_and_submit_all( profile: gr.OAuthProfile | None):
291
  """