Raj989898 commited on
Commit
833c9ef
·
verified ·
1 Parent(s): f04e43e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +120 -102
app.py CHANGED
@@ -11,12 +11,11 @@ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
12
  # --- File helpers ---
13
  def download_task_file(task_id: str):
14
- """Returns (local_path, filename) or (None, None)."""
15
  url = f"{DEFAULT_API_URL}/files/{task_id}"
16
  try:
17
  resp = requests.get(url, timeout=30)
18
  if resp.status_code != 200:
19
- print(f"No file for task {task_id}: HTTP {resp.status_code}")
20
  return None, None
21
  cd = resp.headers.get("content-disposition", "")
22
  fname = "task_file"
@@ -26,7 +25,7 @@ def download_task_file(task_id: str):
26
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
27
  tmp.write(resp.content)
28
  tmp.close()
29
- print(f"Downloaded file: {fname} -> {tmp.name} ({len(resp.content)} bytes)")
30
  return tmp.name, fname
31
  except Exception as e:
32
  print(f"File download error: {e}")
@@ -44,17 +43,14 @@ def read_file_contents(local_path: str, fname: str) -> str:
44
  elif ext in (".py", ".txt", ".md", ".json"):
45
  with open(local_path) as f:
46
  return f.read()
47
- elif ext in (".png", ".jpg", ".jpeg", ".gif", ".webp"):
48
- return f"[IMAGE FILE: {fname}] - This is an image that needs visual analysis."
49
  else:
50
- # Try reading as text anyway
51
  try:
52
  with open(local_path) as f:
53
  return f.read()
54
  except:
55
- return f"Binary file: {fname} ({ext})"
56
  except Exception as e:
57
- return f"Error reading file: {e}"
58
 
59
  def run_python_file(local_path: str) -> str:
60
  try:
@@ -62,13 +58,13 @@ def run_python_file(local_path: str) -> str:
62
  [sys.executable, local_path],
63
  capture_output=True, text=True, timeout=15
64
  )
65
- output = result.stdout + result.stderr
66
- print(f"Python output: {output[:300]}")
67
- return output.strip() if output.strip() else "No output produced."
68
  except subprocess.TimeoutExpired:
69
- return "Code execution timed out."
70
  except Exception as e:
71
- return f"Execution error: {e}"
72
 
73
  def clean_answer(text: str) -> str:
74
  text = text.strip()
@@ -90,10 +86,8 @@ def call_groq(api_key: str, prompt: str, system: str = "", max_tokens: int = 512
90
  body = {"model": "llama-3.3-70b-versatile", "messages": messages,
91
  "temperature": 0.0, "max_tokens": max_tokens}
92
  resp = requests.post(url, headers=headers, json=body, timeout=60)
93
- print(f"Groq status: {resp.status_code}")
94
  if resp.status_code != 200:
95
- print(f"Groq error: {resp.text[:400]}")
96
- raise Exception(f"Groq API error {resp.status_code}: {resp.text[:200]}")
97
  return resp.json()["choices"][0]["message"]["content"].strip()
98
 
99
  # --- Web search ---
@@ -103,7 +97,7 @@ def search_web(query: str, max_results: int = 6) -> str:
103
  with DDGS() as ddgs:
104
  results = list(ddgs.text(query, max_results=max_results))
105
  if not results:
106
- return "No results found."
107
  return "\n\n".join(
108
  f"Title: {r.get('title','')}\nSnippet: {r.get('body','')}\nURL: {r.get('href','')}"
109
  for r in results
@@ -112,42 +106,50 @@ def search_web(query: str, max_results: int = 6) -> str:
112
  return f"Search error: {e}"
113
 
114
  def test_api():
115
- api_key = os.getenv("GROQ_API_KEY", "")
116
- if not api_key:
117
- return "❌ GROQ_API_KEY is NOT set in Space Secrets!"
118
  try:
119
- answer = call_groq(api_key, "What is 2+2? Reply with just the number.", "Reply with only the bare answer.")
120
- return f"✅ Groq API working! Test answer: '{answer}'"
121
  except Exception as e:
122
- return f"❌ Groq failed: {e}"
123
-
124
- # --- System prompt ---
125
- SYSTEM_PROMPT = """You are an expert AI agent solving GAIA benchmark questions. Exact match grading is used.
126
-
127
- CRITICAL RULES:
128
- 1. Reply with ONLY the final answer no explanation, no preamble, no prefix like "The answer is"
129
- 2. Be as concise as possible: just the name, number, word, or short phrase
130
- 3. For numbers: use digits (e.g. "42") unless words are specifically requested
131
- 4. For currency: strip $ signs and commas unless format is specifically asked for (e.g. "1234.56" not "$1,234.56")
132
- 5. For lists: use comma-separated values with no extra words
133
- 6. For names: give full name in the exact format requested (first name only if asked for first name)
134
- 7. Think carefully — precision matters for exact matching
135
  """
136
 
137
  class BasicAgent:
138
  def __init__(self):
139
  self.api_key = os.getenv("GROQ_API_KEY", "")
140
  if not self.api_key:
141
- raise RuntimeError(
142
- "GROQ_API_KEY not set!\n"
143
- "1. Go to https://console.groq.com → free account → API Keys → Create key\n"
144
- "2. Space Settings Variables and Secrets → New Secret\n"
145
- " Name: GROQ_API_KEY Value: your key"
 
 
 
 
 
 
 
 
 
146
  )
147
- print(f"BasicAgent ready. Key: {self.api_key[:8]}...")
 
 
 
148
 
149
  def __call__(self, question: str) -> str:
150
- # Extract injected task_id
151
  task_id = ""
152
  if question.startswith("[TASK_ID:"):
153
  end = question.index("]")
@@ -157,85 +159,108 @@ class BasicAgent:
157
  print(f"\n{'='*50}\nTask: {task_id}\nQ: {question[:200]}")
158
 
159
  file_context = ""
160
- code_output = ""
161
- local_path = None
162
- fname = None
163
 
164
- # 1. Always try to download file for every task
165
  if task_id:
166
  local_path, fname = download_task_file(task_id)
167
  if local_path and fname:
168
  ext = os.path.splitext(fname)[-1].lower()
169
-
170
  if ext == ".py":
171
- # Run Python code and capture output
172
- code_output_text = run_python_file(local_path)
173
- file_contents = read_file_contents(local_path, fname)
174
  file_context = (
175
  f"\n\n[Python file: {fname}]\n"
176
- f"CODE:\n{file_contents}\n\n"
177
- f"EXECUTION OUTPUT:\n{code_output_text}\n"
178
- f"[End of file]\n"
179
  )
180
  elif ext in (".xlsx", ".xls", ".csv"):
181
  contents = read_file_contents(local_path, fname)
182
- file_context = f"\n\n[Data file: {fname}]\n{contents[:5000]}\n[End of file]\n"
183
- elif ext in (".png", ".jpg", ".jpeg"):
184
- file_context = f"\n\n[Note: An image file '{fname}' is attached but cannot be displayed in text. Use your knowledge to answer based on the question context.]\n"
 
185
  else:
186
  contents = read_file_contents(local_path, fname)
187
- file_context = f"\n\n[Attached file: {fname}]\n{contents[:4000]}\n[End of file]\n"
188
 
189
- # 2. Web search always search unless we have a code execution result
190
- search_context = ""
191
- has_code_answer = local_path and fname and os.path.splitext(fname)[-1].lower() == ".py"
192
-
193
- if not has_code_answer:
194
- # Build a focused search query
195
- search_query = question[:200]
196
- print(f"Searching: {search_query[:80]}...")
197
- results = search_web(search_query)
198
- if results and "error" not in results.lower() and "No results" not in results:
199
- search_context = f"\n\n[Web search results]\n{results[:3000]}\n[End search]\n"
200
-
201
- # 3. Special handling for reversed text question
202
  if "rewsna" in question or "dnatsrednu" in question:
203
- # This is a reversed text question — reverse it first
204
  reversed_q = question[::-1]
205
- print(f"Reversed question: {reversed_q}")
206
- question = reversed_q
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- # 4. Build prompt
209
  prompt = (
210
- f"Question: {question}"
211
  f"{file_context}"
212
  f"{search_context}"
213
- "\n\nProvide ONLY the final answer. No explanation. No prefix."
 
214
  )
215
 
216
  try:
217
- answer = call_groq(self.api_key, prompt, SYSTEM_PROMPT, max_tokens=256)
218
- print(f"Raw answer: '{answer}'")
219
 
220
- # If too verbose, extract key part
221
- if len(answer.split()) > 25:
222
  answer = call_groq(
223
  self.api_key,
224
- f"From this response, extract ONLY the shortest final answer "
225
- f"(name, number, or brief phrase). Nothing else:\n\n{answer}",
226
- "Reply with only the bare answer. No explanation.",
227
  max_tokens=64
228
  )
229
- print(f"Extracted: '{answer}'")
230
 
231
  answer = clean_answer(answer)
232
  print(f"Final: '{answer}'")
233
  return answer
234
-
235
  except Exception as e:
236
- print(f"Agent error: {e}\n{traceback.format_exc()}")
237
  return ""
238
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
  # --- Submit ---
240
  def run_and_submit_all(profile: gr.OAuthProfile | None):
241
  space_id = os.getenv("SPACE_ID")
@@ -243,8 +268,6 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
243
  return "Please Login to Hugging Face with the button.", None
244
 
245
  username = f"{profile.username}"
246
- print(f"User: {username}")
247
-
248
  try:
249
  agent = BasicAgent()
250
  except RuntimeError as e:
@@ -283,7 +306,7 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
283
  })
284
 
285
  if not answers_payload:
286
- return "Agent did not produce any answers.", pd.DataFrame(results_log)
287
 
288
  try:
289
  response = requests.post(
@@ -292,32 +315,27 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
292
  timeout=60
293
  )
294
  response.raise_for_status()
295
- result_data = response.json()
296
- final_status = (
297
  f"Submission Successful!\n"
298
- f"User: {result_data.get('username')}\n"
299
- f"Overall Score: {result_data.get('score', 'N/A')}% "
300
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
301
- f"Message: {result_data.get('message', '')}"
302
  )
303
- return final_status, pd.DataFrame(results_log)
304
  except Exception as e:
305
  return f"Submission Failed: {e}", pd.DataFrame(results_log)
306
 
307
  # --- UI ---
308
  with gr.Blocks() as demo:
309
  gr.Markdown("# Basic Agent Evaluation Runner")
310
- gr.Markdown("""
311
- **Setup:** Add `GROQ_API_KEY` in Space Settings → Variables and Secrets → New Secret.
312
- Free key at [console.groq.com](https://console.groq.com)
313
- """)
314
  gr.LoginButton()
315
-
316
  with gr.Row():
317
  test_btn = gr.Button("🔬 Test Groq API", variant="secondary")
318
  test_out = gr.Textbox(label="Test Result", lines=2, interactive=False)
319
  test_btn.click(fn=test_api, outputs=test_out)
320
-
321
  gr.Markdown("---")
322
  run_button = gr.Button("🚀 Run Evaluation & Submit All Answers", variant="primary")
323
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
11
 
12
  # --- File helpers ---
13
  def download_task_file(task_id: str):
 
14
  url = f"{DEFAULT_API_URL}/files/{task_id}"
15
  try:
16
  resp = requests.get(url, timeout=30)
17
  if resp.status_code != 200:
18
+ print(f"No file for {task_id}: HTTP {resp.status_code}")
19
  return None, None
20
  cd = resp.headers.get("content-disposition", "")
21
  fname = "task_file"
 
25
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
26
  tmp.write(resp.content)
27
  tmp.close()
28
+ print(f"Downloaded: {fname} ({len(resp.content)} bytes) -> {tmp.name}")
29
  return tmp.name, fname
30
  except Exception as e:
31
  print(f"File download error: {e}")
 
43
  elif ext in (".py", ".txt", ".md", ".json"):
44
  with open(local_path) as f:
45
  return f.read()
 
 
46
  else:
 
47
  try:
48
  with open(local_path) as f:
49
  return f.read()
50
  except:
51
+ return f"Binary file: {fname}"
52
  except Exception as e:
53
+ return f"Error reading: {e}"
54
 
55
  def run_python_file(local_path: str) -> str:
56
  try:
 
58
  [sys.executable, local_path],
59
  capture_output=True, text=True, timeout=15
60
  )
61
+ output = (result.stdout + result.stderr).strip()
62
+ print(f"Python output: '{output[:200]}'")
63
+ return output if output else "No output."
64
  except subprocess.TimeoutExpired:
65
+ return "Timed out."
66
  except Exception as e:
67
+ return f"Error: {e}"
68
 
69
  def clean_answer(text: str) -> str:
70
  text = text.strip()
 
86
  body = {"model": "llama-3.3-70b-versatile", "messages": messages,
87
  "temperature": 0.0, "max_tokens": max_tokens}
88
  resp = requests.post(url, headers=headers, json=body, timeout=60)
 
89
  if resp.status_code != 200:
90
+ raise Exception(f"Groq error {resp.status_code}: {resp.text[:200]}")
 
91
  return resp.json()["choices"][0]["message"]["content"].strip()
92
 
93
  # --- Web search ---
 
97
  with DDGS() as ddgs:
98
  results = list(ddgs.text(query, max_results=max_results))
99
  if not results:
100
+ return "No results."
101
  return "\n\n".join(
102
  f"Title: {r.get('title','')}\nSnippet: {r.get('body','')}\nURL: {r.get('href','')}"
103
  for r in results
 
106
  return f"Search error: {e}"
107
 
108
  def test_api():
109
+ key = os.getenv("GROQ_API_KEY", "")
110
+ if not key:
111
+ return "❌ GROQ_API_KEY not set!"
112
  try:
113
+ ans = call_groq(key, "What is 2+2?", "Reply with only the number.")
114
+ return f"✅ Groq working! Test: '{ans}'"
115
  except Exception as e:
116
+ return f"❌ {e}"
117
+
118
+ SYSTEM_PROMPT = """You are a GAIA benchmark agent. Exact match grading is used — precision is everything.
119
+
120
+ RULES:
121
+ 1. Reply with ONLY the final answer. No explanation, no prefix, no "The answer is".
122
+ 2. Numbers: use digits unless words are asked. No $ or , in numbers unless format is asked.
123
+ 3. Names: exact format as requested (first name only if asked for first name).
124
+ 4. Lists: comma-separated, alphabetical if asked.
125
+ 5. Think carefully wrong format = wrong answer even if content is right.
 
 
 
126
  """
127
 
128
  class BasicAgent:
129
  def __init__(self):
130
  self.api_key = os.getenv("GROQ_API_KEY", "")
131
  if not self.api_key:
132
+ raise RuntimeError("GROQ_API_KEY not set! Add it in Space Settings → Secrets.")
133
+ print(f"Agent ready. Key: {self.api_key[:8]}...")
134
+
135
+ def _multi_search(self, question: str) -> str:
136
+ """Do up to 2 targeted searches for better results."""
137
+ # First search: full question
138
+ r1 = search_web(question[:200])
139
+ # Second search: extract key entities for a more focused query
140
+ try:
141
+ focused = call_groq(
142
+ self.api_key,
143
+ f"Write a short 5-8 word web search query to find the answer to:\n{question}",
144
+ "Reply with only the search query. No quotes.",
145
+ max_tokens=30
146
  )
147
+ r2 = search_web(focused)
148
+ return r1 + "\n\n---\n\n" + r2
149
+ except:
150
+ return r1
151
 
152
  def __call__(self, question: str) -> str:
 
153
  task_id = ""
154
  if question.startswith("[TASK_ID:"):
155
  end = question.index("]")
 
159
  print(f"\n{'='*50}\nTask: {task_id}\nQ: {question[:200]}")
160
 
161
  file_context = ""
162
+ is_python = False
163
+ is_image = False
 
164
 
165
+ # 1. Download file
166
  if task_id:
167
  local_path, fname = download_task_file(task_id)
168
  if local_path and fname:
169
  ext = os.path.splitext(fname)[-1].lower()
 
170
  if ext == ".py":
171
+ is_python = True
172
+ code = read_file_contents(local_path, fname)
173
+ output = run_python_file(local_path)
174
  file_context = (
175
  f"\n\n[Python file: {fname}]\n"
176
+ f"CODE:\n{code}\n\n"
177
+ f"EXECUTION OUTPUT: {output}\n"
178
+ f"[End]\n"
179
  )
180
  elif ext in (".xlsx", ".xls", ".csv"):
181
  contents = read_file_contents(local_path, fname)
182
+ file_context = f"\n\n[Data file: {fname}]\n{contents[:6000]}\n[End]\n"
183
+ elif ext in (".png", ".jpg", ".jpeg", ".gif"):
184
+ is_image = True
185
+ file_context = f"\n\n[Image file '{fname}' attached — use question context and your knowledge.]\n"
186
  else:
187
  contents = read_file_contents(local_path, fname)
188
+ file_context = f"\n\n[File: {fname}]\n{contents[:4000]}\n[End]\n"
189
 
190
+ # 2. Handle reversed text question
191
+ q_for_search = question
 
 
 
 
 
 
 
 
 
 
 
192
  if "rewsna" in question or "dnatsrednu" in question:
 
193
  reversed_q = question[::-1]
194
+ print(f"Reversed: {reversed_q}")
195
+ q_for_search = reversed_q
196
+ file_context += f"\n\n[Note: The question above is written in reverse. Reversed it reads: {reversed_q}]\n"
197
+
198
+ # 3. Web search (skip if python file — we have the output)
199
+ search_context = ""
200
+ if not is_python:
201
+ print("Searching...")
202
+ results = self._multi_search(q_for_search)
203
+ if results and "error" not in results.lower():
204
+ search_context = f"\n\n[Web search results]\n{results[:4000]}\n[End search]\n"
205
+
206
+ # 4. Build prompt with strong format guidance
207
+ format_hint = self._get_format_hint(question)
208
 
 
209
  prompt = (
210
+ f"Question: {q_for_search}"
211
  f"{file_context}"
212
  f"{search_context}"
213
+ f"\n\n{format_hint}"
214
+ "\nProvide ONLY the final answer. No explanation."
215
  )
216
 
217
  try:
218
+ answer = call_groq(self.api_key, prompt, SYSTEM_PROMPT, max_tokens=128)
219
+ print(f"Raw: '{answer}'")
220
 
221
+ if len(answer.split()) > 30:
 
222
  answer = call_groq(
223
  self.api_key,
224
+ f"Extract only the shortest final answer from:\n\n{answer}",
225
+ "Reply with only the bare answer.",
 
226
  max_tokens=64
227
  )
 
228
 
229
  answer = clean_answer(answer)
230
  print(f"Final: '{answer}'")
231
  return answer
 
232
  except Exception as e:
233
+ print(f"Error: {e}")
234
  return ""
235
 
236
+ def _get_format_hint(self, question: str) -> str:
237
+ q = question.lower()
238
+ if "first name" in q:
239
+ return "Format: Reply with first name only."
240
+ if "surname" in q or "last name" in q:
241
+ return "Format: Reply with surname/last name only."
242
+ if "how many" in q:
243
+ return "Format: Reply with a number only (digits, no words)."
244
+ if "studio album" in q:
245
+ return "Format: Reply with a number only. Count only STUDIO albums (not live, compilation, or collaborative)."
246
+ if "country" in q and "olympic" in q:
247
+ return "Format: Reply with country name only."
248
+ if "excel" in q or "sales" in q or "total" in q:
249
+ return "Format: Plain number only, no $ or commas (e.g. 12345.67 not $12,345.67)."
250
+ if "chess" in q:
251
+ return "Format: Chess move in standard notation (e.g. Qd8, e5, Nf3)."
252
+ if "at bat" in q or "at-bat" in q:
253
+ return "Format: Reply with a number only."
254
+ if "video" in q and "youtube" in q:
255
+ return "Format: Reply with the exact quote or short phrase only."
256
+ if "wikipedia" in q and "nominat" in q:
257
+ return "Format: Reply with the username only."
258
+ if "pitcher" in q:
259
+ return "Format: Two last names separated by comma (e.g. Smith, Jones), in jersey number order."
260
+ if "grocery" in q or "shopping" in q or "ingredients" in q:
261
+ return "Format: Comma-separated list, alphabetical order, all lowercase."
262
+ return "Format: Reply with the shortest possible correct answer."
263
+
264
  # --- Submit ---
265
  def run_and_submit_all(profile: gr.OAuthProfile | None):
266
  space_id = os.getenv("SPACE_ID")
 
268
  return "Please Login to Hugging Face with the button.", None
269
 
270
  username = f"{profile.username}"
 
 
271
  try:
272
  agent = BasicAgent()
273
  except RuntimeError as e:
 
306
  })
307
 
308
  if not answers_payload:
309
+ return "No answers produced.", pd.DataFrame(results_log)
310
 
311
  try:
312
  response = requests.post(
 
315
  timeout=60
316
  )
317
  response.raise_for_status()
318
+ r = response.json()
319
+ status = (
320
  f"Submission Successful!\n"
321
+ f"User: {r.get('username')}\n"
322
+ f"Overall Score: {r.get('score', 'N/A')}% "
323
+ f"({r.get('correct_count', '?')}/{r.get('total_attempted', '?')} correct)\n"
324
+ f"Message: {r.get('message', '')}"
325
  )
326
+ return status, pd.DataFrame(results_log)
327
  except Exception as e:
328
  return f"Submission Failed: {e}", pd.DataFrame(results_log)
329
 
330
  # --- UI ---
331
  with gr.Blocks() as demo:
332
  gr.Markdown("# Basic Agent Evaluation Runner")
333
+ gr.Markdown("**Setup:** Add `GROQ_API_KEY` in Space Settings → Secrets. Free key at [console.groq.com](https://console.groq.com)")
 
 
 
334
  gr.LoginButton()
 
335
  with gr.Row():
336
  test_btn = gr.Button("🔬 Test Groq API", variant="secondary")
337
  test_out = gr.Textbox(label="Test Result", lines=2, interactive=False)
338
  test_btn.click(fn=test_api, outputs=test_out)
 
339
  gr.Markdown("---")
340
  run_button = gr.Button("🚀 Run Evaluation & Submit All Answers", variant="primary")
341
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)