Raj989898 commited on
Commit
8497d3d
·
verified ·
1 Parent(s): 833c9ef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +107 -185
app.py CHANGED
@@ -9,13 +9,11 @@ import sys
9
 
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
12
- # --- File helpers ---
13
  def download_task_file(task_id: str):
14
  url = f"{DEFAULT_API_URL}/files/{task_id}"
15
  try:
16
  resp = requests.get(url, timeout=30)
17
  if resp.status_code != 200:
18
- print(f"No file for {task_id}: HTTP {resp.status_code}")
19
  return None, None
20
  cd = resp.headers.get("content-disposition", "")
21
  fname = "task_file"
@@ -25,13 +23,13 @@ def download_task_file(task_id: str):
25
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
26
  tmp.write(resp.content)
27
  tmp.close()
28
- print(f"Downloaded: {fname} ({len(resp.content)} bytes) -> {tmp.name}")
29
  return tmp.name, fname
30
  except Exception as e:
31
  print(f"File download error: {e}")
32
  return None, None
33
 
34
- def read_file_contents(local_path: str, fname: str) -> str:
35
  ext = os.path.splitext(fname)[-1].lower()
36
  try:
37
  if ext in (".xlsx", ".xls"):
@@ -48,50 +46,39 @@ def read_file_contents(local_path: str, fname: str) -> str:
48
  with open(local_path) as f:
49
  return f.read()
50
  except:
51
- return f"Binary file: {fname}"
52
  except Exception as e:
53
- return f"Error reading: {e}"
54
 
55
- def run_python_file(local_path: str) -> str:
56
  try:
57
- result = subprocess.run(
58
- [sys.executable, local_path],
59
- capture_output=True, text=True, timeout=15
60
- )
61
- output = (result.stdout + result.stderr).strip()
62
- print(f"Python output: '{output[:200]}'")
63
- return output if output else "No output."
64
- except subprocess.TimeoutExpired:
65
- return "Timed out."
66
  except Exception as e:
67
  return f"Error: {e}"
68
 
69
- def clean_answer(text: str) -> str:
70
  text = text.strip()
71
- for prefix in ["FINAL ANSWER:", "Final Answer:", "Answer:",
72
- "The answer is:", "The answer is",
73
- "**Answer:**", "**Final Answer:**"]:
74
- if text.lower().startswith(prefix.lower()):
75
- text = text[len(prefix):].strip()
76
  return text.split("\n")[0].strip().strip('"').strip("'").strip("*").strip()
77
 
78
- # --- Groq API ---
79
- def call_groq(api_key: str, prompt: str, system: str = "", max_tokens: int = 512) -> str:
80
  url = "https://api.groq.com/openai/v1/chat/completions"
81
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
82
- messages = []
83
- if system:
84
- messages.append({"role": "system", "content": system})
85
- messages.append({"role": "user", "content": prompt})
86
  body = {"model": "llama-3.3-70b-versatile", "messages": messages,
87
  "temperature": 0.0, "max_tokens": max_tokens}
88
  resp = requests.post(url, headers=headers, json=body, timeout=60)
89
  if resp.status_code != 200:
90
- raise Exception(f"Groq error {resp.status_code}: {resp.text[:200]}")
91
  return resp.json()["choices"][0]["message"]["content"].strip()
92
 
93
- # --- Web search ---
94
- def search_web(query: str, max_results: int = 6) -> str:
95
  try:
96
  from duckduckgo_search import DDGS
97
  with DDGS() as ddgs:
@@ -100,8 +87,7 @@ def search_web(query: str, max_results: int = 6) -> str:
100
  return "No results."
101
  return "\n\n".join(
102
  f"Title: {r.get('title','')}\nSnippet: {r.get('body','')}\nURL: {r.get('href','')}"
103
- for r in results
104
- )
105
  except Exception as e:
106
  return f"Search error: {e}"
107
 
@@ -110,44 +96,45 @@ def test_api():
110
  if not key:
111
  return "❌ GROQ_API_KEY not set!"
112
  try:
113
- ans = call_groq(key, "What is 2+2?", "Reply with only the number.")
114
  return f"✅ Groq working! Test: '{ans}'"
115
  except Exception as e:
116
  return f"❌ {e}"
117
 
118
- SYSTEM_PROMPT = """You are a GAIA benchmark agent. Exact match grading is used — precision is everything.
119
-
120
- RULES:
121
- 1. Reply with ONLY the final answer. No explanation, no prefix, no "The answer is".
122
- 2. Numbers: use digits unless words are asked. No $ or , in numbers unless format is asked.
123
- 3. Names: exact format as requested (first name only if asked for first name).
124
- 4. Lists: comma-separated, alphabetical if asked.
125
- 5. Think carefully — wrong format = wrong answer even if content is right.
126
- """
127
 
128
  class BasicAgent:
129
  def __init__(self):
130
- self.api_key = os.getenv("GROQ_API_KEY", "")
131
- if not self.api_key:
132
- raise RuntimeError("GROQ_API_KEY not set! Add it in Space Settings → Secrets.")
133
- print(f"Agent ready. Key: {self.api_key[:8]}...")
134
-
135
- def _multi_search(self, question: str) -> str:
136
- """Do up to 2 targeted searches for better results."""
137
- # First search: full question
138
- r1 = search_web(question[:200])
139
- # Second search: extract key entities for a more focused query
140
- try:
141
- focused = call_groq(
142
- self.api_key,
143
- f"Write a short 5-8 word web search query to find the answer to:\n{question}",
144
- "Reply with only the search query. No quotes.",
145
- max_tokens=30
146
- )
147
- r2 = search_web(focused)
148
- return r1 + "\n\n---\n\n" + r2
149
- except:
150
- return r1
 
 
 
 
 
 
 
151
 
152
  def __call__(self, question: str) -> str:
153
  task_id = ""
@@ -158,116 +145,61 @@ class BasicAgent:
158
 
159
  print(f"\n{'='*50}\nTask: {task_id}\nQ: {question[:200]}")
160
 
161
- file_context = ""
162
- is_python = False
163
- is_image = False
 
 
 
 
164
 
165
- # 1. Download file
166
  if task_id:
167
- local_path, fname = download_task_file(task_id)
168
- if local_path and fname:
169
- ext = os.path.splitext(fname)[-1].lower()
170
  if ext == ".py":
171
- is_python = True
172
- code = read_file_contents(local_path, fname)
173
- output = run_python_file(local_path)
174
- file_context = (
175
- f"\n\n[Python file: {fname}]\n"
176
- f"CODE:\n{code}\n\n"
177
- f"EXECUTION OUTPUT: {output}\n"
178
- f"[End]\n"
179
- )
180
  elif ext in (".xlsx", ".xls", ".csv"):
181
- contents = read_file_contents(local_path, fname)
182
- file_context = f"\n\n[Data file: {fname}]\n{contents[:6000]}\n[End]\n"
183
- elif ext in (".png", ".jpg", ".jpeg", ".gif"):
184
- is_image = True
185
- file_context = f"\n\n[Image file '{fname}' attached — use question context and your knowledge.]\n"
186
  else:
187
- contents = read_file_contents(local_path, fname)
188
- file_context = f"\n\n[File: {fname}]\n{contents[:4000]}\n[End]\n"
189
-
190
- # 2. Handle reversed text question
191
- q_for_search = question
192
- if "rewsna" in question or "dnatsrednu" in question:
193
- reversed_q = question[::-1]
194
- print(f"Reversed: {reversed_q}")
195
- q_for_search = reversed_q
196
- file_context += f"\n\n[Note: The question above is written in reverse. Reversed it reads: {reversed_q}]\n"
197
-
198
- # 3. Web search (skip if python file — we have the output)
199
- search_context = ""
200
- if not is_python:
201
- print("Searching...")
202
- results = self._multi_search(q_for_search)
203
- if results and "error" not in results.lower():
204
- search_context = f"\n\n[Web search results]\n{results[:4000]}\n[End search]\n"
205
-
206
- # 4. Build prompt with strong format guidance
207
- format_hint = self._get_format_hint(question)
208
-
209
- prompt = (
210
- f"Question: {q_for_search}"
211
- f"{file_context}"
212
- f"{search_context}"
213
- f"\n\n{format_hint}"
214
- "\nProvide ONLY the final answer. No explanation."
215
- )
216
 
217
  try:
218
- answer = call_groq(self.api_key, prompt, SYSTEM_PROMPT, max_tokens=128)
219
- print(f"Raw: '{answer}'")
220
-
221
- if len(answer.split()) > 30:
222
- answer = call_groq(
223
- self.api_key,
224
- f"Extract only the shortest final answer from:\n\n{answer}",
225
- "Reply with only the bare answer.",
226
- max_tokens=64
227
- )
228
-
229
- answer = clean_answer(answer)
230
  print(f"Final: '{answer}'")
231
  return answer
232
  except Exception as e:
233
  print(f"Error: {e}")
234
  return ""
235
 
236
- def _get_format_hint(self, question: str) -> str:
237
- q = question.lower()
238
- if "first name" in q:
239
- return "Format: Reply with first name only."
240
- if "surname" in q or "last name" in q:
241
- return "Format: Reply with surname/last name only."
242
- if "how many" in q:
243
- return "Format: Reply with a number only (digits, no words)."
244
- if "studio album" in q:
245
- return "Format: Reply with a number only. Count only STUDIO albums (not live, compilation, or collaborative)."
246
- if "country" in q and "olympic" in q:
247
- return "Format: Reply with country name only."
248
- if "excel" in q or "sales" in q or "total" in q:
249
- return "Format: Plain number only, no $ or commas (e.g. 12345.67 not $12,345.67)."
250
- if "chess" in q:
251
- return "Format: Chess move in standard notation (e.g. Qd8, e5, Nf3)."
252
- if "at bat" in q or "at-bat" in q:
253
- return "Format: Reply with a number only."
254
- if "video" in q and "youtube" in q:
255
- return "Format: Reply with the exact quote or short phrase only."
256
- if "wikipedia" in q and "nominat" in q:
257
- return "Format: Reply with the username only."
258
- if "pitcher" in q:
259
- return "Format: Two last names separated by comma (e.g. Smith, Jones), in jersey number order."
260
- if "grocery" in q or "shopping" in q or "ingredients" in q:
261
- return "Format: Comma-separated list, alphabetical order, all lowercase."
262
- return "Format: Reply with the shortest possible correct answer."
263
-
264
- # --- Submit ---
265
  def run_and_submit_all(profile: gr.OAuthProfile | None):
266
  space_id = os.getenv("SPACE_ID")
267
  if not profile:
268
- return "Please Login to Hugging Face with the button.", None
269
-
270
- username = f"{profile.username}"
271
  try:
272
  agent = BasicAgent()
273
  except RuntimeError as e:
@@ -276,15 +208,14 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
276
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
277
 
278
  try:
279
- response = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
280
- response.raise_for_status()
281
- questions_data = response.json()
282
  print(f"Fetched {len(questions_data)} questions.")
283
  except Exception as e:
284
  return f"Error fetching questions: {e}", None
285
 
286
- results_log = []
287
- answers_payload = []
288
 
289
  for i, item in enumerate(questions_data):
290
  task_id = item.get("task_id")
@@ -293,44 +224,35 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
293
  continue
294
  print(f"\n[{i+1}/{len(questions_data)}]")
295
  try:
296
- submitted_answer = agent(f"[TASK_ID:{task_id}] {question_text}")
297
  except Exception as e:
298
- submitted_answer = ""
299
  print(f"Error: {e}")
300
-
301
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
302
  results_log.append({
303
  "Task ID": task_id,
304
  "Question": question_text[:100] + ("..." if len(question_text) > 100 else ""),
305
- "Submitted Answer": submitted_answer
306
  })
307
 
308
  if not answers_payload:
309
- return "No answers produced.", pd.DataFrame(results_log)
310
 
311
  try:
312
- response = requests.post(
313
- f"{DEFAULT_API_URL}/submit",
314
  json={"username": username.strip(), "agent_code": agent_code, "answers": answers_payload},
315
- timeout=60
316
- )
317
- response.raise_for_status()
318
- r = response.json()
319
- status = (
320
- f"Submission Successful!\n"
321
- f"User: {r.get('username')}\n"
322
- f"Overall Score: {r.get('score', 'N/A')}% "
323
- f"({r.get('correct_count', '?')}/{r.get('total_attempted', '?')} correct)\n"
324
- f"Message: {r.get('message', '')}"
325
- )
326
- return status, pd.DataFrame(results_log)
327
  except Exception as e:
328
  return f"Submission Failed: {e}", pd.DataFrame(results_log)
329
 
330
- # --- UI ---
331
  with gr.Blocks() as demo:
332
  gr.Markdown("# Basic Agent Evaluation Runner")
333
- gr.Markdown("**Setup:** Add `GROQ_API_KEY` in Space Settings → Secrets. Free key at [console.groq.com](https://console.groq.com)")
334
  gr.LoginButton()
335
  with gr.Row():
336
  test_btn = gr.Button("🔬 Test Groq API", variant="secondary")
 
9
 
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
 
12
  def download_task_file(task_id: str):
13
  url = f"{DEFAULT_API_URL}/files/{task_id}"
14
  try:
15
  resp = requests.get(url, timeout=30)
16
  if resp.status_code != 200:
 
17
  return None, None
18
  cd = resp.headers.get("content-disposition", "")
19
  fname = "task_file"
 
23
  tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext)
24
  tmp.write(resp.content)
25
  tmp.close()
26
+ print(f"Downloaded: {fname} ({len(resp.content)} bytes)")
27
  return tmp.name, fname
28
  except Exception as e:
29
  print(f"File download error: {e}")
30
  return None, None
31
 
32
+ def read_file_contents(local_path, fname):
33
  ext = os.path.splitext(fname)[-1].lower()
34
  try:
35
  if ext in (".xlsx", ".xls"):
 
46
  with open(local_path) as f:
47
  return f.read()
48
  except:
49
+ return f"Binary: {fname}"
50
  except Exception as e:
51
+ return f"Error: {e}"
52
 
53
+ def run_python_file(local_path):
54
  try:
55
+ result = subprocess.run([sys.executable, local_path],
56
+ capture_output=True, text=True, timeout=15)
57
+ out = (result.stdout + result.stderr).strip()
58
+ print(f"Python output: '{out[:200]}'")
59
+ return out or "No output."
 
 
 
 
60
  except Exception as e:
61
  return f"Error: {e}"
62
 
63
+ def clean_answer(text):
64
  text = text.strip()
65
+ for p in ["FINAL ANSWER:", "Final Answer:", "Answer:", "The answer is:", "The answer is",
66
+ "**Answer:**", "**Final Answer:**"]:
67
+ if text.lower().startswith(p.lower()):
68
+ text = text[len(p):].strip()
 
69
  return text.split("\n")[0].strip().strip('"').strip("'").strip("*").strip()
70
 
71
+ def call_groq(api_key, messages, max_tokens=512):
 
72
  url = "https://api.groq.com/openai/v1/chat/completions"
73
  headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
 
 
 
 
74
  body = {"model": "llama-3.3-70b-versatile", "messages": messages,
75
  "temperature": 0.0, "max_tokens": max_tokens}
76
  resp = requests.post(url, headers=headers, json=body, timeout=60)
77
  if resp.status_code != 200:
78
+ raise Exception(f"Groq {resp.status_code}: {resp.text[:200]}")
79
  return resp.json()["choices"][0]["message"]["content"].strip()
80
 
81
+ def search_web(query, max_results=6):
 
82
  try:
83
  from duckduckgo_search import DDGS
84
  with DDGS() as ddgs:
 
87
  return "No results."
88
  return "\n\n".join(
89
  f"Title: {r.get('title','')}\nSnippet: {r.get('body','')}\nURL: {r.get('href','')}"
90
+ for r in results)
 
91
  except Exception as e:
92
  return f"Search error: {e}"
93
 
 
96
  if not key:
97
  return "❌ GROQ_API_KEY not set!"
98
  try:
99
+ ans = call_groq(key, [{"role":"user","content":"What is 2+2?"}], max_tokens=10)
100
  return f"✅ Groq working! Test: '{ans}'"
101
  except Exception as e:
102
  return f"❌ {e}"
103
 
104
+ SYSTEM = """You are a GAIA benchmark agent. Exact match grading — precision is critical.
105
+ Reply with ONLY the final answer. No explanation. No prefix. No "The answer is".
106
+ Just the bare answer: a name, number, word, or short phrase."""
 
 
 
 
 
 
107
 
108
  class BasicAgent:
109
  def __init__(self):
110
+ self.key = os.getenv("GROQ_API_KEY", "")
111
+ if not self.key:
112
+ raise RuntimeError("GROQ_API_KEY not set! Add in Space Settings → Secrets.")
113
+ print(f"Agent ready. Key: {self.key[:8]}...")
114
+
115
+ def ask(self, user_msg, max_tokens=256):
116
+ return call_groq(self.key, [
117
+ {"role": "system", "content": SYSTEM},
118
+ {"role": "user", "content": user_msg}
119
+ ], max_tokens)
120
+
121
+ def think_then_answer(self, question, context=""):
122
+ """Two-step: reason first, then extract bare answer."""
123
+ # Step 1: reason
124
+ reasoning = call_groq(self.key, [
125
+ {"role": "system", "content": "You are a careful researcher. Think step by step to find the correct answer. Show your reasoning."},
126
+ {"role": "user", "content": f"Question: {question}\n\n{context}\n\nThink carefully and find the answer."}
127
+ ], max_tokens=1024)
128
+ print(f"Reasoning: {reasoning[:300]}...")
129
+
130
+ # Step 2: extract
131
+ answer = call_groq(self.key, [
132
+ {"role": "system", "content": SYSTEM},
133
+ {"role": "user", "content":
134
+ f"Question: {question}\n\nReasoning and research:\n{reasoning}\n\n"
135
+ f"Based on the above reasoning, give ONLY the final bare answer. Nothing else."}
136
+ ], max_tokens=64)
137
+ return clean_answer(answer)
138
 
139
  def __call__(self, question: str) -> str:
140
  task_id = ""
 
145
 
146
  print(f"\n{'='*50}\nTask: {task_id}\nQ: {question[:200]}")
147
 
148
+ # Handle reversed text
149
+ if "rewsna" in question or "dnatsrednu" in question:
150
+ question = question[::-1]
151
+ print(f"Reversed: {question}")
152
+
153
+ file_ctx = ""
154
+ is_py = False
155
 
156
+ # Download file
157
  if task_id:
158
+ lp, fn = download_task_file(task_id)
159
+ if lp and fn:
160
+ ext = os.path.splitext(fn)[-1].lower()
161
  if ext == ".py":
162
+ is_py = True
163
+ code = read_file_contents(lp, fn)
164
+ out = run_python_file(lp)
165
+ file_ctx = f"\n[Python file: {fn}]\nCODE:\n{code}\n\nEXECUTION OUTPUT:\n{out}\n"
 
 
 
 
 
166
  elif ext in (".xlsx", ".xls", ".csv"):
167
+ contents = read_file_contents(lp, fn)
168
+ file_ctx = f"\n[Data file: {fn}]\n{contents[:6000]}\n"
169
+ elif ext in (".png", ".jpg", ".jpeg"):
170
+ file_ctx = f"\n[Image attached: {fn} — use your knowledge based on the question.]\n"
 
171
  else:
172
+ contents = read_file_contents(lp, fn)
173
+ file_ctx = f"\n[File: {fn}]\n{contents[:4000]}\n"
174
+
175
+ # Web search (multiple targeted queries)
176
+ search_ctx = ""
177
+ if not is_py:
178
+ # Ask the model for a good search query
179
+ sq = call_groq(self.key, [
180
+ {"role": "user", "content":
181
+ f"Write a precise 5-8 word web search query to find: {question}\nReply with ONLY the search query."}
182
+ ], max_tokens=30).strip().strip('"')
183
+ print(f"Search query: {sq}")
184
+ r1 = search_web(question[:200])
185
+ r2 = search_web(sq)
186
+ search_ctx = f"\n[Search 1]\n{r1[:2500]}\n\n[Search 2]\n{r2[:2500]}\n"
187
+
188
+ context = file_ctx + search_ctx
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  try:
191
+ answer = self.think_then_answer(question, context)
 
 
 
 
 
 
 
 
 
 
 
192
  print(f"Final: '{answer}'")
193
  return answer
194
  except Exception as e:
195
  print(f"Error: {e}")
196
  return ""
197
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  def run_and_submit_all(profile: gr.OAuthProfile | None):
199
  space_id = os.getenv("SPACE_ID")
200
  if not profile:
201
+ return "Please Login to Hugging Face.", None
202
+ username = profile.username
 
203
  try:
204
  agent = BasicAgent()
205
  except RuntimeError as e:
 
208
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
209
 
210
  try:
211
+ resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
212
+ resp.raise_for_status()
213
+ questions_data = resp.json()
214
  print(f"Fetched {len(questions_data)} questions.")
215
  except Exception as e:
216
  return f"Error fetching questions: {e}", None
217
 
218
+ results_log, answers_payload = [], []
 
219
 
220
  for i, item in enumerate(questions_data):
221
  task_id = item.get("task_id")
 
224
  continue
225
  print(f"\n[{i+1}/{len(questions_data)}]")
226
  try:
227
+ ans = agent(f"[TASK_ID:{task_id}] {question_text}")
228
  except Exception as e:
229
+ ans = ""
230
  print(f"Error: {e}")
231
+ answers_payload.append({"task_id": task_id, "submitted_answer": ans})
 
232
  results_log.append({
233
  "Task ID": task_id,
234
  "Question": question_text[:100] + ("..." if len(question_text) > 100 else ""),
235
+ "Submitted Answer": ans
236
  })
237
 
238
  if not answers_payload:
239
+ return "No answers.", pd.DataFrame(results_log)
240
 
241
  try:
242
+ resp = requests.post(f"{DEFAULT_API_URL}/submit",
 
243
  json={"username": username.strip(), "agent_code": agent_code, "answers": answers_payload},
244
+ timeout=60)
245
+ resp.raise_for_status()
246
+ r = resp.json()
247
+ return (f"Submission Successful!\nUser: {r.get('username')}\n"
248
+ f"Score: {r.get('score')}% ({r.get('correct_count')}/{r.get('total_attempted')} correct)\n"
249
+ f"Message: {r.get('message')}"), pd.DataFrame(results_log)
 
 
 
 
 
 
250
  except Exception as e:
251
  return f"Submission Failed: {e}", pd.DataFrame(results_log)
252
 
 
253
  with gr.Blocks() as demo:
254
  gr.Markdown("# Basic Agent Evaluation Runner")
255
+ gr.Markdown("**Setup:** `GROQ_API_KEY` in Space Settings → Secrets. Free at [console.groq.com](https://console.groq.com)")
256
  gr.LoginButton()
257
  with gr.Row():
258
  test_btn = gr.Button("🔬 Test Groq API", variant="secondary")