Raj989898 commited on
Commit
1758136
·
verified ·
1 Parent(s): 4790a7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +200 -280
app.py CHANGED
@@ -1,309 +1,229 @@
1
  import os
2
  import time
3
- import gradio as gr
4
  import requests
5
  import pandas as pd
6
- import tempfile
7
- import subprocess
8
- import sys
9
 
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
12
- # Track API calls for rate limiting
13
- _last_call_time = 0
 
 
 
 
 
 
 
 
14
 
15
- def rate_limited_groq(api_key, prompt, system="", max_tokens=128):
16
- """Call Groq with rate limiting — max 25 req/min to stay safe."""
17
- global _last_call_time
18
- # Ensure at least 2.5 seconds between calls (= 24/min, safely under 30 limit)
19
- elapsed = time.time() - _last_call_time
20
- if elapsed < 2.5:
21
- time.sleep(2.5 - elapsed)
22
- _last_call_time = time.time()
23
 
24
  url = "https://api.groq.com/openai/v1/chat/completions"
25
- headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
26
- msgs = []
27
- if system:
28
- msgs.append({"role": "system", "content": system})
29
- msgs.append({"role": "user", "content": prompt})
30
- body = {"model": "llama-3.3-70b-versatile", "messages": msgs,
31
- "temperature": 0.0, "max_tokens": max_tokens}
32
- resp = requests.post(url, headers=headers, json=body, timeout=60)
33
- if resp.status_code == 429:
34
- print("Rate limited! Waiting 60s...")
35
- time.sleep(60)
36
- resp = requests.post(url, headers=headers, json=body, timeout=60)
37
- if resp.status_code != 200:
38
- raise Exception(f"Groq {resp.status_code}: {resp.text[:200]}")
39
- return resp.json()["choices"][0]["message"]["content"].strip()
40
-
41
- def download_task_file(task_id):
42
- url = f"{DEFAULT_API_URL}/files/{task_id}"
43
- try:
44
- resp = requests.get(url, timeout=30)
45
- print(f" File request: HTTP {resp.status_code}, size={len(resp.content)}, "
46
- f"content-type={resp.headers.get('content-type','?')}")
47
- if resp.status_code != 200 or len(resp.content) == 0:
48
- return None, None
49
- cd = resp.headers.get("content-disposition", "")
50
- ct = resp.headers.get("content-type", "")
51
- fname = "task_file"
52
- if "filename=" in cd:
53
- fname = cd.split("filename=")[-1].strip().strip('"').strip("'")
54
- ext = os.path.splitext(fname)[-1]
55
- if not ext:
56
- if "python" in ct: ext = ".py"
57
- elif "excel" in ct or "spreadsheet" in ct: ext = ".xlsx"
58
- elif "csv" in ct: ext = ".csv"
59
- elif "image" in ct: ext = ".png"
60
- else: ext = ".bin"
61
- fname += ext
62
- tmp = tempfile.NamedTemporaryFile(delete=False, suffix=ext, prefix="gaia_")
63
- tmp.write(resp.content)
64
- tmp.close()
65
- print(f" Saved: {fname} -> {tmp.name}")
66
- return tmp.name, fname
67
- except Exception as e:
68
- print(f" Download error: {e}")
69
- return None, None
70
-
71
- def read_file_contents(local_path, fname):
72
- ext = os.path.splitext(fname)[-1].lower()
73
- try:
74
- if ext in (".xlsx", ".xls"):
75
- df = pd.read_excel(local_path)
76
- return f"Excel shape={df.shape}\nColumns={list(df.columns)}\n\n{df.to_string()}"
77
- elif ext == ".csv":
78
- df = pd.read_csv(local_path)
79
- return f"CSV shape={df.shape}\nColumns={list(df.columns)}\n\n{df.to_string()}"
80
- elif ext in (".py", ".txt", ".md", ".json"):
81
- with open(local_path, "r", errors="replace") as f:
82
- return f.read()
83
- else:
84
- try:
85
- with open(local_path, "r", errors="replace") as f:
86
- c = f.read()
87
- if c.strip(): return c
88
- except: pass
89
- return f"Binary: {fname}"
90
- except Exception as e:
91
- return f"Error: {e}"
92
-
93
- def run_python_file(local_path):
94
- try:
95
- r = subprocess.run([sys.executable, local_path],
96
- capture_output=True, text=True, timeout=15)
97
- out = (r.stdout + r.stderr).strip()
98
- print(f" Python output: '{out[:200]}'")
99
- return out if out else "No output."
100
- except Exception as e:
101
- return f"Error: {e}"
102
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  def clean_answer(text):
 
104
  text = text.strip()
105
- for p in ["FINAL ANSWER:", "Final Answer:", "Answer:", "The answer is:", "The answer is",
106
- "**Answer:**", "**Final Answer:**"]:
 
 
 
 
 
 
 
107
  if text.lower().startswith(p.lower()):
108
  text = text[len(p):].strip()
109
- return text.split("\n")[0].strip().strip('"').strip("'").strip("*").strip()
110
-
111
- def search_web(query, max_results=6):
112
- try:
113
- from duckduckgo_search import DDGS
114
- with DDGS() as ddgs:
115
- results = list(ddgs.text(query, max_results=max_results))
116
- if not results:
117
- return "No results."
118
- return "\n\n".join(
119
- f"Title: {r.get('title','')}\nSnippet: {r.get('body','')}\nURL: {r.get('href','')}"
120
- for r in results)
121
- except Exception as e:
122
- return f"Search error: {e}"
123
-
124
- def test_api():
125
- key = os.getenv("GROQ_API_KEY", "")
126
- if not key:
127
- return "❌ GROQ_API_KEY not set!"
128
- try:
129
- ans = rate_limited_groq(key, "What is 2+2?", "Reply with only the number.")
130
- return f"✅ Groq working! Test: '{ans}'"
131
- except Exception as e:
132
- return f"❌ {e}"
133
-
134
- SYSTEM = """You are a GAIA benchmark agent. Exact match grading — your answer must match exactly.
135
- Reply with ONLY the final answer. No explanation. No prefix. No "The answer is".
136
- Give the bare answer: a name, number, word, or short phrase only."""
 
 
 
137
 
138
  class BasicAgent:
 
139
  def __init__(self):
140
- self.key = os.getenv("GROQ_API_KEY", "")
 
 
141
  if not self.key:
142
- raise RuntimeError("GROQ_API_KEY not set!")
143
- print(f"Agent ready. Key: {self.key[:8]}...")
144
-
145
- def ask(self, prompt, max_tokens=128):
146
- return clean_answer(rate_limited_groq(self.key, prompt, SYSTEM, max_tokens))
147
-
148
- def __call__(self, question: str, task_id: str = "") -> str:
149
- print(f"\n{'='*50}\nTask: {task_id}\nQ: {question[:200]}")
150
-
151
- # Handle reversed text
152
- if "rewsna" in question or "dnatsrednu" in question:
153
- question = question[::-1]
154
- print(f" Reversed: {question}")
155
-
156
- file_ctx = ""
157
- is_py = False
158
-
159
- # Download file
160
- if task_id:
161
- print(f" Attempting file download for task_id={task_id}")
162
- lp, fn = download_task_file(task_id)
163
- if lp and fn:
164
- ext = os.path.splitext(fn)[-1].lower()
165
- if ext == ".py":
166
- is_py = True
167
- code = read_file_contents(lp, fn)
168
- out = run_python_file(lp)
169
- file_ctx = f"\n[Python: {fn}]\nCODE:\n{code}\nOUTPUT:\n{out}\n"
170
- elif ext in (".xlsx", ".xls", ".csv"):
171
- contents = read_file_contents(lp, fn)
172
- file_ctx = f"\n[File: {fn}]\n{contents[:6000]}\n"
173
- elif ext in (".png", ".jpg", ".jpeg"):
174
- file_ctx = f"\n[Image: {fn} attached.]\n"
175
- else:
176
- contents = read_file_contents(lp, fn)
177
- file_ctx = f"\n[File: {fn}]\n{contents[:4000]}\n"
178
- else:
179
- print(f" No file found for this task.")
180
-
181
- # Web search
182
- search_ctx = ""
183
- if not is_py:
184
- results = search_web(question[:200])
185
- if results and "error" not in results.lower():
186
- search_ctx = f"\n[Search]\n{results[:3500]}\n"
187
-
188
- # Format hints
189
- q = question.lower()
190
- fmt = ""
191
- if "studio album" in q:
192
- fmt = "\nCount only SOLO studio albums (exclude collaborative albums). Single integer answer."
193
- elif "first name" in q:
194
- fmt = "\nFirst name only."
195
- elif "surname" in q or "last name" in q:
196
- fmt = "\nSurname only."
197
- elif "at bat" in q or "at-bat" in q:
198
- fmt = "\nSingle integer only."
199
- elif "how many" in q:
200
- fmt = "\nSingle integer only."
201
- elif "ioc" in q or ("country" in q and "olympic" in q):
202
- fmt = "\nIOC country code only (3 letters, e.g. USA, GBR). If tied, alphabetically first."
203
- elif "excel" in q or ("sale" in q and "food" in q):
204
- fmt = "\nUSD with two decimal places (e.g. 89.50). No $ sign."
205
- elif "chess" in q:
206
- fmt = "\nChess move in algebraic notation only."
207
- elif "pitcher" in q and "number" in q:
208
- fmt = "\nTwo last names, comma-separated, pitcher with lower jersey number first."
209
- elif "wikipedia" in q and "nominat" in q:
210
- fmt = "\nWikipedia username only."
211
- elif "grocery" in q or ("shopping" in q and "list" in q):
212
- fmt = "\nComma-separated list, alphabetical order."
213
- elif "youtube" in q or "video" in q:
214
- fmt = "\nExact short answer only — quote, number, or name."
215
- elif "grant" in q or "award number" in q:
216
- fmt = "\nExact identifier only."
217
-
218
- prompt = (
219
- f"Question: {question}"
220
- f"{file_ctx}"
221
- f"{search_ctx}"
222
- f"{fmt}"
223
- "\n\nGive ONLY the final answer."
224
- )
225
-
226
- try:
227
- answer = self.ask(prompt, max_tokens=64)
228
- if len(answer.split()) > 20:
229
- answer = clean_answer(rate_limited_groq(
230
  self.key,
231
- f"Extract only the shortest final answer from:\n{answer}",
232
- "Reply with only the bare answer.", max_tokens=32))
233
- print(f" Final: '{answer}'")
234
- return answer
235
- except Exception as e:
236
- print(f" Error: {e}")
237
- return ""
238
-
239
- def run_and_submit_all(profile: gr.OAuthProfile | None):
240
- space_id = os.getenv("SPACE_ID")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  if not profile:
242
- return "Please Login to Hugging Face.", None
 
243
  username = profile.username
244
- try:
245
- agent = BasicAgent()
246
- except RuntimeError as e:
247
- return f"❌ {e}", None
248
-
249
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
250
- try:
251
- resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
252
- resp.raise_for_status()
253
- questions_data = resp.json()
254
- print(f"Fetched {len(questions_data)} questions.")
255
- except Exception as e:
256
- return f"Error: {e}", None
257
-
258
- results_log, answers_payload = [], []
259
- for i, item in enumerate(questions_data):
260
- task_id = item.get("task_id", "")
261
- question_text = item.get("question")
262
- if not task_id or question_text is None:
263
- continue
264
- print(f"\n[{i+1}/{len(questions_data)}]")
265
- try:
266
- # Pass task_id directly — no injection needed
267
- ans = agent(question_text, task_id=task_id)
268
- except Exception as e:
269
- ans = ""
270
- answers_payload.append({"task_id": task_id, "submitted_answer": ans})
271
- results_log.append({
272
- "Task ID": task_id,
273
- "Question": question_text[:100] + ("..." if len(question_text) > 100 else ""),
274
- "Submitted Answer": ans
275
  })
276
 
277
- if not answers_payload:
278
- return "No answers.", pd.DataFrame(results_log)
279
-
280
- try:
281
- resp = requests.post(f"{DEFAULT_API_URL}/submit",
282
- json={"username": username.strip(), "agent_code": agent_code, "answers": answers_payload},
283
- timeout=60)
284
- resp.raise_for_status()
285
- r = resp.json()
286
- return (f"Submission Successful!\nUser: {r.get('username')}\n"
287
- f"Score: {r.get('score')}% ({r.get('correct_count')}/{r.get('total_attempted')} correct)\n"
288
- f"Message: {r.get('message')}"), pd.DataFrame(results_log)
289
- except Exception as e:
290
- return f"Submission Failed: {e}", pd.DataFrame(results_log)
291
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
292
  with gr.Blocks() as demo:
293
- gr.Markdown("# Basic Agent Evaluation Runner")
294
- gr.Markdown("**Setup:** `GROQ_API_KEY` in Space Settings → Secrets. Free at [console.groq.com](https://console.groq.com)")
 
295
  gr.LoginButton()
296
- with gr.Row():
297
- test_btn = gr.Button("🔬 Test Groq API", variant="secondary")
298
- test_out = gr.Textbox(label="Test Result", lines=2, interactive=False)
299
- test_btn.click(fn=test_api, outputs=test_out)
300
- gr.Markdown("---")
301
- run_button = gr.Button("🚀 Run Evaluation & Submit All Answers", variant="primary")
302
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
303
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
304
- run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
305
 
306
  if __name__ == "__main__":
307
- key = os.getenv("GROQ_API_KEY", "")
308
- print(f"GROQ_API_KEY: {'SET ✅ ' + key[:8] + '...' if key else 'NOT SET ❌'}")
309
- demo.launch(debug=True, share=False)
 
1
  import os
2
  import time
 
3
  import requests
4
  import pandas as pd
5
+ import gradio as gr
6
+ from ddgs import DDGS
 
7
 
8
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
9
 
10
+ # -------------------------
11
+ # GROQ API CALL
12
+ # -------------------------
13
+ _last_call = 0
14
+
15
+ def call_llm(api_key, prompt, system="", max_tokens=128):
16
+ global _last_call
17
+
18
+ if time.time() - _last_call < 2.5:
19
+ time.sleep(2.5)
20
 
21
+ _last_call = time.time()
 
 
 
 
 
 
 
22
 
23
  url = "https://api.groq.com/openai/v1/chat/completions"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
 
25
+ headers = {
26
+ "Authorization": f"Bearer {api_key}",
27
+ "Content-Type": "application/json"
28
+ }
29
+
30
+ data = {
31
+ "model": "llama-3.3-70b-versatile",
32
+ "messages": [
33
+ {"role": "system", "content": system},
34
+ {"role": "user", "content": prompt}
35
+ ],
36
+ "temperature": 0,
37
+ "max_tokens": max_tokens
38
+ }
39
+
40
+ r = requests.post(url, headers=headers, json=data, timeout=60)
41
+
42
+ if r.status_code != 200:
43
+ raise Exception(r.text)
44
+
45
+ return r.json()["choices"][0]["message"]["content"].strip()
46
+
47
+ # -------------------------
48
+ # CLEAN ANSWER
49
+ # -------------------------
50
  def clean_answer(text):
51
+
52
  text = text.strip()
53
+
54
+ prefixes = [
55
+ "FINAL ANSWER:",
56
+ "Final Answer:",
57
+ "Answer:",
58
+ "The answer is"
59
+ ]
60
+
61
+ for p in prefixes:
62
  if text.lower().startswith(p.lower()):
63
  text = text[len(p):].strip()
64
+
65
+ text = text.split("\n")[0]
66
+
67
+ return text.strip('"').strip("'").strip("*")
68
+
69
+ # -------------------------
70
+ # WEB SEARCH
71
+ # -------------------------
72
+ def web_search(query):
73
+
74
+ results = []
75
+
76
+ with DDGS() as ddgs:
77
+ for r in ddgs.text(query, max_results=6):
78
+ results.append(
79
+ f"{r['title']} — {r['body']}"
80
+ )
81
+
82
+ return "\n".join(results)
83
+
84
+ # -------------------------
85
+ # AGENT
86
+ # -------------------------
87
+ SYSTEM = """
88
+ You are solving GAIA benchmark questions.
89
+
90
+ Rules:
91
+ Return ONLY the final answer.
92
+ No explanation.
93
+ Exact match grading.
94
+ """
95
 
96
  class BasicAgent:
97
+
98
  def __init__(self):
99
+
100
+ self.key = os.getenv("GROQ_API_KEY")
101
+
102
  if not self.key:
103
+ raise RuntimeError("GROQ_API_KEY missing")
104
+
105
+ print("Agent ready")
106
+
107
+ # automatic retry
108
+ def solve(self, prompt):
109
+
110
+ for attempt in range(3):
111
+
112
+ try:
113
+
114
+ answer = call_llm(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  self.key,
116
+ prompt,
117
+ SYSTEM,
118
+ max_tokens=128
119
+ )
120
+
121
+ answer = clean_answer(answer)
122
+
123
+ if len(answer) > 0:
124
+ return answer
125
+
126
+ except Exception as e:
127
+ print("Retry:", e)
128
+
129
+ time.sleep(2)
130
+
131
+ return ""
132
+
133
+ def __call__(self, question, task_id=""):
134
+
135
+ print("Question:", question)
136
+
137
+ search = web_search(question)
138
+
139
+ prompt = f"""
140
+ Question:
141
+ {question}
142
+
143
+ Web information:
144
+ {search}
145
+
146
+ Return ONLY the final answer.
147
+ """
148
+
149
+ answer = self.solve(prompt)
150
+
151
+ print("Answer:", answer)
152
+
153
+ return answer
154
+
155
+ # -------------------------
156
+ # EVALUATION
157
+ # -------------------------
158
+ def run_and_submit_all(profile):
159
+
160
  if not profile:
161
+ return "Please login", None
162
+
163
  username = profile.username
164
+
165
+ agent = BasicAgent()
166
+
167
+ questions = requests.get(
168
+ f"{DEFAULT_API_URL}/questions"
169
+ ).json()
170
+
171
+ answers = []
172
+ logs = []
173
+
174
+ for q in questions:
175
+
176
+ task_id = q["task_id"]
177
+ question = q["question"]
178
+
179
+ ans = agent(question, task_id)
180
+
181
+ answers.append({
182
+ "task_id": task_id,
183
+ "submitted_answer": ans
 
 
 
 
 
 
 
 
 
 
 
184
  })
185
 
186
+ logs.append({
187
+ "Task": task_id,
188
+ "Answer": ans
189
+ })
 
 
 
 
 
 
 
 
 
 
190
 
191
+ result = requests.post(
192
+ f"{DEFAULT_API_URL}/submit",
193
+ json={
194
+ "username": username,
195
+ "agent_code": "",
196
+ "answers": answers
197
+ }
198
+ ).json()
199
+
200
+ msg = f"""
201
+ User: {result['username']}
202
+ Score: {result['score']}%
203
+ Correct: {result['correct_count']}
204
+ """
205
+
206
+ return msg, pd.DataFrame(logs)
207
+
208
+ # -------------------------
209
+ # UI
210
+ # -------------------------
211
  with gr.Blocks() as demo:
212
+
213
+ gr.Markdown("# GAIA Agent")
214
+
215
  gr.LoginButton()
216
+
217
+ run_btn = gr.Button("Run Evaluation")
218
+
219
+ status = gr.Textbox()
220
+
221
+ table = gr.DataFrame()
222
+
223
+ run_btn.click(
224
+ run_and_submit_all,
225
+ outputs=[status, table]
226
+ )
227
 
228
  if __name__ == "__main__":
229
+ demo.launch()