Karim0111 commited on
Commit
78b82b6
·
verified ·
1 Parent(s): ffc93d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +155 -393
app.py CHANGED
@@ -4,434 +4,196 @@ import requests
4
  import pandas as pd
5
  import re
6
  from huggingface_hub import InferenceClient
7
- import time
8
 
9
  # --- Constants ---
10
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
11
 
12
- # --- Enhanced Agent Definition ---
13
- class EnhancedGAIAAgent:
 
 
 
14
  """
15
- Enhanced agent for GAIA benchmark using HuggingFace Inference API.
16
- Focuses on accurate reasoning and proper answer extraction.
17
  """
18
-
19
  def __init__(self):
20
- print("EnhancedGAIAAgent initialized with HuggingFace Inference API.")
21
-
22
- # Get HF token - try multiple environment variable names
23
- hf_token = os.getenv("HF_TOKEN") or os.getenv("HUGGING_FACE_HUB_TOKEN") or os.getenv("HF_API_TOKEN")
24
-
 
 
 
25
  if not hf_token:
26
- print("⚠️ ERROR: No HF token found!")
27
- print(" Add HF_TOKEN to Space secrets: Settings → Variables and secrets")
28
- print(" Get token at: https://huggingface.co/settings/tokens")
29
- self.client = None
30
- self.model = None
31
- return
32
-
33
- try:
34
- self.client = InferenceClient(token=hf_token)
35
-
36
- # Use a reliable, fast model
37
- # Llama 3.1 8B is fast and good for general tasks
38
- self.model = "Qwen/Qwen2.5-Coder-32B-Instruct"
39
-
40
- print(f"✅ Model initialized: {self.model}")
41
- print(f"✅ HF Token configured")
42
-
43
- except Exception as e:
44
- print(f"❌ Error initializing Inference Client: {e}")
45
- self.client = None
46
- self.model = None
47
-
48
  def __call__(self, question: str) -> str:
49
- """
50
- Answer a question using the LLM.
51
- """
52
- print(f"\n{'='*60}")
53
- print(f"Q: {question[:150]}...")
54
-
55
- if self.client is None or self.model is None:
56
- error = "ERROR: HF_TOKEN not configured in Space secrets"
57
- print(f"A: {error}")
58
- return error
59
-
60
  try:
61
- answer = self._generate_answer(question)
62
- print(f"A: {answer[:150]}...")
63
- print(f"{'='*60}\n")
64
  return answer
65
-
66
  except Exception as e:
67
- print(f"❌ Error: {e}")
68
- fallback = self._smart_fallback(question)
69
- print(f"A (fallback): {fallback}")
70
- print(f"{'='*60}\n")
71
- return fallback
72
-
73
- def _generate_answer(self, question: str, max_retries: int = 2) -> str:
74
- """
75
- Generate answer using HF Inference API with retries.
76
- """
77
- # Create a focused prompt that encourages concise answers
78
- prompt = """You are an expert at the GAIA benchmark.
 
 
 
 
 
 
 
79
 
80
- For each question:
81
- 1. Identify the question type (factual, math, reasoning)
82
- 2. Plan your approach
83
- 3. Solve step-by-step
84
- 4. Format answer clearly with "FINAL ANSWER: X"
 
 
 
 
 
 
 
85
 
86
- Be precise and concise!"""
87
-
88
- for attempt in range(max_retries):
89
- try:
90
- # Try text_generation first (more reliable for simple API)
91
- response = self.client.text_generation(
92
- prompt,
93
- model=self.model,
94
- max_new_tokens=512,
95
- temperature=0.1,
96
- do_sample=False,
97
- return_full_text=False,
98
- )
99
-
100
- if response:
101
- answer = self._clean_answer(response)
102
- if len(answer) > 0:
103
- return answer
104
-
105
- except Exception as e:
106
- print(f"Attempt {attempt + 1} failed: {e}")
107
- if attempt < max_retries - 1:
108
- time.sleep(1)
109
- continue
110
- else:
111
- # Last resort: try chat completion
112
- try:
113
- messages = [
114
- {"role": "system", "content": "You are a helpful assistant. Answer concisely."},
115
- {"role": "user", "content": question}
116
- ]
117
-
118
- chat_response = self.client.chat_completion(
119
- messages=messages,
120
- model=self.model,
121
- max_tokens=512,
122
- temperature=0.1,
123
- )
124
-
125
- if chat_response and chat_response.choices:
126
- answer = chat_response.choices[0].message.content
127
- return self._clean_answer(answer)
128
-
129
- except Exception as e2:
130
- print(f"Chat completion also failed: {e2}")
131
-
132
- # If all else fails
133
- return self._smart_fallback(question)
134
-
135
  def _clean_answer(self, text: str) -> str:
136
  """
137
- Extract the cleanest possible answer from model output.
138
  """
139
- if not text:
140
- return ""
141
-
142
  text = text.strip()
143
-
144
- # Remove common prefixes
145
- prefixes_to_remove = [
146
- "Answer:",
147
- "The answer is:",
148
- "The answer is",
149
- "A:",
150
- "Final answer:",
151
- "Result:",
152
  ]
153
-
154
- for prefix in prefixes_to_remove:
155
- if text.lower().startswith(prefix.lower()):
156
- text = text[len(prefix):].strip()
157
-
158
- # Try to extract final answer if text is long
159
- if len(text) > 200:
160
- # Look for concluding patterns
161
- patterns = [
162
- r"(?:therefore|thus|so|hence)[,:]?\s+(.+?)(?:\.|$)",
163
- r"(?:the answer is|final answer)[:]?\s+(.+?)(?:\.|$)",
164
- r"(?:result)[:]?\s+(.+?)(?:\.|$)",
165
- ]
166
-
167
- for pattern in patterns:
168
- match = re.search(pattern, text, re.IGNORECASE)
169
- if match:
170
- extracted = match.group(1).strip()
171
- if 2 < len(extracted) < 100:
172
- return extracted
173
-
174
- # If no pattern matched, take last sentence
175
- sentences = text.split('.')
176
- if len(sentences) > 1:
177
- last_sentence = sentences[-2].strip()
178
- if 2 < len(last_sentence) < 100:
179
- return last_sentence
180
-
181
  return text
182
-
183
- def _smart_fallback(self, question: str) -> str:
184
- """
185
- Provide intelligent fallback answers based on question analysis.
186
- """
187
- q_lower = question.lower()
188
-
189
- # Math/calculation questions
190
- if any(word in q_lower for word in ["calculate", "compute", "how many", "what is"]):
191
- # Try to extract numbers and operators
192
- numbers = re.findall(r'-?\d+\.?\d*', question)
193
-
194
- if len(numbers) >= 2:
195
- try:
196
- # Simple arithmetic detection
197
- if '+' in question or 'plus' in q_lower or 'sum' in q_lower:
198
- result = float(numbers[0]) + float(numbers[1])
199
- return str(int(result) if result.is_integer() else result)
200
- elif '-' in question or 'minus' in q_lower or 'difference' in q_lower:
201
- result = float(numbers[0]) - float(numbers[1])
202
- return str(int(result) if result.is_integer() else result)
203
- elif '*' in question or 'x' in question or 'times' in q_lower or 'multiply' in q_lower:
204
- result = float(numbers[0]) * float(numbers[1])
205
- return str(int(result) if result.is_integer() else result)
206
- elif '/' in question or 'divide' in q_lower:
207
- result = float(numbers[0]) / float(numbers[1])
208
- return str(int(result) if result.is_integer() else result)
209
- elif '%' in question or 'percent' in q_lower:
210
- # X% of Y
211
- result = (float(numbers[0]) / 100) * float(numbers[1])
212
- return str(int(result) if result.is_integer() else result)
213
- except:
214
- pass
215
-
216
- # Year/date questions
217
- if any(word in q_lower for word in ["when", "what year", "date"]):
218
- # Look for years in the question
219
- years = re.findall(r'\b(19\d{2}|20\d{2})\b', question)
220
- if years:
221
- return years[-1] # Return most recent year mentioned
222
- return "2024"
223
-
224
- # Counting questions
225
- if "how many" in q_lower or "count" in q_lower:
226
- numbers = re.findall(r'\b\d+\b', question)
227
- if numbers:
228
- return numbers[0]
229
-
230
- # Default
231
- return "Unable to determine answer"
232
 
233
 
 
 
 
234
  def run_and_submit_all(profile: gr.OAuthProfile | None):
235
- """
236
- Run agent on all questions and submit results.
237
- """
238
- space_id = os.getenv("SPACE_ID")
239
-
240
- if profile:
241
- username = f"{profile.username}"
242
- print(f"User logged in: {username}")
243
- else:
244
- print("User not logged in.")
245
- return "Please Login to Hugging Face with the button.", None
246
-
247
- api_url = DEFAULT_API_URL
248
- questions_url = f"{api_url}/questions"
249
- submit_url = f"{api_url}/submit"
250
-
251
- # 1. Instantiate Agent
252
- print("\n" + "="*60)
253
- print("INITIALIZING AGENT")
254
- print("="*60)
255
-
256
- try:
257
- agent = EnhancedGAIAAgent()
258
- if agent.client is None or agent.model is None:
259
- return """⚠️ SETUP REQUIRED: HF_TOKEN not found!
260
-
261
- Steps to fix:
262
- 1. Go to https://huggingface.co/settings/tokens
263
- 2. Create a new token (Read access)
264
- 3. Copy your token
265
- 4. In your Space: Settings → Variables and secrets → New secret
266
- 5. Name: HF_TOKEN
267
- 6. Value: Paste your token
268
- 7. Save and restart Space
269
-
270
- The agent cannot run without this token.""", None
271
- except Exception as e:
272
- print(f"Error instantiating agent: {e}")
273
- return f"Error initializing agent: {e}", None
274
-
275
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
276
- print(f"Agent code: {agent_code}")
277
-
278
- # 2. Fetch Questions
279
- print("\n" + "="*60)
280
- print("FETCHING QUESTIONS")
281
- print("="*60)
282
-
283
- try:
284
- response = requests.get(questions_url, timeout=15)
285
- response.raise_for_status()
286
- questions_data = response.json()
287
- if not questions_data:
288
- return "No questions received from server.", None
289
- print(f"✅ Fetched {len(questions_data)} questions")
290
- except Exception as e:
291
- print(f"❌ Error fetching questions: {e}")
292
- return f"Error fetching questions: {e}", None
293
-
294
- # 3. Run Agent on All Questions
295
- print("\n" + "="*60)
296
- print("RUNNING AGENT ON QUESTIONS")
297
- print("="*60)
298
-
299
- results_log = []
300
  answers_payload = []
301
-
302
- for idx, item in enumerate(questions_data):
303
- task_id = item.get("task_id")
304
- question_text = item.get("question")
305
-
306
- if not task_id or question_text is None:
307
- print(f"⚠️ Skipping invalid item: {item}")
308
- continue
309
-
310
- print(f"\n[{idx + 1}/{len(questions_data)}] Task ID: {task_id}")
311
-
312
- try:
313
- submitted_answer = agent(question_text)
314
-
315
- answers_payload.append({
316
- "task_id": task_id,
317
- "submitted_answer": submitted_answer
318
- })
319
-
320
- results_log.append({
321
- "Task ID": task_id,
322
- "Question": question_text[:80] + "..." if len(question_text) > 80 else question_text,
323
- "Answer": submitted_answer[:80] + "..." if len(submitted_answer) > 80 else submitted_answer
324
- })
325
-
326
- except Exception as e:
327
- print(f"❌ Error on task {task_id}: {e}")
328
- error_answer = "Error processing question"
329
- answers_payload.append({
330
- "task_id": task_id,
331
- "submitted_answer": error_answer
332
- })
333
- results_log.append({
334
- "Task ID": task_id,
335
- "Question": question_text[:80] + "...",
336
- "Answer": error_answer
337
- })
338
-
339
- if not answers_payload:
340
- return "No answers generated.", pd.DataFrame(results_log)
341
-
342
- # 4. Submit Results
343
- print("\n" + "="*60)
344
- print("SUBMITTING RESULTS")
345
- print("="*60)
346
-
347
- submission_data = {
348
- "username": username.strip(),
349
- "agent_code": agent_code,
350
  "answers": answers_payload
351
  }
352
-
353
- print(f"Submitting {len(answers_payload)} answers for {username}...")
354
 
355
- try:
356
- response = requests.post(submit_url, json=submission_data, timeout=60)
357
- response.raise_for_status()
358
- result_data = response.json()
359
-
360
- final_status = (
361
- f"🎉 Submission Successful!\n\n"
362
- f"User: {result_data.get('username')}\n"
363
- f"Score: {result_data.get('score', 'N/A')}% "
364
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n\n"
365
- f"{result_data.get('message', '')}"
366
- )
367
-
368
- print(f"\n✅ {final_status}")
369
- return final_status, pd.DataFrame(results_log)
370
-
371
- except Exception as e:
372
- error_msg = f"Submission failed: {e}"
373
- print(f"❌ {error_msg}")
374
- return error_msg, pd.DataFrame(results_log)
375
 
 
 
 
 
 
376
 
377
- # --- Gradio Interface ---
378
- with gr.Blocks(title="GAIA Agent Evaluation") as demo:
379
- gr.Markdown("# 🤗 GAIA Benchmark Agent")
380
  gr.Markdown(
381
  """
382
- **Setup Required:**
383
- 1. ⚠️ Add HF_TOKEN to Space secrets (Settings → Variables and secrets)
384
- 2. Get free token at: https://huggingface.co/settings/tokens
385
- 3. Token type: "Read" access is enough
386
-
387
- **Then:**
388
- - Login with HuggingFace
389
- - Click Run button
390
- - Wait 5-10 minutes
391
- - Get your score!
392
-
393
- **Target:** 30%+ to pass ✅
394
  """
395
  )
396
 
397
  gr.LoginButton()
398
-
399
- run_button = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
400
-
401
- status_output = gr.Textbox(
402
- label="Status",
403
- lines=8,
404
- interactive=False
405
- )
406
-
407
- results_table = gr.DataFrame(
408
- label="Results",
409
- wrap=True
410
- )
411
 
412
- run_button.click(
413
- fn=run_and_submit_all,
414
- outputs=[status_output, results_table]
415
- )
416
 
417
  if __name__ == "__main__":
418
- print("\n" + "="*70)
419
- print(" "*20 + "GAIA AGENT STARTING")
420
- print("="*70)
421
-
422
- space_host = os.getenv("SPACE_HOST")
423
- space_id = os.getenv("SPACE_ID")
424
- hf_token = os.getenv("HF_TOKEN")
425
-
426
- if space_host:
427
- print(f"✅ Space Host: {space_host}")
428
- if space_id:
429
- print(f"✅ Space ID: {space_id}")
430
- if hf_token:
431
- print(f"✅ HF_TOKEN: Found")
432
- else:
433
- print(f"⚠️ HF_TOKEN: NOT FOUND - Please add to Space secrets!")
434
-
435
- print("="*70 + "\n")
436
-
437
- demo.launch(debug=True, share=False)
 
4
  import pandas as pd
5
  import re
6
  from huggingface_hub import InferenceClient
 
7
 
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
+
12
+ # =========================
13
+ # GAIA OPTIMIZED AGENT
14
+ # =========================
15
+ class GAIAAgent:
16
  """
17
+ GAIA benchmark agent chat-only, nscale-safe, exact answers.
 
18
  """
19
+
20
  def __init__(self):
21
+ print("🚀 GAIAAgent initializing...")
22
+
23
+ hf_token = (
24
+ os.getenv("HF_TOKEN")
25
+ or os.getenv("HUGGING_FACE_HUB_TOKEN")
26
+ or os.getenv("HF_API_TOKEN")
27
+ )
28
+
29
  if not hf_token:
30
+ raise RuntimeError("HF_TOKEN not found in Space secrets")
31
+
32
+ self.client = InferenceClient(token=hf_token)
33
+
34
+ # SAFE MODELS (chat-only)
35
+ self.model = "meta-llama/Meta-Llama-3.1-8B-Instruct"
36
+ # Alternative:
37
+ # self.model = "Qwen/Qwen2.5-7B-Instruct"
38
+
39
+ print(f"✅ Model loaded: {self.model}")
40
+
 
 
 
 
 
 
 
 
 
 
 
41
  def __call__(self, question: str) -> str:
42
+ print(f"\nQ: {question[:120]}")
43
+
 
 
 
 
 
 
 
 
 
44
  try:
45
+ answer = self._chat_answer(question)
46
+ print(f"A: {answer}")
 
47
  return answer
 
48
  except Exception as e:
49
+ print(f"❌ Agent error: {e}")
50
+ return "Unable to determine answer"
51
+
52
+ def _chat_answer(self, question: str) -> str:
53
+ messages = [
54
+ {
55
+ "role": "system",
56
+ "content": (
57
+ "You are an expert GAIA benchmark solver.\n"
58
+ "Answer EXACTLY what is asked.\n"
59
+ "Return ONLY the final answer.\n"
60
+ "No explanations, no prefixes, no formatting."
61
+ )
62
+ },
63
+ {
64
+ "role": "user",
65
+ "content": question
66
+ }
67
+ ]
68
 
69
+ response = self.client.chat_completion(
70
+ model=self.model,
71
+ messages=messages,
72
+ max_tokens=256,
73
+ temperature=0.0,
74
+ )
75
+
76
+ if not response or not response.choices:
77
+ return "Unable to determine answer"
78
+
79
+ raw = response.choices[0].message.content.strip()
80
+ return self._clean_answer(raw)
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  def _clean_answer(self, text: str) -> str:
83
  """
84
+ GAIA-safe cleaning: minimal, no hallucinated trimming.
85
  """
 
 
 
86
  text = text.strip()
87
+
88
+ # Remove common junk if model disobeys
89
+ bad_prefixes = [
90
+ "answer:",
91
+ "final answer:",
92
+ "the answer is",
93
+ "result:"
 
 
94
  ]
95
+
96
+ for p in bad_prefixes:
97
+ if text.lower().startswith(p):
98
+ text = text[len(p):].strip()
99
+
100
+ # If multi-line, keep first meaningful line
101
+ if "\n" in text:
102
+ text = text.split("\n")[0].strip()
103
+
104
+ # GAIA prefers concise
105
+ if len(text.split()) > 12:
106
+ # keep last sentence
107
+ parts = re.split(r"[.!?]", text)
108
+ text = parts[-2].strip() if len(parts) > 1 else parts[0].strip()
109
+
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
 
113
+ # =========================
114
+ # RUN + SUBMIT
115
+ # =========================
116
  def run_and_submit_all(profile: gr.OAuthProfile | None):
117
+
118
+ if not profile:
119
+ return "Please login with Hugging Face.", None
120
+
121
+ username = profile.username
122
+ print(f"👤 User: {username}")
123
+
124
+ questions_url = f"{DEFAULT_API_URL}/questions"
125
+ submit_url = f"{DEFAULT_API_URL}/submit"
126
+
127
+ agent = GAIAAgent()
128
+
129
+ # Fetch questions
130
+ questions = requests.get(questions_url, timeout=15).json()
131
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
  answers_payload = []
133
+ results_log = []
134
+
135
+ for idx, item in enumerate(questions):
136
+ task_id = item["task_id"]
137
+ question = item["question"]
138
+
139
+ print(f"\n[{idx+1}/{len(questions)}] {task_id}")
140
+ answer = agent(question)
141
+
142
+ answers_payload.append({
143
+ "task_id": task_id,
144
+ "submitted_answer": answer
145
+ })
146
+
147
+ results_log.append({
148
+ "Task ID": task_id,
149
+ "Answer": answer
150
+ })
151
+
152
+ submission = {
153
+ "username": username,
154
+ "agent_code": f"https://huggingface.co/spaces/{os.getenv('SPACE_ID')}",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  "answers": answers_payload
156
  }
 
 
157
 
158
+ response = requests.post(submit_url, json=submission, timeout=60)
159
+ result = response.json()
160
+
161
+ status = (
162
+ f"🎉 Submission Successful\n\n"
163
+ f"Score: {result.get('score')}%\n"
164
+ f"Correct: {result.get('correct_count')}/{result.get('total_attempted')}"
165
+ )
166
+
167
+ return status, pd.DataFrame(results_log)
168
+
 
 
 
 
 
 
 
 
 
169
 
170
+ # =========================
171
+ # GRADIO UI
172
+ # =========================
173
+ with gr.Blocks(title="GAIA Agent") as demo:
174
+ gr.Markdown("# 🤗 GAIA Benchmark Agent (Fixed)")
175
 
 
 
 
176
  gr.Markdown(
177
  """
178
+ Chat-only
179
+ nscale-safe
180
+ GAIA-optimized
181
+
182
+ **Steps**
183
+ 1. Add `HF_TOKEN` to Space secrets
184
+ 2. Login with Hugging Face
185
+ 3. Click Run
 
 
 
 
186
  """
187
  )
188
 
189
  gr.LoginButton()
190
+ run_btn = gr.Button("🚀 Run Evaluation", variant="primary")
191
+
192
+ status = gr.Textbox(label="Status", lines=6)
193
+ table = gr.DataFrame(label="Results")
194
+
195
+ run_btn.click(run_and_submit_all, outputs=[status, table])
 
 
 
 
 
 
 
196
 
 
 
 
 
197
 
198
  if __name__ == "__main__":
199
+ demo.launch(debug=True)