shsad commited on
Commit
cdfb1cc
·
verified ·
1 Parent(s): 01e47af

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -90
app.py CHANGED
@@ -40,18 +40,135 @@ Question: "What is the capital?" -> Answer: "Paris"
40
  Question: "List the winners" → Answer: "John, Mary, Bob"
41
  """
42
 
 
 
 
 
43
 
 
 
44
 
 
 
 
 
 
 
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
-
48
-
49
-
50
- # Submission logic Unchanged
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def run_and_submit_all( profile: gr.OAuthProfile | None):
52
  """
53
- Fetches all questions, runs the BasicAgent on them, submits all answers,
54
- and displays the results.
55
  """
56
  # --- Determine HF Space Runtime URL and Repo URL ---
57
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
@@ -67,15 +184,19 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
67
  questions_url = f"{api_url}/questions"
68
  submit_url = f"{api_url}/submit"
69
 
70
- # 1. Instantiate Agent ( modify this part to create your agent)
 
 
 
71
  try:
72
- agent = BasicAgent()
73
  except Exception as e:
74
  print(f"Error instantiating agent: {e}")
 
 
75
  return f"Error initializing agent: {e}", None
76
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
77
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
78
- print(agent_code)
79
 
80
  # 2. Fetch Questions
81
  print(f"Fetching questions from: {questions_url}")
@@ -98,101 +219,117 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
98
  print(f"An unexpected error occurred fetching questions: {e}")
99
  return f"An unexpected error occurred fetching questions: {e}", None
100
 
101
- # 3. Run your Agent
102
  results_log = []
103
  answers_payload = []
104
- print(f"Running agent on {len(questions_data)} questions...")
105
- for item in questions_data:
 
 
 
 
 
106
  task_id = item.get("task_id")
107
  question_text = item.get("question")
 
108
  if not task_id or question_text is None:
109
- print(f"Skipping item with missing task_id or question: {item}")
110
  continue
 
 
 
 
 
 
 
111
  try:
112
  submitted_answer = agent(question_text)
113
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
114
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
115
  except Exception as e:
116
- print(f"Error running agent on task {task_id}: {e}")
117
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
118
 
119
  if not answers_payload:
120
- print("Agent did not produce any answers to submit.")
121
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
122
 
123
- # 4. Prepare Submission
124
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
125
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
126
- print(status_update)
127
-
128
- # 5. Submit
129
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
 
 
 
 
 
 
 
130
  try:
131
  response = requests.post(submit_url, json=submission_data, timeout=60)
132
  response.raise_for_status()
133
  result_data = response.json()
 
 
 
 
 
134
  final_status = (
135
  f"Submission Successful!\n"
136
  f"User: {result_data.get('username')}\n"
137
- f"Overall Score: {result_data.get('score', 'N/A')}% "
138
- f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
139
- f"Message: {result_data.get('message', 'No message received.')}"
140
  )
141
- print("Submission successful.")
142
  results_df = pd.DataFrame(results_log)
143
  return final_status, results_df
144
- except requests.exceptions.HTTPError as e:
145
- error_detail = f"Server responded with status {e.response.status_code}."
146
- try:
147
- error_json = e.response.json()
148
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
149
- except requests.exceptions.JSONDecodeError:
150
- error_detail += f" Response: {e.response.text[:500]}"
151
- status_message = f"Submission Failed: {error_detail}"
152
- print(status_message)
153
- results_df = pd.DataFrame(results_log)
154
- return status_message, results_df
155
- except requests.exceptions.Timeout:
156
- status_message = "Submission Failed: The request timed out."
157
- print(status_message)
158
- results_df = pd.DataFrame(results_log)
159
- return status_message, results_df
160
- except requests.exceptions.RequestException as e:
161
- status_message = f"Submission Failed: Network error - {e}"
162
- print(status_message)
163
- results_df = pd.DataFrame(results_log)
164
- return status_message, results_df
165
  except Exception as e:
166
- status_message = f"An unexpected error occurred during submission: {e}"
167
- print(status_message)
168
  results_df = pd.DataFrame(results_log)
169
- return status_message, results_df
170
 
171
 
172
  # --- Build Gradio Interface using Blocks ---
173
  with gr.Blocks() as demo:
174
- gr.Markdown("# Basic Agent Evaluation Runner")
175
  gr.Markdown(
176
  """
177
- **Instructions:**
178
-
179
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
180
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
181
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
182
-
183
- ---
184
- **Disclaimers:**
185
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
186
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
 
 
 
 
 
 
187
  """
188
  )
189
 
190
  gr.LoginButton()
191
-
192
- run_button = gr.Button("Run Evaluation & Submit All Answers")
193
-
194
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
195
- # Removed max_rows=10 from DataFrame constructor
196
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
197
 
198
  run_button.click(
@@ -200,26 +337,19 @@ with gr.Blocks() as demo:
200
  outputs=[status_output, results_table]
201
  )
202
 
203
- if __name__ == "__main__":
204
- print("\n" + "-"*30 + " App Starting " + "-"*30)
205
- # Check for SPACE_HOST and SPACE_ID at startup for information
206
- space_host_startup = os.getenv("SPACE_HOST")
207
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
208
-
209
- if space_host_startup:
210
- print(f"✅ SPACE_HOST found: {space_host_startup}")
211
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
212
- else:
213
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
214
 
215
- if space_id_startup: # Print repo URLs if SPACE_ID is found
216
- print(f" SPACE_ID found: {space_id_startup}")
217
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
218
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
219
- else:
220
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
221
-
222
- print("-"*(60 + len(" App Starting ")) + "\n")
223
-
224
- print("Launching Gradio Interface for Basic Agent Evaluation...")
 
 
 
 
225
  demo.launch(debug=True, share=False)
 
40
  Question: "List the winners" → Answer: "John, Mary, Bob"
41
  """
42
 
43
+ class SmolGaiaAgent:
44
+ """
45
+ Premium agent optimized for maximum accuracy on GAIA Level 1.
46
+ """
47
 
48
+ def __init__(self):
49
+ print("Initializing Premium SmolGaiaAgent...")
50
 
51
+ # Use the most capable model available
52
+ # Option 1: Qwen 32B (current - good balance)
53
+ self.model = OpenAIModel(
54
+ model_id="gpt-4.1",
55
+ api_key=os.getenv("OPENAI_API_KEY"),
56
+ )
57
 
58
+ # MORE STEPS = Better accuracy (but slower)
59
+ try:
60
+ self.agent = CodeAgent(
61
+ tools=[],
62
+ add_base_tools=True,
63
+ model=self.model,
64
+ max_steps=12, # INCREASED from 6 to 12 for thorough reasoning
65
+ system_prompt=GAIA_SYSTEM_PROMPT,
66
+ )
67
+ print("Agent initialized with system_prompt parameter")
68
+ self.use_task_prefix = False
69
+ except TypeError as e:
70
+ print(f"system_prompt not supported, using task prefix: {e}")
71
+ self.agent = CodeAgent(
72
+ tools=[],
73
+ add_base_tools=True,
74
+ model=self.model,
75
+ max_steps=12,
76
+ )
77
+ self.use_task_prefix = True
78
+
79
+ def __call__(self, question: str) -> str:
80
+ """
81
+ Runs the CodeAgent on one question with enhanced answer extraction.
82
+ """
83
+ print(f"[Premium Agent] Question: {question[:80]}...")
84
+
85
+ if self.use_task_prefix:
86
+ task = f"{GAIA_SYSTEM_PROMPT}\n\nTask: {question}"
87
+ else:
88
+ task = question
89
+
90
+ try:
91
+ answer = self.agent.run(task)
92
+ answer = str(answer).strip()
93
+
94
+ # Enhanced answer cleaning
95
+ answer = self.aggressive_clean_answer(answer)
96
+
97
+ print(f"[Premium Agent] Final Answer: {answer}")
98
+ return answer
99
+ except Exception as e:
100
+ print(f"[Premium Agent] Error: {e}")
101
+ import traceback
102
+ traceback.print_exc()
103
+ return "Error processing question"
104
 
105
+ def aggressive_clean_answer(self, answer: str) -> str:
106
+ """
107
+ Aggressively clean the answer to extract just the answer.
108
+ """
109
+ original = answer
110
+
111
+ # Remove common prefixes (case insensitive)
112
+ prefixes_to_remove = [
113
+ "final answer:",
114
+ "the final answer is:",
115
+ "answer:",
116
+ "the answer is:",
117
+ "the answer is",
118
+ "result:",
119
+ "solution:",
120
+ "output:",
121
+ ]
122
+
123
+ answer_lower = answer.lower()
124
+ for prefix in prefixes_to_remove:
125
+ if answer_lower.startswith(prefix):
126
+ answer = answer[len(prefix):].strip()
127
+ answer_lower = answer.lower()
128
+
129
+ # Remove surrounding quotes
130
+ if (answer.startswith('"') and answer.endswith('"')) or \
131
+ (answer.startswith("'") and answer.endswith("'")):
132
+ answer = answer[1:-1].strip()
133
+
134
+ # If answer contains "is:" extract what comes after
135
+ if " is:" in answer.lower():
136
+ parts = answer.split("is:")
137
+ if len(parts) > 1:
138
+ answer = parts[-1].strip()
139
+
140
+ # If answer contains "are:" extract what comes after
141
+ if " are:" in answer.lower():
142
+ parts = answer.split("are:")
143
+ if len(parts) > 1:
144
+ answer = parts[-1].strip()
145
+
146
+ # Remove trailing periods (unless it's a decimal number)
147
+ if answer.endswith('.') and not answer[-2].isdigit():
148
+ answer = answer[:-1].strip()
149
+
150
+ # If answer starts with "The " and is followed by a name/noun, remove "The "
151
+ if answer.startswith("The ") and len(answer) > 4:
152
+ # Check if next word is capitalized (likely a proper noun)
153
+ next_word = answer.split()[1] if len(answer.split()) > 1 else ""
154
+ if next_word and next_word[0].isupper():
155
+ answer = answer[4:].strip()
156
+
157
+ # Remove "a " or "an " from the beginning
158
+ if answer.lower().startswith("a "):
159
+ answer = answer[2:].strip()
160
+ elif answer.lower().startswith("an "):
161
+ answer = answer[3:].strip()
162
+
163
+ print(f"[Cleaning] Original: '{original}' → Cleaned: '{answer}'")
164
+ return answer
165
+
166
+
167
+
168
+ # Submission logic slightly changed
169
  def run_and_submit_all( profile: gr.OAuthProfile | None):
170
  """
171
+ Fetches all questions, runs the Premium Agent, and submits answers.
 
172
  """
173
  # --- Determine HF Space Runtime URL and Repo URL ---
174
  space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
 
184
  questions_url = f"{api_url}/questions"
185
  submit_url = f"{api_url}/submit"
186
 
187
+ # 1. Instantiate Agent (modified)
188
+ print("\n" + "="*30)
189
+ print("INITIALIZING PREMIUM AGENT")
190
+ print("="*30)
191
  try:
192
+ agent = SmolGaiaAgent()
193
  except Exception as e:
194
  print(f"Error instantiating agent: {e}")
195
+ import traceback
196
+ traceback.print_exc()
197
  return f"Error initializing agent: {e}", None
198
+
199
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
 
200
 
201
  # 2. Fetch Questions
202
  print(f"Fetching questions from: {questions_url}")
 
219
  print(f"An unexpected error occurred fetching questions: {e}")
220
  return f"An unexpected error occurred fetching questions: {e}", None
221
 
222
+ # 3. Run Agent with detailed progress tracking
223
  results_log = []
224
  answers_payload = []
225
+ total = len(questions_data)
226
+
227
+ print("\n" + "="*30)
228
+ print(f"PROCESSING {total} QUESTIONS")
229
+ print("="*30 + "\n")
230
+
231
+ for idx, item in enumerate(questions_data, 1):
232
  task_id = item.get("task_id")
233
  question_text = item.get("question")
234
+
235
  if not task_id or question_text is None:
236
+ print(f"Skipping item with missing task_id or question")
237
  continue
238
+
239
+ print(f"\n{'='*30}")
240
+ print(f"QUESTION {idx}/{total}")
241
+ print(f"Task ID: {task_id}")
242
+ print(f"Question: {question_text[:100]}...")
243
+ print('='*30)
244
+
245
  try:
246
  submitted_answer = agent(question_text)
247
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
248
+ results_log.append({
249
+ "Task ID": task_id,
250
+ "Question": question_text,
251
+ "Submitted Answer": submitted_answer
252
+ })
253
+ print(f"✓ Answer recorded: {submitted_answer}")
254
  except Exception as e:
255
+ print(f"Error processing question: {e}")
256
+ import traceback
257
+ traceback.print_exc()
258
+ results_log.append({
259
+ "Task ID": task_id,
260
+ "Question": question_text,
261
+ "Submitted Answer": f"AGENT ERROR: {e}"
262
+ })
263
 
264
  if not answers_payload:
 
265
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
266
 
267
+
268
+
269
+
270
+ # 5. Submission
271
+ submission_data = {
272
+ "username": username.strip(),
273
+ "agent_code": agent_code,
274
+ "answers": answers_payload
275
+ }
276
+
277
+ print("\n" + "="*30)
278
+ print(f"SUBMITTING {len(answers_payload)} ANSWERS")
279
+ print("="*30)
280
+
281
  try:
282
  response = requests.post(submit_url, json=submission_data, timeout=60)
283
  response.raise_for_status()
284
  result_data = response.json()
285
+
286
+ score = result_data.get('score', 'N/A')
287
+ correct = result_data.get('correct_count', '?')
288
+ total_attempted = result_data.get('total_attempted', '?')
289
+
290
  final_status = (
291
  f"Submission Successful!\n"
292
  f"User: {result_data.get('username')}\n"
293
+ f"Overall Score: {score}% ({correct}/{total_attempted} correct)\n"
294
+ f"Message: {result_data.get('message', 'No message received.')}\n\n"
295
+ f"{'EXCELLENT!' if float(score) >= 80 else 'Good job!' if float(score) >= 50 else 'Keep improving!'}"
296
  )
297
+ print(f"\n✓ Submission successful! Score: {score}%")
298
  results_df = pd.DataFrame(results_log)
299
  return final_status, results_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
300
  except Exception as e:
301
+ print(f" Submission error: {e}")
 
302
  results_df = pd.DataFrame(results_log)
303
+ return f"Submission Failed: {e}", results_df
304
 
305
 
306
  # --- Build Gradio Interface using Blocks ---
307
  with gr.Blocks() as demo:
308
+ gr.Markdown("# Premium Agent - Optimized for Maximum Accuracy")
309
  gr.Markdown(
310
  """
311
+ **Current Configuration:**
312
+ - Model: Qwen/Qwen2.5-Coder-32B-Instruct (most capable)
313
+ - Max Steps: 12 (thorough reasoning)
314
+ - Enhanced answer cleaning
315
+ - Detailed progress logging
316
+
317
+ **Target Performance:**
318
+ - Time: ~20-25 minutes for 20 questions
319
+ - Target Score: 60-80% (realistic for Level 1)
320
+ - Stretch Goal: 80%+ with optimal configuration
321
+
322
+ **To Reach 100%:**
323
+ Getting 100% on GAIA Level 1 is extremely difficult. The benchmark shows:
324
+ - GPT-4 achieves ~70-80%
325
+ - Claude 3.5 achieves ~75-85%
326
+ - Human experts achieve ~90-95%
327
  """
328
  )
329
 
330
  gr.LoginButton()
331
+ run_button = gr.Button("Run Premium Evaluation & Submit")
332
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=7, interactive=False)
 
 
 
333
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
334
 
335
  run_button.click(
 
337
  outputs=[status_output, results_table]
338
  )
339
 
 
 
 
 
 
 
 
 
 
 
 
340
 
341
+ if __name__ == "__main__":
342
+ print("\n" + "="*30)
343
+ print("PREMIUM AGENT STARTING")
344
+ print("="*30)
345
+
346
+ space_host = os.getenv("SPACE_HOST")
347
+ space_id = os.getenv("SPACE_ID")
348
+
349
+ if space_host:
350
+ print(f" Runtime URL: https://{space_host}.hf.space")
351
+ if space_id:
352
+ print(f"✓ Repo URL: https://huggingface.co/spaces/{space_id}/tree/main")
353
+
354
+ print("="*30 + "\n")
355
  demo.launch(debug=True, share=False)