Nitinguleria commited on
Commit
2e5751e
Β·
verified Β·
1 Parent(s): c2d60ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +331 -128
app.py CHANGED
@@ -1,114 +1,284 @@
1
  import os
2
  import gradio as gr
3
  import requests
4
- import inspect
5
  import pandas as pd
6
  import sympy
 
7
  from duckduckgo_search import DDGS
8
  from langgraph.graph import StateGraph, END
9
- from typing import TypedDict
10
 
11
- # --- Tools ---
 
 
 
12
 
13
  def wikipedia_search_tool(input: str) -> str:
14
- ddgs = DDGS()
15
- results = ddgs.text(input, max_results=3)
16
- if results:
17
- return results[0].get("body", "No information found.")
18
- return "No information found."
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  def math_solver_tool(input: str) -> str:
 
21
  try:
22
- expr = sympy.sympify(input)
23
- return str(expr.evalf())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  except Exception as e:
25
- return f"Math Error: {e}"
 
 
 
 
 
 
 
 
 
26
 
27
  def code_execution_tool(input: str) -> str:
 
28
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  local_vars = {}
30
- exec(f"result = {input}", {}, local_vars)
31
- return str(local_vars["result"])
 
 
 
 
 
 
 
 
 
 
 
32
  except Exception as e:
33
- return f"Code Error: {e}"
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
  # --- State definition ---
36
 
37
  class AgentState(TypedDict):
38
  question: str
39
  response: str
 
40
 
41
- # --- Routing logic ---
42
 
43
- def route_question(state: AgentState) -> str:
 
44
  q = state["question"].lower()
45
- if any(k in q for k in ["solve", "calculate", "evaluate", "math", "=", "^"]):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  return "math"
47
- elif any(k in q for k in ["code", "python", "return", "function"]):
48
  return "code"
49
- elif any(k in q for k in ["what", "who", "when", "where", "why", "capital", "invented"]):
50
  return "search"
51
  else:
52
- return "search"
53
 
54
- # --- LangGraph graph setup ---
55
 
56
- builder = StateGraph(AgentState)
57
-
58
- @builder.node()
59
  def math_node(state: AgentState) -> AgentState:
60
- return {"question": state["question"], "response": math_solver_tool(state["question"])}
 
 
 
 
 
61
 
62
- @builder.node()
63
  def code_node(state: AgentState) -> AgentState:
64
- return {"question": state["question"], "response": code_execution_tool(state["question"])}
 
 
 
 
 
65
 
66
- @builder.node()
67
  def search_node(state: AgentState) -> AgentState:
68
- return {"question": state["question"], "response": wikipedia_search_tool(state["question"])}
69
-
70
- @builder.multi_choice()
71
- def router(state: AgentState) -> str:
72
- return route_question(state)
73
-
74
- builder.set_entry_point("router", router)
75
- builder.add_node("math", math_node)
76
- builder.add_node("code", code_node)
77
- builder.add_node("search", search_node)
78
- builder.add_conditional_edges("router", {
79
- "math": "math",
80
- "code": "code",
81
- "search": "search"
82
- })
83
- builder.add_edge("math", END)
84
- builder.add_edge("code", END)
85
- builder.add_edge("search", END)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
- app_graph = builder.compile()
 
88
 
89
- # --- Agent wrapper ---
90
 
91
  class BasicAgent:
92
  def __init__(self):
93
- print("LangGraph Agent initialized.")
 
94
 
95
  def __call__(self, question: str) -> str:
96
- state = {"question": question, "response": ""}
97
- result = app_graph.invoke(state)
98
- return result["response"]
99
-
100
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
- def run_and_submit_all( profile: gr.OAuthProfile | None):
103
  """
104
  Fetches all questions, runs the BasicAgent on them, submits all answers,
105
  and displays the results.
106
  """
107
  # --- Determine HF Space Runtime URL and Repo URL ---
108
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
109
 
110
  if profile:
111
- username= f"{profile.username}"
112
  print(f"User logged in: {username}")
113
  else:
114
  print("User not logged in.")
@@ -118,15 +288,15 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
118
  questions_url = f"{api_url}/questions"
119
  submit_url = f"{api_url}/submit"
120
 
121
- # 1. Instantiate Agent ( modify this part to create your agent)
122
  try:
123
  agent = BasicAgent()
124
  except Exception as e:
125
  print(f"Error instantiating agent: {e}")
126
  return f"Error initializing agent: {e}", None
127
- # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
128
- agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
129
- print(agent_code)
130
 
131
  # 2. Fetch Questions
132
  print(f"Fetching questions from: {questions_url}")
@@ -135,56 +305,78 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
135
  response.raise_for_status()
136
  questions_data = response.json()
137
  if not questions_data:
138
- print("Fetched questions list is empty.")
139
- return "Fetched questions list is empty or invalid format.", None
140
  print(f"Fetched {len(questions_data)} questions.")
141
  except requests.exceptions.RequestException as e:
142
  print(f"Error fetching questions: {e}")
143
  return f"Error fetching questions: {e}", None
144
- except requests.exceptions.JSONDecodeError as e:
145
- print(f"Error decoding JSON response from questions endpoint: {e}")
146
- print(f"Response text: {response.text[:500]}")
147
- return f"Error decoding server response for questions: {e}", None
148
  except Exception as e:
149
  print(f"An unexpected error occurred fetching questions: {e}")
150
  return f"An unexpected error occurred fetching questions: {e}", None
151
 
152
- # 3. Run your Agent
153
  results_log = []
154
  answers_payload = []
155
  print(f"Running agent on {len(questions_data)} questions...")
156
- for item in questions_data:
 
157
  task_id = item.get("task_id")
158
  question_text = item.get("question")
 
159
  if not task_id or question_text is None:
160
  print(f"Skipping item with missing task_id or question: {item}")
161
  continue
 
 
 
162
  try:
163
  submitted_answer = agent(question_text)
164
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
165
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
166
  except Exception as e:
167
- print(f"Error running agent on task {task_id}: {e}")
168
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
 
 
 
169
 
170
  if not answers_payload:
171
  print("Agent did not produce any answers to submit.")
172
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
173
 
174
- # 4. Prepare Submission
175
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
176
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
177
- print(status_update)
 
 
 
 
178
 
179
- # 5. Submit
180
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
181
  try:
182
- response = requests.post(submit_url, json=submission_data, timeout=60)
183
  response.raise_for_status()
184
  result_data = response.json()
 
185
  final_status = (
186
  f"Submission Successful!\n"
187
- f"User: {result_data.get('username')}\n"
188
  f"Overall Score: {result_data.get('score', 'N/A')}% "
189
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
190
  f"Message: {result_data.get('message', 'No message received.')}"
@@ -192,85 +384,96 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
192
  print("Submission successful.")
193
  results_df = pd.DataFrame(results_log)
194
  return final_status, results_df
 
195
  except requests.exceptions.HTTPError as e:
196
  error_detail = f"Server responded with status {e.response.status_code}."
197
  try:
198
  error_json = e.response.json()
199
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
200
- except requests.exceptions.JSONDecodeError:
201
  error_detail += f" Response: {e.response.text[:500]}"
202
  status_message = f"Submission Failed: {error_detail}"
203
  print(status_message)
204
  results_df = pd.DataFrame(results_log)
205
  return status_message, results_df
206
- except requests.exceptions.Timeout:
207
- status_message = "Submission Failed: The request timed out."
208
- print(status_message)
209
- results_df = pd.DataFrame(results_log)
210
- return status_message, results_df
211
- except requests.exceptions.RequestException as e:
212
- status_message = f"Submission Failed: Network error - {e}"
213
- print(status_message)
214
- results_df = pd.DataFrame(results_log)
215
- return status_message, results_df
216
  except Exception as e:
217
- status_message = f"An unexpected error occurred during submission: {e}"
218
  print(status_message)
219
  results_df = pd.DataFrame(results_log)
220
  return status_message, results_df
221
 
222
-
223
- # --- Build Gradio Interface using Blocks ---
224
- with gr.Blocks() as demo:
225
- gr.Markdown("# Basic Agent Evaluation Runner")
226
  gr.Markdown(
227
  """
 
 
 
 
 
 
 
 
 
228
  **Instructions:**
229
-
230
- 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
231
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
232
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
233
-
234
- ---
235
- **Disclaimers:**
236
- Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
237
- This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
238
  """
239
  )
240
 
241
  gr.LoginButton()
242
 
243
- run_button = gr.Button("Run Evaluation & Submit All Answers")
244
-
245
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
246
- # Removed max_rows=10 from DataFrame constructor
247
- results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
 
 
 
 
 
 
 
 
 
248
 
249
  run_button.click(
250
  fn=run_and_submit_all,
 
251
  outputs=[status_output, results_table]
252
  )
253
 
254
  if __name__ == "__main__":
255
- print("\n" + "-"*30 + " App Starting " + "-"*30)
256
- # Check for SPACE_HOST and SPACE_ID at startup for information
257
- space_host_startup = os.getenv("SPACE_HOST")
258
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
259
-
260
- if space_host_startup:
261
- print(f"βœ… SPACE_HOST found: {space_host_startup}")
262
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
 
 
 
263
  else:
264
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
265
 
266
- if space_id_startup: # Print repo URLs if SPACE_ID is found
267
- print(f"βœ… SPACE_ID found: {space_id_startup}")
268
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
269
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
270
  else:
271
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
272
-
273
- print("-"*(60 + len(" App Starting ")) + "\n")
274
-
275
- print("Launching Gradio Interface for Basic Agent Evaluation...")
 
 
 
276
  demo.launch(debug=True, share=False)
 
1
  import os
2
  import gradio as gr
3
  import requests
 
4
  import pandas as pd
5
  import sympy
6
+ import re
7
  from duckduckgo_search import DDGS
8
  from langgraph.graph import StateGraph, END
9
+ from typing import TypedDict, Literal
10
 
11
+ # Default API URL - you may need to update this
12
+ DEFAULT_API_URL = "https://huggingface.co/api/spaces/evaluate"
13
+
14
+ # --- Enhanced Tools for GAIA Benchmark ---
15
 
16
  def wikipedia_search_tool(input: str) -> str:
17
+ """Enhanced search tool with better result processing"""
18
+ try:
19
+ ddgs = DDGS()
20
+ results = ddgs.text(input, max_results=5)
21
+ if results:
22
+ # Combine multiple results for better coverage
23
+ combined_info = []
24
+ for i, result in enumerate(results[:3]):
25
+ body = result.get("body", "")
26
+ if body and len(body) > 10:
27
+ combined_info.append(f"Source {i+1}: {body}")
28
+
29
+ if combined_info:
30
+ return "\n\n".join(combined_info)
31
+ return "No relevant information found."
32
+ except Exception as e:
33
+ return f"Search Error: {e}"
34
 
35
  def math_solver_tool(input: str) -> str:
36
+ """Enhanced math solver with better parsing"""
37
  try:
38
+ # Clean and preprocess the input
39
+ cleaned_input = input.replace("^", "**").replace("Γ·", "/")
40
+
41
+ # Try to extract mathematical expressions
42
+ math_patterns = [
43
+ r'[\d\+\-\*/\^\(\)\.\s]+',
44
+ r'[a-zA-Z\d\+\-\*/\^\(\)\.\s]+=.*',
45
+ ]
46
+
47
+ for pattern in math_patterns:
48
+ matches = re.findall(pattern, cleaned_input)
49
+ if matches:
50
+ try:
51
+ expr = sympy.sympify(matches[0])
52
+ result = expr.evalf()
53
+ return str(result)
54
+ except:
55
+ continue
56
+
57
+ # Direct sympy attempt
58
+ expr = sympy.sympify(cleaned_input)
59
+ result = expr.evalf()
60
+ return str(result)
61
+
62
  except Exception as e:
63
+ # Try basic eval as fallback (with safety checks)
64
+ try:
65
+ # Only allow safe mathematical operations
66
+ safe_chars = set('0123456789+-*/.() ')
67
+ if all(c in safe_chars for c in input.replace(' ', '')):
68
+ result = eval(input)
69
+ return str(result)
70
+ except:
71
+ pass
72
+ return f"Could not solve mathematical expression: {e}"
73
 
74
  def code_execution_tool(input: str) -> str:
75
+ """Enhanced code execution with better safety and Python support"""
76
  try:
77
+ # Create a safe execution environment
78
+ safe_globals = {
79
+ '__builtins__': {
80
+ 'len': len, 'str': str, 'int': int, 'float': float,
81
+ 'list': list, 'dict': dict, 'tuple': tuple, 'set': set,
82
+ 'sum': sum, 'max': max, 'min': min, 'abs': abs,
83
+ 'round': round, 'range': range, 'enumerate': enumerate,
84
+ 'zip': zip, 'sorted': sorted, 'reversed': reversed,
85
+ 'print': print
86
+ },
87
+ 'math': __import__('math'),
88
+ 're': __import__('re'),
89
+ }
90
+
91
  local_vars = {}
92
+
93
+ # Try to execute the code
94
+ if 'return ' in input or 'print(' in input:
95
+ exec(input, safe_globals, local_vars)
96
+ # Look for printed output or return values
97
+ if 'result' in local_vars:
98
+ return str(local_vars['result'])
99
+ return "Code executed successfully"
100
+ else:
101
+ # Try to evaluate as expression
102
+ result = eval(input, safe_globals, local_vars)
103
+ return str(result)
104
+
105
  except Exception as e:
106
+ return f"Code execution error: {e}"
107
+
108
+ def general_reasoning_tool(input: str) -> str:
109
+ """Tool for general reasoning and analysis"""
110
+ # This is a placeholder for more advanced reasoning
111
+ # In a real implementation, you might use an LLM here
112
+
113
+ # Simple keyword-based analysis
114
+ if any(word in input.lower() for word in ['compare', 'difference', 'similar', 'contrast']):
115
+ return f"Analysis: This appears to be a comparison question. Key factors to consider: {input[:200]}..."
116
+ elif any(word in input.lower() for word in ['cause', 'reason', 'why', 'because']):
117
+ return f"Reasoning: This is asking about causation. Consider multiple factors that might contribute to: {input[:200]}..."
118
+ else:
119
+ return f"General analysis: {input[:300]}..."
120
 
121
  # --- State definition ---
122
 
123
  class AgentState(TypedDict):
124
  question: str
125
  response: str
126
+ tool_used: str
127
 
128
+ # --- Enhanced Routing logic for GAIA ---
129
 
130
+ def route_question(state: AgentState) -> Literal["math", "code", "search", "reasoning"]:
131
+ """Enhanced routing for GAIA benchmark questions"""
132
  q = state["question"].lower()
133
+
134
+ # Math-related keywords
135
+ math_keywords = [
136
+ "solve", "calculate", "evaluate", "compute", "sum", "multiply",
137
+ "divide", "percentage", "%", "=", "equation", "formula", "average",
138
+ "total", "cost", "price", "number", "how many", "how much"
139
+ ]
140
+
141
+ # Code-related keywords
142
+ code_keywords = [
143
+ "python", "code", "function", "return", "algorithm", "program",
144
+ "script", "execute", "run", "implementation"
145
+ ]
146
+
147
+ # Search-related keywords
148
+ search_keywords = [
149
+ "what", "who", "when", "where", "which", "capital", "country",
150
+ "invented", "created", "founded", "established", "located", "known for"
151
+ ]
152
+
153
+ # Check for mathematical expressions or numbers
154
+ if (any(k in q for k in math_keywords) or
155
+ re.search(r'\d+[\+\-\*/\^]\d+', q) or
156
+ re.search(r'\$\d+', q) or
157
+ '%' in q):
158
  return "math"
159
+ elif any(k in q for k in code_keywords):
160
  return "code"
161
+ elif any(k in q for k in search_keywords):
162
  return "search"
163
  else:
164
+ return "reasoning"
165
 
166
+ # --- Node functions ---
167
 
 
 
 
168
  def math_node(state: AgentState) -> AgentState:
169
+ response = math_solver_tool(state["question"])
170
+ return {
171
+ "question": state["question"],
172
+ "response": response,
173
+ "tool_used": "math"
174
+ }
175
 
 
176
  def code_node(state: AgentState) -> AgentState:
177
+ response = code_execution_tool(state["question"])
178
+ return {
179
+ "question": state["question"],
180
+ "response": response,
181
+ "tool_used": "code"
182
+ }
183
 
 
184
  def search_node(state: AgentState) -> AgentState:
185
+ response = wikipedia_search_tool(state["question"])
186
+ return {
187
+ "question": state["question"],
188
+ "response": response,
189
+ "tool_used": "search"
190
+ }
191
+
192
+ def reasoning_node(state: AgentState) -> AgentState:
193
+ response = general_reasoning_tool(state["question"])
194
+ return {
195
+ "question": state["question"],
196
+ "response": response,
197
+ "tool_used": "reasoning"
198
+ }
199
+
200
+ # --- LangGraph setup with corrected API ---
201
+
202
+ def create_agent_graph():
203
+ """Create the agent graph using the correct LangGraph API"""
204
+
205
+ # Create the state graph
206
+ workflow = StateGraph(AgentState)
207
+
208
+ # Add all the nodes
209
+ workflow.add_node("math", math_node)
210
+ workflow.add_node("code", code_node)
211
+ workflow.add_node("search", search_node)
212
+ workflow.add_node("reasoning", reasoning_node)
213
+
214
+ # Add conditional edges from entry point
215
+ workflow.add_conditional_edges(
216
+ "__start__",
217
+ route_question,
218
+ {
219
+ "math": "math",
220
+ "code": "code",
221
+ "search": "search",
222
+ "reasoning": "reasoning"
223
+ }
224
+ )
225
+
226
+ # All nodes end the workflow
227
+ workflow.add_edge("math", END)
228
+ workflow.add_edge("code", END)
229
+ workflow.add_edge("search", END)
230
+ workflow.add_edge("reasoning", END)
231
+
232
+ return workflow.compile()
233
 
234
+ # Create the compiled graph
235
+ app_graph = create_agent_graph()
236
 
237
+ # --- Enhanced Agent wrapper ---
238
 
239
  class BasicAgent:
240
  def __init__(self):
241
+ self.graph = app_graph
242
+ print("Enhanced LangGraph Agent initialized for GAIA benchmark.")
243
 
244
  def __call__(self, question: str) -> str:
245
+ """Process a question and return an answer"""
246
+ try:
247
+ state = {
248
+ "question": question,
249
+ "response": "",
250
+ "tool_used": ""
251
+ }
252
+
253
+ result = self.graph.invoke(state)
254
+
255
+ # Post-process the response for better formatting
256
+ response = result.get("response", "No response generated")
257
+ tool_used = result.get("tool_used", "unknown")
258
+
259
+ # For math problems, try to extract just the numerical answer
260
+ if tool_used == "math" and response:
261
+ # Try to extract the final number
262
+ numbers = re.findall(r'-?\d+\.?\d*', response)
263
+ if numbers:
264
+ return numbers[-1] # Return the last number found
265
+
266
+ return str(response)
267
+
268
+ except Exception as e:
269
+ print(f"Error in agent processing: {e}")
270
+ return f"Error: Could not process the question - {e}"
271
 
272
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
273
  """
274
  Fetches all questions, runs the BasicAgent on them, submits all answers,
275
  and displays the results.
276
  """
277
  # --- Determine HF Space Runtime URL and Repo URL ---
278
+ space_id = os.getenv("SPACE_ID")
279
 
280
  if profile:
281
+ username = f"{profile.username}"
282
  print(f"User logged in: {username}")
283
  else:
284
  print("User not logged in.")
 
288
  questions_url = f"{api_url}/questions"
289
  submit_url = f"{api_url}/submit"
290
 
291
+ # 1. Instantiate Agent
292
  try:
293
  agent = BasicAgent()
294
  except Exception as e:
295
  print(f"Error instantiating agent: {e}")
296
  return f"Error initializing agent: {e}", None
297
+
298
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "local"
299
+ print(f"Agent code location: {agent_code}")
300
 
301
  # 2. Fetch Questions
302
  print(f"Fetching questions from: {questions_url}")
 
305
  response.raise_for_status()
306
  questions_data = response.json()
307
  if not questions_data:
308
+ print("Fetched questions list is empty.")
309
+ return "Fetched questions list is empty or invalid format.", None
310
  print(f"Fetched {len(questions_data)} questions.")
311
  except requests.exceptions.RequestException as e:
312
  print(f"Error fetching questions: {e}")
313
  return f"Error fetching questions: {e}", None
 
 
 
 
314
  except Exception as e:
315
  print(f"An unexpected error occurred fetching questions: {e}")
316
  return f"An unexpected error occurred fetching questions: {e}", None
317
 
318
+ # 3. Run Agent on all questions
319
  results_log = []
320
  answers_payload = []
321
  print(f"Running agent on {len(questions_data)} questions...")
322
+
323
+ for i, item in enumerate(questions_data):
324
  task_id = item.get("task_id")
325
  question_text = item.get("question")
326
+
327
  if not task_id or question_text is None:
328
  print(f"Skipping item with missing task_id or question: {item}")
329
  continue
330
+
331
+ print(f"Processing question {i+1}/{len(questions_data)}: {task_id}")
332
+
333
  try:
334
  submitted_answer = agent(question_text)
335
+ answers_payload.append({
336
+ "task_id": task_id,
337
+ "submitted_answer": submitted_answer
338
+ })
339
+ results_log.append({
340
+ "Task ID": task_id,
341
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
342
+ "Submitted Answer": submitted_answer
343
+ })
344
  except Exception as e:
345
+ print(f"Error running agent on task {task_id}: {e}")
346
+ error_answer = f"AGENT ERROR: {e}"
347
+ answers_payload.append({
348
+ "task_id": task_id,
349
+ "submitted_answer": error_answer
350
+ })
351
+ results_log.append({
352
+ "Task ID": task_id,
353
+ "Question": question_text[:100] + "..." if len(question_text) > 100 else question_text,
354
+ "Submitted Answer": error_answer
355
+ })
356
 
357
  if not answers_payload:
358
  print("Agent did not produce any answers to submit.")
359
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
360
 
361
+ # 4. Prepare Submission
362
+ submission_data = {
363
+ "username": username.strip(),
364
+ "agent_code": agent_code,
365
+ "answers": answers_payload
366
+ }
367
+
368
+ print(f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'...")
369
 
370
+ # 5. Submit answers
371
  print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
372
  try:
373
+ response = requests.post(submit_url, json=submission_data, timeout=120)
374
  response.raise_for_status()
375
  result_data = response.json()
376
+
377
  final_status = (
378
  f"Submission Successful!\n"
379
+ f"User: {result_data.get('username', username)}\n"
380
  f"Overall Score: {result_data.get('score', 'N/A')}% "
381
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
382
  f"Message: {result_data.get('message', 'No message received.')}"
 
384
  print("Submission successful.")
385
  results_df = pd.DataFrame(results_log)
386
  return final_status, results_df
387
+
388
  except requests.exceptions.HTTPError as e:
389
  error_detail = f"Server responded with status {e.response.status_code}."
390
  try:
391
  error_json = e.response.json()
392
  error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
393
+ except:
394
  error_detail += f" Response: {e.response.text[:500]}"
395
  status_message = f"Submission Failed: {error_detail}"
396
  print(status_message)
397
  results_df = pd.DataFrame(results_log)
398
  return status_message, results_df
399
+
 
 
 
 
 
 
 
 
 
400
  except Exception as e:
401
+ status_message = f"Submission error: {e}"
402
  print(status_message)
403
  results_df = pd.DataFrame(results_log)
404
  return status_message, results_df
405
 
406
+ # --- Gradio Interface ---
407
+ with gr.Blocks(title="GAIA Benchmark Agent") as demo:
408
+ gr.Markdown("# Enhanced GAIA Benchmark Agent")
 
409
  gr.Markdown(
410
  """
411
+ **Enhanced Agent for GAIA Benchmark - Targeting 60% Accuracy**
412
+
413
+ **Features:**
414
+ - Enhanced mathematical problem solving with symbolic computation
415
+ - Improved search capabilities with multiple source aggregation
416
+ - Safe code execution environment
417
+ - Smart question routing (math/code/search/reasoning)
418
+ - Better answer formatting and extraction
419
+
420
  **Instructions:**
421
+ 1. Log in to your Hugging Face account using the button below
422
+ 2. Click 'Run Evaluation & Submit All Answers' to start the benchmark
423
+ 3. The agent will process all questions and submit answers automatically
424
+
425
+ **Note:** Processing may take several minutes depending on the number of questions.
 
 
 
 
426
  """
427
  )
428
 
429
  gr.LoginButton()
430
 
431
+ run_button = gr.Button("Run Evaluation & Submit All Answers", variant="primary")
432
+
433
+ status_output = gr.Textbox(
434
+ label="Status & Results",
435
+ lines=8,
436
+ interactive=False,
437
+ placeholder="Click the button above to start the evaluation..."
438
+ )
439
+
440
+ results_table = gr.DataFrame(
441
+ label="Questions and Agent Responses",
442
+ wrap=True,
443
+ interactive=False
444
+ )
445
 
446
  run_button.click(
447
  fn=run_and_submit_all,
448
+ inputs=[],
449
  outputs=[status_output, results_table]
450
  )
451
 
452
  if __name__ == "__main__":
453
+ print("\n" + "="*50)
454
+ print("πŸš€ GAIA Benchmark Agent Starting")
455
+ print("="*50)
456
+
457
+ # Environment info
458
+ space_host = os.getenv("SPACE_HOST")
459
+ space_id = os.getenv("SPACE_ID")
460
+
461
+ if space_host:
462
+ print(f"βœ… SPACE_HOST: {space_host}")
463
+ print(f" Runtime URL: https://{space_host}.hf.space")
464
  else:
465
+ print("ℹ️ Running locally (SPACE_HOST not found)")
466
 
467
+ if space_id:
468
+ print(f"βœ… SPACE_ID: {space_id}")
469
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id}")
 
470
  else:
471
+ print("ℹ️ SPACE_ID not found")
472
+
473
+ print("="*50 + "\n")
474
+
475
+ print("🎯 Target: 60% accuracy on GAIA benchmark")
476
+ print("πŸ”§ Enhanced tools: Math, Code, Search, Reasoning")
477
+ print("\nLaunching Gradio interface...")
478
+
479
  demo.launch(debug=True, share=False)