mujtabarizvi commited on
Commit
cc67291
·
verified ·
1 Parent(s): d75d9a2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +66 -167
app.py CHANGED
@@ -8,8 +8,7 @@ import re # For parsing LLM output
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
- # You can choose a different model, but make sure it's good at instruction following and ReAct-style prompting.
12
- LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta" # or "mistralai/Mistral-7B-Instruct-v0.2"
13
 
14
  try:
15
  hf_token = os.getenv("HF_TOKEN")
@@ -19,21 +18,13 @@ except Exception as e:
19
  llm_client = None
20
 
21
  # --- Tools ---
22
- # 1. Search Tool (using DuckDuckGo)
23
  from duckduckgo_search import DDGS
24
 
25
  def search_tool(query: str) -> str:
26
- """
27
- Searches the web using DuckDuckGo for a given query and returns the top results.
28
- Args:
29
- query (str): The search query.
30
- Returns:
31
- str: A string containing the search results, or an error/status message.
32
- """
33
  print(f"Tool: search_tool, Query: {query}")
34
  try:
35
  with DDGS() as ddgs:
36
- results = ddgs.text(query, max_results=3) # Get top 3 results
37
  if results:
38
  return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
39
  else:
@@ -42,16 +33,7 @@ def search_tool(query: str) -> str:
42
  print(f"Error in search_tool: {e}")
43
  return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit."
44
 
45
- # 2. Calculator Tool
46
  def calculator_tool(expression: str) -> str:
47
- """
48
- Calculates the result of a mathematical expression.
49
- Args:
50
- expression (str): The mathematical expression to evaluate (e.g., "2+2", "100*3.14/4").
51
- It should be a valid Python-evaluable expression.
52
- Returns:
53
- str: The result of the calculation or an error message.
54
- """
55
  print(f"Tool: calculator_tool, Expression: {expression}")
56
  try:
57
  result = eval(expression, {"__builtins__": {}}, {"sqrt": lambda x: x**0.5, "pi": 3.1415926535})
@@ -62,14 +44,13 @@ def calculator_tool(expression: str) -> str:
62
 
63
  # --- Agent Definition ---
64
  class ReActAgent:
65
- def __init__(self, llm_client, tools: dict, max_iterations=7): # max_iterations can be tuned
66
  print("ReActAgent initialized.")
67
  if llm_client is None:
68
- raise ValueError("LLM client not initialized. Check HF_TOKEN and model availability.")
69
  self.llm = llm_client
70
  self.tools = tools
71
  self.max_iterations = max_iterations
72
- self.stop_pattern = "Final Answer:"
73
 
74
  self.tool_descriptions = "\n".join([
75
  f"- {name}: {inspect.getdoc(func)}"
@@ -77,25 +58,25 @@ class ReActAgent:
77
  ])
78
  self.tool_names = ", ".join(tools.keys())
79
 
80
- # Refined ReAct prompt template for exact match answers
81
  self.react_prompt_template = inspect.cleandoc(f"""
82
- You are a helpful and observant AI assistant. Your goal is to answer the following question accurately.
83
- You must use a step-by-step thinking process (Thought, Action, Observation).
 
84
  The final answer submitted must be an EXACT match to the correct response, without any extra explanations or prefixes being part of the answer itself.
85
 
86
  Available tools:
87
  {self.tool_descriptions}
88
 
89
- Use the following format:
90
  Question: the input question you must answer
91
- Thought: You should always think about what to do.
92
- Action: The action to take, should be one of [{self.tool_names}]. The input to the tool is between the brackets. For example: search_tool[query] or calculator_tool[expression].
93
- Observation: The result of the action.
94
- ... (this Thought/Action/Observation sequence can repeat up to {self.max_iterations} times)
95
- Thought: I now know the final answer.
96
- Final Answer: [Provide ONLY the precise answer here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Do not include any other text or explanations in the answer part itself.]
97
-
98
- Begin!
99
  """) + "\nQuestion: {question}\n{scratchpad}"
100
 
101
 
@@ -103,11 +84,9 @@ class ReActAgent:
103
  try:
104
  response = self.llm.text_generation(
105
  prompt,
106
- max_new_tokens=512, # Adjust if LLM needs more space for thought process
107
- temperature=0.1, # Lower temperature for more deterministic and precise answers
108
- do_sample=True, # Often needed if temperature is not 1.0
109
- # Using temperature < 1.0 makes it do_sample=True by default in many HuggingFace implementations
110
- # stop_sequences=["Observation:"] # Can help, but might prematurely stop LLM. Parsing is more robust.
111
  )
112
  return response.strip()
113
  except Exception as e:
@@ -116,77 +95,75 @@ class ReActAgent:
116
 
117
  def __call__(self, question: str) -> str:
118
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
119
-
120
  scratchpad = ""
121
- # Initial prompt construction for the first turn
122
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
123
 
124
  for i in range(self.max_iterations):
125
  print(f"\nIteration {i+1}")
126
- # Note: The scratchpad builds up. Ensure the LLM prompt correctly handles cumulative context.
127
- # The current template appends the new LLM output and observation to the scratchpad.
128
- # current_prompt is reconstructed each time using the *updated* scratchpad.
129
-
130
  llm_output = self.run_llm(current_prompt)
131
 
132
  if not llm_output:
133
  print("LLM returned empty or error, stopping.")
134
  return "Agent Error: LLM failed to respond."
135
 
136
- # Append the LLM's full response (thought and potentially action or final answer) to scratchpad
137
  scratchpad += llm_output + "\n"
138
 
139
- # Check for "Final Answer:" in the LLM's *current* output
140
- final_answer_match = re.search(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
141
- if final_answer_match:
142
- answer = final_answer_match.group(1).strip()
143
- print(f"Found Final Answer in LLM output: '{answer}'")
144
- return answer # This is the clean answer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
 
146
- # Parse Action from the LLM's *current* output
147
  action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
148
  if action_match:
149
  tool_name = action_match.group(1).strip()
150
  tool_input = action_match.group(2).strip()
151
-
152
  if tool_name in self.tools:
153
  print(f"Executing Tool: {tool_name}, Input: {tool_input}")
154
  try:
155
  observation = self.tools[tool_name](tool_input)
156
  except Exception as e:
157
  observation = f"Error executing tool {tool_name}: {e}"
158
- print(f"Observation: {observation[:200]}...") # Print truncated observation
159
- scratchpad += f"Observation: {observation}\n" # Add observation to scratchpad
160
  else:
161
  print(f"Unknown tool: {tool_name}")
162
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
163
  else:
164
- # If no action and no Final Answer, it implies the LLM might be just thinking,
165
- # or the output is malformed. The loop will continue, using the updated scratchpad.
166
- print("No valid action found in LLM output for this iteration. LLM might be thinking or output is malformed.")
167
 
168
- # Reconstruct the prompt for the next iteration with the updated scratchpad
169
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
170
 
171
-
172
- # Fallback if max_iterations is reached without a "Final Answer:"
173
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
174
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
175
  print(f"Returning standard failure message: {standard_failure_message}")
176
  return standard_failure_message
177
 
178
-
179
- # --- Constants (from template) ---
180
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
181
 
182
- # --- Main Execution Logic (from template, modified to use ReActAgent) ---
183
  def run_and_submit_all(profile: gr.OAuthProfile | None):
184
  space_id = os.getenv("SPACE_ID")
185
  if profile:
186
  username = f"{profile.username}"
187
- print(f"User logged in: {username}")
188
  else:
189
- print("User not logged in.")
190
  return "Please Login to Hugging Face with the button.", None
191
 
192
  api_url = DEFAULT_API_URL
@@ -194,157 +171,79 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
194
  submit_url = f"{api_url}/submit"
195
 
196
  try:
197
- available_tools = {
198
- "search_tool": search_tool,
199
- "calculator_tool": calculator_tool,
200
- }
201
  if llm_client is None:
202
  return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None
203
  agent = ReActAgent(llm_client=llm_client, tools=available_tools)
204
  except Exception as e:
205
- print(f"Error instantiating agent: {e}")
206
  return f"Error initializing agent: {e}", None
207
 
208
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)"
209
- print(f"Agent code link: {agent_code}")
210
 
211
- print(f"Fetching questions from: {questions_url}")
212
  try:
213
  response = requests.get(questions_url, timeout=20)
214
  response.raise_for_status()
215
  questions_data = response.json()
216
  if not questions_data:
217
- print("Fetched questions list is empty.")
218
  return "Fetched questions list is empty or invalid format.", None
219
- print(f"Fetched {len(questions_data)} questions.")
220
- except requests.exceptions.RequestException as e:
221
- print(f"Error fetching questions: {e}")
222
  return f"Error fetching questions: {e}", None
223
- except requests.exceptions.JSONDecodeError as e:
224
- print(f"Error decoding JSON response from questions endpoint: {e}")
225
- print(f"Response text: {response.text[:500]}")
226
- return f"Error decoding server response for questions: {e}", None
227
-
228
- results_log = []
229
- answers_payload = []
230
- print(f"Running agent on {len(questions_data)} questions...")
231
  for item in questions_data:
232
- task_id = item.get("task_id")
233
- question_text = item.get("question")
234
- if not task_id or question_text is None:
235
- print(f"Skipping item with missing task_id or question: {item}")
236
- continue
237
  try:
238
  print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---")
239
  submitted_answer = agent(question_text)
240
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
241
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
242
- print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'") # Added quotes for clarity
243
  except Exception as e:
244
- print(f"Error running agent on task {task_id}: {e}")
245
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
246
 
247
  if not answers_payload:
248
- print("Agent did not produce any answers to submit.")
249
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
250
 
251
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
252
- status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
253
- print(status_update)
254
 
255
- print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
256
  try:
257
  response = requests.post(submit_url, json=submission_data, timeout=120)
258
  response.raise_for_status()
259
  result_data = response.json()
260
  final_status = (
261
- f"Submission Successful!\n"
262
- f"User: {result_data.get('username')}\n"
263
  f"Overall Score: {result_data.get('score', 'N/A')}% "
264
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
265
  f"Message: {result_data.get('message', 'No message received.')}"
266
  )
267
- print("Submission successful.")
268
- results_df = pd.DataFrame(results_log)
269
- return final_status, results_df
270
  except requests.exceptions.HTTPError as e:
271
  error_detail = f"Server responded with status {e.response.status_code}."
272
- try:
273
- error_json = e.response.json()
274
- error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
275
- except requests.exceptions.JSONDecodeError:
276
- error_detail += f" Response: {e.response.text[:500]}"
277
- status_message = f"Submission Failed: {error_detail}"
278
- print(status_message)
279
- results_df = pd.DataFrame(results_log)
280
- return status_message, results_df
281
- except requests.exceptions.Timeout:
282
- status_message = "Submission Failed: The request timed out."
283
- print(status_message)
284
- results_df = pd.DataFrame(results_log)
285
- return status_message, results_df
286
- except requests.exceptions.RequestException as e:
287
- status_message = f"Submission Failed: Network error - {e}"
288
- print(status_message)
289
- results_df = pd.DataFrame(results_log)
290
- return status_message, results_df
291
  except Exception as e:
292
- status_message = f"An unexpected error occurred during submission: {e}"
293
- print(status_message)
294
- results_df = pd.DataFrame(results_log)
295
- return status_message, results_df
296
 
297
- # --- Build Gradio Interface using Blocks (from template) ---
298
  with gr.Blocks() as demo:
299
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
300
- gr.Markdown(
301
- """
302
- **Instructions:**
303
- 1. This Space implements a ReAct (Reasoning-Action) agent using an LLM from the Hugging Face Inference API.
304
- 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
305
- 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
306
- 4. The agent uses a search tool (DuckDuckGo) and a calculator tool. The prompt has been refined to encourage EXACT MATCH answers.
307
- ---
308
- **Disclaimers:**
309
- * LLM responses can be slow, and running through all questions will take time.
310
- * The agent's performance depends heavily on the chosen LLM and the quality of its ReAct prompting.
311
- * You may need to set an `HF_TOKEN` in your Space secrets if you use a gated model or encounter rate limits.
312
- * The calculator tool uses `eval()` which has security implications if not carefully managed. For this specific benchmark it is a common simplification.
313
- """
314
- )
315
  gr.LoginButton()
316
  run_button = gr.Button("Run Evaluation & Submit All Answers")
317
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
318
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
319
-
320
- run_button.click(
321
- fn=run_and_submit_all,
322
- outputs=[status_output, results_table]
323
- )
324
 
325
  if __name__ == "__main__":
326
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
327
  space_host_startup = os.getenv("SPACE_HOST")
328
  space_id_startup = os.getenv("SPACE_ID")
329
- if space_host_startup:
330
- print(f"✅ SPACE_HOST found: {space_host_startup}")
331
- print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
332
- else:
333
- print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
334
- if space_id_startup:
335
- print(f"✅ SPACE_ID found: {space_id_startup}")
336
- print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
337
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
338
- else:
339
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
340
-
341
- if llm_client is None:
342
- print("⚠️ LLM Client (InferenceClient) was not initialized. The agent will not work.")
343
- print(" Please check if you need to set the HF_TOKEN secret in your Space settings,")
344
- print(f" and ensure the model '{LLM_MODEL}' is accessible via the Inference API.")
345
- else:
346
- print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
347
-
348
  print("-"*(60 + len(" App Starting ")) + "\n")
349
- print("Launching Gradio Interface for ReAct Agent Evaluation...")
350
  demo.launch(debug=True, share=False)
 
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
+ LLM_MODEL = "HuggingFaceH4/zephyr-7b-beta"
 
12
 
13
  try:
14
  hf_token = os.getenv("HF_TOKEN")
 
18
  llm_client = None
19
 
20
  # --- Tools ---
 
21
  from duckduckgo_search import DDGS
22
 
23
  def search_tool(query: str) -> str:
 
 
 
 
 
 
 
24
  print(f"Tool: search_tool, Query: {query}")
25
  try:
26
  with DDGS() as ddgs:
27
+ results = ddgs.text(query, max_results=3)
28
  if results:
29
  return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
30
  else:
 
33
  print(f"Error in search_tool: {e}")
34
  return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit."
35
 
 
36
  def calculator_tool(expression: str) -> str:
 
 
 
 
 
 
 
 
37
  print(f"Tool: calculator_tool, Expression: {expression}")
38
  try:
39
  result = eval(expression, {"__builtins__": {}}, {"sqrt": lambda x: x**0.5, "pi": 3.1415926535})
 
44
 
45
  # --- Agent Definition ---
46
  class ReActAgent:
47
+ def __init__(self, llm_client, tools: dict, max_iterations=7):
48
  print("ReActAgent initialized.")
49
  if llm_client is None:
50
+ raise ValueError("LLM client not initialized.")
51
  self.llm = llm_client
52
  self.tools = tools
53
  self.max_iterations = max_iterations
 
54
 
55
  self.tool_descriptions = "\n".join([
56
  f"- {name}: {inspect.getdoc(func)}"
 
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
 
61
  self.react_prompt_template = inspect.cleandoc(f"""
62
+ You are a helpful AI assistant. Your goal is to answer the CURRENT question accurately.
63
+ Focus ONLY on the provided "Question:". Do not generate new questions or continue a dialogue beyond answering the current question.
64
+ You must use a step-by-step thinking process (Thought, Action, Observation) for the current question.
65
  The final answer submitted must be an EXACT match to the correct response, without any extra explanations or prefixes being part of the answer itself.
66
 
67
  Available tools:
68
  {self.tool_descriptions}
69
 
70
+ Use the following format FOR THE CURRENT QUESTION ONLY:
71
  Question: the input question you must answer
72
+ Thought: Your reasoning and plan for the current question.
73
+ Action: The action to take for the current question, should be one of [{self.tool_names}]. Input to the tool is between brackets. E.g., search_tool[query] or calculator_tool[expression].
74
+ Observation: The result of the action for the current question.
75
+ ... (this Thought/Action/Observation sequence can repeat for the current question)
76
+ Thought: I now have enough information to answer the current question.
77
+ Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Do not include any other text, reasoning, or new questions after this line.]
78
+
79
+ Let's begin with the current question.
80
  """) + "\nQuestion: {question}\n{scratchpad}"
81
 
82
 
 
84
  try:
85
  response = self.llm.text_generation(
86
  prompt,
87
+ max_new_tokens=512,
88
+ temperature=0.1,
89
+ do_sample=True,
 
 
90
  )
91
  return response.strip()
92
  except Exception as e:
 
95
 
96
  def __call__(self, question: str) -> str:
97
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
 
98
  scratchpad = ""
 
99
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
100
 
101
  for i in range(self.max_iterations):
102
  print(f"\nIteration {i+1}")
 
 
 
 
103
  llm_output = self.run_llm(current_prompt)
104
 
105
  if not llm_output:
106
  print("LLM returned empty or error, stopping.")
107
  return "Agent Error: LLM failed to respond."
108
 
 
109
  scratchpad += llm_output + "\n"
110
 
111
+ all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
112
+ if all_final_answers:
113
+ answer = all_final_answers[-1].strip() # Get the last "Final Answer:"
114
+
115
+ # Further clean up common patterns of LLM over-generation within the answer
116
+ if "Thought:" in answer:
117
+ answer = answer.split("Thought:")[0].strip()
118
+ if "Action:" in answer:
119
+ answer = answer.split("Action:")[0].strip()
120
+ if "Observation:" in answer:
121
+ answer = answer.split("Observation:")[0].strip()
122
+ if "Question:" in answer: # If it starts generating a new question within the answer
123
+ answer = answer.split("Question:")[0].strip()
124
+
125
+ # Handle nested "Final Answer:" in the extracted part
126
+ inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
127
+ if inner_final_answers:
128
+ answer = inner_final_answers[-1].strip()
129
+
130
+ print(f"Found and extracted Final Answer: '{answer}'")
131
+ return answer
132
 
 
133
  action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
134
  if action_match:
135
  tool_name = action_match.group(1).strip()
136
  tool_input = action_match.group(2).strip()
 
137
  if tool_name in self.tools:
138
  print(f"Executing Tool: {tool_name}, Input: {tool_input}")
139
  try:
140
  observation = self.tools[tool_name](tool_input)
141
  except Exception as e:
142
  observation = f"Error executing tool {tool_name}: {e}"
143
+ print(f"Observation: {observation[:200]}...")
144
+ scratchpad += f"Observation: {observation}\n"
145
  else:
146
  print(f"Unknown tool: {tool_name}")
147
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
148
  else:
149
+ print("No valid action found in LLM output for this iteration.")
 
 
150
 
 
151
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
152
 
 
 
153
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
154
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
155
  print(f"Returning standard failure message: {standard_failure_message}")
156
  return standard_failure_message
157
 
158
+ # --- Constants ---
 
159
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
160
 
161
+ # --- Main Execution Logic ---
162
  def run_and_submit_all(profile: gr.OAuthProfile | None):
163
  space_id = os.getenv("SPACE_ID")
164
  if profile:
165
  username = f"{profile.username}"
 
166
  else:
 
167
  return "Please Login to Hugging Face with the button.", None
168
 
169
  api_url = DEFAULT_API_URL
 
171
  submit_url = f"{api_url}/submit"
172
 
173
  try:
174
+ available_tools = {"search_tool": search_tool, "calculator_tool": calculator_tool}
 
 
 
175
  if llm_client is None:
176
  return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None
177
  agent = ReActAgent(llm_client=llm_client, tools=available_tools)
178
  except Exception as e:
 
179
  return f"Error initializing agent: {e}", None
180
 
181
  agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)"
 
182
 
 
183
  try:
184
  response = requests.get(questions_url, timeout=20)
185
  response.raise_for_status()
186
  questions_data = response.json()
187
  if not questions_data:
 
188
  return "Fetched questions list is empty or invalid format.", None
189
+ except Exception as e:
 
 
190
  return f"Error fetching questions: {e}", None
191
+
192
+ results_log, answers_payload = [], []
 
 
 
 
 
 
193
  for item in questions_data:
194
+ task_id, question_text = item.get("task_id"), item.get("question")
195
+ if not task_id or question_text is None: continue
 
 
 
196
  try:
197
  print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---")
198
  submitted_answer = agent(question_text)
199
  answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
200
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
201
+ print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
202
  except Exception as e:
 
203
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
204
 
205
  if not answers_payload:
 
206
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
207
 
208
  submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
 
 
209
 
 
210
  try:
211
  response = requests.post(submit_url, json=submission_data, timeout=120)
212
  response.raise_for_status()
213
  result_data = response.json()
214
  final_status = (
215
+ f"Submission Successful!\nUser: {result_data.get('username')}\n"
 
216
  f"Overall Score: {result_data.get('score', 'N/A')}% "
217
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
218
  f"Message: {result_data.get('message', 'No message received.')}"
219
  )
220
+ return final_status, pd.DataFrame(results_log)
 
 
221
  except requests.exceptions.HTTPError as e:
222
  error_detail = f"Server responded with status {e.response.status_code}."
223
+ try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}"
224
+ except: error_detail += f" Response: {e.response.text[:500]}"
225
+ return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
  except Exception as e:
227
+ return f"An unexpected error occurred during submission: {e}", pd.DataFrame(results_log)
 
 
 
228
 
229
+ # --- Gradio Interface ---
230
  with gr.Blocks() as demo:
231
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
232
+ gr.Markdown("Instructions and disclaimers...") # Keep your existing markdown or customize
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
  gr.LoginButton()
234
  run_button = gr.Button("Run Evaluation & Submit All Answers")
235
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
236
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
237
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
 
238
 
239
  if __name__ == "__main__":
240
  print("\n" + "-"*30 + " App Starting " + "-"*30)
241
+ # Startup messages (space_host, space_id, llm_client status)
242
  space_host_startup = os.getenv("SPACE_HOST")
243
  space_id_startup = os.getenv("SPACE_ID")
244
+ if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
245
+ if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
246
+ if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
247
+ else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  print("-"*(60 + len(" App Starting ")) + "\n")
 
249
  demo.launch(debug=True, share=False)