mujtabarizvi commited on
Commit
cdedb37
·
verified ·
1 Parent(s): d4303f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -43
app.py CHANGED
@@ -44,7 +44,7 @@ def calculator_tool(expression: str) -> str:
44
 
45
  # --- Agent Definition ---
46
  class ReActAgent:
47
- def __init__(self, llm_client, tools: dict, max_iterations=7):
48
  print("ReActAgent initialized.")
49
  if llm_client is None:
50
  raise ValueError("LLM client not initialized.")
@@ -58,14 +58,15 @@ class ReActAgent:
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
61
- # Further strengthened ReAct prompt
62
  self.react_prompt_template = inspect.cleandoc(f"""
63
  You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
64
  Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
65
- You MUST begin your response with a "Thought:" section, outlining your plan.
66
- Following the "Thought:", you MUST specify an "Action:". This action should be one of the available tools (e.g., search_tool[query]) or "Action: None" if no tool is immediately necessary based on your thought.
67
- Only AFTER an "Action:" and its corresponding "Observation:" (or if "Action: None" was used, after further "Thought:") can you consider providing a "Final Answer:".
68
- DO NOT output "Final Answer:" as your very first step or before going through at least one Thought/Action/Observation cycle if tools or reasoning are required.
 
 
69
 
70
  The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
71
 
@@ -74,23 +75,42 @@ class ReActAgent:
74
 
75
  Use the following format FOR THE CURRENT QUESTION ONLY:
76
  Question: the input question you must answer
77
- Thought: Your reasoning and plan for the current question. This MUST be your first step.
78
- Action: The action to take. Choose from [{self.tool_names}] with input in brackets (e.g., search_tool[query]), or use "Action: None" if no tool is needed for this immediate step. This MUST follow your Thought.
79
- Observation: The result of the action. If Action was None, state "Observation: No action taken, proceeding with reasoning." or similar. This MUST follow your Action.
80
- Thought: Further reasoning based on the observation or your initial thought process. You may loop through Thought/Action/Observation multiple times.
81
- Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Use this ONLY when all reasoning is complete and you are certain of the answer.]
82
 
83
- Let's begin by thinking about the current question.
 
 
 
 
 
 
 
 
 
 
84
  """) + "\nQuestion: {question}\n{scratchpad}"
85
 
86
 
87
  def run_llm(self, prompt: str) -> str:
88
  try:
 
 
 
 
 
 
 
 
 
 
 
 
89
  response = self.llm.text_generation(
90
  prompt,
91
- max_new_tokens=512, # Consider increasing if logs show truncation of ReAct steps
92
  temperature=0.1,
93
  do_sample=True,
 
 
94
  )
95
  return response.strip()
96
  except Exception as e:
@@ -100,63 +120,89 @@ class ReActAgent:
100
  def __call__(self, question: str) -> str:
101
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
102
  scratchpad = ""
103
- current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
104
-
105
  for i in range(self.max_iterations):
106
  print(f"\nIteration {i+1}")
 
 
107
  llm_output = self.run_llm(current_prompt)
108
 
109
- # ---- START: Added for debugging ----
110
  print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
111
  print(llm_output)
112
  print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
113
- # ---- END: Added for debugging ----
114
 
115
  if not llm_output:
116
  print("LLM returned empty or error, stopping.")
117
- return "Agent could not determine an answer within the allowed steps." # Consistent failure message
118
-
119
- scratchpad += llm_output + "\n"
120
-
 
 
 
 
 
 
 
 
121
  all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
122
  if all_final_answers:
123
  answer = all_final_answers[-1].strip()
124
-
125
  if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
126
  if "Action:" in answer: answer = answer.split("Action:")[0].strip()
127
  if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
128
  if "Question:" in answer: answer = answer.split("Question:")[0].strip()
129
-
130
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
131
- if inner_final_answers:
132
- answer = inner_final_answers[-1].strip()
133
 
134
- print(f"Found and extracted Final Answer: '{answer}'")
 
135
  return answer
136
 
 
 
 
 
 
 
 
137
  action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
 
 
138
  if action_match:
139
  tool_name = action_match.group(1).strip()
140
  tool_input = action_match.group(2).strip()
141
  if tool_name in self.tools:
142
  print(f"Executing Tool: {tool_name}, Input: {tool_input}")
143
  try:
144
- observation = self.tools[tool_name](tool_input)
145
  except Exception as e:
146
- observation = f"Error executing tool {tool_name}: {e}"
147
- print(f"Observation: {observation[:200]}...")
148
- scratchpad += f"Observation: {observation}\n"
149
  else:
150
  print(f"Unknown tool: {tool_name}")
151
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
 
 
 
152
  else:
153
- # If the LLM output does not contain "Final Answer:" and also does not contain a valid "Action:",
154
- # it means the LLM is likely just "thinking" or its output is malformed for ReAct.
155
- # We add its output to scratchpad and let it try again in the next iteration.
156
- print("No valid Action or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
157
-
158
-
159
- current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
 
 
 
 
 
 
 
 
 
160
 
161
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
162
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
@@ -208,9 +254,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
208
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
209
  print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
210
  except Exception as e:
211
- print(f"Error running agent on task {task_id}: {e}") # Log specific agent error
212
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
213
- # Still add a payload so the task is marked as attempted, with an error message.
214
  answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
215
 
216
 
@@ -241,10 +286,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
241
  # --- Gradio Interface ---
242
  with gr.Blocks() as demo:
243
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
244
- gr.Markdown( # Shortened for brevity, keep your detailed markdown
245
  """
246
  **Instructions & Disclaimers:**
247
- Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt.
248
  Check logs for RAW LLM OUTPUT for debugging.
249
  """
250
  )
@@ -259,9 +304,7 @@ if __name__ == "__main__":
259
  space_host_startup = os.getenv("SPACE_HOST")
260
  space_id_startup = os.getenv("SPACE_ID")
261
  if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
262
- # else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
263
  if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
264
- # else: print("ℹ️ SPACE_ID environment variable not found (running locally?).")
265
 
266
  if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
267
  else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
 
44
 
45
  # --- Agent Definition ---
46
  class ReActAgent:
47
+ def __init__(self, llm_client, tools: dict, max_iterations=7): # Iteration 1 for T/A, Iteration 2 for T/FA minimum
48
  print("ReActAgent initialized.")
49
  if llm_client is None:
50
  raise ValueError("LLM client not initialized.")
 
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
 
61
  self.react_prompt_template = inspect.cleandoc(f"""
62
  You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
63
  Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
64
+
65
+ You will proceed in a Thought, Action, Observation loop.
66
+ 1. First, provide a "Thought:" explaining your reasoning for the current question.
67
+ 2. Next, provide an "Action:". This can be using a tool (e.g., search_tool[query]) or "Action: None" if no tool is needed for this step.
68
+ 3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
69
+ 4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".
70
 
71
  The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
72
 
 
75
 
76
  Use the following format FOR THE CURRENT QUESTION ONLY:
77
  Question: the input question you must answer
 
 
 
 
 
78
 
79
+ Thought: Your reasoning and plan for the current question.
80
+ Action: The action to take (e.g., search_tool[query] or calculator_tool[expression] or Action: None). AFTER THIS, STOP.
81
+ Observation: [The system will provide this. Do NOT generate this part.]
82
+ Thought: Your reasoning based on the previous observation.
83
+ Action: (Another action, or Action: None). AFTER THIS, STOP.
84
+ Observation: [The system will provide this. Do NOT generate this part.]
85
+ ... (Repeat Thought/Action/STOP/Observation as needed)
86
+ Thought: I have sufficient information to answer the current question.
87
+ Final Answer: [Provide ONLY the precise answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]
88
+
89
+ Let's begin with the current question.
90
  """) + "\nQuestion: {question}\n{scratchpad}"
91
 
92
 
93
  def run_llm(self, prompt: str) -> str:
94
  try:
95
+ # Define stop sequences to make the LLM pause after an Action
96
+ # or when it's about to give a Final Answer.
97
+ stop_sequences = [
98
+ "\nObservation:", "Observation:",
99
+ # "\nThought:", # Removing this as a primary stop, LLM should produce Thought then Action.
100
+ # If it stops at Thought, it means it didn't reach Action.
101
+ "\nFinal Answer:", "Final Answer:"
102
+ ]
103
+ # Adding "\nThought:" as a stop might be too aggressive if the LLM wants to write a thought
104
+ # *before* an action in its first turn. The prompt guides it to do T then A.
105
+ # The main goal is to stop it *before* it hallucinates an Observation.
106
+
107
  response = self.llm.text_generation(
108
  prompt,
109
+ max_new_tokens=350, # Reduced slightly as each turn should be shorter. Was 512.
110
  temperature=0.1,
111
  do_sample=True,
112
+ stop_sequences=stop_sequences, # Key addition
113
+ # return_full_text=False # Ensure this is False or default if supported, to not include prompt in response
114
  )
115
  return response.strip()
116
  except Exception as e:
 
120
  def __call__(self, question: str) -> str:
121
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
122
  scratchpad = ""
123
+
 
124
  for i in range(self.max_iterations):
125
  print(f"\nIteration {i+1}")
126
+ current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
127
+
128
  llm_output = self.run_llm(current_prompt)
129
 
 
130
  print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
131
  print(llm_output)
132
  print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
 
133
 
134
  if not llm_output:
135
  print("LLM returned empty or error, stopping.")
136
+ return "Agent could not determine an answer within the allowed steps."
137
+
138
+ # Append only the LLM's actual generation for this turn to scratchpad
139
+ # If llm_output includes a stop sequence like "Observation:", we might not want to add that part yet.
140
+ # However, the prompt structure expects the scratchpad to be a coherent dialogue.
141
+ # Let's add the raw llm_output, then the observation will be added explicitly.
142
+ # Check if llm_output ends with a stop sequence and trim if necessary before adding to scratchpad,
143
+ # or ensure the next parts of the logic handle it.
144
+ # For now, add the raw output. The next prompt will contain it.
145
+ # The key is that the *next* part of the scratchpad will be a *real* observation.
146
+
147
+ # If LLM output already contains "Final Answer:", extract and return
148
  all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
149
  if all_final_answers:
150
  answer = all_final_answers[-1].strip()
 
151
  if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
152
  if "Action:" in answer: answer = answer.split("Action:")[0].strip()
153
  if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
154
  if "Question:" in answer: answer = answer.split("Question:")[0].strip()
 
155
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
156
+ if inner_final_answers: answer = inner_final_answers[-1].strip()
 
157
 
158
+ print(f"Found and extracted Final Answer from LLM output: '{answer}'")
159
+ scratchpad += llm_output + "\n" # Add the final thought/answer block
160
  return answer
161
 
162
+ # If not Final Answer, add the current llm_output (Thought & Action) to scratchpad
163
+ scratchpad += llm_output # LLM output should be Thought \n Action
164
+ if not llm_output.endswith("\n"):
165
+ scratchpad += "\n"
166
+
167
+
168
+ # Parse Action from the LLM's *current* output
169
  action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
170
+ action_none_match = re.search(r"Action:\s*None", llm_output, re.IGNORECASE)
171
+
172
  if action_match:
173
  tool_name = action_match.group(1).strip()
174
  tool_input = action_match.group(2).strip()
175
  if tool_name in self.tools:
176
  print(f"Executing Tool: {tool_name}, Input: {tool_input}")
177
  try:
178
+ observation_content = self.tools[tool_name](tool_input)
179
  except Exception as e:
180
+ observation_content = f"Error executing tool {tool_name}: {e}"
181
+ print(f"Observation content: {observation_content[:200]}...")
182
+ scratchpad += f"Observation: {observation_content}\n"
183
  else:
184
  print(f"Unknown tool: {tool_name}")
185
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
186
+ elif action_none_match:
187
+ print("Action: None detected.")
188
+ scratchpad += f"Observation: No action taken, proceeding with reasoning.\n"
189
  else:
190
+ # LLM didn't output a valid Action or "Final Answer:". It might be just a "Thought:".
191
+ # Or it might be a malformed output. Let the loop continue, it will use this partial output in the next prompt.
192
+ print("No valid Action (tool use or None) or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
193
+ # If it's just a thought, the scratchpad has it. Next iteration will prompt with it.
194
+ # If no action and no final answer, we might want to consider it a failed step if it persists.
195
+ # For now, we assume the LLM might be in a multi-step thought process not requiring immediate action.
196
+ # However, the prompt now *requires* an Action (even "Action: None").
197
+ # So, if we reach here, the LLM is not perfectly following the format.
198
+ # We might add a generic "Observation: LLM did not provide a valid action." to prompt for recovery.
199
+ # This is less critical if the stop sequences work well.
200
+ # If the LLM stops generating *before* an action, this branch will also be hit.
201
+ # The raw LLM output log will be key here.
202
+ if not llm_output.strip().startswith("Thought:"): # If it's not even a thought, it's very off.
203
+ scratchpad += "Observation: LLM output was not a valid Thought/Action or Final Answer. Please try again adhering to the format.\n"
204
+
205
+ # current_prompt for next iteration is reconstructed outside the loop start
206
 
207
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
208
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
 
254
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
255
  print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
256
  except Exception as e:
257
+ print(f"Error running agent on task {task_id}: {e}")
258
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
259
  answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
260
 
261
 
 
286
  # --- Gradio Interface ---
287
  with gr.Blocks() as demo:
288
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
289
+ gr.Markdown(
290
  """
291
  **Instructions & Disclaimers:**
292
+ Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
293
  Check logs for RAW LLM OUTPUT for debugging.
294
  """
295
  )
 
304
  space_host_startup = os.getenv("SPACE_HOST")
305
  space_id_startup = os.getenv("SPACE_ID")
306
  if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
 
307
  if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
 
308
 
309
  if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
310
  else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")