mujtabarizvi commited on
Commit
c1f3f5c
·
verified ·
1 Parent(s): cdedb37

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -77
app.py CHANGED
@@ -8,7 +8,7 @@ import re # For parsing LLM output
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
- LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Using Mixtral
12
 
13
  try:
14
  hf_token = os.getenv("HF_TOKEN")
@@ -24,7 +24,7 @@ def search_tool(query: str) -> str:
24
  print(f"Tool: search_tool, Query: {query}")
25
  try:
26
  with DDGS() as ddgs:
27
- results = ddgs.text(query, max_results=3)
28
  if results:
29
  return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
30
  else:
@@ -36,15 +36,30 @@ def search_tool(query: str) -> str:
36
  def calculator_tool(expression: str) -> str:
37
  print(f"Tool: calculator_tool, Expression: {expression}")
38
  try:
39
- result = eval(expression, {"__builtins__": {}}, {"sqrt": lambda x: x**0.5, "pi": 3.1415926535})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  return str(result)
 
41
  except Exception as e:
42
  print(f"Error in calculator_tool: {e}")
43
- return f"Error calculating: {str(e)}. Ensure the expression is valid math."
44
 
45
  # --- Agent Definition ---
46
  class ReActAgent:
47
- def __init__(self, llm_client, tools: dict, max_iterations=7): # Iteration 1 for T/A, Iteration 2 for T/FA minimum
48
  print("ReActAgent initialized.")
49
  if llm_client is None:
50
  raise ValueError("LLM client not initialized.")
@@ -58,17 +73,21 @@ class ReActAgent:
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
 
61
  self.react_prompt_template = inspect.cleandoc(f"""
62
  You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
63
  Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
64
 
65
  You will proceed in a Thought, Action, Observation loop.
66
  1. First, provide a "Thought:" explaining your reasoning for the current question.
67
- 2. Next, provide an "Action:". This can be using a tool (e.g., search_tool[query]) or "Action: None" if no tool is needed for this step.
 
 
 
68
  3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
69
  4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".
70
 
71
- The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
72
 
73
  Available tools:
74
  {self.tool_descriptions}
@@ -76,41 +95,35 @@ class ReActAgent:
76
  Use the following format FOR THE CURRENT QUESTION ONLY:
77
  Question: the input question you must answer
78
 
79
- Thought: Your reasoning and plan for the current question.
80
- Action: The action to take (e.g., search_tool[query] or calculator_tool[expression] or Action: None). AFTER THIS, STOP.
81
- Observation: [The system will provide this. Do NOT generate this part.]
82
- Thought: Your reasoning based on the previous observation.
83
- Action: (Another action, or Action: None). AFTER THIS, STOP.
84
  Observation: [The system will provide this. Do NOT generate this part.]
 
 
 
85
  ... (Repeat Thought/Action/STOP/Observation as needed)
86
  Thought: I have sufficient information to answer the current question.
87
- Final Answer: [Provide ONLY the precise answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]
88
 
89
- Let's begin with the current question.
90
- """) + "\nQuestion: {question}\n{scratchpad}"
91
 
92
 
93
  def run_llm(self, prompt: str) -> str:
94
  try:
95
- # Define stop sequences to make the LLM pause after an Action
96
- # or when it's about to give a Final Answer.
97
- stop_sequences = [
98
  "\nObservation:", "Observation:",
99
- # "\nThought:", # Removing this as a primary stop, LLM should produce Thought then Action.
100
- # If it stops at Thought, it means it didn't reach Action.
101
  "\nFinal Answer:", "Final Answer:"
102
  ]
103
- # Adding "\nThought:" as a stop might be too aggressive if the LLM wants to write a thought
104
- # *before* an action in its first turn. The prompt guides it to do T then A.
105
- # The main goal is to stop it *before* it hallucinates an Observation.
106
 
107
  response = self.llm.text_generation(
108
  prompt,
109
- max_new_tokens=350, # Reduced slightly as each turn should be shorter. Was 512.
110
- temperature=0.1,
111
- do_sample=True,
112
- stop_sequences=stop_sequences, # Key addition
113
- # return_full_text=False # Ensure this is False or default if supported, to not include prompt in response
114
  )
115
  return response.strip()
116
  except Exception as e:
@@ -119,35 +132,54 @@ class ReActAgent:
119
 
120
  def __call__(self, question: str) -> str:
121
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
122
- scratchpad = ""
123
 
124
  for i in range(self.max_iterations):
125
  print(f"\nIteration {i+1}")
126
- current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
 
 
 
 
 
 
 
 
 
127
 
128
- llm_output = self.run_llm(current_prompt)
 
 
 
 
 
 
129
 
130
  print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
131
- print(llm_output)
132
  print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
133
 
134
- if not llm_output:
135
  print("LLM returned empty or error, stopping.")
136
  return "Agent could not determine an answer within the allowed steps."
137
 
138
- # Append only the LLM's actual generation for this turn to scratchpad
139
- # If llm_output includes a stop sequence like "Observation:", we might not want to add that part yet.
140
- # However, the prompt structure expects the scratchpad to be a coherent dialogue.
141
- # Let's add the raw llm_output, then the observation will be added explicitly.
142
- # Check if llm_output ends with a stop sequence and trim if necessary before adding to scratchpad,
143
- # or ensure the next parts of the logic handle it.
144
- # For now, add the raw output. The next prompt will contain it.
145
- # The key is that the *next* part of the scratchpad will be a *real* observation.
 
 
 
 
 
146
 
147
- # If LLM output already contains "Final Answer:", extract and return
148
- all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
149
  if all_final_answers:
150
  answer = all_final_answers[-1].strip()
 
151
  if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
152
  if "Action:" in answer: answer = answer.split("Action:")[0].strip()
153
  if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
@@ -155,19 +187,17 @@ class ReActAgent:
155
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
156
  if inner_final_answers: answer = inner_final_answers[-1].strip()
157
 
158
- print(f"Found and extracted Final Answer from LLM output: '{answer}'")
159
- scratchpad += llm_output + "\n" # Add the final thought/answer block
160
- return answer
161
-
162
- # If not Final Answer, add the current llm_output (Thought & Action) to scratchpad
163
- scratchpad += llm_output # LLM output should be Thought \n Action
164
- if not llm_output.endswith("\n"):
165
- scratchpad += "\n"
166
-
167
 
168
- # Parse Action from the LLM's *current* output
169
- action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
170
- action_none_match = re.search(r"Action:\s*None", llm_output, re.IGNORECASE)
 
171
 
172
  if action_match:
173
  tool_name = action_match.group(1).strip()
@@ -179,30 +209,20 @@ class ReActAgent:
179
  except Exception as e:
180
  observation_content = f"Error executing tool {tool_name}: {e}"
181
  print(f"Observation content: {observation_content[:200]}...")
182
- scratchpad += f"Observation: {observation_content}\n"
183
  else:
184
  print(f"Unknown tool: {tool_name}")
185
- scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
186
  elif action_none_match:
187
  print("Action: None detected.")
188
- scratchpad += f"Observation: No action taken, proceeding with reasoning.\n"
189
  else:
190
- # LLM didn't output a valid Action or "Final Answer:". It might be just a "Thought:".
191
- # Or it might be a malformed output. Let the loop continue, it will use this partial output in the next prompt.
192
- print("No valid Action (tool use or None) or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
193
- # If it's just a thought, the scratchpad has it. Next iteration will prompt with it.
194
- # If no action and no final answer, we might want to consider it a failed step if it persists.
195
- # For now, we assume the LLM might be in a multi-step thought process not requiring immediate action.
196
- # However, the prompt now *requires* an Action (even "Action: None").
197
- # So, if we reach here, the LLM is not perfectly following the format.
198
- # We might add a generic "Observation: LLM did not provide a valid action." to prompt for recovery.
199
- # This is less critical if the stop sequences work well.
200
- # If the LLM stops generating *before* an action, this branch will also be hit.
201
- # The raw LLM output log will be key here.
202
- if not llm_output.strip().startswith("Thought:"): # If it's not even a thought, it's very off.
203
- scratchpad += "Observation: LLM output was not a valid Thought/Action or Final Answer. Please try again adhering to the format.\n"
204
-
205
- # current_prompt for next iteration is reconstructed outside the loop start
206
 
207
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
208
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
@@ -268,13 +288,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
268
  response = requests.post(submit_url, json=submission_data, timeout=120)
269
  response.raise_for_status()
270
  result_data = response.json()
271
- final_status = (
272
  f"Submission Successful!\nUser: {result_data.get('username')}\n"
273
  f"Overall Score: {result_data.get('score', 'N/A')}% "
274
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
275
  f"Message: {result_data.get('message', 'No message received.')}"
276
  )
277
- return final_status, pd.DataFrame(results_log)
278
  except requests.exceptions.HTTPError as e:
279
  error_detail = f"Server responded with status {e.response.status_code}."
280
  try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}"
@@ -290,7 +310,7 @@ with gr.Blocks() as demo:
290
  """
291
  **Instructions & Disclaimers:**
292
  Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
293
- Check logs for RAW LLM OUTPUT for debugging.
294
  """
295
  )
296
  gr.LoginButton()
 
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
+ LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
12
 
13
  try:
14
  hf_token = os.getenv("HF_TOKEN")
 
24
  print(f"Tool: search_tool, Query: {query}")
25
  try:
26
  with DDGS() as ddgs:
27
+ results = ddgs.text(query, max_results=3) # Fewer results to be less verbose
28
  if results:
29
  return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
30
  else:
 
36
  def calculator_tool(expression: str) -> str:
37
  print(f"Tool: calculator_tool, Expression: {expression}")
38
  try:
39
+ # Basic check for safety, though a proper parser is better for production
40
+ if not re.match(r"^[0-9\s\+\-\*\/\(\)\.\%sqrtpijabsindcostanlog]+$", expression):
41
+ # Add more functions as needed, e.g. math.sqrt, math.pi etc.
42
+ # For simplicity, we are keeping a limited set here.
43
+ if expression not in ["pi", "sqrt"] and not any(op in expression for op in ['+', '-', '*', '/']):
44
+ return f"Error: Invalid characters in expression. Only numbers, basic operators, sqrt, pi allowed. Expression: {expression}"
45
+
46
+ # Using a more controlled eval
47
+ allowed_names = {"sqrt": lambda x: x**0.5, "pi": 3.1415926535} # Add more safe functions
48
+ code = compile(expression, "<string>", "eval")
49
+ for name in code.co_names:
50
+ if name not in allowed_names and name not in __builtins__:
51
+ raise NameError(f"Use of {name} is not allowed")
52
+
53
+ result = eval(code, {"__builtins__": {}}, allowed_names)
54
  return str(result)
55
+
56
  except Exception as e:
57
  print(f"Error in calculator_tool: {e}")
58
+ return f"Error calculating: {str(e)}. Ensure the expression is valid and uses allowed functions/operators."
59
 
60
  # --- Agent Definition ---
61
  class ReActAgent:
62
+ def __init__(self, llm_client, tools: dict, max_iterations=7):
63
  print("ReActAgent initialized.")
64
  if llm_client is None:
65
  raise ValueError("LLM client not initialized.")
 
73
  ])
74
  self.tool_names = ", ".join(tools.keys())
75
 
76
+ # Refined prompt for better tool usage and stopping
77
  self.react_prompt_template = inspect.cleandoc(f"""
78
  You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
79
  Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
80
 
81
  You will proceed in a Thought, Action, Observation loop.
82
  1. First, provide a "Thought:" explaining your reasoning for the current question.
83
+ 2. Next, provide an "Action:".
84
+ - If you need to search the web, use search_tool[query].
85
+ - If you need to perform a calculation (e.g., arithmetic like 5*5, or math expressions), use calculator_tool[expression].
86
+ - If no tool is needed for this immediate step based on your current thought and the information available, use "Action: None". Only use Action: None if you are certain no tool can help or is required for the current step.
87
  3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
88
  4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".
89
 
90
+ The final answer itself (the text after "Final Answer:") must be an EXACT, non-empty match to the correct response, without any extra explanations, apologies, or prefixes.
91
 
92
  Available tools:
93
  {self.tool_descriptions}
 
95
  Use the following format FOR THE CURRENT QUESTION ONLY:
96
  Question: the input question you must answer
97
 
98
+ {'{scratchpad}'}
99
+
100
+ Thought: [Your reasoning and plan for the current question. If continuing from an observation, reason about that observation.]
101
+ Action: [search_tool[query_for_search] OR calculator_tool[math_expression_to_calculate] OR Action: None]. AFTER THIS, STOP.
 
102
  Observation: [The system will provide this. Do NOT generate this part.]
103
+ Thought: [Your reasoning based on the previous observation.]
104
+ Action: [Another action or Action: None]. AFTER THIS, STOP.
105
+ Observation: [The system will provide this.]
106
  ... (Repeat Thought/Action/STOP/Observation as needed)
107
  Thought: I have sufficient information to answer the current question.
108
+ Final Answer: [Provide ONLY the precise, non-empty answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]
109
 
110
+ Start your response for the current turn with "Thought:".
111
+ """) # Removed initial "Question: {question}" here, it's now part of the formatted prompt
112
 
113
 
114
  def run_llm(self, prompt: str) -> str:
115
  try:
116
+ stop_tokens = [
 
 
117
  "\nObservation:", "Observation:",
 
 
118
  "\nFinal Answer:", "Final Answer:"
119
  ]
 
 
 
120
 
121
  response = self.llm.text_generation(
122
  prompt,
123
+ max_new_tokens=350,
124
+ temperature=0.05, # Lowered further for more determinism
125
+ do_sample=True, # Important if temperature < 1.0
126
+ stop=stop_tokens, # Using `stop` as per FutureWarning
 
127
  )
128
  return response.strip()
129
  except Exception as e:
 
132
 
133
  def __call__(self, question: str) -> str:
134
  print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
135
+ scratchpad_history = ""
136
 
137
  for i in range(self.max_iterations):
138
  print(f"\nIteration {i+1}")
139
+
140
+ # Construct the prompt for the LLM for the current turn
141
+ # The template now has {scratchpad} in the middle, then format instructions, then prompts for Thought/Action.
142
+ # We ensure the LLM starts its generation with a Thought.
143
+ # The initial prompt will be the template + Question + "Thought:"
144
+ # Subsequent prompts will be template + Question + scratchpad_history + "Thought:"
145
+
146
+ # The main instruction block, question, and current scratchpad history
147
+ current_prompt_base = self.react_prompt_template.format(scratchpad=scratchpad_history).split("Thought:")[0]
148
+ current_prompt_text = f"Question: {question}\n" + current_prompt_base
149
 
150
+ if not scratchpad_history: # First turn
151
+ current_prompt_text += "Thought:" # Prime for the first thought
152
+ else: # Subsequent turns, scratchpad_history has previous T/A/O
153
+ current_prompt_text += scratchpad_history + "\nThought:" # Prime for next thought after observation
154
+
155
+ print(f"--- PROMPT FOR LLM (Iteration {i+1}, last 300 chars) ---\n...{current_prompt_text[-300:]}\n--- END PROMPT ---")
156
+ llm_output_this_turn = self.run_llm(current_prompt_text)
157
 
158
  print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
159
+ print(llm_output_this_turn)
160
  print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
161
 
162
+ if not llm_output_this_turn:
163
  print("LLM returned empty or error, stopping.")
164
  return "Agent could not determine an answer within the allowed steps."
165
 
166
+ # Prepend "Thought:" if LLM didn't include it (due to priming)
167
+ # This ensures scratchpad consistency if the LLM directly starts with the thought content.
168
+ actual_llm_generation = llm_output_this_turn
169
+ if not llm_output_this_turn.strip().startswith("Thought:") and \
170
+ (scratchpad_history.strip().endswith("Observation:") or not scratchpad_history):
171
+ actual_llm_generation = "Thought: " + llm_output_this_turn
172
+
173
+ scratchpad_history += actual_llm_generation + "\n"
174
+
175
+ # Check for Final Answer in the llm_output_this_turn
176
+ # The llm_output_this_turn could be "Thought: ... Final Answer: ..." if no tool was needed.
177
+ final_answer_segment = actual_llm_generation # Check the full segment for Final Answer
178
+ all_final_answers = re.findall(r"Final Answer:\s*(.*)", final_answer_segment, re.DOTALL | re.IGNORECASE)
179
 
 
 
180
  if all_final_answers:
181
  answer = all_final_answers[-1].strip()
182
+ # Clean common contamination
183
  if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
184
  if "Action:" in answer: answer = answer.split("Action:")[0].strip()
185
  if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
 
187
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
188
  if inner_final_answers: answer = inner_final_answers[-1].strip()
189
 
190
+ if answer: # Only if the answer is not empty after cleaning
191
+ print(f"Found and extracted Final Answer: '{answer}'")
192
+ return answer
193
+ else:
194
+ print("LLM produced 'Final Answer:' but the content was empty or invalid after cleaning. Continuing.")
195
+ # Scratchpad already has this turn's problematic output. Loop continues.
 
 
 
196
 
197
+ # Parse Action from llm_output_this_turn (or actual_llm_generation)
198
+ action_segment = actual_llm_generation # Check the full segment for Action
199
+ action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", action_segment, re.DOTALL)
200
+ action_none_match = re.search(r"Action:\s*None", action_segment, re.IGNORECASE)
201
 
202
  if action_match:
203
  tool_name = action_match.group(1).strip()
 
209
  except Exception as e:
210
  observation_content = f"Error executing tool {tool_name}: {e}"
211
  print(f"Observation content: {observation_content[:200]}...")
212
+ scratchpad_history += f"Observation: {observation_content}\n"
213
  else:
214
  print(f"Unknown tool: {tool_name}")
215
+ scratchpad_history += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
216
  elif action_none_match:
217
  print("Action: None detected.")
218
+ scratchpad_history += f"Observation: No action taken, proceeding with reasoning.\n"
219
  else:
220
+ print("No valid Action (tool use or None) found in LLM output for this turn. LLM might be thinking or its format is off.")
221
+ # If the LLM is supposed to always output an Action (even None) but doesn't,
222
+ # it's a deviation. We add a generic observation to try and get it back on track.
223
+ # This can happen if it only outputs a Thought.
224
+ scratchpad_history += "Observation: LLM did not provide an Action in the expected format. Please provide a Thought and then an Action (or Action: None).\n"
225
+
 
 
 
 
 
 
 
 
 
 
226
 
227
  print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
228
  standard_failure_message = "Agent could not determine an answer within the allowed steps."
 
288
  response = requests.post(submit_url, json=submission_data, timeout=120)
289
  response.raise_for_status()
290
  result_data = response.json()
291
+ final__status = ( # Renamed to avoid conflict
292
  f"Submission Successful!\nUser: {result_data.get('username')}\n"
293
  f"Overall Score: {result_data.get('score', 'N/A')}% "
294
  f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
295
  f"Message: {result_data.get('message', 'No message received.')}"
296
  )
297
+ return final_status, pd.DataFrame(results_log) # Corrected variable name
298
  except requests.exceptions.HTTPError as e:
299
  error_detail = f"Server responded with status {e.response.status_code}."
300
  try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}"
 
310
  """
311
  **Instructions & Disclaimers:**
312
  Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
313
+ Check logs for RAW LLM OUTPUT and PROMPT FOR LLM for debugging.
314
  """
315
  )
316
  gr.LoginButton()