mujtabarizvi commited on
Commit
d4303f4
·
verified ·
1 Parent(s): cfce637

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -30
app.py CHANGED
@@ -8,7 +8,7 @@ import re # For parsing LLM output
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
- LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
12
 
13
  try:
14
  hf_token = os.getenv("HF_TOKEN")
@@ -58,25 +58,29 @@ class ReActAgent:
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
 
61
  self.react_prompt_template = inspect.cleandoc(f"""
62
- You are a helpful AI assistant. Your goal is to answer the CURRENT question accurately.
63
- Focus ONLY on the provided "Question:". Do not generate new questions or continue a dialogue beyond answering the current question.
64
- You must use a step-by-step thinking process (Thought, Action, Observation) for the current question.
65
- The final answer submitted must be an EXACT match to the correct response, without any extra explanations or prefixes being part of the answer itself.
 
 
 
 
66
 
67
  Available tools:
68
  {self.tool_descriptions}
69
 
70
  Use the following format FOR THE CURRENT QUESTION ONLY:
71
  Question: the input question you must answer
72
- Thought: Your reasoning and plan for the current question.
73
- Action: The action to take for the current question, should be one of [{self.tool_names}]. Input to the tool is between brackets. E.g., search_tool[query] or calculator_tool[expression].
74
- Observation: The result of the action for the current question.
75
- ... (this Thought/Action/Observation sequence can repeat for the current question)
76
- Thought: I now have enough information to answer the current question.
77
- Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Do not include any other text, reasoning, or new questions after this line.]
78
-
79
- Let's begin with the current question.
80
  """) + "\nQuestion: {question}\n{scratchpad}"
81
 
82
 
@@ -84,7 +88,7 @@ class ReActAgent:
84
  try:
85
  response = self.llm.text_generation(
86
  prompt,
87
- max_new_tokens=512,
88
  temperature=0.1,
89
  do_sample=True,
90
  )
@@ -102,27 +106,27 @@ class ReActAgent:
102
  print(f"\nIteration {i+1}")
103
  llm_output = self.run_llm(current_prompt)
104
 
 
 
 
 
 
 
105
  if not llm_output:
106
  print("LLM returned empty or error, stopping.")
107
- return "Agent Error: LLM failed to respond."
108
 
109
  scratchpad += llm_output + "\n"
110
 
111
  all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
112
  if all_final_answers:
113
- answer = all_final_answers[-1].strip() # Get the last "Final Answer:"
114
 
115
- # Further clean up common patterns of LLM over-generation within the answer
116
- if "Thought:" in answer:
117
- answer = answer.split("Thought:")[0].strip()
118
- if "Action:" in answer:
119
- answer = answer.split("Action:")[0].strip()
120
- if "Observation:" in answer:
121
- answer = answer.split("Observation:")[0].strip()
122
- if "Question:" in answer: # If it starts generating a new question within the answer
123
- answer = answer.split("Question:")[0].strip()
124
-
125
- # Handle nested "Final Answer:" in the extracted part
126
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
127
  if inner_final_answers:
128
  answer = inner_final_answers[-1].strip()
@@ -146,7 +150,11 @@ class ReActAgent:
146
  print(f"Unknown tool: {tool_name}")
147
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
148
  else:
149
- print("No valid action found in LLM output for this iteration.")
 
 
 
 
150
 
151
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
152
 
@@ -200,7 +208,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
200
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
201
  print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
202
  except Exception as e:
 
203
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
204
 
205
  if not answers_payload:
206
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
@@ -229,7 +241,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
229
  # --- Gradio Interface ---
230
  with gr.Blocks() as demo:
231
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
232
- gr.Markdown("Instructions and disclaimers...") # Keep your existing markdown or customize
 
 
 
 
 
 
233
  gr.LoginButton()
234
  run_button = gr.Button("Run Evaluation & Submit All Answers")
235
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
@@ -238,11 +256,13 @@ with gr.Blocks() as demo:
238
 
239
  if __name__ == "__main__":
240
  print("\n" + "-"*30 + " App Starting " + "-"*30)
241
- # Startup messages (space_host, space_id, llm_client status)
242
  space_host_startup = os.getenv("SPACE_HOST")
243
  space_id_startup = os.getenv("SPACE_ID")
244
  if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
 
245
  if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
 
 
246
  if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
247
  else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
248
  print("-"*(60 + len(" App Starting ")) + "\n")
 
8
  # --- HF Inference API for LLM ---
9
  from huggingface_hub import InferenceClient
10
 
11
+ LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Using Mixtral
12
 
13
  try:
14
  hf_token = os.getenv("HF_TOKEN")
 
58
  ])
59
  self.tool_names = ", ".join(tools.keys())
60
 
61
+ # Further strengthened ReAct prompt
62
  self.react_prompt_template = inspect.cleandoc(f"""
63
+ You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
64
+ Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
65
+ You MUST begin your response with a "Thought:" section, outlining your plan.
66
+ Following the "Thought:", you MUST specify an "Action:". This action should be one of the available tools (e.g., search_tool[query]) or "Action: None" if no tool is immediately necessary based on your thought.
67
+ Only AFTER an "Action:" and its corresponding "Observation:" (or if "Action: None" was used, after further "Thought:") can you consider providing a "Final Answer:".
68
+ DO NOT output "Final Answer:" as your very first step or before going through at least one Thought/Action/Observation cycle if tools or reasoning are required.
69
+
70
+ The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
71
 
72
  Available tools:
73
  {self.tool_descriptions}
74
 
75
  Use the following format FOR THE CURRENT QUESTION ONLY:
76
  Question: the input question you must answer
77
+ Thought: Your reasoning and plan for the current question. This MUST be your first step.
78
+ Action: The action to take. Choose from [{self.tool_names}] with input in brackets (e.g., search_tool[query]), or use "Action: None" if no tool is needed for this immediate step. This MUST follow your Thought.
79
+ Observation: The result of the action. If Action was None, state "Observation: No action taken, proceeding with reasoning." or similar. This MUST follow your Action.
80
+ Thought: Further reasoning based on the observation or your initial thought process. You may loop through Thought/Action/Observation multiple times.
81
+ Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Use this ONLY when all reasoning is complete and you are certain of the answer.]
82
+
83
+ Let's begin by thinking about the current question.
 
84
  """) + "\nQuestion: {question}\n{scratchpad}"
85
 
86
 
 
88
  try:
89
  response = self.llm.text_generation(
90
  prompt,
91
+ max_new_tokens=512, # Consider increasing if logs show truncation of ReAct steps
92
  temperature=0.1,
93
  do_sample=True,
94
  )
 
106
  print(f"\nIteration {i+1}")
107
  llm_output = self.run_llm(current_prompt)
108
 
109
+ # ---- START: Added for debugging ----
110
+ print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
111
+ print(llm_output)
112
+ print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
113
+ # ---- END: Added for debugging ----
114
+
115
  if not llm_output:
116
  print("LLM returned empty or error, stopping.")
117
+ return "Agent could not determine an answer within the allowed steps." # Consistent failure message
118
 
119
  scratchpad += llm_output + "\n"
120
 
121
  all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
122
  if all_final_answers:
123
+ answer = all_final_answers[-1].strip()
124
 
125
+ if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
126
+ if "Action:" in answer: answer = answer.split("Action:")[0].strip()
127
+ if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
128
+ if "Question:" in answer: answer = answer.split("Question:")[0].strip()
129
+
 
 
 
 
 
 
130
  inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
131
  if inner_final_answers:
132
  answer = inner_final_answers[-1].strip()
 
150
  print(f"Unknown tool: {tool_name}")
151
  scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
152
  else:
153
+ # If the LLM output does not contain "Final Answer:" and also does not contain a valid "Action:",
154
+ # it means the LLM is likely just "thinking" or its output is malformed for ReAct.
155
+ # We add its output to scratchpad and let it try again in the next iteration.
156
+ print("No valid Action or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
157
+
158
 
159
  current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
160
 
 
208
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
209
  print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
210
  except Exception as e:
211
+ print(f"Error running agent on task {task_id}: {e}") # Log specific agent error
212
  results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
213
+ # Still add a payload so the task is marked as attempted, with an error message.
214
+ answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
215
+
216
 
217
  if not answers_payload:
218
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
 
241
  # --- Gradio Interface ---
242
  with gr.Blocks() as demo:
243
  gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
244
+ gr.Markdown( # Shortened for brevity, keep your detailed markdown
245
+ """
246
+ **Instructions & Disclaimers:**
247
+ Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt.
248
+ Check logs for RAW LLM OUTPUT for debugging.
249
+ """
250
+ )
251
  gr.LoginButton()
252
  run_button = gr.Button("Run Evaluation & Submit All Answers")
253
  status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
256
 
257
  if __name__ == "__main__":
258
  print("\n" + "-"*30 + " App Starting " + "-"*30)
 
259
  space_host_startup = os.getenv("SPACE_HOST")
260
  space_id_startup = os.getenv("SPACE_ID")
261
  if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
262
+ # else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
263
  if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
264
+ # else: print("ℹ️ SPACE_ID environment variable not found (running locally?).")
265
+
266
  if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
267
  else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
268
  print("-"*(60 + len(" App Starting ")) + "\n")