Final_Assignment_Template_Final

Sleeping

App Files Files Community

mujtabarizvi commited on May 17, 2025

Commit

cdedb37

verified ·

1 Parent(s): d4303f4

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -43

app.py CHANGED Viewed

@@ -44,7 +44,7 @@ def calculator_tool(expression: str) -> str:
 # --- Agent Definition ---
 class ReActAgent:
-    def __init__(self, llm_client, tools: dict, max_iterations=7):
         print("ReActAgent initialized.")
         if llm_client is None:
             raise ValueError("LLM client not initialized.")
@@ -58,14 +58,15 @@ class ReActAgent:
         ])
         self.tool_names = ", ".join(tools.keys())
-        # Further strengthened ReAct prompt
         self.react_prompt_template = inspect.cleandoc(f"""
             You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
             Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
-            You MUST begin your response with a "Thought:" section, outlining your plan.
-            Following the "Thought:", you MUST specify an "Action:". This action should be one of the available tools (e.g., search_tool[query]) or "Action: None" if no tool is immediately necessary based on your thought.
-            Only AFTER an "Action:" and its corresponding "Observation:" (or if "Action: None" was used, after further "Thought:") can you consider providing a "Final Answer:".
-            DO NOT output "Final Answer:" as your very first step or before going through at least one Thought/Action/Observation cycle if tools or reasoning are required.
             The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
@@ -74,23 +75,42 @@ class ReActAgent:
             Use the following format FOR THE CURRENT QUESTION ONLY:
             Question: the input question you must answer
-            Thought: Your reasoning and plan for the current question. This MUST be your first step.
-            Action: The action to take. Choose from [{self.tool_names}] with input in brackets (e.g., search_tool[query]), or use "Action: None" if no tool is needed for this immediate step. This MUST follow your Thought.
-            Observation: The result of the action. If Action was None, state "Observation: No action taken, proceeding with reasoning." or similar. This MUST follow your Action.
-            Thought: Further reasoning based on the observation or your initial thought process. You may loop through Thought/Action/Observation multiple times.
-            Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Use this ONLY when all reasoning is complete and you are certain of the answer.]
-            Let's begin by thinking about the current question.
         """) + "\nQuestion: {question}\n{scratchpad}"
     def run_llm(self, prompt: str) -> str:
         try:
             response = self.llm.text_generation(
                 prompt,
-                max_new_tokens=512, # Consider increasing if logs show truncation of ReAct steps
                 temperature=0.1,
                 do_sample=True,
             )
             return response.strip()
         except Exception as e:
@@ -100,63 +120,89 @@ class ReActAgent:
     def __call__(self, question: str) -> str:
         print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
         scratchpad = ""
-        current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
         for i in range(self.max_iterations):
             print(f"\nIteration {i+1}")
             llm_output = self.run_llm(current_prompt)
-            # ---- START: Added for debugging ----
             print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
             print(llm_output)
             print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
-            # ---- END: Added for debugging ----
             if not llm_output:
                 print("LLM returned empty or error, stopping.")
-                return "Agent could not determine an answer within the allowed steps." # Consistent failure message
-            scratchpad += llm_output + "\n"
             all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
             if all_final_answers:
                 answer = all_final_answers[-1].strip()
                 if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
                 if "Action:" in answer: answer = answer.split("Action:")[0].strip()
                 if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
                 if "Question:" in answer: answer = answer.split("Question:")[0].strip()
                 inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
-                if inner_final_answers:
-                    answer = inner_final_answers[-1].strip()
-                print(f"Found and extracted Final Answer: '{answer}'")
                 return answer
             action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
             if action_match:
                 tool_name = action_match.group(1).strip()
                 tool_input = action_match.group(2).strip()
                 if tool_name in self.tools:
                     print(f"Executing Tool: {tool_name}, Input: {tool_input}")
                     try:
-                        observation = self.tools[tool_name](tool_input)
                     except Exception as e:
-                        observation = f"Error executing tool {tool_name}: {e}"
-                    print(f"Observation: {observation[:200]}...")
-                    scratchpad += f"Observation: {observation}\n"
                 else:
                     print(f"Unknown tool: {tool_name}")
                     scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
             else:
-                # If the LLM output does not contain "Final Answer:" and also does not contain a valid "Action:",
-                # it means the LLM is likely just "thinking" or its output is malformed for ReAct.
-                # We add its output to scratchpad and let it try again in the next iteration.
-                print("No valid Action or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
-            current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
         print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
         standard_failure_message = "Agent could not determine an answer within the allowed steps."
@@ -208,9 +254,8 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
             print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
         except Exception as e:
-            print(f"Error running agent on task {task_id}: {e}") # Log specific agent error
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
-            # Still add a payload so the task is marked as attempted, with an error message.
             answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
@@ -241,10 +286,10 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
-    gr.Markdown( # Shortened for brevity, keep your detailed markdown
         """
         **Instructions & Disclaimers:**
-        Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt.
         Check logs for RAW LLM OUTPUT for debugging.
         """
     )
@@ -259,9 +304,7 @@ if __name__ == "__main__":
     space_host_startup = os.getenv("SPACE_HOST")
     space_id_startup = os.getenv("SPACE_ID")
     if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
-    # else: print("ℹ️  SPACE_HOST environment variable not found (running locally?).")
     if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
-    # else: print("ℹ️  SPACE_ID environment variable not found (running locally?).")
     if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
     else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")

 # --- Agent Definition ---
 class ReActAgent:
+    def __init__(self, llm_client, tools: dict, max_iterations=7): # Iteration 1 for T/A, Iteration 2 for T/FA minimum
         print("ReActAgent initialized.")
         if llm_client is None:
             raise ValueError("LLM client not initialized.")
         ])
         self.tool_names = ", ".join(tools.keys())
         self.react_prompt_template = inspect.cleandoc(f"""
             You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
             Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
+            You will proceed in a Thought, Action, Observation loop.
+            1. First, provide a "Thought:" explaining your reasoning for the current question.
+            2. Next, provide an "Action:". This can be using a tool (e.g., search_tool[query]) or "Action: None" if no tool is needed for this step.
+            3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
+            4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".
             The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
             Use the following format FOR THE CURRENT QUESTION ONLY:
             Question: the input question you must answer
+            Thought: Your reasoning and plan for the current question.
+            Action: The action to take (e.g., search_tool[query] or calculator_tool[expression] or Action: None). AFTER THIS, STOP.
+            Observation: [The system will provide this. Do NOT generate this part.]
+            Thought: Your reasoning based on the previous observation.
+            Action: (Another action, or Action: None). AFTER THIS, STOP.
+            Observation: [The system will provide this. Do NOT generate this part.]
+            ... (Repeat Thought/Action/STOP/Observation as needed)
+            Thought: I have sufficient information to answer the current question.
+            Final Answer: [Provide ONLY the precise answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]
+            Let's begin with the current question.
         """) + "\nQuestion: {question}\n{scratchpad}"
     def run_llm(self, prompt: str) -> str:
         try:
+            # Define stop sequences to make the LLM pause after an Action
+            # or when it's about to give a Final Answer.
+            stop_sequences = [
+                "\nObservation:", "Observation:",
+                # "\nThought:", # Removing this as a primary stop, LLM should produce Thought then Action.
+                                # If it stops at Thought, it means it didn't reach Action.
+                "\nFinal Answer:", "Final Answer:"
+            ]
+            # Adding "\nThought:" as a stop might be too aggressive if the LLM wants to write a thought
+            # *before* an action in its first turn. The prompt guides it to do T then A.
+            # The main goal is to stop it *before* it hallucinates an Observation.
             response = self.llm.text_generation(
                 prompt,
+                max_new_tokens=350, # Reduced slightly as each turn should be shorter. Was 512.
                 temperature=0.1,
                 do_sample=True,
+                stop_sequences=stop_sequences, # Key addition
+                # return_full_text=False # Ensure this is False or default if supported, to not include prompt in response
             )
             return response.strip()
         except Exception as e:
     def __call__(self, question: str) -> str:
         print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
         scratchpad = ""
         for i in range(self.max_iterations):
             print(f"\nIteration {i+1}")
+            current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
             llm_output = self.run_llm(current_prompt)
             print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
             print(llm_output)
             print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
             if not llm_output:
                 print("LLM returned empty or error, stopping.")
+                return "Agent could not determine an answer within the allowed steps."
+            # Append only the LLM's actual generation for this turn to scratchpad
+            # If llm_output includes a stop sequence like "Observation:", we might not want to add that part yet.
+            # However, the prompt structure expects the scratchpad to be a coherent dialogue.
+            # Let's add the raw llm_output, then the observation will be added explicitly.
+            # Check if llm_output ends with a stop sequence and trim if necessary before adding to scratchpad,
+            # or ensure the next parts of the logic handle it.
+            # For now, add the raw output. The next prompt will contain it.
+            # The key is that the *next* part of the scratchpad will be a *real* observation.
+            # If LLM output already contains "Final Answer:", extract and return
             all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
             if all_final_answers:
                 answer = all_final_answers[-1].strip()
                 if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
                 if "Action:" in answer: answer = answer.split("Action:")[0].strip()
                 if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
                 if "Question:" in answer: answer = answer.split("Question:")[0].strip()
                 inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
+                if inner_final_answers: answer = inner_final_answers[-1].strip()
+                print(f"Found and extracted Final Answer from LLM output: '{answer}'")
+                scratchpad += llm_output + "\n" # Add the final thought/answer block
                 return answer
+            # If not Final Answer, add the current llm_output (Thought & Action) to scratchpad
+            scratchpad += llm_output # LLM output should be Thought \n Action
+            if not llm_output.endswith("\n"):
+                scratchpad += "\n"
+            # Parse Action from the LLM's *current* output
             action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", llm_output, re.DOTALL)
+            action_none_match = re.search(r"Action:\s*None", llm_output, re.IGNORECASE)
             if action_match:
                 tool_name = action_match.group(1).strip()
                 tool_input = action_match.group(2).strip()
                 if tool_name in self.tools:
                     print(f"Executing Tool: {tool_name}, Input: {tool_input}")
                     try:
+                        observation_content = self.tools[tool_name](tool_input)
                     except Exception as e:
+                        observation_content = f"Error executing tool {tool_name}: {e}"
+                    print(f"Observation content: {observation_content[:200]}...")
+                    scratchpad += f"Observation: {observation_content}\n"
                 else:
                     print(f"Unknown tool: {tool_name}")
                     scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
+            elif action_none_match:
+                print("Action: None detected.")
+                scratchpad += f"Observation: No action taken, proceeding with reasoning.\n"
             else:
+                # LLM didn't output a valid Action or "Final Answer:". It might be just a "Thought:".
+                # Or it might be a malformed output. Let the loop continue, it will use this partial output in the next prompt.
+                print("No valid Action (tool use or None) or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
+                # If it's just a thought, the scratchpad has it. Next iteration will prompt with it.
+                # If no action and no final answer, we might want to consider it a failed step if it persists.
+                # For now, we assume the LLM might be in a multi-step thought process not requiring immediate action.
+                # However, the prompt now *requires* an Action (even "Action: None").
+                # So, if we reach here, the LLM is not perfectly following the format.
+                # We might add a generic "Observation: LLM did not provide a valid action." to prompt for recovery.
+                # This is less critical if the stop sequences work well.
+                # If the LLM stops generating *before* an action, this branch will also be hit.
+                # The raw LLM output log will be key here.
+                if not llm_output.strip().startswith("Thought:"): # If it's not even a thought, it's very off.
+                     scratchpad += "Observation: LLM output was not a valid Thought/Action or Final Answer. Please try again adhering to the format.\n"
+            # current_prompt for next iteration is reconstructed outside the loop start
         print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
         standard_failure_message = "Agent could not determine an answer within the allowed steps."
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
             print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
         except Exception as e:
+            print(f"Error running agent on task {task_id}: {e}")
             results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
             answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
 # --- Gradio Interface ---
 with gr.Blocks() as demo:
     gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
+    gr.Markdown(
         """
         **Instructions & Disclaimers:**
+        Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
         Check logs for RAW LLM OUTPUT for debugging.
         """
     )
     space_host_startup = os.getenv("SPACE_HOST")
     space_id_startup = os.getenv("SPACE_ID")
     if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
     if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
     if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
     else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")