Update app.py
Browse files
app.py
CHANGED
|
@@ -8,7 +8,7 @@ import re # For parsing LLM output
|
|
| 8 |
# --- HF Inference API for LLM ---
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
| 11 |
-
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
|
| 12 |
|
| 13 |
try:
|
| 14 |
hf_token = os.getenv("HF_TOKEN")
|
|
@@ -58,25 +58,29 @@ class ReActAgent:
|
|
| 58 |
])
|
| 59 |
self.tool_names = ", ".join(tools.keys())
|
| 60 |
|
|
|
|
| 61 |
self.react_prompt_template = inspect.cleandoc(f"""
|
| 62 |
-
You are a helpful AI assistant. Your goal is to answer the CURRENT question accurately.
|
| 63 |
-
Focus ONLY on the provided "Question:". Do not generate new questions or
|
| 64 |
-
You
|
| 65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
|
| 67 |
Available tools:
|
| 68 |
{self.tool_descriptions}
|
| 69 |
|
| 70 |
Use the following format FOR THE CURRENT QUESTION ONLY:
|
| 71 |
Question: the input question you must answer
|
| 72 |
-
Thought: Your reasoning and plan for the current question.
|
| 73 |
-
Action: The action to take
|
| 74 |
-
Observation: The result of the action
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
Let's begin with the current question.
|
| 80 |
""") + "\nQuestion: {question}\n{scratchpad}"
|
| 81 |
|
| 82 |
|
|
@@ -84,7 +88,7 @@ class ReActAgent:
|
|
| 84 |
try:
|
| 85 |
response = self.llm.text_generation(
|
| 86 |
prompt,
|
| 87 |
-
max_new_tokens=512,
|
| 88 |
temperature=0.1,
|
| 89 |
do_sample=True,
|
| 90 |
)
|
|
@@ -102,27 +106,27 @@ class ReActAgent:
|
|
| 102 |
print(f"\nIteration {i+1}")
|
| 103 |
llm_output = self.run_llm(current_prompt)
|
| 104 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
if not llm_output:
|
| 106 |
print("LLM returned empty or error, stopping.")
|
| 107 |
-
return "Agent
|
| 108 |
|
| 109 |
scratchpad += llm_output + "\n"
|
| 110 |
|
| 111 |
all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
|
| 112 |
if all_final_answers:
|
| 113 |
-
answer = all_final_answers[-1].strip()
|
| 114 |
|
| 115 |
-
|
| 116 |
-
if "
|
| 117 |
-
|
| 118 |
-
if "
|
| 119 |
-
|
| 120 |
-
if "Observation:" in answer:
|
| 121 |
-
answer = answer.split("Observation:")[0].strip()
|
| 122 |
-
if "Question:" in answer: # If it starts generating a new question within the answer
|
| 123 |
-
answer = answer.split("Question:")[0].strip()
|
| 124 |
-
|
| 125 |
-
# Handle nested "Final Answer:" in the extracted part
|
| 126 |
inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
|
| 127 |
if inner_final_answers:
|
| 128 |
answer = inner_final_answers[-1].strip()
|
|
@@ -146,7 +150,11 @@ class ReActAgent:
|
|
| 146 |
print(f"Unknown tool: {tool_name}")
|
| 147 |
scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
|
| 148 |
else:
|
| 149 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 150 |
|
| 151 |
current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
|
| 152 |
|
|
@@ -200,7 +208,11 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 200 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 201 |
print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
|
| 202 |
except Exception as e:
|
|
|
|
| 203 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
if not answers_payload:
|
| 206 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
|
@@ -229,7 +241,13 @@ def run_and_submit_all(profile: gr.OAuthProfile | None):
|
|
| 229 |
# --- Gradio Interface ---
|
| 230 |
with gr.Blocks() as demo:
|
| 231 |
gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
|
| 232 |
-
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
gr.LoginButton()
|
| 234 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 235 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
@@ -238,11 +256,13 @@ with gr.Blocks() as demo:
|
|
| 238 |
|
| 239 |
if __name__ == "__main__":
|
| 240 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 241 |
-
# Startup messages (space_host, space_id, llm_client status)
|
| 242 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 243 |
space_id_startup = os.getenv("SPACE_ID")
|
| 244 |
if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
|
|
| 245 |
if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
|
|
|
|
|
|
|
| 246 |
if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
|
| 247 |
else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
|
| 248 |
print("-"*(60 + len(" App Starting ")) + "\n")
|
|
|
|
| 8 |
# --- HF Inference API for LLM ---
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
|
| 11 |
+
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" # Using Mixtral
|
| 12 |
|
| 13 |
try:
|
| 14 |
hf_token = os.getenv("HF_TOKEN")
|
|
|
|
| 58 |
])
|
| 59 |
self.tool_names = ", ".join(tools.keys())
|
| 60 |
|
| 61 |
+
# Further strengthened ReAct prompt
|
| 62 |
self.react_prompt_template = inspect.cleandoc(f"""
|
| 63 |
+
You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
|
| 64 |
+
Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
|
| 65 |
+
You MUST begin your response with a "Thought:" section, outlining your plan.
|
| 66 |
+
Following the "Thought:", you MUST specify an "Action:". This action should be one of the available tools (e.g., search_tool[query]) or "Action: None" if no tool is immediately necessary based on your thought.
|
| 67 |
+
Only AFTER an "Action:" and its corresponding "Observation:" (or if "Action: None" was used, after further "Thought:") can you consider providing a "Final Answer:".
|
| 68 |
+
DO NOT output "Final Answer:" as your very first step or before going through at least one Thought/Action/Observation cycle if tools or reasoning are required.
|
| 69 |
+
|
| 70 |
+
The final answer itself (the text after "Final Answer:") must be an EXACT match to the correct response, without any extra explanations, apologies, or prefixes.
|
| 71 |
|
| 72 |
Available tools:
|
| 73 |
{self.tool_descriptions}
|
| 74 |
|
| 75 |
Use the following format FOR THE CURRENT QUESTION ONLY:
|
| 76 |
Question: the input question you must answer
|
| 77 |
+
Thought: Your reasoning and plan for the current question. This MUST be your first step.
|
| 78 |
+
Action: The action to take. Choose from [{self.tool_names}] with input in brackets (e.g., search_tool[query]), or use "Action: None" if no tool is needed for this immediate step. This MUST follow your Thought.
|
| 79 |
+
Observation: The result of the action. If Action was None, state "Observation: No action taken, proceeding with reasoning." or similar. This MUST follow your Action.
|
| 80 |
+
Thought: Further reasoning based on the observation or your initial thought process. You may loop through Thought/Action/Observation multiple times.
|
| 81 |
+
Final Answer: [Provide ONLY the precise answer to the CURRENT question here. For example, if the question is "What is 2+2?", the Final Answer should be just "4". Use this ONLY when all reasoning is complete and you are certain of the answer.]
|
| 82 |
+
|
| 83 |
+
Let's begin by thinking about the current question.
|
|
|
|
| 84 |
""") + "\nQuestion: {question}\n{scratchpad}"
|
| 85 |
|
| 86 |
|
|
|
|
| 88 |
try:
|
| 89 |
response = self.llm.text_generation(
|
| 90 |
prompt,
|
| 91 |
+
max_new_tokens=512, # Consider increasing if logs show truncation of ReAct steps
|
| 92 |
temperature=0.1,
|
| 93 |
do_sample=True,
|
| 94 |
)
|
|
|
|
| 106 |
print(f"\nIteration {i+1}")
|
| 107 |
llm_output = self.run_llm(current_prompt)
|
| 108 |
|
| 109 |
+
# ---- START: Added for debugging ----
|
| 110 |
+
print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
|
| 111 |
+
print(llm_output)
|
| 112 |
+
print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
|
| 113 |
+
# ---- END: Added for debugging ----
|
| 114 |
+
|
| 115 |
if not llm_output:
|
| 116 |
print("LLM returned empty or error, stopping.")
|
| 117 |
+
return "Agent could not determine an answer within the allowed steps." # Consistent failure message
|
| 118 |
|
| 119 |
scratchpad += llm_output + "\n"
|
| 120 |
|
| 121 |
all_final_answers = re.findall(r"Final Answer:\s*(.*)", llm_output, re.DOTALL | re.IGNORECASE)
|
| 122 |
if all_final_answers:
|
| 123 |
+
answer = all_final_answers[-1].strip()
|
| 124 |
|
| 125 |
+
if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
|
| 126 |
+
if "Action:" in answer: answer = answer.split("Action:")[0].strip()
|
| 127 |
+
if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
|
| 128 |
+
if "Question:" in answer: answer = answer.split("Question:")[0].strip()
|
| 129 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
|
| 131 |
if inner_final_answers:
|
| 132 |
answer = inner_final_answers[-1].strip()
|
|
|
|
| 150 |
print(f"Unknown tool: {tool_name}")
|
| 151 |
scratchpad += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
|
| 152 |
else:
|
| 153 |
+
# If the LLM output does not contain "Final Answer:" and also does not contain a valid "Action:",
|
| 154 |
+
# it means the LLM is likely just "thinking" or its output is malformed for ReAct.
|
| 155 |
+
# We add its output to scratchpad and let it try again in the next iteration.
|
| 156 |
+
print("No valid Action or Final Answer found in LLM output for this iteration. LLM might be thinking or output is malformed.")
|
| 157 |
+
|
| 158 |
|
| 159 |
current_prompt = self.react_prompt_template.format(question=question, scratchpad=scratchpad)
|
| 160 |
|
|
|
|
| 208 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
|
| 209 |
print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
|
| 210 |
except Exception as e:
|
| 211 |
+
print(f"Error running agent on task {task_id}: {e}") # Log specific agent error
|
| 212 |
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
|
| 213 |
+
# Still add a payload so the task is marked as attempted, with an error message.
|
| 214 |
+
answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
|
| 215 |
+
|
| 216 |
|
| 217 |
if not answers_payload:
|
| 218 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
|
|
|
| 241 |
# --- Gradio Interface ---
|
| 242 |
with gr.Blocks() as demo:
|
| 243 |
gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
|
| 244 |
+
gr.Markdown( # Shortened for brevity, keep your detailed markdown
|
| 245 |
+
"""
|
| 246 |
+
**Instructions & Disclaimers:**
|
| 247 |
+
Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt.
|
| 248 |
+
Check logs for RAW LLM OUTPUT for debugging.
|
| 249 |
+
"""
|
| 250 |
+
)
|
| 251 |
gr.LoginButton()
|
| 252 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 253 |
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
|
|
|
|
| 256 |
|
| 257 |
if __name__ == "__main__":
|
| 258 |
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
|
|
|
| 259 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 260 |
space_id_startup = os.getenv("SPACE_ID")
|
| 261 |
if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
|
| 262 |
+
# else: print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 263 |
if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 264 |
+
# else: print("ℹ️ SPACE_ID environment variable not found (running locally?).")
|
| 265 |
+
|
| 266 |
if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
|
| 267 |
else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
|
| 268 |
print("-"*(60 + len(" App Starting ")) + "\n")
|