mujtabarizvi's picture
Update app.py
c1f3f5c verified
import os
import gradio as gr
import requests
import inspect
import pandas as pd
import re # For parsing LLM output
# --- HF Inference API for LLM ---
from huggingface_hub import InferenceClient
LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1"
try:
hf_token = os.getenv("HF_TOKEN")
llm_client = InferenceClient(model=LLM_MODEL, token=hf_token)
except Exception as e:
print(f"Error initializing InferenceClient: {e}")
llm_client = None
# --- Tools ---
from duckduckgo_search import DDGS
def search_tool(query: str) -> str:
print(f"Tool: search_tool, Query: {query}")
try:
with DDGS() as ddgs:
results = ddgs.text(query, max_results=3) # Fewer results to be less verbose
if results:
return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results])
else:
return "No results found for your query. This might mean the query returned no relevant documents, or there could be a temporary issue (e.g., rate limit)."
except Exception as e:
print(f"Error in search_tool: {e}")
return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit."
def calculator_tool(expression: str) -> str:
print(f"Tool: calculator_tool, Expression: {expression}")
try:
# Basic check for safety, though a proper parser is better for production
if not re.match(r"^[0-9\s\+\-\*\/\(\)\.\%sqrtpijabsindcostanlog]+$", expression):
# Add more functions as needed, e.g. math.sqrt, math.pi etc.
# For simplicity, we are keeping a limited set here.
if expression not in ["pi", "sqrt"] and not any(op in expression for op in ['+', '-', '*', '/']):
return f"Error: Invalid characters in expression. Only numbers, basic operators, sqrt, pi allowed. Expression: {expression}"
# Using a more controlled eval
allowed_names = {"sqrt": lambda x: x**0.5, "pi": 3.1415926535} # Add more safe functions
code = compile(expression, "<string>", "eval")
for name in code.co_names:
if name not in allowed_names and name not in __builtins__:
raise NameError(f"Use of {name} is not allowed")
result = eval(code, {"__builtins__": {}}, allowed_names)
return str(result)
except Exception as e:
print(f"Error in calculator_tool: {e}")
return f"Error calculating: {str(e)}. Ensure the expression is valid and uses allowed functions/operators."
# --- Agent Definition ---
class ReActAgent:
def __init__(self, llm_client, tools: dict, max_iterations=7):
print("ReActAgent initialized.")
if llm_client is None:
raise ValueError("LLM client not initialized.")
self.llm = llm_client
self.tools = tools
self.max_iterations = max_iterations
self.tool_descriptions = "\n".join([
f"- {name}: {inspect.getdoc(func)}"
for name, func in tools.items()
])
self.tool_names = ", ".join(tools.keys())
# Refined prompt for better tool usage and stopping
self.react_prompt_template = inspect.cleandoc(f"""
You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process.
Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones.
You will proceed in a Thought, Action, Observation loop.
1. First, provide a "Thought:" explaining your reasoning for the current question.
2. Next, provide an "Action:".
- If you need to search the web, use search_tool[query].
- If you need to perform a calculation (e.g., arithmetic like 5*5, or math expressions), use calculator_tool[expression].
- If no tool is needed for this immediate step based on your current thought and the information available, use "Action: None". Only use Action: None if you are certain no tool can help or is required for the current step.
3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:".
4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:".
The final answer itself (the text after "Final Answer:") must be an EXACT, non-empty match to the correct response, without any extra explanations, apologies, or prefixes.
Available tools:
{self.tool_descriptions}
Use the following format FOR THE CURRENT QUESTION ONLY:
Question: the input question you must answer
{'{scratchpad}'}
Thought: [Your reasoning and plan for the current question. If continuing from an observation, reason about that observation.]
Action: [search_tool[query_for_search] OR calculator_tool[math_expression_to_calculate] OR Action: None]. AFTER THIS, STOP.
Observation: [The system will provide this. Do NOT generate this part.]
Thought: [Your reasoning based on the previous observation.]
Action: [Another action or Action: None]. AFTER THIS, STOP.
Observation: [The system will provide this.]
... (Repeat Thought/Action/STOP/Observation as needed)
Thought: I have sufficient information to answer the current question.
Final Answer: [Provide ONLY the precise, non-empty answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".]
Start your response for the current turn with "Thought:".
""") # Removed initial "Question: {question}" here, it's now part of the formatted prompt
def run_llm(self, prompt: str) -> str:
try:
stop_tokens = [
"\nObservation:", "Observation:",
"\nFinal Answer:", "Final Answer:"
]
response = self.llm.text_generation(
prompt,
max_new_tokens=350,
temperature=0.05, # Lowered further for more determinism
do_sample=True, # Important if temperature < 1.0
stop=stop_tokens, # Using `stop` as per FutureWarning
)
return response.strip()
except Exception as e:
print(f"Error during LLM call: {e}")
return f"Error generating response: {str(e)}"
def __call__(self, question: str) -> str:
print(f"ReActAgent received question (first 100 chars): {question[:100]}...")
scratchpad_history = ""
for i in range(self.max_iterations):
print(f"\nIteration {i+1}")
# Construct the prompt for the LLM for the current turn
# The template now has {scratchpad} in the middle, then format instructions, then prompts for Thought/Action.
# We ensure the LLM starts its generation with a Thought.
# The initial prompt will be the template + Question + "Thought:"
# Subsequent prompts will be template + Question + scratchpad_history + "Thought:"
# The main instruction block, question, and current scratchpad history
current_prompt_base = self.react_prompt_template.format(scratchpad=scratchpad_history).split("Thought:")[0]
current_prompt_text = f"Question: {question}\n" + current_prompt_base
if not scratchpad_history: # First turn
current_prompt_text += "Thought:" # Prime for the first thought
else: # Subsequent turns, scratchpad_history has previous T/A/O
current_prompt_text += scratchpad_history + "\nThought:" # Prime for next thought after observation
print(f"--- PROMPT FOR LLM (Iteration {i+1}, last 300 chars) ---\n...{current_prompt_text[-300:]}\n--- END PROMPT ---")
llm_output_this_turn = self.run_llm(current_prompt_text)
print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---")
print(llm_output_this_turn)
print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---")
if not llm_output_this_turn:
print("LLM returned empty or error, stopping.")
return "Agent could not determine an answer within the allowed steps."
# Prepend "Thought:" if LLM didn't include it (due to priming)
# This ensures scratchpad consistency if the LLM directly starts with the thought content.
actual_llm_generation = llm_output_this_turn
if not llm_output_this_turn.strip().startswith("Thought:") and \
(scratchpad_history.strip().endswith("Observation:") or not scratchpad_history):
actual_llm_generation = "Thought: " + llm_output_this_turn
scratchpad_history += actual_llm_generation + "\n"
# Check for Final Answer in the llm_output_this_turn
# The llm_output_this_turn could be "Thought: ... Final Answer: ..." if no tool was needed.
final_answer_segment = actual_llm_generation # Check the full segment for Final Answer
all_final_answers = re.findall(r"Final Answer:\s*(.*)", final_answer_segment, re.DOTALL | re.IGNORECASE)
if all_final_answers:
answer = all_final_answers[-1].strip()
# Clean common contamination
if "Thought:" in answer: answer = answer.split("Thought:")[0].strip()
if "Action:" in answer: answer = answer.split("Action:")[0].strip()
if "Observation:" in answer: answer = answer.split("Observation:")[0].strip()
if "Question:" in answer: answer = answer.split("Question:")[0].strip()
inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE)
if inner_final_answers: answer = inner_final_answers[-1].strip()
if answer: # Only if the answer is not empty after cleaning
print(f"Found and extracted Final Answer: '{answer}'")
return answer
else:
print("LLM produced 'Final Answer:' but the content was empty or invalid after cleaning. Continuing.")
# Scratchpad already has this turn's problematic output. Loop continues.
# Parse Action from llm_output_this_turn (or actual_llm_generation)
action_segment = actual_llm_generation # Check the full segment for Action
action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", action_segment, re.DOTALL)
action_none_match = re.search(r"Action:\s*None", action_segment, re.IGNORECASE)
if action_match:
tool_name = action_match.group(1).strip()
tool_input = action_match.group(2).strip()
if tool_name in self.tools:
print(f"Executing Tool: {tool_name}, Input: {tool_input}")
try:
observation_content = self.tools[tool_name](tool_input)
except Exception as e:
observation_content = f"Error executing tool {tool_name}: {e}"
print(f"Observation content: {observation_content[:200]}...")
scratchpad_history += f"Observation: {observation_content}\n"
else:
print(f"Unknown tool: {tool_name}")
scratchpad_history += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n"
elif action_none_match:
print("Action: None detected.")
scratchpad_history += f"Observation: No action taken, proceeding with reasoning.\n"
else:
print("No valid Action (tool use or None) found in LLM output for this turn. LLM might be thinking or its format is off.")
# If the LLM is supposed to always output an Action (even None) but doesn't,
# it's a deviation. We add a generic observation to try and get it back on track.
# This can happen if it only outputs a Thought.
scratchpad_history += "Observation: LLM did not provide an Action in the expected format. Please provide a Thought and then an Action (or Action: None).\n"
print(f"Max iterations reached for question (first 50 chars): {question[:50]}...")
standard_failure_message = "Agent could not determine an answer within the allowed steps."
print(f"Returning standard failure message: {standard_failure_message}")
return standard_failure_message
# --- Constants ---
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
# --- Main Execution Logic ---
def run_and_submit_all(profile: gr.OAuthProfile | None):
space_id = os.getenv("SPACE_ID")
if profile:
username = f"{profile.username}"
else:
return "Please Login to Hugging Face with the button.", None
api_url = DEFAULT_API_URL
questions_url = f"{api_url}/questions"
submit_url = f"{api_url}/submit"
try:
available_tools = {"search_tool": search_tool, "calculator_tool": calculator_tool}
if llm_client is None:
return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None
agent = ReActAgent(llm_client=llm_client, tools=available_tools)
except Exception as e:
return f"Error initializing agent: {e}", None
agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)"
try:
response = requests.get(questions_url, timeout=20)
response.raise_for_status()
questions_data = response.json()
if not questions_data:
return "Fetched questions list is empty or invalid format.", None
except Exception as e:
return f"Error fetching questions: {e}", None
results_log, answers_payload = [], []
for item in questions_data:
task_id, question_text = item.get("task_id"), item.get("question")
if not task_id or question_text is None: continue
try:
print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---")
submitted_answer = agent(question_text)
answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'")
except Exception as e:
print(f"Error running agent on task {task_id}: {e}")
results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."})
if not answers_payload:
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
try:
response = requests.post(submit_url, json=submission_data, timeout=120)
response.raise_for_status()
result_data = response.json()
final__status = ( # Renamed to avoid conflict
f"Submission Successful!\nUser: {result_data.get('username')}\n"
f"Overall Score: {result_data.get('score', 'N/A')}% "
f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
f"Message: {result_data.get('message', 'No message received.')}"
)
return final_status, pd.DataFrame(results_log) # Corrected variable name
except requests.exceptions.HTTPError as e:
error_detail = f"Server responded with status {e.response.status_code}."
try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}"
except: error_detail += f" Response: {e.response.text[:500]}"
return f"Submission Failed: {error_detail}", pd.DataFrame(results_log)
except Exception as e:
return f"An unexpected error occurred during submission: {e}", pd.DataFrame(results_log)
# --- Gradio Interface ---
with gr.Blocks() as demo:
gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)")
gr.Markdown(
"""
**Instructions & Disclaimers:**
Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences.
Check logs for RAW LLM OUTPUT and PROMPT FOR LLM for debugging.
"""
)
gr.LoginButton()
run_button = gr.Button("Run Evaluation & Submit All Answers")
status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
if __name__ == "__main__":
print("\n" + "-"*30 + " App Starting " + "-"*30)
space_host_startup = os.getenv("SPACE_HOST")
space_id_startup = os.getenv("SPACE_ID")
if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}")
if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}")
if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.")
else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}")
print("-"*(60 + len(" App Starting ")) + "\n")
demo.launch(debug=True, share=False)