import os import gradio as gr import requests import inspect import pandas as pd import re # For parsing LLM output # --- HF Inference API for LLM --- from huggingface_hub import InferenceClient LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" try: hf_token = os.getenv("HF_TOKEN") llm_client = InferenceClient(model=LLM_MODEL, token=hf_token) except Exception as e: print(f"Error initializing InferenceClient: {e}") llm_client = None # --- Tools --- from duckduckgo_search import DDGS def search_tool(query: str) -> str: print(f"Tool: search_tool, Query: {query}") try: with DDGS() as ddgs: results = ddgs.text(query, max_results=3) # Fewer results to be less verbose if results: return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results]) else: return "No results found for your query. This might mean the query returned no relevant documents, or there could be a temporary issue (e.g., rate limit)." except Exception as e: print(f"Error in search_tool: {e}") return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit." def calculator_tool(expression: str) -> str: print(f"Tool: calculator_tool, Expression: {expression}") try: # Basic check for safety, though a proper parser is better for production if not re.match(r"^[0-9\s\+\-\*\/\(\)\.\%sqrtpijabsindcostanlog]+$", expression): # Add more functions as needed, e.g. math.sqrt, math.pi etc. # For simplicity, we are keeping a limited set here. if expression not in ["pi", "sqrt"] and not any(op in expression for op in ['+', '-', '*', '/']): return f"Error: Invalid characters in expression. Only numbers, basic operators, sqrt, pi allowed. Expression: {expression}" # Using a more controlled eval allowed_names = {"sqrt": lambda x: x**0.5, "pi": 3.1415926535} # Add more safe functions code = compile(expression, "", "eval") for name in code.co_names: if name not in allowed_names and name not in __builtins__: raise NameError(f"Use of {name} is not allowed") result = eval(code, {"__builtins__": {}}, allowed_names) return str(result) except Exception as e: print(f"Error in calculator_tool: {e}") return f"Error calculating: {str(e)}. Ensure the expression is valid and uses allowed functions/operators." # --- Agent Definition --- class ReActAgent: def __init__(self, llm_client, tools: dict, max_iterations=7): print("ReActAgent initialized.") if llm_client is None: raise ValueError("LLM client not initialized.") self.llm = llm_client self.tools = tools self.max_iterations = max_iterations self.tool_descriptions = "\n".join([ f"- {name}: {inspect.getdoc(func)}" for name, func in tools.items() ]) self.tool_names = ", ".join(tools.keys()) # Refined prompt for better tool usage and stopping self.react_prompt_template = inspect.cleandoc(f""" You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process. Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones. You will proceed in a Thought, Action, Observation loop. 1. First, provide a "Thought:" explaining your reasoning for the current question. 2. Next, provide an "Action:". - If you need to search the web, use search_tool[query]. - If you need to perform a calculation (e.g., arithmetic like 5*5, or math expressions), use calculator_tool[expression]. - If no tool is needed for this immediate step based on your current thought and the information available, use "Action: None". Only use Action: None if you are certain no tool can help or is required for the current step. 3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:". 4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:". The final answer itself (the text after "Final Answer:") must be an EXACT, non-empty match to the correct response, without any extra explanations, apologies, or prefixes. Available tools: {self.tool_descriptions} Use the following format FOR THE CURRENT QUESTION ONLY: Question: the input question you must answer {'{scratchpad}'} Thought: [Your reasoning and plan for the current question. If continuing from an observation, reason about that observation.] Action: [search_tool[query_for_search] OR calculator_tool[math_expression_to_calculate] OR Action: None]. AFTER THIS, STOP. Observation: [The system will provide this. Do NOT generate this part.] Thought: [Your reasoning based on the previous observation.] Action: [Another action or Action: None]. AFTER THIS, STOP. Observation: [The system will provide this.] ... (Repeat Thought/Action/STOP/Observation as needed) Thought: I have sufficient information to answer the current question. Final Answer: [Provide ONLY the precise, non-empty answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".] Start your response for the current turn with "Thought:". """) # Removed initial "Question: {question}" here, it's now part of the formatted prompt def run_llm(self, prompt: str) -> str: try: stop_tokens = [ "\nObservation:", "Observation:", "\nFinal Answer:", "Final Answer:" ] response = self.llm.text_generation( prompt, max_new_tokens=350, temperature=0.05, # Lowered further for more determinism do_sample=True, # Important if temperature < 1.0 stop=stop_tokens, # Using `stop` as per FutureWarning ) return response.strip() except Exception as e: print(f"Error during LLM call: {e}") return f"Error generating response: {str(e)}" def __call__(self, question: str) -> str: print(f"ReActAgent received question (first 100 chars): {question[:100]}...") scratchpad_history = "" for i in range(self.max_iterations): print(f"\nIteration {i+1}") # Construct the prompt for the LLM for the current turn # The template now has {scratchpad} in the middle, then format instructions, then prompts for Thought/Action. # We ensure the LLM starts its generation with a Thought. # The initial prompt will be the template + Question + "Thought:" # Subsequent prompts will be template + Question + scratchpad_history + "Thought:" # The main instruction block, question, and current scratchpad history current_prompt_base = self.react_prompt_template.format(scratchpad=scratchpad_history).split("Thought:")[0] current_prompt_text = f"Question: {question}\n" + current_prompt_base if not scratchpad_history: # First turn current_prompt_text += "Thought:" # Prime for the first thought else: # Subsequent turns, scratchpad_history has previous T/A/O current_prompt_text += scratchpad_history + "\nThought:" # Prime for next thought after observation print(f"--- PROMPT FOR LLM (Iteration {i+1}, last 300 chars) ---\n...{current_prompt_text[-300:]}\n--- END PROMPT ---") llm_output_this_turn = self.run_llm(current_prompt_text) print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---") print(llm_output_this_turn) print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---") if not llm_output_this_turn: print("LLM returned empty or error, stopping.") return "Agent could not determine an answer within the allowed steps." # Prepend "Thought:" if LLM didn't include it (due to priming) # This ensures scratchpad consistency if the LLM directly starts with the thought content. actual_llm_generation = llm_output_this_turn if not llm_output_this_turn.strip().startswith("Thought:") and \ (scratchpad_history.strip().endswith("Observation:") or not scratchpad_history): actual_llm_generation = "Thought: " + llm_output_this_turn scratchpad_history += actual_llm_generation + "\n" # Check for Final Answer in the llm_output_this_turn # The llm_output_this_turn could be "Thought: ... Final Answer: ..." if no tool was needed. final_answer_segment = actual_llm_generation # Check the full segment for Final Answer all_final_answers = re.findall(r"Final Answer:\s*(.*)", final_answer_segment, re.DOTALL | re.IGNORECASE) if all_final_answers: answer = all_final_answers[-1].strip() # Clean common contamination if "Thought:" in answer: answer = answer.split("Thought:")[0].strip() if "Action:" in answer: answer = answer.split("Action:")[0].strip() if "Observation:" in answer: answer = answer.split("Observation:")[0].strip() if "Question:" in answer: answer = answer.split("Question:")[0].strip() inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE) if inner_final_answers: answer = inner_final_answers[-1].strip() if answer: # Only if the answer is not empty after cleaning print(f"Found and extracted Final Answer: '{answer}'") return answer else: print("LLM produced 'Final Answer:' but the content was empty or invalid after cleaning. Continuing.") # Scratchpad already has this turn's problematic output. Loop continues. # Parse Action from llm_output_this_turn (or actual_llm_generation) action_segment = actual_llm_generation # Check the full segment for Action action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", action_segment, re.DOTALL) action_none_match = re.search(r"Action:\s*None", action_segment, re.IGNORECASE) if action_match: tool_name = action_match.group(1).strip() tool_input = action_match.group(2).strip() if tool_name in self.tools: print(f"Executing Tool: {tool_name}, Input: {tool_input}") try: observation_content = self.tools[tool_name](tool_input) except Exception as e: observation_content = f"Error executing tool {tool_name}: {e}" print(f"Observation content: {observation_content[:200]}...") scratchpad_history += f"Observation: {observation_content}\n" else: print(f"Unknown tool: {tool_name}") scratchpad_history += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n" elif action_none_match: print("Action: None detected.") scratchpad_history += f"Observation: No action taken, proceeding with reasoning.\n" else: print("No valid Action (tool use or None) found in LLM output for this turn. LLM might be thinking or its format is off.") # If the LLM is supposed to always output an Action (even None) but doesn't, # it's a deviation. We add a generic observation to try and get it back on track. # This can happen if it only outputs a Thought. scratchpad_history += "Observation: LLM did not provide an Action in the expected format. Please provide a Thought and then an Action (or Action: None).\n" print(f"Max iterations reached for question (first 50 chars): {question[:50]}...") standard_failure_message = "Agent could not determine an answer within the allowed steps." print(f"Returning standard failure message: {standard_failure_message}") return standard_failure_message # --- Constants --- DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" # --- Main Execution Logic --- def run_and_submit_all(profile: gr.OAuthProfile | None): space_id = os.getenv("SPACE_ID") if profile: username = f"{profile.username}" else: return "Please Login to Hugging Face with the button.", None api_url = DEFAULT_API_URL questions_url = f"{api_url}/questions" submit_url = f"{api_url}/submit" try: available_tools = {"search_tool": search_tool, "calculator_tool": calculator_tool} if llm_client is None: return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None agent = ReActAgent(llm_client=llm_client, tools=available_tools) except Exception as e: return f"Error initializing agent: {e}", None agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)" try: response = requests.get(questions_url, timeout=20) response.raise_for_status() questions_data = response.json() if not questions_data: return "Fetched questions list is empty or invalid format.", None except Exception as e: return f"Error fetching questions: {e}", None results_log, answers_payload = [], [] for item in questions_data: task_id, question_text = item.get("task_id"), item.get("question") if not task_id or question_text is None: continue try: print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---") submitted_answer = agent(question_text) answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}) print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'") except Exception as e: print(f"Error running agent on task {task_id}: {e}") results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"}) answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."}) if not answers_payload: return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload} try: response = requests.post(submit_url, json=submission_data, timeout=120) response.raise_for_status() result_data = response.json() final__status = ( # Renamed to avoid conflict f"Submission Successful!\nUser: {result_data.get('username')}\n" f"Overall Score: {result_data.get('score', 'N/A')}% " f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" f"Message: {result_data.get('message', 'No message received.')}" ) return final_status, pd.DataFrame(results_log) # Corrected variable name except requests.exceptions.HTTPError as e: error_detail = f"Server responded with status {e.response.status_code}." try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}" except: error_detail += f" Response: {e.response.text[:500]}" return f"Submission Failed: {error_detail}", pd.DataFrame(results_log) except Exception as e: return f"An unexpected error occurred during submission: {e}", pd.DataFrame(results_log) # --- Gradio Interface --- with gr.Blocks() as demo: gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)") gr.Markdown( """ **Instructions & Disclaimers:** Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences. Check logs for RAW LLM OUTPUT and PROMPT FOR LLM for debugging. """ ) gr.LoginButton() run_button = gr.Button("Run Evaluation & Submit All Answers") status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False) results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table]) if __name__ == "__main__": print("\n" + "-"*30 + " App Starting " + "-"*30) space_host_startup = os.getenv("SPACE_HOST") space_id_startup = os.getenv("SPACE_ID") if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}") if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}") if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.") else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}") print("-"*(60 + len(" App Starting ")) + "\n") demo.launch(debug=True, share=False)