| | import os |
| | import gradio as gr |
| | import requests |
| | import inspect |
| | import pandas as pd |
| | import re |
| |
|
| | |
| | from huggingface_hub import InferenceClient |
| |
|
| | LLM_MODEL = "mistralai/Mixtral-8x7B-Instruct-v0.1" |
| |
|
| | try: |
| | hf_token = os.getenv("HF_TOKEN") |
| | llm_client = InferenceClient(model=LLM_MODEL, token=hf_token) |
| | except Exception as e: |
| | print(f"Error initializing InferenceClient: {e}") |
| | llm_client = None |
| |
|
| | |
| | from duckduckgo_search import DDGS |
| |
|
| | def search_tool(query: str) -> str: |
| | print(f"Tool: search_tool, Query: {query}") |
| | try: |
| | with DDGS() as ddgs: |
| | results = ddgs.text(query, max_results=3) |
| | if results: |
| | return "\n".join([f"Title: {r['title']}\nSnippet: {r['body']}\nURL: {r['href']}" for r in results]) |
| | else: |
| | return "No results found for your query. This might mean the query returned no relevant documents, or there could be a temporary issue (e.g., rate limit)." |
| | except Exception as e: |
| | print(f"Error in search_tool: {e}") |
| | return f"Error performing search: {str(e)}. This could be due to a network issue, an invalid query, or a rate limit." |
| |
|
| | def calculator_tool(expression: str) -> str: |
| | print(f"Tool: calculator_tool, Expression: {expression}") |
| | try: |
| | |
| | if not re.match(r"^[0-9\s\+\-\*\/\(\)\.\%sqrtpijabsindcostanlog]+$", expression): |
| | |
| | |
| | if expression not in ["pi", "sqrt"] and not any(op in expression for op in ['+', '-', '*', '/']): |
| | return f"Error: Invalid characters in expression. Only numbers, basic operators, sqrt, pi allowed. Expression: {expression}" |
| |
|
| | |
| | allowed_names = {"sqrt": lambda x: x**0.5, "pi": 3.1415926535} |
| | code = compile(expression, "<string>", "eval") |
| | for name in code.co_names: |
| | if name not in allowed_names and name not in __builtins__: |
| | raise NameError(f"Use of {name} is not allowed") |
| | |
| | result = eval(code, {"__builtins__": {}}, allowed_names) |
| | return str(result) |
| |
|
| | except Exception as e: |
| | print(f"Error in calculator_tool: {e}") |
| | return f"Error calculating: {str(e)}. Ensure the expression is valid and uses allowed functions/operators." |
| |
|
| | |
| | class ReActAgent: |
| | def __init__(self, llm_client, tools: dict, max_iterations=7): |
| | print("ReActAgent initialized.") |
| | if llm_client is None: |
| | raise ValueError("LLM client not initialized.") |
| | self.llm = llm_client |
| | self.tools = tools |
| | self.max_iterations = max_iterations |
| |
|
| | self.tool_descriptions = "\n".join([ |
| | f"- {name}: {inspect.getdoc(func)}" |
| | for name, func in tools.items() |
| | ]) |
| | self.tool_names = ", ".join(tools.keys()) |
| |
|
| | |
| | self.react_prompt_template = inspect.cleandoc(f""" |
| | You are a helpful AI assistant. Your primary goal is to answer the CURRENT question accurately by strictly following a step-by-step reasoning process. |
| | Focus ONLY on the provided "Question:". Do not generate new questions or answer unrelated ones. |
| | |
| | You will proceed in a Thought, Action, Observation loop. |
| | 1. First, provide a "Thought:" explaining your reasoning for the current question. |
| | 2. Next, provide an "Action:". |
| | - If you need to search the web, use search_tool[query]. |
| | - If you need to perform a calculation (e.g., arithmetic like 5*5, or math expressions), use calculator_tool[expression]. |
| | - If no tool is needed for this immediate step based on your current thought and the information available, use "Action: None". Only use Action: None if you are certain no tool can help or is required for the current step. |
| | 3. AFTER YOU PROVIDE THE ACTION, YOU MUST STOP. The system will then provide an "Observation:". |
| | 4. Based on the "Observation:", you will continue with another "Thought:", followed by another "Action:" (and then STOP), or if you have enough information, a "Final Answer:". |
| | |
| | The final answer itself (the text after "Final Answer:") must be an EXACT, non-empty match to the correct response, without any extra explanations, apologies, or prefixes. |
| | |
| | Available tools: |
| | {self.tool_descriptions} |
| | |
| | Use the following format FOR THE CURRENT QUESTION ONLY: |
| | Question: the input question you must answer |
| | |
| | {'{scratchpad}'} |
| | |
| | Thought: [Your reasoning and plan for the current question. If continuing from an observation, reason about that observation.] |
| | Action: [search_tool[query_for_search] OR calculator_tool[math_expression_to_calculate] OR Action: None]. AFTER THIS, STOP. |
| | Observation: [The system will provide this. Do NOT generate this part.] |
| | Thought: [Your reasoning based on the previous observation.] |
| | Action: [Another action or Action: None]. AFTER THIS, STOP. |
| | Observation: [The system will provide this.] |
| | ... (Repeat Thought/Action/STOP/Observation as needed) |
| | Thought: I have sufficient information to answer the current question. |
| | Final Answer: [Provide ONLY the precise, non-empty answer. For example, if the question is "What is 2+2?", your Final Answer should be just "4".] |
| | |
| | Start your response for the current turn with "Thought:". |
| | """) |
| |
|
| |
|
| | def run_llm(self, prompt: str) -> str: |
| | try: |
| | stop_tokens = [ |
| | "\nObservation:", "Observation:", |
| | "\nFinal Answer:", "Final Answer:" |
| | ] |
| | |
| | response = self.llm.text_generation( |
| | prompt, |
| | max_new_tokens=350, |
| | temperature=0.05, |
| | do_sample=True, |
| | stop=stop_tokens, |
| | ) |
| | return response.strip() |
| | except Exception as e: |
| | print(f"Error during LLM call: {e}") |
| | return f"Error generating response: {str(e)}" |
| |
|
| | def __call__(self, question: str) -> str: |
| | print(f"ReActAgent received question (first 100 chars): {question[:100]}...") |
| | scratchpad_history = "" |
| | |
| | for i in range(self.max_iterations): |
| | print(f"\nIteration {i+1}") |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | current_prompt_base = self.react_prompt_template.format(scratchpad=scratchpad_history).split("Thought:")[0] |
| | current_prompt_text = f"Question: {question}\n" + current_prompt_base |
| | |
| | if not scratchpad_history: |
| | current_prompt_text += "Thought:" |
| | else: |
| | current_prompt_text += scratchpad_history + "\nThought:" |
| |
|
| | print(f"--- PROMPT FOR LLM (Iteration {i+1}, last 300 chars) ---\n...{current_prompt_text[-300:]}\n--- END PROMPT ---") |
| | llm_output_this_turn = self.run_llm(current_prompt_text) |
| |
|
| | print(f"--- RAW LLM OUTPUT (Iteration {i+1}) ---") |
| | print(llm_output_this_turn) |
| | print(f"--- END RAW LLM OUTPUT (Iteration {i+1}) ---") |
| |
|
| | if not llm_output_this_turn: |
| | print("LLM returned empty or error, stopping.") |
| | return "Agent could not determine an answer within the allowed steps." |
| |
|
| | |
| | |
| | actual_llm_generation = llm_output_this_turn |
| | if not llm_output_this_turn.strip().startswith("Thought:") and \ |
| | (scratchpad_history.strip().endswith("Observation:") or not scratchpad_history): |
| | actual_llm_generation = "Thought: " + llm_output_this_turn |
| |
|
| | scratchpad_history += actual_llm_generation + "\n" |
| |
|
| | |
| | |
| | final_answer_segment = actual_llm_generation |
| | all_final_answers = re.findall(r"Final Answer:\s*(.*)", final_answer_segment, re.DOTALL | re.IGNORECASE) |
| | |
| | if all_final_answers: |
| | answer = all_final_answers[-1].strip() |
| | |
| | if "Thought:" in answer: answer = answer.split("Thought:")[0].strip() |
| | if "Action:" in answer: answer = answer.split("Action:")[0].strip() |
| | if "Observation:" in answer: answer = answer.split("Observation:")[0].strip() |
| | if "Question:" in answer: answer = answer.split("Question:")[0].strip() |
| | inner_final_answers = re.findall(r"Final Answer:\s*(.*)", answer, re.DOTALL | re.IGNORECASE) |
| | if inner_final_answers: answer = inner_final_answers[-1].strip() |
| | |
| | if answer: |
| | print(f"Found and extracted Final Answer: '{answer}'") |
| | return answer |
| | else: |
| | print("LLM produced 'Final Answer:' but the content was empty or invalid after cleaning. Continuing.") |
| | |
| |
|
| | |
| | action_segment = actual_llm_generation |
| | action_match = re.search(r"Action:\s*([a-zA-Z_0-9]+)\[(.*?)\]", action_segment, re.DOTALL) |
| | action_none_match = re.search(r"Action:\s*None", action_segment, re.IGNORECASE) |
| |
|
| | if action_match: |
| | tool_name = action_match.group(1).strip() |
| | tool_input = action_match.group(2).strip() |
| | if tool_name in self.tools: |
| | print(f"Executing Tool: {tool_name}, Input: {tool_input}") |
| | try: |
| | observation_content = self.tools[tool_name](tool_input) |
| | except Exception as e: |
| | observation_content = f"Error executing tool {tool_name}: {e}" |
| | print(f"Observation content: {observation_content[:200]}...") |
| | scratchpad_history += f"Observation: {observation_content}\n" |
| | else: |
| | print(f"Unknown tool: {tool_name}") |
| | scratchpad_history += f"Observation: Error - Unknown tool '{tool_name}'. Available tools: {self.tool_names}\n" |
| | elif action_none_match: |
| | print("Action: None detected.") |
| | scratchpad_history += f"Observation: No action taken, proceeding with reasoning.\n" |
| | else: |
| | print("No valid Action (tool use or None) found in LLM output for this turn. LLM might be thinking or its format is off.") |
| | |
| | |
| | |
| | scratchpad_history += "Observation: LLM did not provide an Action in the expected format. Please provide a Thought and then an Action (or Action: None).\n" |
| |
|
| |
|
| | print(f"Max iterations reached for question (first 50 chars): {question[:50]}...") |
| | standard_failure_message = "Agent could not determine an answer within the allowed steps." |
| | print(f"Returning standard failure message: {standard_failure_message}") |
| | return standard_failure_message |
| |
|
| | |
| | DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space" |
| |
|
| | |
| | def run_and_submit_all(profile: gr.OAuthProfile | None): |
| | space_id = os.getenv("SPACE_ID") |
| | if profile: |
| | username = f"{profile.username}" |
| | else: |
| | return "Please Login to Hugging Face with the button.", None |
| |
|
| | api_url = DEFAULT_API_URL |
| | questions_url = f"{api_url}/questions" |
| | submit_url = f"{api_url}/submit" |
| |
|
| | try: |
| | available_tools = {"search_tool": search_tool, "calculator_tool": calculator_tool} |
| | if llm_client is None: |
| | return "LLM Client could not be initialized. Check logs and HF_TOKEN.", None |
| | agent = ReActAgent(llm_client=llm_client, tools=available_tools) |
| | except Exception as e: |
| | return f"Error initializing agent: {e}", None |
| |
|
| | agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main" if space_id else "Code not available (SPACE_ID not set)" |
| |
|
| | try: |
| | response = requests.get(questions_url, timeout=20) |
| | response.raise_for_status() |
| | questions_data = response.json() |
| | if not questions_data: |
| | return "Fetched questions list is empty or invalid format.", None |
| | except Exception as e: |
| | return f"Error fetching questions: {e}", None |
| |
|
| | results_log, answers_payload = [], [] |
| | for item in questions_data: |
| | task_id, question_text = item.get("task_id"), item.get("question") |
| | if not task_id or question_text is None: continue |
| | try: |
| | print(f"\n--- Processing Task ID: {task_id}, Question: {question_text[:100]}... ---") |
| | submitted_answer = agent(question_text) |
| | answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer}) |
| | results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer}) |
| | print(f"Agent answer for task {task_id}: '{submitted_answer[:100]}...'") |
| | except Exception as e: |
| | print(f"Error running agent on task {task_id}: {e}") |
| | results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"}) |
| | answers_payload.append({"task_id": task_id, "submitted_answer": "Agent execution error."}) |
| |
|
| |
|
| | if not answers_payload: |
| | return "Agent did not produce any answers to submit.", pd.DataFrame(results_log) |
| |
|
| | submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload} |
| |
|
| | try: |
| | response = requests.post(submit_url, json=submission_data, timeout=120) |
| | response.raise_for_status() |
| | result_data = response.json() |
| | final__status = ( |
| | f"Submission Successful!\nUser: {result_data.get('username')}\n" |
| | f"Overall Score: {result_data.get('score', 'N/A')}% " |
| | f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n" |
| | f"Message: {result_data.get('message', 'No message received.')}" |
| | ) |
| | return final_status, pd.DataFrame(results_log) |
| | except requests.exceptions.HTTPError as e: |
| | error_detail = f"Server responded with status {e.response.status_code}." |
| | try: error_detail += f" Detail: {e.response.json().get('detail', e.response.text)}" |
| | except: error_detail += f" Response: {e.response.text[:500]}" |
| | return f"Submission Failed: {error_detail}", pd.DataFrame(results_log) |
| | except Exception as e: |
| | return f"An unexpected error occurred during submission: {e}", pd.DataFrame(results_log) |
| |
|
| | |
| | with gr.Blocks() as demo: |
| | gr.Markdown("# ReAct Agent Evaluation Runner (GAIA Modified)") |
| | gr.Markdown( |
| | """ |
| | **Instructions & Disclaimers:** |
| | Login, then click 'Run Evaluation'. This uses Mixtral and a refined ReAct prompt with stop sequences. |
| | Check logs for RAW LLM OUTPUT and PROMPT FOR LLM for debugging. |
| | """ |
| | ) |
| | gr.LoginButton() |
| | run_button = gr.Button("Run Evaluation & Submit All Answers") |
| | status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False) |
| | results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True) |
| | run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table]) |
| |
|
| | if __name__ == "__main__": |
| | print("\n" + "-"*30 + " App Starting " + "-"*30) |
| | space_host_startup = os.getenv("SPACE_HOST") |
| | space_id_startup = os.getenv("SPACE_ID") |
| | if space_host_startup: print(f"✅ SPACE_HOST found: {space_host_startup}") |
| | if space_id_startup: print(f"✅ SPACE_ID found: {space_id_startup}") |
| |
|
| | if llm_client is None: print("⚠️ LLM Client (InferenceClient) was not initialized.") |
| | else: print(f"✅ LLM Client initialized with model: {LLM_MODEL}") |
| | print("-"*(60 + len(" App Starting ")) + "\n") |
| | demo.launch(debug=True, share=False) |