text-adventure-agent

Sleeping

Willy Vo

Implement my agent

0b17df7 13 days ago

25.8 kB

	"""
	Example: MCP ReAct Agent

	A complete ReAct agent that uses MCP tools to play text adventure games.
	This is a working example students can learn from.
	"""

	import json
	import os
	import re
	from dataclasses import dataclass, field
	from typing import Optional

	from dotenv import load_dotenv
	from huggingface_hub import InferenceClient

	import hashlib
	from collections import defaultdict

	load_dotenv()

	# =============================================================================
	# LLM Configuration - DO NOT MODIFY
	# =============================================================================

	LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct"

	_hf_token = os.getenv("HF_TOKEN")
	if not _hf_token:
	raise ValueError("HF_TOKEN not found. Set it in your .env file.")

	LLM_CLIENT = InferenceClient(token=_hf_token)


	def call_llm(prompt: str, system_prompt: str, seed: int, max_tokens: int = 300) -> str:
	"""Call the LLM with the given prompt."""
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": prompt},
	]

	response = LLM_CLIENT.chat.completions.create(
	model=LLM_MODEL,
	messages=messages,
	temperature=0.0,
	max_tokens=max_tokens,
	seed=seed,
	)

	return response.choices[0].message.content


	@dataclass
	class RunResult:
	"""Result of running the agent. Do not modify this class."""
	final_score: int
	max_score: int
	moves: int
	locations_visited: set[str]
	game_completed: bool
	error: Optional[str] = None
	history: list[tuple[str, str, str]] = field(default_factory=list)


	# =============================================================================
	# System Prompt
	# =============================================================================

	SYSTEM_PROMPT = """You are an expert text adventure game player. Your goal is to explore, collect treasures, and maximize your score.

	AVAILABLE TOOLS (use these via MCP):
	1. play_action - Execute game commands (north, take lamp, open mailbox, etc.)
	2. memory - Get current game state, score, and recent history
	3. get_map - See explored locations and connections
	4. inventory - Check what you're carrying

	VALID GAME COMMANDS for play_action:
	- Movement: north, south, east, west, up, down, enter, exit
	- Objects: take <item>, drop <item>, open <thing>, close <thing>, examine <thing>
	- Light: turn on lamp, turn off lamp
	- Combat: attack <enemy> with <weapon>
	- Other: inventory, look, read <thing>, wait

	FORBIDDEN (will NOT work): check, inspect, search, grab, use, help

	RESPOND IN THIS EXACT FORMAT (no markdown):
	THOUGHT: <brief reasoning about what to do next>
	TOOL: <tool_name>
	ARGS: <JSON arguments>

	Examples:
	THOUGHT: I need to see what's around me.
	TOOL: play_action
	ARGS: {"action": "look"}

	THOUGHT: Let me check my current state and score.
	TOOL: memory
	ARGS: {}

	THOUGHT: The mailbox might contain something useful.
	TOOL: play_action
	ARGS: {"action": "open mailbox"}

	STRATEGY:
	1. Start by looking around and checking memory
	2. Explore systematically - try all directions
	3. Pick up useful items (lamp, sword, etc.)
	4. Open containers (mailbox, window, etc.)
	5. Use get_map to avoid getting lost
	6. Turn on lamp before dark areas!

	DO NOT repeat the same action multiple times in a row."""


	# =============================================================================
	# Student Agent Implementation
	# =============================================================================

	class StudentAgent:
	"""
	MCP ReAct Agent - A complete working example.

	This agent demonstrates:
	- ReAct loop (Thought -> Tool -> Observation)
	- Loop detection
	- Action validation
	- Score tracking via memory tool
	"""

	def __init__(self):
	"""Initialize the agent state."""
	self.history: list[dict] = []
	self.recent_actions: list[str] = []
	self.score: int = 0

	# --- Context management memory ---
	# Keyed by (state_id, inv_sig)
	self.failed_strong = defaultdict(set) # actions that are nonsense here
	self.failed_soft = defaultdict(dict) # action -> last_step tried (cooldown)
	self.state_last_obs = {} # (state_id, inv_sig) -> normalized obs
	self.inv_sig: str = "" # current inventory signature
	self.prev_inv_sig: str = "" # previous signature to detect changes
	self.step: int = 0 # current step counter
	self.debug_context: bool = True # Whether to include context management info in the prompt (for transparency)
	self.soft_cooldown_steps = 30

	# -- LLM judge
	self.judge_cache: dict[tuple[str, str, str], str] = {}
	self.use_llm_judge: bool = True
	self._last_llm_raw = ""
	self._last_llm_label = ""
	self._last_llm_cached = False

	async def run(
	self,
	client,
	game: str,
	max_steps: int,
	seed: int,
	verbose: bool = False,
	) -> RunResult:
	"""Run the agent for a game session."""
	locations_visited = set()
	history = []
	moves = 0

	# Get list of available tools
	tools = await client.list_tools()
	tool_names = [t.name for t in tools]

	# Get initial observation
	result = await client.call_tool("play_action", {"action": "look"})
	observation = self._extract_result(result)

	# Initialize inventory signature
	inv_res = await client.call_tool("inventory", {})
	inv_text = self._extract_result(inv_res)
	self.inv_sig = self._inventory_signature(inv_text)
	self.prev_inv_sig = self.inv_sig

	# Track initial location
	location = observation.split("\n")[0] if observation else "Unknown"
	locations_visited.add(location)

	if verbose:
	print(f"\n{observation}")

	# Main ReAct loop
	for step in range(1, max_steps + 1):
	self.step = step

	# Refresh inventory periodically (cheap and very useful for gating)
	if step == 1 or step % 7 == 0:
	inv_res = await client.call_tool("inventory", {})
	inv_text = self._extract_result(inv_res)
	self.inv_sig = self._inventory_signature(inv_text)

	# If inventory changed, we want to allow retry of gated actions everywhere
	if self.inv_sig != self.prev_inv_sig:
	# Clear ALL soft failures (gated actions may now be valid)
	self.failed_soft.clear()
	self.prev_inv_sig = self.inv_sig

	# Build prompt with context
	prompt = self._build_prompt(observation)

	# Call LLM for reasoning (use step-based seed for variety)
	response = call_llm(prompt, SYSTEM_PROMPT, seed + step)

	# Parse the response
	thought, tool_name, tool_args = self._parse_response(response, tool_names)

	if verbose:
	print(f"\n--- Step {step} ---")
	print(f"[THOUGHT] {thought}")
	print(f"[TOOL] {tool_name}({tool_args})")

	# Validate and fix common issues
	tool_name, tool_args = self._validate_tool_call(tool_name, tool_args, tool_names)
	tool_name, tool_args = self._apply_context_management(tool_name, tool_args, observation)

	# Loop detection
	if tool_name == "play_action":
	action = tool_args.get("action", "look")
	self.recent_actions.append(action)
	if len(self.recent_actions) > 5:
	self.recent_actions = self.recent_actions[-5:]

	# Detect loops - if same action 3 times, force "look"
	if len(self.recent_actions) >= 3 and len(set(self.recent_actions[-3:])) == 1:
	if verbose:
	print(f"[WARNING] Loop detected - forcing 'look'")
	tool_args = {"action": "look"}
	self.recent_actions.append("look")

	moves += 1

	# Execute the tool
	try:
	prev_observation = observation # keep previous for failure detection

	result = await client.call_tool(tool_name, tool_args)
	observation = self._extract_result(result)

	# Update failure memory only for play_action
	if tool_name == "play_action":
	action = tool_args.get("action", "look")
	self._update_failure_memory(prev_observation, action, observation)
	self._log_context_state(prev_observation, action, new_observation=observation)

	if verbose:
	print(f"[RESULT] {observation[:200]}...")
	except Exception as e:
	observation = f"Error: {e}"
	if verbose:
	print(f"[ERROR] {e}")


	# Track location
	location = observation.split("\n")[0] if observation else "Unknown"
	locations_visited.add(location)

	# Update history
	self.history.append({
	"step": step,
	"thought": thought,
	"tool": tool_name,
	"args": tool_args,
	"result": observation[:200]
	})
	if len(self.history) > 10:
	self.history = self.history[-10:]

	# Track score from observation
	self._update_score(observation)

	# Record in result history
	history.append((thought, f"{tool_name}({tool_args})", observation[:100]))

	# Check for game over
	if self._is_game_over(observation):
	if verbose:
	print("\n* GAME OVER *")
	break

	return RunResult(
	final_score=self.score,
	max_score=350,
	moves=moves,
	locations_visited=locations_visited,
	game_completed=self._is_game_over(observation),
	history=history,
	)

	def _build_prompt(self, observation: str) -> str:
	"""Build the prompt for the LLM with context."""
	parts = []

	parts.append(f"Current Score: {self.score}")

	# Recent history
	if self.history:
	parts.append("\nRecent actions:")
	for entry in self.history[-8:]:
	action = entry.get("args", {}).get("action", entry["tool"])
	result_short = entry["result"][:80] + "..." if len(entry["result"]) > 80 else entry["result"]
	parts.append(f" > {action} -> {result_short}")

	# Warn about repeated actions
	if self.recent_actions and len(set(self.recent_actions[-3:])) == 1:
	parts.append(f"\n[WARNING: You've been doing '{self.recent_actions[-1]}' repeatedly. TRY SOMETHING DIFFERENT!]")

	# Add context constraints to reduce repetition
	sid = self._state_id_from_observation(observation)
	key = (sid, self.inv_sig)
	forbidden = list(self.failed_strong[key])[:8]
	soft_forbidden = list(self.failed_soft[key].keys())[:8]

	if forbidden or soft_forbidden:
	parts.append("\nContext restrictions (DO NOT choose these actions here):")
	if forbidden:
	parts.append(" Strong banned: " + ", ".join(forbidden))
	if soft_forbidden:
	parts.append(" Recently failed (wait before retry): " + ", ".join(soft_forbidden))
	parts.append(f"\nCurrent situation:\n{observation}")
	parts.append("\nWhat do you do next?")

	return "\n".join(parts)

	def _parse_response(self, response: str, valid_tools: list[str]) -> tuple[str, str, dict]:
	"""Parse the LLM response to extract thought, tool, and arguments."""
	thought = "No reasoning provided"
	tool_name = "play_action"
	tool_args = {"action": "look"}

	lines = response.strip().split("\n")

	for line in lines:
	line_clean = line.strip()
	line_upper = line_clean.upper()

	if line_upper.startswith("THOUGHT:"):
	thought = line_clean.split(":", 1)[1].strip()

	elif line_upper.startswith("TOOL:"):
	raw_tool = line_clean.split(":", 1)[1].strip().lower()
	raw_tool = raw_tool.replace("*", "").replace("", "").replace("`", "")
	raw_tool = raw_tool.split()[0] if raw_tool else "play_action"
	tool_name = raw_tool

	elif line_upper.startswith("ARGS:"):
	args_part = line_clean.split(":", 1)[1].strip()
	try:
	args_part = args_part.replace("'", '"')
	tool_args = json.loads(args_part)
	except json.JSONDecodeError:
	match = re.search(r'"action"\s:\s"([^"]+)"', args_part)
	if match:
	tool_args = {"action": match.group(1)}
	else:
	tool_args = {"action": "look"}

	return thought, tool_name, tool_args

	def _validate_tool_call(self, tool_name: str, tool_args: dict, valid_tools: list[str]) -> tuple[str, dict]:
	"""Validate and fix common tool call issues."""
	# Fix tool name
	if tool_name not in valid_tools:
	if tool_name in ["action", "do", "command"]:
	tool_name = "play_action"
	elif tool_name in ["map", "location"]:
	tool_name = "get_map"
	elif tool_name in ["mem", "state", "status"]:
	tool_name = "memory"
	elif tool_name in ["inv", "items"]:
	tool_name = "inventory"
	else:
	tool_name = "play_action"

	# Fix action verbs
	if tool_name == "play_action":
	action = tool_args.get("action", "look")

	invalid_verb_map = {
	"check": "examine",
	"inspect": "examine",
	"search": "look",
	"grab": "take",
	"pick": "take",
	"use": "examine",
	"investigate": "examine",
	}

	words = action.lower().split()
	if words and words[0] in invalid_verb_map:
	words[0] = invalid_verb_map[words[0]]
	action = " ".join(words)

	action = action.lower().strip()
	action = action.replace("*", "").replace("", "").replace("`", "")
	action = " ".join(action.split())

	tool_args["action"] = action

	return tool_name, tool_args

	def _extract_result(self, result) -> str:
	"""Extract text from MCP tool result."""
	if hasattr(result, 'content') and result.content:
	return result.content[0].text
	if isinstance(result, list) and result:
	return result[0].text if hasattr(result[0], 'text') else str(result[0])
	return str(result)

	def _update_score(self, text: str) -> None:
	"""Update score from game text."""
	patterns = [
	r'Score:\s*(\d+)',
	r'score[:\s]+(\d+)',
	r'\[Score:\s*(\d+)',
	]

	for pattern in patterns:
	match = re.search(pattern, text, re.IGNORECASE)
	if match:
	self.score = max(self.score, int(match.group(1)))

	def _is_game_over(self, text: str) -> bool:
	"""Check if the game is over."""
	game_over_phrases = [
	"game over",
	"you have died",
	"you are dead",
	"* you have died *",
	]
	text_lower = text.lower()
	return any(phrase in text_lower for phrase in game_over_phrases)

	def _normalize_text(self, text: str) -> str:
	"""Normalize text for comparison (remove numbers/punct, collapse spaces)."""
	s = (text or "").lower()
	s = re.sub(r"\d+", "0", s)
	s = re.sub(r"[^a-z0\s]", " ", s)
	s = re.sub(r"\s+", " ", s).strip()
	return s[:500]

	def _state_id_from_observation(self, observation: str) -> str:
	"""Compute a stable-ish state id for the current room/context."""
	lines = [l.strip() for l in (observation or "").splitlines() if l.strip()]
	head = lines[0].lower() if lines else "unknown"
	# hash helps reduce tiny variations
	norm = self._normalize_text(head)
	return hashlib.md5(norm.encode()).hexdigest()

	def _inventory_signature(self, inv_text: str) -> str:
	"""Signature of inventory text; robust enough for gating retries."""
	norm = self._normalize_text(inv_text)
	return hashlib.md5(norm.encode()).hexdigest()

	def _classify_failure(self, prev_obs: str, action: str, new_obs: str) -> str:
	"""
	Return: "none" \| "strong" \| "soft"
	strong = will never become valid (parser/no object)
	soft = gated (locked/dark/tool) -> retry when inventory/state changes
	"""

	# Optional fast rule: identical observation => likely failed (soft)
	if self._normalize_text(new_obs) == self._normalize_text(prev_obs):
	return "soft"

	if not getattr(self, "use_llm_judge", True):
	return "none"

	return self._llm_judge_failure(prev_obs, action, new_obs)

	def _llm_judge_failure(self, prev_obs: str, action: str, new_obs: str) -> str:
	"""
	LLM judge: returns "none" \| "soft" \| "strong"
	"""
	# Normalize for caching (prevents repeated LLM calls)
	prev_n = self._normalize_text(prev_obs)
	new_n = self._normalize_text(new_obs)
	act_n = " ".join((action or "").lower().split())

	key = (prev_n, act_n, new_n)
	if key in self.judge_cache:
	self._last_llm_raw = "(cached)"
	self._last_llm_label = self.judge_cache[key]
	self._last_llm_cached = True
	return self.judge_cache[key]

	system = "You are a strict classifier for text-adventure command outcomes."
	prompt = f"""
	Classify whether the player's action FAILED, based on the before/after observations.

	Return EXACTLY one label: none \| soft \| strong

	Definitions:
	- strong: The command is invalid/unknown OR refers to something not present/visible OR impossible in principle.
	It will NOT become valid later just by having a different item.
	- soft: The command was understood but is blocked by a condition (locked, closed, too dark, need an item, must do something first, not possible yet).
	It COULD become valid later.
	- none: The action had an effect OR gave new useful information (state changed, moved, item changed, new description).

	BE STRICT: If the new observation is basically identical to the previous one AND no progress happened, prefer "soft".

	PREVIOUS_OBSERVATION:
	{prev_obs}

	ACTION:
	{action}

	NEW_OBSERVATION:
	{new_obs}
	""".strip()

	out = call_llm(prompt, system, seed=100000 + self.step, max_tokens=8)
	raw_out = out.strip().lower()

	label = "none"
	if "strong" in raw_out:
	label = "strong"
	elif "soft" in raw_out:
	label = "soft"
	elif raw_out in {"none", "soft", "strong"}:
	label = raw_out

	# Store for logging
	self._last_llm_raw = raw_out
	self._last_llm_label = label
	self._last_llm_cached = False

	self.judge_cache[key] = label
	return label

	def _apply_context_management(self,tool_name: str,tool_args: dict,observation: str,) -> tuple[str, dict]:
	"""Prevent repeating failed actions in the same context (state + inv)."""
	if tool_name != "play_action":
	return tool_name, tool_args

	action = (tool_args.get("action") or "look").strip().lower()
	sid = self._state_id_from_observation(observation)
	key = (sid, self.inv_sig)

	# Strong blacklist
	if action in self.failed_strong[key]:
	return "play_action", {"action": self._fallback_action(observation)}

	# Soft blacklist with cooldown
	if action in self.failed_soft[key]:
	last = self.failed_soft[key][action]
	# cooldown: avoid retrying too soon
	if self.step - last < self.soft_cooldown_steps:
	return "play_action", {"action": self._fallback_action(observation)}

	# Prevent immediate repetition in same context
	if self.recent_actions and action == self.recent_actions[-1]:
	return "play_action", {"action": self._fallback_action(observation)}

	return tool_name, {"action": action}

	def _fallback_action(self, observation: str) -> str:
	"""
	Deterministic fallback when the chosen action is banned.
	Prefer exploration moves; otherwise look/inventory.
	"""
	# Prefer moves that haven't been tried recently
	move_candidates = ["north","south","east","west","up","down","n","s","e","w","u","d"]
	recent_set = set(self.recent_actions[-8:]) if self.recent_actions else set()
	for m in move_candidates:
	if m not in recent_set:
	return m

	# If stuck, refresh
	if "dark" in (observation or "").lower():
	# lamp heuristic: often useful in Zork
	return "turn on lamp"

	return "look"

	def _update_failure_memory(self, prev_obs: str, action: str, new_obs: str) -> None:
	"""Update strong/soft failed actions for this (state, inv) context."""
	sid = self._state_id_from_observation(prev_obs)
	key = (sid, self.inv_sig)

	verdict = self._classify_failure(prev_obs, action, new_obs)
	if verdict == "strong":
	self.failed_strong[key].add(action)
	# also remove from soft if present
	if action in self.failed_soft[key]:
	del self.failed_soft[key][action]
	elif verdict == "soft":
	self.failed_soft[key][action] = self.step

	def _log_context_state(self, prev_observation: str, chosen_action: str = "", new_observation: str = ""):
	"""Print debug info for context management (before-state bucket)."""
	if not self.debug_context:
	return

	sid_before = self._state_id_from_observation(prev_observation)
	sid_after = self._state_id_from_observation(new_observation) if new_observation else ""
	key_before = (sid_before, self.inv_sig)

	print("\n" + "=" * 60)
	print(f"[STEP] {self.step}")
	print(f"[STATE_ID_BEFORE] {sid_before}")
	if sid_after:
	print(f"[STATE_ID_AFTER] {sid_after}")
	print(f"[INV_SIG] {self.inv_sig[:8]}...")

	if chosen_action:
	print(f"[CHOSEN ACTION] {chosen_action}")

	# ---- LLM Judge Info ----
	if getattr(self, "_last_llm_label", ""):
	print(f"[LLM_VERDICT] {self._last_llm_label}")
	print(f"[LLM_RAW_OUTPUT] {getattr(self, '_last_llm_raw', '')}")
	print(f"[LLM_FROM_CACHE] {getattr(self, '_last_llm_cached', False)}")

	# Strong failures (BEFORE bucket)
	strong = list(self.failed_strong[key_before])
	print(f"[FAILED_STRONG_BEFORE] ({len(strong)})")
	for a in strong[:10]:
	print(f" - {a}")

	# Soft failures (BEFORE bucket)
	soft = self.failed_soft[key_before]
	print(f"[FAILED_SOFT_BEFORE] ({len(soft)})")
	for a, last_step in list(soft.items())[:10]:
	cooldown_left = max(0, self.soft_cooldown_steps - (self.step - last_step))
	print(f" - {a} (retry in {cooldown_left} steps)")

	print(f"[RECENT_ACTIONS] {self.recent_actions[-10:]}")
	print("=" * 60 + "\n")



	# =============================================================================
	# Local Testing
	# =============================================================================

	async def test_agent():
	"""Test the agent locally."""
	from fastmcp import Client

	agent = StudentAgent()

	async with Client("mcp_server.py") as client:
	result = await agent.run(
	client=client,
	game="zork1",
	max_steps=20,
	seed=42,
	verbose=True,
	)

	print(f"\n{'=' * 50}")
	print(f"Final Score: {result.final_score}")
	print(f"Moves: {result.moves}")
	print(f"Locations: {len(result.locations_visited)}")


	if __name__ == "__main__":
	import asyncio
	asyncio.run(test_agent())