Spaces:
Running
Running
Commit ·
450ea3f
1
Parent(s): 8562e41
Major refactoring
Browse files- .gitignore +4 -8
- README.md +54 -109
- agents/__init__.py +0 -9
- agents/base_agent.py +0 -78
- agents/react_agent.py +0 -243
- app.py +12 -6
- evaluation/__init__.py +14 -0
- evaluation/evaluate.py +559 -0
- evaluation/metrics.py +151 -0
- evaluation/runner.py +188 -0
- example_submission/README.md +28 -0
- agents/mcp_react_agent.py → example_submission/agent.py +190 -263
- mcp_server/zork_server.py → example_submission/mcp_server.py +16 -240
- function_calling/controller.py +0 -291
- function_calling/simple_controller.py +0 -268
- function_calling/tools.py +0 -127
- mcp_server/README.md +0 -83
- mcp_server/__init__.py +0 -1
- mcp_server/mcp_config.json +0 -9
- requirements.txt +4 -1
- run_agent.py +125 -251
- submission_template/README.md +31 -0
- submission_template/agent.py +279 -0
- submission_template/app.py +71 -0
- templates/mcp_server_template.py → submission_template/mcp_server.py +117 -55
- submission_template/requirements.txt +8 -0
- templates/README.md +0 -129
- templates/react_agent_template.py +0 -303
.gitignore
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
-
master.zip
|
| 2 |
.github/
|
|
|
|
| 3 |
|
| 4 |
# Byte-compiled / optimized / DLL files
|
| 5 |
__pycache__/
|
|
@@ -140,12 +140,8 @@ dmypy.json
|
|
| 140 |
*.swo
|
| 141 |
*~
|
| 142 |
|
| 143 |
-
# Game files
|
| 144 |
-
z-machine-games-master/
|
| 145 |
-
*.z3
|
| 146 |
-
*.z4
|
| 147 |
-
*.z5
|
| 148 |
-
*.z8
|
| 149 |
-
|
| 150 |
# Temp files
|
| 151 |
.mcp_config_temp.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
.github/
|
| 2 |
+
hidden_submission/
|
| 3 |
|
| 4 |
# Byte-compiled / optimized / DLL files
|
| 5 |
__pycache__/
|
|
|
|
| 140 |
*.swo
|
| 141 |
*~
|
| 142 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 143 |
# Temp files
|
| 144 |
.mcp_config_temp.json
|
| 145 |
+
|
| 146 |
+
# Z-machine game files
|
| 147 |
+
z-machine-games-master/
|
README.md
CHANGED
|
@@ -20,8 +20,9 @@ This project provides:
|
|
| 20 |
|
| 21 |
1. **MCP Server** - Exposes text adventure games as MCP tools using FastMCP
|
| 22 |
2. **ReAct Agent** - An agent that uses MCP tools to play games with reasoning
|
| 23 |
-
3. **
|
| 24 |
-
4. **
|
|
|
|
| 25 |
|
| 26 |
## Architecture
|
| 27 |
|
|
@@ -63,109 +64,78 @@ Get your HuggingFace token at: https://huggingface.co/settings/tokens
|
|
| 63 |
### 2. Run an Agent
|
| 64 |
|
| 65 |
```bash
|
| 66 |
-
#
|
| 67 |
-
python run_agent.py
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
python run_agent.py --
|
| 71 |
|
| 72 |
-
#
|
| 73 |
-
python run_agent.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
```
|
| 75 |
|
| 76 |
## Project Structure
|
| 77 |
|
| 78 |
```
|
| 79 |
.
|
| 80 |
-
+-- run_agent.py #
|
| 81 |
-
+--
|
| 82 |
-
|
| 83 |
-
+--
|
| 84 |
-
| +--
|
| 85 |
-
| +--
|
| 86 |
-
|
| 87 |
-
+--
|
|
|
|
|
|
|
| 88 |
| +-- README.md # Assignment instructions
|
| 89 |
-
| +--
|
| 90 |
-
| +--
|
| 91 |
-
+--
|
| 92 |
-
| +-- controller.py
|
| 93 |
-
| +-- simple_controller.py
|
| 94 |
-
| +-- tools.py
|
| 95 |
+-- games/
|
| 96 |
| +-- zork_env.py # Jericho wrapper
|
| 97 |
+-- z-machine-games-master/ # Game files
|
| 98 |
```
|
| 99 |
|
| 100 |
-
##
|
| 101 |
-
|
| 102 |
-
| Mode | Description | Command |
|
| 103 |
-
|------|-------------|---------|
|
| 104 |
-
| `mcp` | MCP ReAct agent (FastMCP Client) | `--mode mcp` |
|
| 105 |
-
| `react` | Basic ReAct (direct game) | `--mode react` |
|
| 106 |
-
| `function` | Function calling (API) | `--mode function` |
|
| 107 |
-
| `function --simple` | Function calling (text) | `--mode function --simple` |
|
| 108 |
-
|
| 109 |
-
### Examples
|
| 110 |
-
|
| 111 |
-
```bash
|
| 112 |
-
# Run MCP agent with verbose output
|
| 113 |
-
python run_agent.py --mode mcp -v
|
| 114 |
-
|
| 115 |
-
# Run with different model
|
| 116 |
-
python run_agent.py --mode mcp --model google/gemma-2-2b-it
|
| 117 |
|
| 118 |
-
|
| 119 |
-
python run_agent.py --mode mcp -n 50
|
| 120 |
-
|
| 121 |
-
# Play different games
|
| 122 |
-
python run_agent.py --mode mcp --game zork2
|
| 123 |
-
python run_agent.py --mode mcp --game advent # Colossal Cave Adventure
|
| 124 |
-
python run_agent.py --mode mcp --game enchanter # Infocom classic
|
| 125 |
-
python run_agent.py --mode mcp --game hhgg # Hitchhiker's Guide
|
| 126 |
-
|
| 127 |
-
# List all 57 available games
|
| 128 |
-
python run_agent.py --list-games
|
| 129 |
-
```
|
| 130 |
|
| 131 |
-
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
|
| 134 |
|
| 135 |
-
|
| 136 |
-
|------|-------------|
|
| 137 |
-
| `play_action(action)` | Execute a game command (north, take lamp, etc.) |
|
| 138 |
-
| `memory()` | Get current state (location, score, history) |
|
| 139 |
-
| `get_map()` | View explored locations and connections |
|
| 140 |
-
| `inventory()` | Check items you're carrying |
|
| 141 |
-
| `valid_actions()` | Get command hints |
|
| 142 |
-
| `reset_game(game)` | Start over or switch games |
|
| 143 |
-
| `list_games()` | See all 57 available games |
|
| 144 |
-
| `hint()` | Get contextual hints |
|
| 145 |
|
| 146 |
-
|
| 147 |
|
| 148 |
```bash
|
| 149 |
-
#
|
| 150 |
-
python
|
| 151 |
|
| 152 |
-
#
|
| 153 |
-
|
| 154 |
|
| 155 |
-
#
|
| 156 |
-
|
| 157 |
|
| 158 |
-
#
|
| 159 |
-
|
| 160 |
```
|
| 161 |
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
1. **MCP Server** (`mcp_server_template.py`) - Expose game functionality as MCP tools
|
| 168 |
-
2. **ReAct Agent** (`react_agent_template.py`) - Play text adventures using MCP
|
| 169 |
|
| 170 |
## Configuration
|
| 171 |
|
|
@@ -176,39 +146,14 @@ Create `.env` from `.env.example`:
|
|
| 176 |
```bash
|
| 177 |
# Required: HuggingFace token
|
| 178 |
HF_TOKEN=hf_your_token_here
|
| 179 |
-
|
| 180 |
-
# Optional: Model override (default: meta-llama/Llama-3.2-3B-Instruct)
|
| 181 |
-
HF_MODEL=meta-llama/Llama-3.2-3B-Instruct
|
| 182 |
```
|
| 183 |
|
| 184 |
-
###
|
| 185 |
-
|
| 186 |
-
| Model | Notes |
|
| 187 |
-
|-------|-------|
|
| 188 |
-
| `meta-llama/Llama-3.2-3B-Instruct` | Default, good balance |
|
| 189 |
-
| `google/gemma-2-2b-it` | Smaller, faster |
|
| 190 |
-
| `Qwen/Qwen2.5-7B-Instruct` | Good instruction following |
|
| 191 |
-
|
| 192 |
-
## Evaluation
|
| 193 |
-
|
| 194 |
-
Run the evaluator to test agent performance:
|
| 195 |
-
|
| 196 |
-
```bash
|
| 197 |
-
python evaluate.py --mode mcp --games zork1 --runs 3
|
| 198 |
-
```
|
| 199 |
-
|
| 200 |
-
Metrics:
|
| 201 |
-
- **Score**: Points earned in-game
|
| 202 |
-
- **Score %**: Score / Max possible score
|
| 203 |
-
- **Steps**: Number of actions taken
|
| 204 |
-
- **Time**: Elapsed time
|
| 205 |
-
|
| 206 |
-
## Resources
|
| 207 |
|
| 208 |
-
|
| 209 |
-
-
|
| 210 |
-
-
|
| 211 |
-
-
|
| 212 |
|
| 213 |
## License
|
| 214 |
|
|
|
|
| 20 |
|
| 21 |
1. **MCP Server** - Exposes text adventure games as MCP tools using FastMCP
|
| 22 |
2. **ReAct Agent** - An agent that uses MCP tools to play games with reasoning
|
| 23 |
+
3. **Submission Template** - Starter code for students to implement their own solutions
|
| 24 |
+
4. **Evaluation System** - Deterministic evaluation with seeded runs
|
| 25 |
+
5. **57 Games** - Zork trilogy, Infocom classics, and many more Z-machine games
|
| 26 |
|
| 27 |
## Architecture
|
| 28 |
|
|
|
|
| 64 |
### 2. Run an Agent
|
| 65 |
|
| 66 |
```bash
|
| 67 |
+
# Run the example MCP agent
|
| 68 |
+
python run_agent.py
|
| 69 |
|
| 70 |
+
# Play a different game
|
| 71 |
+
python run_agent.py --game advent
|
| 72 |
|
| 73 |
+
# Verbose output
|
| 74 |
+
python run_agent.py -v
|
| 75 |
+
|
| 76 |
+
# Limit steps
|
| 77 |
+
python run_agent.py -n 50
|
| 78 |
+
|
| 79 |
+
# List all 57 games
|
| 80 |
+
python run_agent.py --list-games
|
| 81 |
```
|
| 82 |
|
| 83 |
## Project Structure
|
| 84 |
|
| 85 |
```
|
| 86 |
.
|
| 87 |
+
+-- run_agent.py # Agent runner
|
| 88 |
+
+-- app.py # Gradio interface
|
| 89 |
+
+-- evaluation/ # Evaluation system
|
| 90 |
+
| +-- evaluate.py # Main CLI script
|
| 91 |
+
| +-- runner.py # Agent execution
|
| 92 |
+
| +-- metrics.py # Result tracking
|
| 93 |
+
+-- example_submission/ # Working example submission
|
| 94 |
+
| +-- agent.py # Full ReAct agent implementation
|
| 95 |
+
| +-- mcp_server.py # Full MCP server implementation
|
| 96 |
+
+-- submission_template/ # Student templates
|
| 97 |
| +-- README.md # Assignment instructions
|
| 98 |
+
| +-- agent.py # Agent starter code
|
| 99 |
+
| +-- mcp_server.py # MCP server starter code
|
| 100 |
+
| +-- app.py # HF Spaces app
|
|
|
|
|
|
|
|
|
|
| 101 |
+-- games/
|
| 102 |
| +-- zork_env.py # Jericho wrapper
|
| 103 |
+-- z-machine-games-master/ # Game files
|
| 104 |
```
|
| 105 |
|
| 106 |
+
## Assignment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
+
See [submission_template/README.md](submission_template/README.md) for the assignment instructions.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
You need to implement:
|
| 111 |
+
1. **MCP Server** (`mcp_server.py`) - Expose game functionality as MCP tools
|
| 112 |
+
2. **ReAct Agent** (`agent.py`) - Play text adventures using MCP tools
|
| 113 |
|
| 114 |
+
A working example is provided in `example_submission/`.
|
| 115 |
|
| 116 |
+
## Evaluation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
| 118 |
+
Run the evaluator to test submissions:
|
| 119 |
|
| 120 |
```bash
|
| 121 |
+
# Evaluate a submission
|
| 122 |
+
python evaluation/evaluate.py -s ./submission_template -g zork1 -t 5
|
| 123 |
|
| 124 |
+
# Evaluate the example
|
| 125 |
+
python evaluation/evaluate.py -s ./example_submission -g zork1 -t 3
|
| 126 |
|
| 127 |
+
# Evaluate multiple games
|
| 128 |
+
python evaluation/evaluate.py -s ./example_submission -g zork1 advent enchanter -t 3
|
| 129 |
|
| 130 |
+
# Save results to JSON
|
| 131 |
+
python evaluation/evaluate.py -s ./example_submission -g zork1 -t 3 -o results.json
|
| 132 |
```
|
| 133 |
|
| 134 |
+
Metrics:
|
| 135 |
+
- **Score**: Points earned in-game (averaged over trials)
|
| 136 |
+
- **Score %**: Score / Max possible score
|
| 137 |
+
- **Steps**: Number of actions taken
|
| 138 |
+
- **Time**: Elapsed time
|
|
|
|
|
|
|
| 139 |
|
| 140 |
## Configuration
|
| 141 |
|
|
|
|
| 146 |
```bash
|
| 147 |
# Required: HuggingFace token
|
| 148 |
HF_TOKEN=hf_your_token_here
|
|
|
|
|
|
|
|
|
|
| 149 |
```
|
| 150 |
|
| 151 |
+
### Fixed Model
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
| 153 |
+
All submissions use the same model for fairness:
|
| 154 |
+
- **Model**: `Qwen/Qwen2.5-72B-Instruct`
|
| 155 |
+
- **Temperature**: `0.0` (deterministic)
|
| 156 |
+
- **Seed**: Provided for reproducibility
|
| 157 |
|
| 158 |
## License
|
| 159 |
|
agents/__init__.py
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
from .base_agent import BaseAgent, AgentConfig
|
| 2 |
-
from .react_agent import ReActAgent, ReActConfig
|
| 3 |
-
from .mcp_react_agent import MCPReActAgent, MCPAgentConfig
|
| 4 |
-
|
| 5 |
-
__all__ = [
|
| 6 |
-
"BaseAgent", "AgentConfig",
|
| 7 |
-
"ReActAgent", "ReActConfig",
|
| 8 |
-
"MCPReActAgent", "MCPAgentConfig",
|
| 9 |
-
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agents/base_agent.py
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Base Agent Abstract Class
|
| 3 |
-
|
| 4 |
-
Defines the interface that all text adventure agents must implement.
|
| 5 |
-
"""
|
| 6 |
-
|
| 7 |
-
from abc import ABC, abstractmethod
|
| 8 |
-
from dataclasses import dataclass
|
| 9 |
-
from games.zork_env import GameState
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
@dataclass
|
| 13 |
-
class AgentConfig:
|
| 14 |
-
"""Configuration for an agent."""
|
| 15 |
-
name: str = "BaseAgent"
|
| 16 |
-
max_history: int = 20 # Maximum number of past interactions to remember
|
| 17 |
-
verbose: bool = False
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
class BaseAgent(ABC):
|
| 21 |
-
"""
|
| 22 |
-
Abstract base class for text adventure agents.
|
| 23 |
-
|
| 24 |
-
Students should extend this class and implement the `choose_action` method.
|
| 25 |
-
"""
|
| 26 |
-
|
| 27 |
-
def __init__(self, config: AgentConfig = None):
|
| 28 |
-
self.config = config or AgentConfig()
|
| 29 |
-
self.history: list[tuple[str, str, GameState]] = [] # (action, observation, state)
|
| 30 |
-
|
| 31 |
-
@abstractmethod
|
| 32 |
-
def choose_action(self, observation: str, game_state: GameState) -> str:
|
| 33 |
-
"""
|
| 34 |
-
Choose the next action based on the current observation and game state.
|
| 35 |
-
|
| 36 |
-
Args:
|
| 37 |
-
observation: The text observation from the game
|
| 38 |
-
game_state: The current GameState object with score, inventory, etc.
|
| 39 |
-
|
| 40 |
-
Returns:
|
| 41 |
-
A string action to take in the game (e.g., "go north", "take lamp")
|
| 42 |
-
"""
|
| 43 |
-
pass
|
| 44 |
-
|
| 45 |
-
def update_history(self, action: str, observation: str, game_state: GameState):
|
| 46 |
-
"""
|
| 47 |
-
Update the agent's history after taking an action.
|
| 48 |
-
|
| 49 |
-
Args:
|
| 50 |
-
action: The action that was taken
|
| 51 |
-
observation: The resulting observation
|
| 52 |
-
game_state: The resulting game state
|
| 53 |
-
"""
|
| 54 |
-
self.history.append((action, observation, game_state))
|
| 55 |
-
|
| 56 |
-
# Keep history bounded
|
| 57 |
-
if len(self.history) > self.config.max_history:
|
| 58 |
-
self.history = self.history[-self.config.max_history:]
|
| 59 |
-
|
| 60 |
-
def reset(self):
|
| 61 |
-
"""Reset the agent's internal state for a new game."""
|
| 62 |
-
self.history = []
|
| 63 |
-
|
| 64 |
-
def get_history_text(self) -> str:
|
| 65 |
-
"""Get a text summary of recent history for context."""
|
| 66 |
-
if not self.history:
|
| 67 |
-
return "No previous actions taken."
|
| 68 |
-
|
| 69 |
-
lines = []
|
| 70 |
-
for action, observation, state in self.history[-10:]: # Last 10 actions
|
| 71 |
-
lines.append(f"> {action}")
|
| 72 |
-
# Truncate long observations
|
| 73 |
-
obs_preview = observation[:200] + "..." if len(observation) > 200 else observation
|
| 74 |
-
lines.append(obs_preview)
|
| 75 |
-
lines.append(f"[Score: {state.score}, Moves: {state.moves}]")
|
| 76 |
-
lines.append("")
|
| 77 |
-
|
| 78 |
-
return "\n".join(lines)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agents/react_agent.py
DELETED
|
@@ -1,243 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
ReAct Agent for Text Adventure Games
|
| 3 |
-
|
| 4 |
-
Implements a ReAct (Reasoning + Acting) loop using an LLM to play text adventures.
|
| 5 |
-
The agent thinks about its situation, decides on an action, and learns from the result.
|
| 6 |
-
"""
|
| 7 |
-
|
| 8 |
-
import os
|
| 9 |
-
from dataclasses import dataclass
|
| 10 |
-
from huggingface_hub import InferenceClient
|
| 11 |
-
from dotenv import load_dotenv
|
| 12 |
-
|
| 13 |
-
from agents.base_agent import BaseAgent, AgentConfig
|
| 14 |
-
from games.zork_env import GameState
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
@dataclass
|
| 18 |
-
class ReActConfig(AgentConfig):
|
| 19 |
-
"""Configuration for the ReAct agent."""
|
| 20 |
-
name: str = "ReActAgent"
|
| 21 |
-
model: str = "meta-llama/Llama-3.2-3B-Instruct"
|
| 22 |
-
temperature: float = 0.7
|
| 23 |
-
max_tokens: int = 300
|
| 24 |
-
max_history: int = 15
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
SYSTEM_PROMPT = """You are playing a classic text adventure game.
|
| 28 |
-
|
| 29 |
-
GOAL: Explore the world, solve puzzles, collect treasures, and maximize your score.
|
| 30 |
-
|
| 31 |
-
VALID COMMANDS:
|
| 32 |
-
- Movement: north, south, east, west, up, down, enter, exit
|
| 33 |
-
- Looking: look, examine <thing>, read <thing>
|
| 34 |
-
- Objects: take <item>, drop <item>, open <thing>, close <thing>
|
| 35 |
-
- Light: turn on lamp, light match
|
| 36 |
-
- Combat: attack <enemy> with <weapon>
|
| 37 |
-
- Other: inventory, wait, push <thing>, move <thing>
|
| 38 |
-
|
| 39 |
-
INVALID COMMANDS (do NOT use): check, inspect, search, grab, use, help
|
| 40 |
-
|
| 41 |
-
TIPS:
|
| 42 |
-
- Explore systematically - try all directions
|
| 43 |
-
- Examine interesting objects and read documents
|
| 44 |
-
- Pick up useful items (lamp, keys, weapons)
|
| 45 |
-
- Open containers to find hidden items
|
| 46 |
-
|
| 47 |
-
You MUST respond in EXACTLY this format (no markdown, no extra text):
|
| 48 |
-
THOUGHT: <your reasoning in one sentence>
|
| 49 |
-
ACTION: <one valid command>
|
| 50 |
-
|
| 51 |
-
Example response:
|
| 52 |
-
THOUGHT: I see a container here, I should check what is inside.
|
| 53 |
-
ACTION: open container"""
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
class ReActAgent(BaseAgent):
|
| 57 |
-
"""
|
| 58 |
-
A ReAct (Reasoning + Acting) agent that uses an LLM to play text adventures.
|
| 59 |
-
|
| 60 |
-
Uses Hugging Face Hub's Inference API.
|
| 61 |
-
"""
|
| 62 |
-
|
| 63 |
-
def __init__(self, config: ReActConfig = None, token: str = None):
|
| 64 |
-
super().__init__(config or ReActConfig())
|
| 65 |
-
self.config: ReActConfig = self.config
|
| 66 |
-
|
| 67 |
-
# Load token from environment if not provided
|
| 68 |
-
load_dotenv()
|
| 69 |
-
token = token or os.getenv("HF_TOKEN")
|
| 70 |
-
if not token:
|
| 71 |
-
raise ValueError("HF_TOKEN not found. Set HF_TOKEN environment variable or pass token parameter.")
|
| 72 |
-
|
| 73 |
-
# Override model from environment if set
|
| 74 |
-
env_model = os.getenv("HF_MODEL")
|
| 75 |
-
if env_model:
|
| 76 |
-
self.config.model = env_model
|
| 77 |
-
|
| 78 |
-
self.client = InferenceClient(token=token)
|
| 79 |
-
self.thoughts: list[str] = [] # Store reasoning history
|
| 80 |
-
|
| 81 |
-
def choose_action(self, observation: str, game_state: GameState) -> str:
|
| 82 |
-
"""
|
| 83 |
-
Use the LLM to reason about the situation and choose an action.
|
| 84 |
-
"""
|
| 85 |
-
# Build the prompt with context
|
| 86 |
-
prompt = self._build_prompt(observation, game_state)
|
| 87 |
-
|
| 88 |
-
# Call the LLM
|
| 89 |
-
response = self._call_llm(prompt)
|
| 90 |
-
|
| 91 |
-
# Parse the response
|
| 92 |
-
thought, action = self._parse_response(response)
|
| 93 |
-
|
| 94 |
-
# Store the thought for history
|
| 95 |
-
self.thoughts.append(thought)
|
| 96 |
-
|
| 97 |
-
if self.config.verbose:
|
| 98 |
-
print(f"\n[Thought] {thought}")
|
| 99 |
-
print(f"[Action] {action}")
|
| 100 |
-
|
| 101 |
-
return action
|
| 102 |
-
|
| 103 |
-
def _build_prompt(self, observation: str, game_state: GameState) -> str:
|
| 104 |
-
"""Build the prompt for the LLM with current context."""
|
| 105 |
-
parts = []
|
| 106 |
-
|
| 107 |
-
# Current status (compact for small models)
|
| 108 |
-
parts.append(f"Score: {game_state.score}/{game_state.max_score} | Moves: {game_state.moves}")
|
| 109 |
-
|
| 110 |
-
if game_state.inventory:
|
| 111 |
-
parts.append(f"Inventory: {', '.join(game_state.inventory)}")
|
| 112 |
-
|
| 113 |
-
# Recent history (only last 3 for small models)
|
| 114 |
-
if self.history:
|
| 115 |
-
parts.append("\nRecent:")
|
| 116 |
-
recent_actions = []
|
| 117 |
-
for action, obs, state in self.history[-3:]:
|
| 118 |
-
obs_short = obs[:150] + "..." if len(obs) > 150 else obs
|
| 119 |
-
parts.append(f"> {action}\n{obs_short}")
|
| 120 |
-
recent_actions.append(action)
|
| 121 |
-
|
| 122 |
-
# Warn about repeated actions
|
| 123 |
-
if len(recent_actions) >= 2 and len(set(recent_actions)) == 1:
|
| 124 |
-
parts.append(f"\n[WARNING: You've done '{recent_actions[0]}' multiple times. Try something different!]")
|
| 125 |
-
|
| 126 |
-
# Current observation
|
| 127 |
-
parts.append(f"\nNow:\n{observation}")
|
| 128 |
-
parts.append("\nWhat do you do next? (Try a NEW action)")
|
| 129 |
-
|
| 130 |
-
return "\n".join(parts)
|
| 131 |
-
|
| 132 |
-
def _call_llm(self, prompt: str) -> str:
|
| 133 |
-
"""Call the Hugging Face Inference API."""
|
| 134 |
-
try:
|
| 135 |
-
messages = [
|
| 136 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 137 |
-
{"role": "user", "content": prompt}
|
| 138 |
-
]
|
| 139 |
-
|
| 140 |
-
response = self.client.chat.completions.create(
|
| 141 |
-
model=self.config.model,
|
| 142 |
-
messages=messages,
|
| 143 |
-
temperature=self.config.temperature,
|
| 144 |
-
max_tokens=self.config.max_tokens,
|
| 145 |
-
)
|
| 146 |
-
return response.choices[0].message.content
|
| 147 |
-
except Exception as e:
|
| 148 |
-
print(f"Error calling LLM: {e}")
|
| 149 |
-
return "THOUGHT: Error occurred, trying a safe action.\nACTION: look"
|
| 150 |
-
|
| 151 |
-
def _parse_response(self, response: str) -> tuple[str, str]:
|
| 152 |
-
"""Parse the LLM response to extract thought and action."""
|
| 153 |
-
thought = ""
|
| 154 |
-
action = "look" # Default fallback action
|
| 155 |
-
|
| 156 |
-
lines = response.strip().split("\n")
|
| 157 |
-
|
| 158 |
-
for i, line in enumerate(lines):
|
| 159 |
-
line_upper = line.upper().strip()
|
| 160 |
-
|
| 161 |
-
if line_upper.startswith("THOUGHT:"):
|
| 162 |
-
# Extract thought (may span multiple lines until ACTION)
|
| 163 |
-
thought_parts = [line.split(":", 1)[1].strip()]
|
| 164 |
-
for j in range(i + 1, len(lines)):
|
| 165 |
-
if lines[j].upper().strip().startswith("ACTION:"):
|
| 166 |
-
break
|
| 167 |
-
thought_parts.append(lines[j].strip())
|
| 168 |
-
thought = " ".join(thought_parts).strip()
|
| 169 |
-
|
| 170 |
-
elif line_upper.startswith("ACTION:"):
|
| 171 |
-
action = line.split(":", 1)[1].strip().lower()
|
| 172 |
-
# Clean up the action - remove quotes, markdown, and extra whitespace
|
| 173 |
-
action = action.strip('"\'')
|
| 174 |
-
# Remove markdown bold/italic markers
|
| 175 |
-
action = action.replace("**", "").replace("*", "").replace("__", "").replace("_", " ")
|
| 176 |
-
# Remove backticks
|
| 177 |
-
action = action.replace("`", "")
|
| 178 |
-
# Clean up whitespace
|
| 179 |
-
action = " ".join(action.split())
|
| 180 |
-
break
|
| 181 |
-
|
| 182 |
-
# Validate action isn't empty
|
| 183 |
-
if not action or action.isspace():
|
| 184 |
-
action = "look"
|
| 185 |
-
|
| 186 |
-
return thought, action
|
| 187 |
-
|
| 188 |
-
def reset(self):
|
| 189 |
-
"""Reset the agent for a new game."""
|
| 190 |
-
super().reset()
|
| 191 |
-
self.thoughts = []
|
| 192 |
-
|
| 193 |
-
def get_summary(self) -> str:
|
| 194 |
-
"""Get a summary of the agent's reasoning."""
|
| 195 |
-
if not self.thoughts:
|
| 196 |
-
return "No thoughts recorded yet."
|
| 197 |
-
|
| 198 |
-
return "\n---\n".join(self.thoughts[-5:])
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
# Example usage and testing
|
| 202 |
-
if __name__ == "__main__":
|
| 203 |
-
import sys
|
| 204 |
-
from games.zork_env import TextAdventureEnv
|
| 205 |
-
|
| 206 |
-
# Use command line arg or default to zork1
|
| 207 |
-
game = sys.argv[1] if len(sys.argv) > 1 else "zork1"
|
| 208 |
-
|
| 209 |
-
# Quick test
|
| 210 |
-
config = ReActConfig(verbose=True)
|
| 211 |
-
|
| 212 |
-
try:
|
| 213 |
-
agent = ReActAgent(config)
|
| 214 |
-
env = TextAdventureEnv(game)
|
| 215 |
-
|
| 216 |
-
state = env.reset()
|
| 217 |
-
print("=" * 50)
|
| 218 |
-
print(f"{game.upper()} (using {agent.config.model})")
|
| 219 |
-
print("=" * 50)
|
| 220 |
-
print(state.observation)
|
| 221 |
-
|
| 222 |
-
# Run a few steps
|
| 223 |
-
for step in range(5):
|
| 224 |
-
print(f"\n{'=' * 50}")
|
| 225 |
-
print(f"Step {step + 1}")
|
| 226 |
-
print("=" * 50)
|
| 227 |
-
|
| 228 |
-
action = agent.choose_action(state.observation, state)
|
| 229 |
-
print(f"\n> {action}")
|
| 230 |
-
|
| 231 |
-
state = env.step(action)
|
| 232 |
-
print(f"\n{state.observation}")
|
| 233 |
-
print(f"\nScore: {state.score}/{state.max_score}")
|
| 234 |
-
|
| 235 |
-
agent.update_history(action, state.observation, state)
|
| 236 |
-
|
| 237 |
-
if state.done:
|
| 238 |
-
print("\nGAME OVER!")
|
| 239 |
-
break
|
| 240 |
-
|
| 241 |
-
except ValueError as e:
|
| 242 |
-
print(f"Setup error: {e}")
|
| 243 |
-
print("Make sure to set your HF_TOKEN in .env file")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -49,16 +49,22 @@ Get your HuggingFace token at: https://huggingface.co/settings/tokens
|
|
| 49 |
|
| 50 |
### 4. Explore the Templates
|
| 51 |
|
| 52 |
-
The
|
| 53 |
|
| 54 |
-
- `
|
| 55 |
-
- `
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
### 5. Test Your Implementation
|
| 58 |
|
| 59 |
```bash
|
| 60 |
-
# Run
|
| 61 |
-
python run_agent.py
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
# List available games (57 total!)
|
| 64 |
python run_agent.py --list-games
|
|
@@ -66,7 +72,7 @@ python run_agent.py --list-games
|
|
| 66 |
|
| 67 |
## Resources
|
| 68 |
|
| 69 |
-
- [
|
| 70 |
- [FastMCP Documentation](https://gofastmcp.com/)
|
| 71 |
- [MCP Protocol](https://modelcontextprotocol.io/)
|
| 72 |
"""
|
|
|
|
| 49 |
|
| 50 |
### 4. Explore the Templates
|
| 51 |
|
| 52 |
+
The submission template is in the `submission_template/` folder:
|
| 53 |
|
| 54 |
+
- `agent.py` - Your agent implementation (implement the StudentAgent class)
|
| 55 |
+
- `mcp_server.py` - Your MCP server implementation (add tools)
|
| 56 |
+
- `README.md` - Detailed instructions
|
| 57 |
+
|
| 58 |
+
A working example is in `examples/mcp_react/`.
|
| 59 |
|
| 60 |
### 5. Test Your Implementation
|
| 61 |
|
| 62 |
```bash
|
| 63 |
+
# Run the example agent
|
| 64 |
+
python run_agent.py
|
| 65 |
+
|
| 66 |
+
# Run with a different game
|
| 67 |
+
python run_agent.py --game advent
|
| 68 |
|
| 69 |
# List available games (57 total!)
|
| 70 |
python run_agent.py --list-games
|
|
|
|
| 72 |
|
| 73 |
## Resources
|
| 74 |
|
| 75 |
+
- [Submission Instructions](submission_template/README.md)
|
| 76 |
- [FastMCP Documentation](https://gofastmcp.com/)
|
| 77 |
- [MCP Protocol](https://modelcontextprotocol.io/)
|
| 78 |
"""
|
evaluation/__init__.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation package for Text Adventure Agents.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from evaluation.metrics import EvaluationResult, TrialResult
|
| 6 |
+
from evaluation.runner import RunConfig, RunResult, run_agent_with_server
|
| 7 |
+
|
| 8 |
+
__all__ = [
|
| 9 |
+
"EvaluationResult",
|
| 10 |
+
"TrialResult",
|
| 11 |
+
"RunConfig",
|
| 12 |
+
"RunResult",
|
| 13 |
+
"run_agent_with_server",
|
| 14 |
+
]
|
evaluation/evaluate.py
ADDED
|
@@ -0,0 +1,559 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Evaluation Script for Text Adventure Agents
|
| 4 |
+
|
| 5 |
+
Evaluates student submissions by running their agent + MCP server
|
| 6 |
+
on a text adventure game for multiple trials and averaging scores.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
# Evaluate a student submission
|
| 10 |
+
python evaluation/evaluate.py \\
|
| 11 |
+
--submission path/to/student/submission \\
|
| 12 |
+
--game zork1 \\
|
| 13 |
+
--trials 5 \\
|
| 14 |
+
--max-steps 100
|
| 15 |
+
|
| 16 |
+
# Evaluate with reference agent comparison
|
| 17 |
+
python evaluation/evaluate.py \\
|
| 18 |
+
--submission path/to/student/submission \\
|
| 19 |
+
--game zork1 \\
|
| 20 |
+
--reference
|
| 21 |
+
|
| 22 |
+
# Evaluate from a Hugging Face Space
|
| 23 |
+
python evaluation/evaluate.py \\
|
| 24 |
+
--hf-space username/space-name \\
|
| 25 |
+
--game zork1
|
| 26 |
+
|
| 27 |
+
# Batch evaluate multiple submissions
|
| 28 |
+
python evaluation/evaluate.py \\
|
| 29 |
+
--submissions-dir path/to/all/submissions \\
|
| 30 |
+
--game zork1 \\
|
| 31 |
+
--output results.json
|
| 32 |
+
|
| 33 |
+
Examples:
|
| 34 |
+
# Quick test with 3 trials
|
| 35 |
+
python evaluation/evaluate.py -s ./submission_template -g zork1 -t 3
|
| 36 |
+
|
| 37 |
+
# Full evaluation for grading
|
| 38 |
+
python evaluation/evaluate.py -s ./submission_template -g advent -t 5 --max-steps 150
|
| 39 |
+
"""
|
| 40 |
+
|
| 41 |
+
import argparse
|
| 42 |
+
import asyncio
|
| 43 |
+
import json
|
| 44 |
+
import os
|
| 45 |
+
import random
|
| 46 |
+
import sys
|
| 47 |
+
import tempfile
|
| 48 |
+
import warnings
|
| 49 |
+
from datetime import datetime
|
| 50 |
+
from pathlib import Path
|
| 51 |
+
|
| 52 |
+
# Suppress asyncio subprocess cleanup warnings
|
| 53 |
+
warnings.filterwarnings("ignore", message=".*Event loop is closed.*")
|
| 54 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="multiprocessing.resource_tracker")
|
| 55 |
+
|
| 56 |
+
# Add parent directory to path
|
| 57 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 58 |
+
|
| 59 |
+
from evaluation.metrics import EvaluationResult, TrialResult
|
| 60 |
+
from evaluation.runner import RunConfig, run_agent_with_server, run_reference_agent
|
| 61 |
+
from games.zork_env import list_available_games
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def generate_seeds(base_seed: int, num_trials: int) -> list[int]:
|
| 65 |
+
"""Generate deterministic seeds for each trial."""
|
| 66 |
+
random.seed(base_seed)
|
| 67 |
+
return [random.randint(0, 2**32 - 1) for _ in range(num_trials)]
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
async def evaluate_submission(
|
| 71 |
+
submission_path: Path,
|
| 72 |
+
game: str,
|
| 73 |
+
num_trials: int = 5,
|
| 74 |
+
max_steps: int = 100,
|
| 75 |
+
base_seed: int = 42,
|
| 76 |
+
verbose: bool = False,
|
| 77 |
+
) -> EvaluationResult:
|
| 78 |
+
"""
|
| 79 |
+
Evaluate a student submission across multiple trials.
|
| 80 |
+
|
| 81 |
+
Args:
|
| 82 |
+
submission_path: Path to student's submission directory
|
| 83 |
+
game: Name of the game to evaluate on
|
| 84 |
+
num_trials: Number of trials to run (default: 5)
|
| 85 |
+
max_steps: Maximum steps per trial (default: 100)
|
| 86 |
+
base_seed: Base seed for reproducibility (default: 42)
|
| 87 |
+
verbose: Print detailed output
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
EvaluationResult with aggregated metrics
|
| 91 |
+
"""
|
| 92 |
+
# Locate agent and server files
|
| 93 |
+
agent_path = submission_path / "agent.py"
|
| 94 |
+
server_path = submission_path / "mcp_server.py"
|
| 95 |
+
|
| 96 |
+
# Extract student ID from path or README
|
| 97 |
+
student_id = submission_path.name
|
| 98 |
+
readme_path = submission_path / "README.md"
|
| 99 |
+
if readme_path.exists():
|
| 100 |
+
content = readme_path.read_text()
|
| 101 |
+
# Try to extract student name from README
|
| 102 |
+
for line in content.split("\n"):
|
| 103 |
+
if line.startswith("# ") or "name:" in line.lower():
|
| 104 |
+
student_id = line.replace("#", "").replace("name:", "").strip()[:50]
|
| 105 |
+
break
|
| 106 |
+
|
| 107 |
+
# Initialize results
|
| 108 |
+
result = EvaluationResult(
|
| 109 |
+
student_id=student_id,
|
| 110 |
+
game=game,
|
| 111 |
+
num_trials=num_trials,
|
| 112 |
+
max_steps=max_steps,
|
| 113 |
+
)
|
| 114 |
+
|
| 115 |
+
# Generate deterministic seeds
|
| 116 |
+
seeds = generate_seeds(base_seed, num_trials)
|
| 117 |
+
|
| 118 |
+
print(f"\nEvaluating: {student_id}")
|
| 119 |
+
print(f"Game: {game}")
|
| 120 |
+
print(f"Trials: {num_trials}")
|
| 121 |
+
print(f"Max steps: {max_steps}")
|
| 122 |
+
print(f"Seeds: {seeds}")
|
| 123 |
+
print("-" * 50)
|
| 124 |
+
|
| 125 |
+
for i, seed in enumerate(seeds):
|
| 126 |
+
trial_num = i + 1
|
| 127 |
+
print(f"\nTrial {trial_num}/{num_trials} (seed={seed})...")
|
| 128 |
+
|
| 129 |
+
config = RunConfig(
|
| 130 |
+
agent_path=agent_path,
|
| 131 |
+
server_path=server_path,
|
| 132 |
+
game=game,
|
| 133 |
+
max_steps=max_steps,
|
| 134 |
+
seed=seed,
|
| 135 |
+
verbose=verbose,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
try:
|
| 139 |
+
run_result = await run_agent_with_server(config)
|
| 140 |
+
|
| 141 |
+
trial = TrialResult(
|
| 142 |
+
trial_number=trial_num,
|
| 143 |
+
final_score=run_result.final_score,
|
| 144 |
+
max_score=run_result.max_score,
|
| 145 |
+
moves=run_result.moves,
|
| 146 |
+
locations_visited=len(run_result.locations_visited),
|
| 147 |
+
game_completed=run_result.game_completed,
|
| 148 |
+
error=run_result.error,
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
if run_result.error:
|
| 152 |
+
print(f" Error: {run_result.error[:100]}...")
|
| 153 |
+
else:
|
| 154 |
+
print(f" Score: {run_result.final_score}")
|
| 155 |
+
print(f" Moves: {run_result.moves}")
|
| 156 |
+
print(f" Locations: {len(run_result.locations_visited)}")
|
| 157 |
+
|
| 158 |
+
except Exception as e:
|
| 159 |
+
trial = TrialResult(
|
| 160 |
+
trial_number=trial_num,
|
| 161 |
+
final_score=0,
|
| 162 |
+
max_score=0,
|
| 163 |
+
moves=0,
|
| 164 |
+
locations_visited=0,
|
| 165 |
+
game_completed=False,
|
| 166 |
+
error=str(e),
|
| 167 |
+
)
|
| 168 |
+
print(f" Exception: {e}")
|
| 169 |
+
|
| 170 |
+
result.add_trial(trial)
|
| 171 |
+
|
| 172 |
+
return result
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
async def evaluate_with_reference(
|
| 176 |
+
submission_path: Path,
|
| 177 |
+
game: str,
|
| 178 |
+
num_trials: int = 5,
|
| 179 |
+
max_steps: int = 100,
|
| 180 |
+
base_seed: int = 42,
|
| 181 |
+
verbose: bool = False,
|
| 182 |
+
) -> tuple[EvaluationResult, EvaluationResult]:
|
| 183 |
+
"""
|
| 184 |
+
Evaluate student submission and compare with reference agent.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Tuple of (student_result, reference_result)
|
| 188 |
+
"""
|
| 189 |
+
# Evaluate student
|
| 190 |
+
student_result = await evaluate_submission(
|
| 191 |
+
submission_path=submission_path,
|
| 192 |
+
game=game,
|
| 193 |
+
num_trials=num_trials,
|
| 194 |
+
max_steps=max_steps,
|
| 195 |
+
base_seed=base_seed,
|
| 196 |
+
verbose=verbose,
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Evaluate reference agent (from examples/mcp_react)
|
| 200 |
+
print("\n" + "=" * 50)
|
| 201 |
+
print("Running reference agent for comparison...")
|
| 202 |
+
print("=" * 50)
|
| 203 |
+
|
| 204 |
+
seeds = generate_seeds(base_seed, num_trials)
|
| 205 |
+
|
| 206 |
+
reference_result = EvaluationResult(
|
| 207 |
+
student_id="reference_agent",
|
| 208 |
+
game=game,
|
| 209 |
+
num_trials=num_trials,
|
| 210 |
+
max_steps=max_steps,
|
| 211 |
+
)
|
| 212 |
+
|
| 213 |
+
for i, seed in enumerate(seeds):
|
| 214 |
+
trial_num = i + 1
|
| 215 |
+
print(f"\nReference Trial {trial_num}/{num_trials} (seed={seed})...")
|
| 216 |
+
|
| 217 |
+
try:
|
| 218 |
+
run_result = await run_reference_agent(
|
| 219 |
+
game=game,
|
| 220 |
+
max_steps=max_steps,
|
| 221 |
+
seed=seed,
|
| 222 |
+
verbose=verbose,
|
| 223 |
+
)
|
| 224 |
+
|
| 225 |
+
trial = TrialResult(
|
| 226 |
+
trial_number=trial_num,
|
| 227 |
+
final_score=run_result.final_score,
|
| 228 |
+
max_score=run_result.max_score,
|
| 229 |
+
moves=run_result.moves,
|
| 230 |
+
locations_visited=len(run_result.locations_visited),
|
| 231 |
+
game_completed=run_result.game_completed,
|
| 232 |
+
error=run_result.error,
|
| 233 |
+
)
|
| 234 |
+
|
| 235 |
+
if run_result.error:
|
| 236 |
+
print(f" Error: {run_result.error[:100]}...")
|
| 237 |
+
else:
|
| 238 |
+
print(f" Score: {run_result.final_score}")
|
| 239 |
+
|
| 240 |
+
except Exception as e:
|
| 241 |
+
trial = TrialResult(
|
| 242 |
+
trial_number=trial_num,
|
| 243 |
+
final_score=0,
|
| 244 |
+
max_score=0,
|
| 245 |
+
moves=0,
|
| 246 |
+
locations_visited=0,
|
| 247 |
+
game_completed=False,
|
| 248 |
+
error=str(e),
|
| 249 |
+
)
|
| 250 |
+
print(f" Exception: {e}")
|
| 251 |
+
|
| 252 |
+
reference_result.add_trial(trial)
|
| 253 |
+
|
| 254 |
+
return student_result, reference_result
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def clone_hf_space(space_id: str, target_dir: Path) -> Path:
|
| 258 |
+
"""Clone a Hugging Face Space to local directory."""
|
| 259 |
+
import subprocess
|
| 260 |
+
|
| 261 |
+
# HF Spaces are git repos at huggingface.co/spaces/
|
| 262 |
+
repo_url = f"https://huggingface.co/spaces/{space_id}"
|
| 263 |
+
|
| 264 |
+
print(f"Cloning {repo_url}...")
|
| 265 |
+
subprocess.run(
|
| 266 |
+
["git", "clone", "--depth", "1", repo_url, str(target_dir)],
|
| 267 |
+
check=True,
|
| 268 |
+
capture_output=True,
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
return target_dir
|
| 272 |
+
|
| 273 |
+
|
| 274 |
+
async def batch_evaluate(
|
| 275 |
+
submissions_dir: Path,
|
| 276 |
+
game: str,
|
| 277 |
+
num_trials: int = 5,
|
| 278 |
+
max_steps: int = 100,
|
| 279 |
+
base_seed: int = 42,
|
| 280 |
+
output_path: Path = None,
|
| 281 |
+
verbose: bool = False,
|
| 282 |
+
) -> list[EvaluationResult]:
|
| 283 |
+
"""Evaluate all submissions in a directory."""
|
| 284 |
+
results = []
|
| 285 |
+
|
| 286 |
+
# Find all submission directories (those containing agent.py)
|
| 287 |
+
submission_dirs = [
|
| 288 |
+
d for d in submissions_dir.iterdir()
|
| 289 |
+
if d.is_dir() and (d / "agent.py").exists()
|
| 290 |
+
]
|
| 291 |
+
|
| 292 |
+
print(f"Found {len(submission_dirs)} submissions")
|
| 293 |
+
|
| 294 |
+
for submission_path in sorted(submission_dirs):
|
| 295 |
+
try:
|
| 296 |
+
result = await evaluate_submission(
|
| 297 |
+
submission_path=submission_path,
|
| 298 |
+
game=game,
|
| 299 |
+
num_trials=num_trials,
|
| 300 |
+
max_steps=max_steps,
|
| 301 |
+
base_seed=base_seed,
|
| 302 |
+
verbose=verbose,
|
| 303 |
+
)
|
| 304 |
+
results.append(result)
|
| 305 |
+
except Exception as e:
|
| 306 |
+
print(f"Failed to evaluate {submission_path}: {e}")
|
| 307 |
+
|
| 308 |
+
# Sort by mean score (descending)
|
| 309 |
+
results.sort(key=lambda r: r.mean_score, reverse=True)
|
| 310 |
+
|
| 311 |
+
# Save results
|
| 312 |
+
if output_path:
|
| 313 |
+
output_data = {
|
| 314 |
+
"evaluation_date": datetime.now().isoformat(),
|
| 315 |
+
"game": game,
|
| 316 |
+
"num_trials": num_trials,
|
| 317 |
+
"max_steps": max_steps,
|
| 318 |
+
"base_seed": base_seed,
|
| 319 |
+
"results": [r.to_dict() for r in results],
|
| 320 |
+
"leaderboard": [
|
| 321 |
+
{
|
| 322 |
+
"rank": i + 1,
|
| 323 |
+
"student_id": r.student_id,
|
| 324 |
+
"mean_score": round(r.mean_score, 2),
|
| 325 |
+
"std_score": round(r.std_score, 2),
|
| 326 |
+
}
|
| 327 |
+
for i, r in enumerate(results)
|
| 328 |
+
],
|
| 329 |
+
}
|
| 330 |
+
|
| 331 |
+
with open(output_path, "w") as f:
|
| 332 |
+
json.dump(output_data, f, indent=2)
|
| 333 |
+
|
| 334 |
+
print(f"\nResults saved to {output_path}")
|
| 335 |
+
|
| 336 |
+
return results
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
def print_comparison(student: EvaluationResult, reference: EvaluationResult):
|
| 340 |
+
"""Print a comparison between student and reference results."""
|
| 341 |
+
print("\n" + "=" * 60)
|
| 342 |
+
print("EVALUATION COMPARISON")
|
| 343 |
+
print("=" * 60)
|
| 344 |
+
|
| 345 |
+
print(f"\n{'Metric':<25} {'Student':<15} {'Reference':<15}")
|
| 346 |
+
print("-" * 55)
|
| 347 |
+
print(f"{'Mean Score':<25} {student.mean_score:<15.2f} {reference.mean_score:<15.2f}")
|
| 348 |
+
print(f"{'Std Score':<25} {student.std_score:<15.2f} {reference.std_score:<15.2f}")
|
| 349 |
+
print(f"{'Min Score':<25} {student.min_score:<15} {reference.min_score:<15}")
|
| 350 |
+
print(f"{'Max Score':<25} {student.max_score_achieved:<15} {reference.max_score_achieved:<15}")
|
| 351 |
+
print(f"{'Mean Moves':<25} {student.mean_moves:<15.1f} {reference.mean_moves:<15.1f}")
|
| 352 |
+
print(f"{'Mean Locations':<25} {student.mean_locations:<15.1f} {reference.mean_locations:<15.1f}")
|
| 353 |
+
print(f"{'Successful Trials':<25} {student.successful_trials:<15} {reference.successful_trials:<15}")
|
| 354 |
+
|
| 355 |
+
# Performance ratio
|
| 356 |
+
if reference.mean_score > 0:
|
| 357 |
+
ratio = student.mean_score / reference.mean_score * 100
|
| 358 |
+
print(f"\nStudent performance: {ratio:.1f}% of reference")
|
| 359 |
+
|
| 360 |
+
|
| 361 |
+
def main():
|
| 362 |
+
parser = argparse.ArgumentParser(
|
| 363 |
+
description="Evaluate text adventure agent submissions",
|
| 364 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 365 |
+
epilog=__doc__,
|
| 366 |
+
)
|
| 367 |
+
|
| 368 |
+
# Input options (mutually exclusive)
|
| 369 |
+
input_group = parser.add_mutually_exclusive_group(required=True)
|
| 370 |
+
input_group.add_argument(
|
| 371 |
+
"-s", "--submission",
|
| 372 |
+
type=Path,
|
| 373 |
+
help="Path to student submission directory",
|
| 374 |
+
)
|
| 375 |
+
input_group.add_argument(
|
| 376 |
+
"--hf-space",
|
| 377 |
+
type=str,
|
| 378 |
+
help="Hugging Face Space ID (e.g., username/space-name)",
|
| 379 |
+
)
|
| 380 |
+
input_group.add_argument(
|
| 381 |
+
"--submissions-dir",
|
| 382 |
+
type=Path,
|
| 383 |
+
help="Directory containing multiple submissions (for batch evaluation)",
|
| 384 |
+
)
|
| 385 |
+
|
| 386 |
+
# Evaluation parameters
|
| 387 |
+
parser.add_argument(
|
| 388 |
+
"-g", "--game",
|
| 389 |
+
type=str,
|
| 390 |
+
default="lostpig",
|
| 391 |
+
help="Game to evaluate on (default: lostpig)",
|
| 392 |
+
)
|
| 393 |
+
parser.add_argument(
|
| 394 |
+
"-t", "--trials",
|
| 395 |
+
type=int,
|
| 396 |
+
default=5,
|
| 397 |
+
help="Number of trials to run (default: 5)",
|
| 398 |
+
)
|
| 399 |
+
parser.add_argument(
|
| 400 |
+
"--max-steps",
|
| 401 |
+
type=int,
|
| 402 |
+
default=100,
|
| 403 |
+
help="Maximum steps per trial (default: 100)",
|
| 404 |
+
)
|
| 405 |
+
parser.add_argument(
|
| 406 |
+
"--seed",
|
| 407 |
+
type=int,
|
| 408 |
+
default=42,
|
| 409 |
+
help="Base random seed for reproducibility (default: 42)",
|
| 410 |
+
)
|
| 411 |
+
|
| 412 |
+
# Reference comparison
|
| 413 |
+
parser.add_argument(
|
| 414 |
+
"-r", "--reference",
|
| 415 |
+
action="store_true",
|
| 416 |
+
help="Also run reference agent (from examples/mcp_react) for comparison",
|
| 417 |
+
)
|
| 418 |
+
|
| 419 |
+
# Output options
|
| 420 |
+
parser.add_argument(
|
| 421 |
+
"-o", "--output",
|
| 422 |
+
type=Path,
|
| 423 |
+
help="Output file for results (JSON)",
|
| 424 |
+
)
|
| 425 |
+
parser.add_argument(
|
| 426 |
+
"-v", "--verbose",
|
| 427 |
+
action="store_true",
|
| 428 |
+
help="Print detailed output",
|
| 429 |
+
)
|
| 430 |
+
parser.add_argument(
|
| 431 |
+
"--list-games",
|
| 432 |
+
action="store_true",
|
| 433 |
+
help="List available games and exit",
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
args = parser.parse_args()
|
| 437 |
+
|
| 438 |
+
# List games if requested
|
| 439 |
+
if args.list_games:
|
| 440 |
+
games = list_available_games()
|
| 441 |
+
print(f"Available games ({len(games)}):")
|
| 442 |
+
for game in games:
|
| 443 |
+
print(f" - {game}")
|
| 444 |
+
return
|
| 445 |
+
|
| 446 |
+
# Validate game
|
| 447 |
+
available_games = list_available_games()
|
| 448 |
+
if args.game not in available_games:
|
| 449 |
+
print(f"Error: Unknown game '{args.game}'")
|
| 450 |
+
print(f"Available: {', '.join(available_games[:10])}...")
|
| 451 |
+
sys.exit(1)
|
| 452 |
+
|
| 453 |
+
# Handle HF Space input
|
| 454 |
+
if args.hf_space:
|
| 455 |
+
with tempfile.TemporaryDirectory() as tmpdir:
|
| 456 |
+
submission_path = clone_hf_space(args.hf_space, Path(tmpdir) / "submission")
|
| 457 |
+
|
| 458 |
+
if args.reference:
|
| 459 |
+
student_result, reference_result = asyncio.run(
|
| 460 |
+
evaluate_with_reference(
|
| 461 |
+
submission_path=submission_path,
|
| 462 |
+
game=args.game,
|
| 463 |
+
num_trials=args.trials,
|
| 464 |
+
max_steps=args.max_steps,
|
| 465 |
+
base_seed=args.seed,
|
| 466 |
+
verbose=args.verbose,
|
| 467 |
+
)
|
| 468 |
+
)
|
| 469 |
+
print_comparison(student_result, reference_result)
|
| 470 |
+
else:
|
| 471 |
+
result = asyncio.run(
|
| 472 |
+
evaluate_submission(
|
| 473 |
+
submission_path=submission_path,
|
| 474 |
+
game=args.game,
|
| 475 |
+
num_trials=args.trials,
|
| 476 |
+
max_steps=args.max_steps,
|
| 477 |
+
base_seed=args.seed,
|
| 478 |
+
verbose=args.verbose,
|
| 479 |
+
)
|
| 480 |
+
)
|
| 481 |
+
print("\n" + result.summary_str())
|
| 482 |
+
|
| 483 |
+
# Handle batch evaluation
|
| 484 |
+
elif args.submissions_dir:
|
| 485 |
+
results = asyncio.run(
|
| 486 |
+
batch_evaluate(
|
| 487 |
+
submissions_dir=args.submissions_dir,
|
| 488 |
+
game=args.game,
|
| 489 |
+
num_trials=args.trials,
|
| 490 |
+
max_steps=args.max_steps,
|
| 491 |
+
base_seed=args.seed,
|
| 492 |
+
output_path=args.output,
|
| 493 |
+
verbose=args.verbose,
|
| 494 |
+
)
|
| 495 |
+
)
|
| 496 |
+
|
| 497 |
+
# Print leaderboard
|
| 498 |
+
print("\n" + "=" * 60)
|
| 499 |
+
print("LEADERBOARD")
|
| 500 |
+
print("=" * 60)
|
| 501 |
+
print(f"\n{'Rank':<6} {'Student':<30} {'Mean Score':<12} {'Std':<10}")
|
| 502 |
+
print("-" * 58)
|
| 503 |
+
for i, r in enumerate(results):
|
| 504 |
+
print(f"{i+1:<6} {r.student_id:<30} {r.mean_score:<12.2f} {r.std_score:<10.2f}")
|
| 505 |
+
|
| 506 |
+
# Handle single submission
|
| 507 |
+
else:
|
| 508 |
+
submission_path = args.submission
|
| 509 |
+
|
| 510 |
+
if not submission_path.exists():
|
| 511 |
+
print(f"Error: Submission path not found: {submission_path}")
|
| 512 |
+
sys.exit(1)
|
| 513 |
+
|
| 514 |
+
if args.reference:
|
| 515 |
+
student_result, reference_result = asyncio.run(
|
| 516 |
+
evaluate_with_reference(
|
| 517 |
+
submission_path=submission_path,
|
| 518 |
+
game=args.game,
|
| 519 |
+
num_trials=args.trials,
|
| 520 |
+
max_steps=args.max_steps,
|
| 521 |
+
base_seed=args.seed,
|
| 522 |
+
verbose=args.verbose,
|
| 523 |
+
)
|
| 524 |
+
)
|
| 525 |
+
print_comparison(student_result, reference_result)
|
| 526 |
+
|
| 527 |
+
# Save results if output specified
|
| 528 |
+
if args.output:
|
| 529 |
+
output_data = {
|
| 530 |
+
"evaluation_date": datetime.now().isoformat(),
|
| 531 |
+
"student": student_result.to_dict(),
|
| 532 |
+
"reference": reference_result.to_dict(),
|
| 533 |
+
}
|
| 534 |
+
with open(args.output, "w") as f:
|
| 535 |
+
json.dump(output_data, f, indent=2)
|
| 536 |
+
print(f"\nResults saved to {args.output}")
|
| 537 |
+
else:
|
| 538 |
+
result = asyncio.run(
|
| 539 |
+
evaluate_submission(
|
| 540 |
+
submission_path=submission_path,
|
| 541 |
+
game=args.game,
|
| 542 |
+
num_trials=args.trials,
|
| 543 |
+
max_steps=args.max_steps,
|
| 544 |
+
base_seed=args.seed,
|
| 545 |
+
verbose=args.verbose,
|
| 546 |
+
)
|
| 547 |
+
)
|
| 548 |
+
|
| 549 |
+
print("\n" + result.summary_str())
|
| 550 |
+
|
| 551 |
+
# Save results if output specified
|
| 552 |
+
if args.output:
|
| 553 |
+
with open(args.output, "w") as f:
|
| 554 |
+
json.dump(result.to_dict(), f, indent=2)
|
| 555 |
+
print(f"\nResults saved to {args.output}")
|
| 556 |
+
|
| 557 |
+
|
| 558 |
+
if __name__ == "__main__":
|
| 559 |
+
main()
|
evaluation/metrics.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Evaluation Metrics for Text Adventure Agents
|
| 3 |
+
|
| 4 |
+
Tracks scores across multiple trials and computes statistics.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import statistics
|
| 8 |
+
from dataclasses import dataclass, field
|
| 9 |
+
from typing import Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@dataclass
|
| 13 |
+
class TrialResult:
|
| 14 |
+
"""Result of a single evaluation trial."""
|
| 15 |
+
trial_number: int
|
| 16 |
+
final_score: int
|
| 17 |
+
max_score: int
|
| 18 |
+
moves: int
|
| 19 |
+
locations_visited: int
|
| 20 |
+
game_completed: bool
|
| 21 |
+
error: Optional[str] = None
|
| 22 |
+
|
| 23 |
+
@property
|
| 24 |
+
def score_percentage(self) -> float:
|
| 25 |
+
"""Score as percentage of max possible."""
|
| 26 |
+
if self.max_score == 0:
|
| 27 |
+
return 0.0
|
| 28 |
+
return (self.final_score / self.max_score) * 100
|
| 29 |
+
|
| 30 |
+
def to_dict(self) -> dict:
|
| 31 |
+
"""Convert to dictionary for JSON serialization."""
|
| 32 |
+
return {
|
| 33 |
+
"trial_number": self.trial_number,
|
| 34 |
+
"final_score": self.final_score,
|
| 35 |
+
"max_score": self.max_score,
|
| 36 |
+
"score_percentage": round(self.score_percentage, 2),
|
| 37 |
+
"moves": self.moves,
|
| 38 |
+
"locations_visited": self.locations_visited,
|
| 39 |
+
"game_completed": self.game_completed,
|
| 40 |
+
"error": self.error,
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class EvaluationResult:
|
| 46 |
+
"""Aggregated results across all trials."""
|
| 47 |
+
student_id: str
|
| 48 |
+
game: str
|
| 49 |
+
num_trials: int
|
| 50 |
+
max_steps: int
|
| 51 |
+
trials: list[TrialResult] = field(default_factory=list)
|
| 52 |
+
|
| 53 |
+
@property
|
| 54 |
+
def scores(self) -> list[int]:
|
| 55 |
+
"""List of final scores from all trials."""
|
| 56 |
+
return [t.final_score for t in self.trials if t.error is None]
|
| 57 |
+
|
| 58 |
+
@property
|
| 59 |
+
def mean_score(self) -> float:
|
| 60 |
+
"""Average score across trials."""
|
| 61 |
+
if not self.scores:
|
| 62 |
+
return 0.0
|
| 63 |
+
return statistics.mean(self.scores)
|
| 64 |
+
|
| 65 |
+
@property
|
| 66 |
+
def std_score(self) -> float:
|
| 67 |
+
"""Standard deviation of scores."""
|
| 68 |
+
if len(self.scores) < 2:
|
| 69 |
+
return 0.0
|
| 70 |
+
return statistics.stdev(self.scores)
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def min_score(self) -> int:
|
| 74 |
+
"""Minimum score achieved."""
|
| 75 |
+
if not self.scores:
|
| 76 |
+
return 0
|
| 77 |
+
return min(self.scores)
|
| 78 |
+
|
| 79 |
+
@property
|
| 80 |
+
def max_score_achieved(self) -> int:
|
| 81 |
+
"""Maximum score achieved."""
|
| 82 |
+
if not self.scores:
|
| 83 |
+
return 0
|
| 84 |
+
return max(self.scores)
|
| 85 |
+
|
| 86 |
+
@property
|
| 87 |
+
def successful_trials(self) -> int:
|
| 88 |
+
"""Number of trials that completed without error."""
|
| 89 |
+
return len([t for t in self.trials if t.error is None])
|
| 90 |
+
|
| 91 |
+
@property
|
| 92 |
+
def mean_moves(self) -> float:
|
| 93 |
+
"""Average number of moves across trials."""
|
| 94 |
+
moves = [t.moves for t in self.trials if t.error is None]
|
| 95 |
+
if not moves:
|
| 96 |
+
return 0.0
|
| 97 |
+
return statistics.mean(moves)
|
| 98 |
+
|
| 99 |
+
@property
|
| 100 |
+
def mean_locations(self) -> float:
|
| 101 |
+
"""Average number of locations visited."""
|
| 102 |
+
locs = [t.locations_visited for t in self.trials if t.error is None]
|
| 103 |
+
if not locs:
|
| 104 |
+
return 0.0
|
| 105 |
+
return statistics.mean(locs)
|
| 106 |
+
|
| 107 |
+
def add_trial(self, trial: TrialResult) -> None:
|
| 108 |
+
"""Add a trial result."""
|
| 109 |
+
self.trials.append(trial)
|
| 110 |
+
|
| 111 |
+
def to_dict(self) -> dict:
|
| 112 |
+
"""Convert to dictionary for JSON serialization."""
|
| 113 |
+
return {
|
| 114 |
+
"student_id": self.student_id,
|
| 115 |
+
"game": self.game,
|
| 116 |
+
"num_trials": self.num_trials,
|
| 117 |
+
"max_steps": self.max_steps,
|
| 118 |
+
"successful_trials": self.successful_trials,
|
| 119 |
+
"summary": {
|
| 120 |
+
"mean_score": round(self.mean_score, 2),
|
| 121 |
+
"std_score": round(self.std_score, 2),
|
| 122 |
+
"min_score": self.min_score,
|
| 123 |
+
"max_score": self.max_score_achieved,
|
| 124 |
+
"mean_moves": round(self.mean_moves, 2),
|
| 125 |
+
"mean_locations": round(self.mean_locations, 2),
|
| 126 |
+
},
|
| 127 |
+
"trials": [t.to_dict() for t in self.trials],
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
def summary_str(self) -> str:
|
| 131 |
+
"""Human-readable summary."""
|
| 132 |
+
lines = [
|
| 133 |
+
f"Evaluation Results: {self.student_id}",
|
| 134 |
+
f"{'=' * 50}",
|
| 135 |
+
f"Game: {self.game}",
|
| 136 |
+
f"Trials: {self.successful_trials}/{self.num_trials} successful",
|
| 137 |
+
f"Max steps per trial: {self.max_steps}",
|
| 138 |
+
f"",
|
| 139 |
+
f"Score Statistics:",
|
| 140 |
+
f" Mean: {self.mean_score:.2f}",
|
| 141 |
+
f" Std: {self.std_score:.2f}",
|
| 142 |
+
f" Min: {self.min_score}",
|
| 143 |
+
f" Max: {self.max_score_achieved}",
|
| 144 |
+
f"",
|
| 145 |
+
f"Exploration:",
|
| 146 |
+
f" Mean moves: {self.mean_moves:.1f}",
|
| 147 |
+
f" Mean locations: {self.mean_locations:.1f}",
|
| 148 |
+
f"",
|
| 149 |
+
f"Per-Trial Scores: {self.scores}",
|
| 150 |
+
]
|
| 151 |
+
return "\n".join(lines)
|
evaluation/runner.py
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Agent Runner for Evaluation
|
| 3 |
+
|
| 4 |
+
Handles spawning the MCP server subprocess and running the agent.
|
| 5 |
+
Provides isolation between trials and proper cleanup.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import importlib.util
|
| 10 |
+
import os
|
| 11 |
+
import subprocess
|
| 12 |
+
import sys
|
| 13 |
+
import time
|
| 14 |
+
from dataclasses import dataclass
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
from fastmcp import Client
|
| 19 |
+
from fastmcp.client.transports import StdioTransport
|
| 20 |
+
|
| 21 |
+
# Add parent directory to path
|
| 22 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 23 |
+
from games.zork_env import list_available_games
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
@dataclass
|
| 27 |
+
class RunConfig:
|
| 28 |
+
"""Configuration for a single agent run."""
|
| 29 |
+
agent_path: Path
|
| 30 |
+
server_path: Path
|
| 31 |
+
game: str
|
| 32 |
+
max_steps: int
|
| 33 |
+
seed: int
|
| 34 |
+
verbose: bool = False
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
@dataclass
|
| 38 |
+
class RunResult:
|
| 39 |
+
"""Result of a single agent run."""
|
| 40 |
+
final_score: int
|
| 41 |
+
max_score: int
|
| 42 |
+
moves: int
|
| 43 |
+
locations_visited: set[str]
|
| 44 |
+
game_completed: bool
|
| 45 |
+
error: Optional[str] = None
|
| 46 |
+
history: list[tuple[str, str, str]] = None # (thought, action, result)
|
| 47 |
+
|
| 48 |
+
def __post_init__(self):
|
| 49 |
+
if self.history is None:
|
| 50 |
+
self.history = []
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def load_agent_class(agent_path: Path):
|
| 54 |
+
"""
|
| 55 |
+
Dynamically load the agent class from student's agent.py.
|
| 56 |
+
|
| 57 |
+
Expects the student file to define a class called 'StudentAgent'
|
| 58 |
+
with an async method 'run(client, game, max_steps, seed)'.
|
| 59 |
+
"""
|
| 60 |
+
spec = importlib.util.spec_from_file_location("student_agent", agent_path)
|
| 61 |
+
module = importlib.util.module_from_spec(spec)
|
| 62 |
+
|
| 63 |
+
# Add the submission directory to path so relative imports work
|
| 64 |
+
submission_dir = str(agent_path.parent)
|
| 65 |
+
if submission_dir not in sys.path:
|
| 66 |
+
sys.path.insert(0, submission_dir)
|
| 67 |
+
|
| 68 |
+
spec.loader.exec_module(module)
|
| 69 |
+
|
| 70 |
+
if not hasattr(module, "StudentAgent"):
|
| 71 |
+
raise ValueError(
|
| 72 |
+
f"Agent file {agent_path} must define a 'StudentAgent' class"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
return module.StudentAgent
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
async def run_agent_with_server(config: RunConfig) -> RunResult:
|
| 79 |
+
"""
|
| 80 |
+
Run the student's agent with their MCP server.
|
| 81 |
+
|
| 82 |
+
1. Spawns the MCP server as a subprocess
|
| 83 |
+
2. Connects the agent via FastMCP Client
|
| 84 |
+
3. Runs the agent for max_steps
|
| 85 |
+
4. Collects and returns results
|
| 86 |
+
"""
|
| 87 |
+
# Validate paths
|
| 88 |
+
if not config.agent_path.exists():
|
| 89 |
+
return RunResult(
|
| 90 |
+
final_score=0,
|
| 91 |
+
max_score=0,
|
| 92 |
+
moves=0,
|
| 93 |
+
locations_visited=set(),
|
| 94 |
+
game_completed=False,
|
| 95 |
+
error=f"Agent file not found: {config.agent_path}"
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
if not config.server_path.exists():
|
| 99 |
+
return RunResult(
|
| 100 |
+
final_score=0,
|
| 101 |
+
max_score=0,
|
| 102 |
+
moves=0,
|
| 103 |
+
locations_visited=set(),
|
| 104 |
+
game_completed=False,
|
| 105 |
+
error=f"Server file not found: {config.server_path}"
|
| 106 |
+
)
|
| 107 |
+
|
| 108 |
+
# Validate game
|
| 109 |
+
available_games = list_available_games()
|
| 110 |
+
if config.game not in available_games:
|
| 111 |
+
return RunResult(
|
| 112 |
+
final_score=0,
|
| 113 |
+
max_score=0,
|
| 114 |
+
moves=0,
|
| 115 |
+
locations_visited=set(),
|
| 116 |
+
game_completed=False,
|
| 117 |
+
error=f"Unknown game: {config.game}. Available: {available_games[:10]}..."
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
try:
|
| 121 |
+
# Load the student's agent class
|
| 122 |
+
AgentClass = load_agent_class(config.agent_path)
|
| 123 |
+
agent = AgentClass()
|
| 124 |
+
|
| 125 |
+
# Create transport for the MCP server
|
| 126 |
+
# Set environment variable for the game
|
| 127 |
+
env = os.environ.copy()
|
| 128 |
+
env["GAME"] = config.game
|
| 129 |
+
|
| 130 |
+
transport = StdioTransport(
|
| 131 |
+
command=sys.executable,
|
| 132 |
+
args=[str(config.server_path)],
|
| 133 |
+
env=env,
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
# Connect to the server and run the agent
|
| 137 |
+
async with Client(transport) as client:
|
| 138 |
+
result = await agent.run(
|
| 139 |
+
client=client,
|
| 140 |
+
game=config.game,
|
| 141 |
+
max_steps=config.max_steps,
|
| 142 |
+
seed=config.seed,
|
| 143 |
+
verbose=config.verbose,
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
return result
|
| 147 |
+
|
| 148 |
+
except Exception as e:
|
| 149 |
+
import traceback
|
| 150 |
+
return RunResult(
|
| 151 |
+
final_score=0,
|
| 152 |
+
max_score=0,
|
| 153 |
+
moves=0,
|
| 154 |
+
locations_visited=set(),
|
| 155 |
+
game_completed=False,
|
| 156 |
+
error=f"{type(e).__name__}: {str(e)}\n{traceback.format_exc()}"
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
|
| 160 |
+
async def run_reference_agent(
|
| 161 |
+
game: str,
|
| 162 |
+
max_steps: int,
|
| 163 |
+
seed: int,
|
| 164 |
+
verbose: bool = False,
|
| 165 |
+
) -> RunResult:
|
| 166 |
+
"""
|
| 167 |
+
Run the reference agent (from example_submission) for baseline comparison.
|
| 168 |
+
"""
|
| 169 |
+
# Use the example as the reference
|
| 170 |
+
examples_dir = Path(__file__).parent.parent / "example_submission"
|
| 171 |
+
agent_path = examples_dir / "agent.py"
|
| 172 |
+
server_path = examples_dir / "mcp_server.py"
|
| 173 |
+
|
| 174 |
+
config = RunConfig(
|
| 175 |
+
agent_path=agent_path,
|
| 176 |
+
server_path=server_path,
|
| 177 |
+
game=game,
|
| 178 |
+
max_steps=max_steps,
|
| 179 |
+
seed=seed,
|
| 180 |
+
verbose=verbose,
|
| 181 |
+
)
|
| 182 |
+
|
| 183 |
+
return await run_agent_with_server(config)
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
def run_single_trial(config: RunConfig) -> RunResult:
|
| 187 |
+
"""Synchronous wrapper for running a single trial."""
|
| 188 |
+
return asyncio.run(run_agent_with_server(config))
|
example_submission/README.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Example: MCP ReAct Agent
|
| 2 |
+
|
| 3 |
+
This is a complete, working example submission that demonstrates a ReAct agent using MCP.
|
| 4 |
+
|
| 5 |
+
## Approach
|
| 6 |
+
|
| 7 |
+
This agent uses the full ReAct pattern:
|
| 8 |
+
1. **Thought**: Reason about the current situation
|
| 9 |
+
2. **Tool**: Choose and call an MCP tool
|
| 10 |
+
3. **Observation**: Process the result
|
| 11 |
+
|
| 12 |
+
Features:
|
| 13 |
+
- Loop detection (avoids repeating the same action)
|
| 14 |
+
- Action validation (fixes common invalid verbs)
|
| 15 |
+
- Score tracking
|
| 16 |
+
- History management
|
| 17 |
+
|
| 18 |
+
## Files
|
| 19 |
+
|
| 20 |
+
- `agent.py` - ReAct agent with full implementation
|
| 21 |
+
- `mcp_server.py` - MCP server with memory, map, and inventory tools
|
| 22 |
+
|
| 23 |
+
## Testing
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
# Test locally
|
| 27 |
+
python agent.py
|
| 28 |
+
```
|
agents/mcp_react_agent.py → example_submission/agent.py
RENAMED
|
@@ -1,40 +1,68 @@
|
|
| 1 |
"""
|
| 2 |
-
MCP ReAct Agent
|
| 3 |
|
| 4 |
-
A
|
| 5 |
-
This
|
| 6 |
-
|
| 7 |
-
Features:
|
| 8 |
-
- FastMCP Client integration for MCP server communication
|
| 9 |
-
- ReAct loop (Thought -> Tool -> Observation)
|
| 10 |
-
- Loop detection and action validation
|
| 11 |
-
- History tracking and memory management
|
| 12 |
-
- Score tracking and game over detection
|
| 13 |
"""
|
| 14 |
|
| 15 |
-
import asyncio
|
| 16 |
import json
|
| 17 |
import os
|
| 18 |
import re
|
| 19 |
-
import sys
|
| 20 |
from dataclasses import dataclass, field
|
| 21 |
-
from
|
|
|
|
| 22 |
from dotenv import load_dotenv
|
| 23 |
-
from
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
@dataclass
|
| 28 |
-
class
|
| 29 |
-
"""
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
SYSTEM_PROMPT = """You are an expert text adventure game player. Your goal is to explore, collect treasures, and maximize your score.
|
| 39 |
|
| 40 |
AVAILABLE TOOLS (use these via MCP):
|
|
@@ -42,9 +70,6 @@ AVAILABLE TOOLS (use these via MCP):
|
|
| 42 |
2. memory - Get current game state, score, and recent history
|
| 43 |
3. get_map - See explored locations and connections
|
| 44 |
4. inventory - Check what you're carrying
|
| 45 |
-
5. hint - Get a hint if stuck
|
| 46 |
-
6. list_games - See available games
|
| 47 |
-
7. reset_game - Switch to a different game
|
| 48 |
|
| 49 |
VALID GAME COMMANDS for play_action:
|
| 50 |
- Movement: north, south, east, west, up, down, enter, exit
|
|
@@ -84,176 +109,145 @@ STRATEGY:
|
|
| 84 |
DO NOT repeat the same action multiple times in a row."""
|
| 85 |
|
| 86 |
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
"""
|
| 89 |
-
|
| 90 |
|
| 91 |
-
This
|
| 92 |
-
-
|
| 93 |
- Loop detection
|
| 94 |
- Action validation
|
| 95 |
-
- Score tracking
|
| 96 |
"""
|
| 97 |
|
| 98 |
-
def __init__(self
|
| 99 |
-
"""
|
| 100 |
-
Initialize the MCP ReAct agent.
|
| 101 |
-
|
| 102 |
-
Args:
|
| 103 |
-
mcp_server_path: Path to the MCP server script
|
| 104 |
-
config: Agent configuration
|
| 105 |
-
"""
|
| 106 |
-
load_dotenv()
|
| 107 |
-
|
| 108 |
-
self.mcp_server_path = mcp_server_path
|
| 109 |
-
self.config = config or MCPAgentConfig()
|
| 110 |
-
|
| 111 |
-
# Override model from environment if set
|
| 112 |
-
env_model = os.getenv("HF_MODEL")
|
| 113 |
-
if env_model:
|
| 114 |
-
self.config.model = env_model
|
| 115 |
-
|
| 116 |
-
# Initialize LLM client
|
| 117 |
-
token = os.getenv("HF_TOKEN")
|
| 118 |
-
if not token:
|
| 119 |
-
raise ValueError("HF_TOKEN not found. Set it in your .env file.")
|
| 120 |
-
self.llm = InferenceClient(token=token)
|
| 121 |
-
|
| 122 |
-
# Agent state
|
| 123 |
self.history: list[dict] = []
|
| 124 |
-
self.
|
| 125 |
self.score: int = 0
|
| 126 |
-
self.max_score: int = 350
|
| 127 |
-
self.recent_actions: list[str] = [] # For loop detection
|
| 128 |
|
| 129 |
-
async def run(
|
| 130 |
-
|
| 131 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
Returns:
|
| 137 |
-
Dictionary with game results
|
| 138 |
-
"""
|
| 139 |
-
import time
|
| 140 |
-
start_time = time.time()
|
| 141 |
-
step = 0
|
| 142 |
-
game_over = False
|
| 143 |
-
game_name = self.config.game
|
| 144 |
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
print("=" * 60)
|
| 149 |
|
| 150 |
-
#
|
| 151 |
-
|
| 152 |
-
|
| 153 |
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
command=sys.executable,
|
| 157 |
-
args=[self.mcp_server_path],
|
| 158 |
-
env=env,
|
| 159 |
-
)
|
| 160 |
|
| 161 |
-
#
|
| 162 |
-
|
| 163 |
-
#
|
| 164 |
-
|
| 165 |
-
tool_names = [t.name for t in tools]
|
| 166 |
-
print(f"\nConnected to MCP server. Tools: {tool_names}")
|
| 167 |
|
| 168 |
-
#
|
| 169 |
-
|
| 170 |
-
observation = self._extract_result(result)
|
| 171 |
-
print(f"\n{observation}\n")
|
| 172 |
|
| 173 |
-
# Parse
|
| 174 |
-
self.
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
print(f"
|
| 179 |
-
print(f"
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
self.thoughts.append(thought)
|
| 192 |
-
|
| 193 |
-
if self.config.verbose:
|
| 194 |
-
print(f"\n[THOUGHT] {thought}")
|
| 195 |
-
print(f"[TOOL] {tool_name}({tool_args})")
|
| 196 |
-
|
| 197 |
-
# Validate and fix common issues
|
| 198 |
-
tool_name, tool_args = self._validate_tool_call(tool_name, tool_args, tool_names)
|
| 199 |
-
|
| 200 |
-
# Check for loops
|
| 201 |
-
if tool_name == "play_action":
|
| 202 |
-
action = tool_args.get("action", "look")
|
| 203 |
-
self.recent_actions.append(action)
|
| 204 |
-
if len(self.recent_actions) > 5:
|
| 205 |
-
self.recent_actions = self.recent_actions[-5:]
|
| 206 |
-
|
| 207 |
-
# Detect loops
|
| 208 |
-
if len(self.recent_actions) >= 3 and len(set(self.recent_actions[-3:])) == 1:
|
| 209 |
-
print(f"\n[WARNING] Loop detected - repeating '{action}'")
|
| 210 |
-
# Force a different action
|
| 211 |
-
tool_args = {"action": "look"}
|
| 212 |
-
self.recent_actions.append("look")
|
| 213 |
-
|
| 214 |
-
# Execute tool via MCP
|
| 215 |
-
try:
|
| 216 |
-
result = await client.call_tool(tool_name, tool_args)
|
| 217 |
-
observation = self._extract_result(result)
|
| 218 |
-
print(f"\n{observation}")
|
| 219 |
-
except Exception as e:
|
| 220 |
-
observation = f"Error executing tool: {e}"
|
| 221 |
-
print(f"\n[ERROR] {e}")
|
| 222 |
|
| 223 |
-
#
|
| 224 |
-
self.
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
"
|
| 228 |
-
"
|
| 229 |
-
"result": observation[:200]
|
| 230 |
-
})
|
| 231 |
-
if len(self.history) > self.config.max_history:
|
| 232 |
-
self.history = self.history[-self.config.max_history:]
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
-
|
| 247 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
|
| 249 |
def _build_prompt(self, observation: str) -> str:
|
| 250 |
"""Build the prompt for the LLM with context."""
|
| 251 |
parts = []
|
| 252 |
|
| 253 |
-
|
| 254 |
-
parts.append(f"Current Score: {self.score}/{self.max_score}")
|
| 255 |
|
| 256 |
-
# Recent history
|
| 257 |
if self.history:
|
| 258 |
parts.append("\nRecent actions:")
|
| 259 |
for entry in self.history[-3:]:
|
|
@@ -265,31 +259,11 @@ class MCPReActAgent:
|
|
| 265 |
if self.recent_actions and len(set(self.recent_actions[-3:])) == 1:
|
| 266 |
parts.append(f"\n[WARNING: You've been doing '{self.recent_actions[-1]}' repeatedly. TRY SOMETHING DIFFERENT!]")
|
| 267 |
|
| 268 |
-
# Current observation
|
| 269 |
parts.append(f"\nCurrent situation:\n{observation}")
|
| 270 |
parts.append("\nWhat do you do next?")
|
| 271 |
|
| 272 |
return "\n".join(parts)
|
| 273 |
|
| 274 |
-
def _call_llm(self, prompt: str) -> str:
|
| 275 |
-
"""Call the LLM for reasoning."""
|
| 276 |
-
try:
|
| 277 |
-
messages = [
|
| 278 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 279 |
-
{"role": "user", "content": prompt}
|
| 280 |
-
]
|
| 281 |
-
|
| 282 |
-
response = self.llm.chat.completions.create(
|
| 283 |
-
model=self.config.model,
|
| 284 |
-
messages=messages,
|
| 285 |
-
temperature=self.config.temperature,
|
| 286 |
-
max_tokens=self.config.max_tokens,
|
| 287 |
-
)
|
| 288 |
-
return response.choices[0].message.content
|
| 289 |
-
except Exception as e:
|
| 290 |
-
print(f"[LLM Error] {e}")
|
| 291 |
-
return "THOUGHT: LLM error, trying look.\nTOOL: play_action\nARGS: {\"action\": \"look\"}"
|
| 292 |
-
|
| 293 |
def _parse_response(self, response: str, valid_tools: list[str]) -> tuple[str, str, dict]:
|
| 294 |
"""Parse the LLM response to extract thought, tool, and arguments."""
|
| 295 |
thought = "No reasoning provided"
|
|
@@ -298,7 +272,7 @@ class MCPReActAgent:
|
|
| 298 |
|
| 299 |
lines = response.strip().split("\n")
|
| 300 |
|
| 301 |
-
for
|
| 302 |
line_clean = line.strip()
|
| 303 |
line_upper = line_clean.upper()
|
| 304 |
|
|
@@ -307,7 +281,6 @@ class MCPReActAgent:
|
|
| 307 |
|
| 308 |
elif line_upper.startswith("TOOL:"):
|
| 309 |
raw_tool = line_clean.split(":", 1)[1].strip().lower()
|
| 310 |
-
# Clean up common issues
|
| 311 |
raw_tool = raw_tool.replace("**", "").replace("*", "").replace("`", "")
|
| 312 |
raw_tool = raw_tool.split()[0] if raw_tool else "play_action"
|
| 313 |
tool_name = raw_tool
|
|
@@ -315,16 +288,13 @@ class MCPReActAgent:
|
|
| 315 |
elif line_upper.startswith("ARGS:"):
|
| 316 |
args_part = line_clean.split(":", 1)[1].strip()
|
| 317 |
try:
|
| 318 |
-
# Handle various JSON formats
|
| 319 |
args_part = args_part.replace("'", '"')
|
| 320 |
tool_args = json.loads(args_part)
|
| 321 |
except json.JSONDecodeError:
|
| 322 |
-
# Try to extract action from text
|
| 323 |
match = re.search(r'"action"\s*:\s*"([^"]+)"', args_part)
|
| 324 |
if match:
|
| 325 |
tool_args = {"action": match.group(1)}
|
| 326 |
else:
|
| 327 |
-
# Fallback: try to use the whole thing as action
|
| 328 |
tool_args = {"action": "look"}
|
| 329 |
|
| 330 |
return thought, tool_name, tool_args
|
|
@@ -333,7 +303,6 @@ class MCPReActAgent:
|
|
| 333 |
"""Validate and fix common tool call issues."""
|
| 334 |
# Fix tool name
|
| 335 |
if tool_name not in valid_tools:
|
| 336 |
-
# Try common alternatives
|
| 337 |
if tool_name in ["action", "do", "command"]:
|
| 338 |
tool_name = "play_action"
|
| 339 |
elif tool_name in ["map", "location"]:
|
|
@@ -345,11 +314,10 @@ class MCPReActAgent:
|
|
| 345 |
else:
|
| 346 |
tool_name = "play_action"
|
| 347 |
|
| 348 |
-
# Fix action
|
| 349 |
if tool_name == "play_action":
|
| 350 |
action = tool_args.get("action", "look")
|
| 351 |
|
| 352 |
-
# Fix invalid verbs
|
| 353 |
invalid_verb_map = {
|
| 354 |
"check": "examine",
|
| 355 |
"inspect": "examine",
|
|
@@ -365,7 +333,6 @@ class MCPReActAgent:
|
|
| 365 |
words[0] = invalid_verb_map[words[0]]
|
| 366 |
action = " ".join(words)
|
| 367 |
|
| 368 |
-
# Clean up action
|
| 369 |
action = action.lower().strip()
|
| 370 |
action = action.replace("**", "").replace("*", "").replace("`", "")
|
| 371 |
action = " ".join(action.split())
|
|
@@ -378,25 +345,22 @@ class MCPReActAgent:
|
|
| 378 |
"""Extract text from MCP tool result."""
|
| 379 |
if hasattr(result, 'content') and result.content:
|
| 380 |
return result.content[0].text
|
|
|
|
|
|
|
| 381 |
return str(result)
|
| 382 |
|
| 383 |
def _update_score(self, text: str) -> None:
|
| 384 |
"""Update score from game text."""
|
| 385 |
-
# Look for score patterns
|
| 386 |
patterns = [
|
| 387 |
-
r'\+(\d+) points',
|
| 388 |
r'Score:\s*(\d+)',
|
| 389 |
-
r'
|
|
|
|
| 390 |
]
|
| 391 |
|
| 392 |
for pattern in patterns:
|
| 393 |
match = re.search(pattern, text, re.IGNORECASE)
|
| 394 |
if match:
|
| 395 |
-
score = int(match.group(1))
|
| 396 |
-
if "+" in pattern:
|
| 397 |
-
self.score += score
|
| 398 |
-
else:
|
| 399 |
-
self.score = max(self.score, score)
|
| 400 |
|
| 401 |
def _is_game_over(self, text: str) -> bool:
|
| 402 |
"""Check if the game is over."""
|
|
@@ -408,70 +372,33 @@ class MCPReActAgent:
|
|
| 408 |
]
|
| 409 |
text_lower = text.lower()
|
| 410 |
return any(phrase in text_lower for phrase in game_over_phrases)
|
| 411 |
-
|
| 412 |
-
def _print_summary(self, step: int, elapsed_time: float, game_over: bool) -> dict:
|
| 413 |
-
"""Print game summary and return results."""
|
| 414 |
-
print("\n" + "=" * 60)
|
| 415 |
-
print("GAME SUMMARY")
|
| 416 |
-
print("=" * 60)
|
| 417 |
-
print(f"Final Score: {self.score}/{self.max_score} ({100*self.score/self.max_score:.1f}%)")
|
| 418 |
-
print(f"Steps Taken: {step}")
|
| 419 |
-
print(f"Time Elapsed: {elapsed_time:.1f} seconds")
|
| 420 |
-
print(f"Game Over: {game_over}")
|
| 421 |
-
print("=" * 60)
|
| 422 |
-
|
| 423 |
-
return {
|
| 424 |
-
"final_score": self.score,
|
| 425 |
-
"max_score": self.max_score,
|
| 426 |
-
"score_percentage": 100 * self.score / self.max_score,
|
| 427 |
-
"steps": step,
|
| 428 |
-
"elapsed_time": elapsed_time,
|
| 429 |
-
"game_over": game_over,
|
| 430 |
-
}
|
| 431 |
|
| 432 |
|
| 433 |
# =============================================================================
|
| 434 |
-
#
|
| 435 |
# =============================================================================
|
| 436 |
|
| 437 |
-
async def
|
| 438 |
-
"""
|
| 439 |
-
import
|
| 440 |
|
| 441 |
-
|
| 442 |
-
parser.add_argument(
|
| 443 |
-
"--server", "-s",
|
| 444 |
-
default="mcp_server/zork_server.py",
|
| 445 |
-
help="Path to the MCP server script"
|
| 446 |
-
)
|
| 447 |
-
parser.add_argument(
|
| 448 |
-
"--max-steps", "-n",
|
| 449 |
-
type=int,
|
| 450 |
-
default=100,
|
| 451 |
-
help="Maximum steps to run"
|
| 452 |
-
)
|
| 453 |
-
parser.add_argument(
|
| 454 |
-
"--model",
|
| 455 |
-
type=str,
|
| 456 |
-
default=None,
|
| 457 |
-
help="HuggingFace model to use"
|
| 458 |
-
)
|
| 459 |
-
parser.add_argument(
|
| 460 |
-
"--verbose", "-v",
|
| 461 |
-
action="store_true",
|
| 462 |
-
default=True,
|
| 463 |
-
help="Show detailed output"
|
| 464 |
-
)
|
| 465 |
-
|
| 466 |
-
args = parser.parse_args()
|
| 467 |
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 474 |
|
| 475 |
|
| 476 |
if __name__ == "__main__":
|
| 477 |
-
asyncio
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Example: MCP ReAct Agent
|
| 3 |
|
| 4 |
+
A complete ReAct agent that uses MCP tools to play text adventure games.
|
| 5 |
+
This is a working example students can learn from.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
|
|
|
| 8 |
import json
|
| 9 |
import os
|
| 10 |
import re
|
|
|
|
| 11 |
from dataclasses import dataclass, field
|
| 12 |
+
from typing import Optional
|
| 13 |
+
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
+
from huggingface_hub import InferenceClient
|
| 16 |
+
|
| 17 |
+
load_dotenv()
|
| 18 |
+
|
| 19 |
+
# =============================================================================
|
| 20 |
+
# LLM Configuration - DO NOT MODIFY
|
| 21 |
+
# =============================================================================
|
| 22 |
+
|
| 23 |
+
LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct"
|
| 24 |
+
|
| 25 |
+
_hf_token = os.getenv("HF_TOKEN")
|
| 26 |
+
if not _hf_token:
|
| 27 |
+
raise ValueError("HF_TOKEN not found. Set it in your .env file.")
|
| 28 |
+
|
| 29 |
+
LLM_CLIENT = InferenceClient(token=_hf_token)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def call_llm(prompt: str, system_prompt: str, seed: int, max_tokens: int = 300) -> str:
|
| 33 |
+
"""Call the LLM with the given prompt."""
|
| 34 |
+
messages = [
|
| 35 |
+
{"role": "system", "content": system_prompt},
|
| 36 |
+
{"role": "user", "content": prompt},
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
response = LLM_CLIENT.chat.completions.create(
|
| 40 |
+
model=LLM_MODEL,
|
| 41 |
+
messages=messages,
|
| 42 |
+
temperature=0.0,
|
| 43 |
+
max_tokens=max_tokens,
|
| 44 |
+
seed=seed,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
return response.choices[0].message.content
|
| 48 |
|
| 49 |
|
| 50 |
@dataclass
|
| 51 |
+
class RunResult:
|
| 52 |
+
"""Result of running the agent. Do not modify this class."""
|
| 53 |
+
final_score: int
|
| 54 |
+
max_score: int
|
| 55 |
+
moves: int
|
| 56 |
+
locations_visited: set[str]
|
| 57 |
+
game_completed: bool
|
| 58 |
+
error: Optional[str] = None
|
| 59 |
+
history: list[tuple[str, str, str]] = field(default_factory=list)
|
| 60 |
|
| 61 |
|
| 62 |
+
# =============================================================================
|
| 63 |
+
# System Prompt
|
| 64 |
+
# =============================================================================
|
| 65 |
+
|
| 66 |
SYSTEM_PROMPT = """You are an expert text adventure game player. Your goal is to explore, collect treasures, and maximize your score.
|
| 67 |
|
| 68 |
AVAILABLE TOOLS (use these via MCP):
|
|
|
|
| 70 |
2. memory - Get current game state, score, and recent history
|
| 71 |
3. get_map - See explored locations and connections
|
| 72 |
4. inventory - Check what you're carrying
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
VALID GAME COMMANDS for play_action:
|
| 75 |
- Movement: north, south, east, west, up, down, enter, exit
|
|
|
|
| 109 |
DO NOT repeat the same action multiple times in a row."""
|
| 110 |
|
| 111 |
|
| 112 |
+
# =============================================================================
|
| 113 |
+
# Student Agent Implementation
|
| 114 |
+
# =============================================================================
|
| 115 |
+
|
| 116 |
+
class StudentAgent:
|
| 117 |
"""
|
| 118 |
+
MCP ReAct Agent - A complete working example.
|
| 119 |
|
| 120 |
+
This agent demonstrates:
|
| 121 |
+
- ReAct loop (Thought -> Tool -> Observation)
|
| 122 |
- Loop detection
|
| 123 |
- Action validation
|
| 124 |
+
- Score tracking via memory tool
|
| 125 |
"""
|
| 126 |
|
| 127 |
+
def __init__(self):
|
| 128 |
+
"""Initialize the agent state."""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
self.history: list[dict] = []
|
| 130 |
+
self.recent_actions: list[str] = []
|
| 131 |
self.score: int = 0
|
|
|
|
|
|
|
| 132 |
|
| 133 |
+
async def run(
|
| 134 |
+
self,
|
| 135 |
+
client,
|
| 136 |
+
game: str,
|
| 137 |
+
max_steps: int,
|
| 138 |
+
seed: int,
|
| 139 |
+
verbose: bool = False,
|
| 140 |
+
) -> RunResult:
|
| 141 |
+
"""Run the agent for a game session."""
|
| 142 |
+
locations_visited = set()
|
| 143 |
+
history = []
|
| 144 |
+
moves = 0
|
| 145 |
|
| 146 |
+
# Get list of available tools
|
| 147 |
+
tools = await client.list_tools()
|
| 148 |
+
tool_names = [t.name for t in tools]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
+
# Get initial observation
|
| 151 |
+
result = await client.call_tool("play_action", {"action": "look"})
|
| 152 |
+
observation = self._extract_result(result)
|
|
|
|
| 153 |
|
| 154 |
+
# Track initial location
|
| 155 |
+
location = observation.split("\n")[0] if observation else "Unknown"
|
| 156 |
+
locations_visited.add(location)
|
| 157 |
|
| 158 |
+
if verbose:
|
| 159 |
+
print(f"\n{observation}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
+
# Main ReAct loop
|
| 162 |
+
for step in range(1, max_steps + 1):
|
| 163 |
+
# Build prompt with context
|
| 164 |
+
prompt = self._build_prompt(observation)
|
|
|
|
|
|
|
| 165 |
|
| 166 |
+
# Call LLM for reasoning (use step-based seed for variety)
|
| 167 |
+
response = call_llm(prompt, SYSTEM_PROMPT, seed + step)
|
|
|
|
|
|
|
| 168 |
|
| 169 |
+
# Parse the response
|
| 170 |
+
thought, tool_name, tool_args = self._parse_response(response, tool_names)
|
| 171 |
|
| 172 |
+
if verbose:
|
| 173 |
+
print(f"\n--- Step {step} ---")
|
| 174 |
+
print(f"[THOUGHT] {thought}")
|
| 175 |
+
print(f"[TOOL] {tool_name}({tool_args})")
|
| 176 |
+
|
| 177 |
+
# Validate and fix common issues
|
| 178 |
+
tool_name, tool_args = self._validate_tool_call(tool_name, tool_args, tool_names)
|
| 179 |
+
|
| 180 |
+
# Loop detection
|
| 181 |
+
if tool_name == "play_action":
|
| 182 |
+
action = tool_args.get("action", "look")
|
| 183 |
+
self.recent_actions.append(action)
|
| 184 |
+
if len(self.recent_actions) > 5:
|
| 185 |
+
self.recent_actions = self.recent_actions[-5:]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
+
# Detect loops - if same action 3 times, force "look"
|
| 188 |
+
if len(self.recent_actions) >= 3 and len(set(self.recent_actions[-3:])) == 1:
|
| 189 |
+
if verbose:
|
| 190 |
+
print(f"[WARNING] Loop detected - forcing 'look'")
|
| 191 |
+
tool_args = {"action": "look"}
|
| 192 |
+
self.recent_actions.append("look")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
+
moves += 1
|
| 195 |
+
|
| 196 |
+
# Execute the tool
|
| 197 |
+
try:
|
| 198 |
+
result = await client.call_tool(tool_name, tool_args)
|
| 199 |
+
observation = self._extract_result(result)
|
| 200 |
|
| 201 |
+
if verbose:
|
| 202 |
+
print(f"[RESULT] {observation[:200]}...")
|
| 203 |
+
except Exception as e:
|
| 204 |
+
observation = f"Error: {e}"
|
| 205 |
+
if verbose:
|
| 206 |
+
print(f"[ERROR] {e}")
|
| 207 |
+
|
| 208 |
+
# Track location
|
| 209 |
+
location = observation.split("\n")[0] if observation else "Unknown"
|
| 210 |
+
locations_visited.add(location)
|
| 211 |
+
|
| 212 |
+
# Update history
|
| 213 |
+
self.history.append({
|
| 214 |
+
"step": step,
|
| 215 |
+
"thought": thought,
|
| 216 |
+
"tool": tool_name,
|
| 217 |
+
"args": tool_args,
|
| 218 |
+
"result": observation[:200]
|
| 219 |
+
})
|
| 220 |
+
if len(self.history) > 10:
|
| 221 |
+
self.history = self.history[-10:]
|
| 222 |
+
|
| 223 |
+
# Track score from observation
|
| 224 |
+
self._update_score(observation)
|
| 225 |
+
|
| 226 |
+
# Record in result history
|
| 227 |
+
history.append((thought, f"{tool_name}({tool_args})", observation[:100]))
|
| 228 |
+
|
| 229 |
+
# Check for game over
|
| 230 |
+
if self._is_game_over(observation):
|
| 231 |
+
if verbose:
|
| 232 |
+
print("\n*** GAME OVER ***")
|
| 233 |
+
break
|
| 234 |
|
| 235 |
+
return RunResult(
|
| 236 |
+
final_score=self.score,
|
| 237 |
+
max_score=350,
|
| 238 |
+
moves=moves,
|
| 239 |
+
locations_visited=locations_visited,
|
| 240 |
+
game_completed=self._is_game_over(observation),
|
| 241 |
+
history=history,
|
| 242 |
+
)
|
| 243 |
|
| 244 |
def _build_prompt(self, observation: str) -> str:
|
| 245 |
"""Build the prompt for the LLM with context."""
|
| 246 |
parts = []
|
| 247 |
|
| 248 |
+
parts.append(f"Current Score: {self.score}")
|
|
|
|
| 249 |
|
| 250 |
+
# Recent history
|
| 251 |
if self.history:
|
| 252 |
parts.append("\nRecent actions:")
|
| 253 |
for entry in self.history[-3:]:
|
|
|
|
| 259 |
if self.recent_actions and len(set(self.recent_actions[-3:])) == 1:
|
| 260 |
parts.append(f"\n[WARNING: You've been doing '{self.recent_actions[-1]}' repeatedly. TRY SOMETHING DIFFERENT!]")
|
| 261 |
|
|
|
|
| 262 |
parts.append(f"\nCurrent situation:\n{observation}")
|
| 263 |
parts.append("\nWhat do you do next?")
|
| 264 |
|
| 265 |
return "\n".join(parts)
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
def _parse_response(self, response: str, valid_tools: list[str]) -> tuple[str, str, dict]:
|
| 268 |
"""Parse the LLM response to extract thought, tool, and arguments."""
|
| 269 |
thought = "No reasoning provided"
|
|
|
|
| 272 |
|
| 273 |
lines = response.strip().split("\n")
|
| 274 |
|
| 275 |
+
for line in lines:
|
| 276 |
line_clean = line.strip()
|
| 277 |
line_upper = line_clean.upper()
|
| 278 |
|
|
|
|
| 281 |
|
| 282 |
elif line_upper.startswith("TOOL:"):
|
| 283 |
raw_tool = line_clean.split(":", 1)[1].strip().lower()
|
|
|
|
| 284 |
raw_tool = raw_tool.replace("**", "").replace("*", "").replace("`", "")
|
| 285 |
raw_tool = raw_tool.split()[0] if raw_tool else "play_action"
|
| 286 |
tool_name = raw_tool
|
|
|
|
| 288 |
elif line_upper.startswith("ARGS:"):
|
| 289 |
args_part = line_clean.split(":", 1)[1].strip()
|
| 290 |
try:
|
|
|
|
| 291 |
args_part = args_part.replace("'", '"')
|
| 292 |
tool_args = json.loads(args_part)
|
| 293 |
except json.JSONDecodeError:
|
|
|
|
| 294 |
match = re.search(r'"action"\s*:\s*"([^"]+)"', args_part)
|
| 295 |
if match:
|
| 296 |
tool_args = {"action": match.group(1)}
|
| 297 |
else:
|
|
|
|
| 298 |
tool_args = {"action": "look"}
|
| 299 |
|
| 300 |
return thought, tool_name, tool_args
|
|
|
|
| 303 |
"""Validate and fix common tool call issues."""
|
| 304 |
# Fix tool name
|
| 305 |
if tool_name not in valid_tools:
|
|
|
|
| 306 |
if tool_name in ["action", "do", "command"]:
|
| 307 |
tool_name = "play_action"
|
| 308 |
elif tool_name in ["map", "location"]:
|
|
|
|
| 314 |
else:
|
| 315 |
tool_name = "play_action"
|
| 316 |
|
| 317 |
+
# Fix action verbs
|
| 318 |
if tool_name == "play_action":
|
| 319 |
action = tool_args.get("action", "look")
|
| 320 |
|
|
|
|
| 321 |
invalid_verb_map = {
|
| 322 |
"check": "examine",
|
| 323 |
"inspect": "examine",
|
|
|
|
| 333 |
words[0] = invalid_verb_map[words[0]]
|
| 334 |
action = " ".join(words)
|
| 335 |
|
|
|
|
| 336 |
action = action.lower().strip()
|
| 337 |
action = action.replace("**", "").replace("*", "").replace("`", "")
|
| 338 |
action = " ".join(action.split())
|
|
|
|
| 345 |
"""Extract text from MCP tool result."""
|
| 346 |
if hasattr(result, 'content') and result.content:
|
| 347 |
return result.content[0].text
|
| 348 |
+
if isinstance(result, list) and result:
|
| 349 |
+
return result[0].text if hasattr(result[0], 'text') else str(result[0])
|
| 350 |
return str(result)
|
| 351 |
|
| 352 |
def _update_score(self, text: str) -> None:
|
| 353 |
"""Update score from game text."""
|
|
|
|
| 354 |
patterns = [
|
|
|
|
| 355 |
r'Score:\s*(\d+)',
|
| 356 |
+
r'score[:\s]+(\d+)',
|
| 357 |
+
r'\[Score:\s*(\d+)',
|
| 358 |
]
|
| 359 |
|
| 360 |
for pattern in patterns:
|
| 361 |
match = re.search(pattern, text, re.IGNORECASE)
|
| 362 |
if match:
|
| 363 |
+
self.score = max(self.score, int(match.group(1)))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
def _is_game_over(self, text: str) -> bool:
|
| 366 |
"""Check if the game is over."""
|
|
|
|
| 372 |
]
|
| 373 |
text_lower = text.lower()
|
| 374 |
return any(phrase in text_lower for phrase in game_over_phrases)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 375 |
|
| 376 |
|
| 377 |
# =============================================================================
|
| 378 |
+
# Local Testing
|
| 379 |
# =============================================================================
|
| 380 |
|
| 381 |
+
async def test_agent():
|
| 382 |
+
"""Test the agent locally."""
|
| 383 |
+
from fastmcp import Client
|
| 384 |
|
| 385 |
+
agent = StudentAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 386 |
|
| 387 |
+
async with Client("mcp_server.py") as client:
|
| 388 |
+
result = await agent.run(
|
| 389 |
+
client=client,
|
| 390 |
+
game="zork1",
|
| 391 |
+
max_steps=20,
|
| 392 |
+
seed=42,
|
| 393 |
+
verbose=True,
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
print(f"\n{'=' * 50}")
|
| 397 |
+
print(f"Final Score: {result.final_score}")
|
| 398 |
+
print(f"Moves: {result.moves}")
|
| 399 |
+
print(f"Locations: {len(result.locations_visited)}")
|
| 400 |
|
| 401 |
|
| 402 |
if __name__ == "__main__":
|
| 403 |
+
import asyncio
|
| 404 |
+
asyncio.run(test_agent())
|
mcp_server/zork_server.py → example_submission/mcp_server.py
RENAMED
|
@@ -1,27 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
Uses FastMCP for simple, Pythonic MCP server implementation.
|
| 8 |
-
|
| 9 |
-
Usage:
|
| 10 |
-
# Run directly (stdio transport) - default game is zork1
|
| 11 |
-
python mcp_server/zork_server.py
|
| 12 |
-
|
| 13 |
-
# Run with a different game
|
| 14 |
-
GAME=zork2 python mcp_server/zork_server.py
|
| 15 |
-
GAME=advent python mcp_server/zork_server.py
|
| 16 |
-
GAME=enchanter python mcp_server/zork_server.py
|
| 17 |
-
|
| 18 |
-
# Use with FastMCP dev tools
|
| 19 |
-
fastmcp dev mcp_server/zork_server.py
|
| 20 |
-
|
| 21 |
-
# Connect from an MCP client
|
| 22 |
-
from fastmcp import Client
|
| 23 |
-
async with Client("mcp_server/zork_server.py") as client:
|
| 24 |
-
result = await client.call_tool("play_action", {"action": "look"})
|
| 25 |
"""
|
| 26 |
|
| 27 |
import sys
|
|
@@ -49,7 +30,7 @@ class GameState:
|
|
| 49 |
self.env = TextAdventureEnv(game)
|
| 50 |
self.state = self.env.reset()
|
| 51 |
self.history: list[tuple[str, str]] = []
|
| 52 |
-
self.explored_locations: dict[str, set[str]] = {}
|
| 53 |
self.current_location: str = self._extract_location(self.state.observation)
|
| 54 |
|
| 55 |
def _extract_location(self, observation: str) -> str:
|
|
@@ -82,7 +63,7 @@ class GameState:
|
|
| 82 |
def get_memory(self) -> str:
|
| 83 |
"""Get a summary of current game state."""
|
| 84 |
recent = self.history[-5:] if self.history else []
|
| 85 |
-
recent_str = "\n".join([f" > {a}
|
| 86 |
|
| 87 |
return f"""Current State:
|
| 88 |
- Location: {self.current_location}
|
|
@@ -120,13 +101,10 @@ Current Observation:
|
|
| 120 |
item_names = []
|
| 121 |
for item in items:
|
| 122 |
item_str = str(item)
|
| 123 |
-
# Handle Jericho's object format: "leaflet Parent4 Sibling0..."
|
| 124 |
-
# Look for "Parent" (case-insensitive) to find where metadata starts
|
| 125 |
item_lower = item_str.lower()
|
| 126 |
if "parent" in item_lower:
|
| 127 |
idx = item_lower.index("parent")
|
| 128 |
name = item_str[:idx].strip()
|
| 129 |
-
# Remove leading "obj123: " if present
|
| 130 |
if ":" in name:
|
| 131 |
name = name.split(":", 1)[1].strip()
|
| 132 |
item_names.append(name)
|
|
@@ -137,19 +115,9 @@ Current Observation:
|
|
| 137 |
item_names.append(item_str)
|
| 138 |
|
| 139 |
return f"Inventory: {', '.join(item_names)}"
|
| 140 |
-
|
| 141 |
-
def get_valid_actions(self) -> str:
|
| 142 |
-
"""Get list of valid actions in current state."""
|
| 143 |
-
try:
|
| 144 |
-
valid = self.env.get_valid_actions() if hasattr(self.env, 'get_valid_actions') else []
|
| 145 |
-
if valid:
|
| 146 |
-
return f"Valid actions: {', '.join(valid[:20])}"
|
| 147 |
-
except Exception:
|
| 148 |
-
pass
|
| 149 |
-
return "Valid actions: Try standard commands like look, north, south, east, west, take <item>, open <thing>"
|
| 150 |
|
| 151 |
|
| 152 |
-
# Global game state
|
| 153 |
_game_state: GameState | None = None
|
| 154 |
|
| 155 |
|
|
@@ -161,23 +129,15 @@ def get_game() -> GameState:
|
|
| 161 |
return _game_state
|
| 162 |
|
| 163 |
|
| 164 |
-
#
|
| 165 |
# MCP Tools
|
| 166 |
-
#
|
| 167 |
|
| 168 |
@mcp.tool()
|
| 169 |
def play_action(action: str) -> str:
|
| 170 |
"""
|
| 171 |
Execute a game action in the text adventure.
|
| 172 |
|
| 173 |
-
Common commands:
|
| 174 |
-
- Movement: north, south, east, west, up, down, enter, exit (or n, s, e, w, u, d)
|
| 175 |
-
- Objects: take <item>, drop <item>, open <thing>, close <thing>, put <item> in <container>
|
| 176 |
-
- Look: look, examine <thing>, read <thing>
|
| 177 |
-
- Combat: attack <enemy> with <weapon>
|
| 178 |
-
- Light: turn on lamp, light match
|
| 179 |
-
- Other: wait, score, inventory
|
| 180 |
-
|
| 181 |
Args:
|
| 182 |
action: The command to execute (e.g., 'north', 'take lamp', 'open mailbox')
|
| 183 |
|
|
@@ -187,8 +147,9 @@ def play_action(action: str) -> str:
|
|
| 187 |
game = get_game()
|
| 188 |
result = game.take_action(action)
|
| 189 |
|
| 190 |
-
# Add score info
|
| 191 |
-
score_info = ""
|
|
|
|
| 192 |
if game.state.reward > 0:
|
| 193 |
score_info = f"\n\n+{game.state.reward} points! (Total: {game.state.score})"
|
| 194 |
|
|
@@ -204,9 +165,7 @@ def memory() -> str:
|
|
| 204 |
"""
|
| 205 |
Get a summary of the current game state.
|
| 206 |
|
| 207 |
-
Returns
|
| 208 |
-
Use this to understand where you are and what happened recently.
|
| 209 |
-
Very useful for avoiding loops and tracking progress.
|
| 210 |
"""
|
| 211 |
return get_game().get_memory()
|
| 212 |
|
|
@@ -214,10 +173,9 @@ def memory() -> str:
|
|
| 214 |
@mcp.tool()
|
| 215 |
def get_map() -> str:
|
| 216 |
"""
|
| 217 |
-
Get a map showing
|
| 218 |
|
| 219 |
-
Useful for navigation and
|
| 220 |
-
The map builds up as you explore more of the game world.
|
| 221 |
"""
|
| 222 |
return get_game().get_map()
|
| 223 |
|
|
@@ -226,195 +184,13 @@ def get_map() -> str:
|
|
| 226 |
def inventory() -> str:
|
| 227 |
"""
|
| 228 |
Check what items you are currently carrying.
|
| 229 |
-
|
| 230 |
-
Essential before trying to use, drop, or interact with items.
|
| 231 |
-
Most games have an inventory limit, so manage your items wisely.
|
| 232 |
"""
|
| 233 |
return get_game().get_inventory()
|
| 234 |
|
| 235 |
|
| 236 |
-
|
| 237 |
-
def valid_actions() -> str:
|
| 238 |
-
"""
|
| 239 |
-
Get a list of valid actions available in the current game state.
|
| 240 |
-
|
| 241 |
-
Helpful when stuck or unsure what commands the game accepts.
|
| 242 |
-
Note: This may not include all possible actions, just common ones.
|
| 243 |
-
"""
|
| 244 |
-
return get_game().get_valid_actions()
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
@mcp.tool()
|
| 248 |
-
def reset_game(game: str = "zork1") -> str:
|
| 249 |
-
"""
|
| 250 |
-
Reset the game to the beginning or switch to a different game.
|
| 251 |
-
|
| 252 |
-
Use this to start over if you get stuck, die, or want to try a different game.
|
| 253 |
-
|
| 254 |
-
Args:
|
| 255 |
-
game: Game name (e.g., 'zork1', 'zork2', 'advent', 'enchanter')
|
| 256 |
-
Use list_games() to see available options.
|
| 257 |
-
|
| 258 |
-
Returns:
|
| 259 |
-
The initial game text
|
| 260 |
-
"""
|
| 261 |
-
global _game_state
|
| 262 |
-
try:
|
| 263 |
-
_game_state = GameState(game)
|
| 264 |
-
return f"Game reset to {game}.\n\n{_game_state.state.observation}"
|
| 265 |
-
except ValueError as e:
|
| 266 |
-
return f"Error: {e}"
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
@mcp.tool()
|
| 270 |
-
def list_games() -> str:
|
| 271 |
-
"""
|
| 272 |
-
List all available text adventure games.
|
| 273 |
-
|
| 274 |
-
Returns:
|
| 275 |
-
List of game names that can be passed to reset_game()
|
| 276 |
-
"""
|
| 277 |
-
games = list_available_games()
|
| 278 |
-
return f"Available games ({len(games)} total):\n" + ", ".join(games)
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
@mcp.tool()
|
| 282 |
-
def hint() -> str:
|
| 283 |
-
"""
|
| 284 |
-
Get a hint about what to do next based on your current situation.
|
| 285 |
-
|
| 286 |
-
Provides general guidance without spoiling puzzle solutions.
|
| 287 |
-
"""
|
| 288 |
-
game = get_game()
|
| 289 |
-
location = game.current_location.lower()
|
| 290 |
-
inv = game.get_inventory().lower()
|
| 291 |
-
observation = game.state.observation.lower()
|
| 292 |
-
|
| 293 |
-
hints = []
|
| 294 |
-
|
| 295 |
-
# Darkness detection (common in many games)
|
| 296 |
-
if "dark" in location or "dark" in observation or "pitch black" in observation:
|
| 297 |
-
hints.append("It's dangerous in the dark! You need a light source.")
|
| 298 |
-
hints.append("If you have a lamp, try 'turn on lamp'.")
|
| 299 |
-
|
| 300 |
-
# Common items to look for
|
| 301 |
-
if "lamp" in observation and "lamp" not in inv:
|
| 302 |
-
hints.append("There's a lamp here - light sources are essential!")
|
| 303 |
-
if "lantern" in observation and "lantern" not in inv:
|
| 304 |
-
hints.append("There's a lantern here - you'll need light for dark areas!")
|
| 305 |
-
if "sword" in observation and "sword" not in inv:
|
| 306 |
-
hints.append("A sword might be useful for combat encounters.")
|
| 307 |
-
if "key" in observation and "key" not in inv:
|
| 308 |
-
hints.append("A key might unlock something important.")
|
| 309 |
-
|
| 310 |
-
# Container hints
|
| 311 |
-
if any(word in observation for word in ["mailbox", "chest", "box", "container", "cabinet"]):
|
| 312 |
-
hints.append("Try opening containers to find hidden items.")
|
| 313 |
-
|
| 314 |
-
# Door/window hints
|
| 315 |
-
if "door" in observation or "window" in observation:
|
| 316 |
-
hints.append("There might be a way in or out here. Try 'open' commands.")
|
| 317 |
-
|
| 318 |
-
# General hints if nothing specific found
|
| 319 |
-
if not hints:
|
| 320 |
-
hints.append("Explore all directions: north, south, east, west, up, down.")
|
| 321 |
-
hints.append("Examine interesting objects with 'examine <thing>'.")
|
| 322 |
-
hints.append("Pick up useful items with 'take <item>'.")
|
| 323 |
-
hints.append("Open containers and read documents for clues.")
|
| 324 |
-
|
| 325 |
-
return "Hints:\\n" + "\\n".join(f" - {h}" for h in hints)
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
# ============================================================================
|
| 329 |
-
# MCP Resources
|
| 330 |
-
# ============================================================================
|
| 331 |
-
|
| 332 |
-
@mcp.resource("game://state")
|
| 333 |
-
def get_state_resource() -> str:
|
| 334 |
-
"""Current game state as a resource."""
|
| 335 |
-
return get_game().get_memory()
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
@mcp.resource("game://history")
|
| 339 |
-
def get_history_resource() -> str:
|
| 340 |
-
"""Complete action history as a resource."""
|
| 341 |
-
game = get_game()
|
| 342 |
-
if not game.history:
|
| 343 |
-
return "No actions taken yet."
|
| 344 |
-
lines = [f"{i+1}. {action} -> {result[:80]}..." for i, (action, result) in enumerate(game.history)]
|
| 345 |
-
return "\n".join(lines)
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
@mcp.resource("game://map")
|
| 349 |
-
def get_map_resource() -> str:
|
| 350 |
-
"""Explored map as a resource."""
|
| 351 |
-
return get_game().get_map()
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
# ============================================================================
|
| 355 |
-
# Game Prompt (for agents)
|
| 356 |
-
# ============================================================================
|
| 357 |
-
|
| 358 |
-
GAME_PROMPT = """You are playing a classic text adventure game.
|
| 359 |
-
|
| 360 |
-
## YOUR GOAL
|
| 361 |
-
Explore the world, solve puzzles, collect treasures, and maximize your score.
|
| 362 |
-
|
| 363 |
-
## VALID COMMANDS (use ONLY these exact verbs)
|
| 364 |
-
|
| 365 |
-
Movement:
|
| 366 |
-
north, south, east, west, up, down (or n, s, e, w, u, d)
|
| 367 |
-
enter, exit, climb, cross, go <direction>
|
| 368 |
-
|
| 369 |
-
Looking:
|
| 370 |
-
look, examine <thing>, look at <thing>, look in <thing>, read <thing>
|
| 371 |
-
|
| 372 |
-
Objects:
|
| 373 |
-
take <item>, drop <item>, pick up <item>
|
| 374 |
-
open <thing>, close <thing>, unlock <thing> with <key>
|
| 375 |
-
put <item> in <container>, give <item> to <person>
|
| 376 |
-
|
| 377 |
-
Light:
|
| 378 |
-
turn on lamp, turn off lamp, light match
|
| 379 |
-
|
| 380 |
-
Combat:
|
| 381 |
-
attack <enemy> with <weapon>, kill <enemy> with <weapon>
|
| 382 |
-
|
| 383 |
-
Other:
|
| 384 |
-
inventory (or i), wait (or z), score
|
| 385 |
-
push <thing>, pull <thing>, move <thing>
|
| 386 |
-
tie <rope> to <thing>, eat <food>, wave <item>
|
| 387 |
-
|
| 388 |
-
## FORBIDDEN VERBS (these will NOT work):
|
| 389 |
-
check, inspect, search, investigate, grab, pick, use, interact,
|
| 390 |
-
go to, walk to, head to, travel, proceed
|
| 391 |
-
|
| 392 |
-
## STRATEGY TIPS
|
| 393 |
-
1. Explore systematically - check all directions
|
| 394 |
-
2. Read everything - open containers, read documents, examine objects
|
| 395 |
-
3. Use get_map() to track explored locations
|
| 396 |
-
4. Light is essential - find a light source before dark areas!
|
| 397 |
-
5. Manage inventory - you can only carry limited items
|
| 398 |
-
|
| 399 |
-
## GETTING STARTED
|
| 400 |
-
1. Call memory() to see your current state
|
| 401 |
-
2. Explore your starting area thoroughly
|
| 402 |
-
3. Pick up useful items (light sources, weapons, keys)
|
| 403 |
-
|
| 404 |
-
Good luck!
|
| 405 |
-
"""
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
def get_game_prompt(game: str = "zork1") -> str:
|
| 409 |
-
"""Get the system prompt for playing text adventures."""
|
| 410 |
-
prompt = GAME_PROMPT
|
| 411 |
-
prompt += f"\n\nNote: Currently playing {game}. Use list_games() to see all 57 available games."
|
| 412 |
-
return prompt
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
# ============================================================================
|
| 416 |
# Main
|
| 417 |
-
#
|
| 418 |
|
| 419 |
if __name__ == "__main__":
|
| 420 |
mcp.run()
|
|
|
|
| 1 |
"""
|
| 2 |
+
Example: MCP Server for Text Adventures
|
| 3 |
|
| 4 |
+
A complete MCP server that exposes text adventure games via tools.
|
| 5 |
+
This demonstrates a full-featured server with memory, mapping, and inventory.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
"""
|
| 7 |
|
| 8 |
import sys
|
|
|
|
| 30 |
self.env = TextAdventureEnv(game)
|
| 31 |
self.state = self.env.reset()
|
| 32 |
self.history: list[tuple[str, str]] = []
|
| 33 |
+
self.explored_locations: dict[str, set[str]] = {}
|
| 34 |
self.current_location: str = self._extract_location(self.state.observation)
|
| 35 |
|
| 36 |
def _extract_location(self, observation: str) -> str:
|
|
|
|
| 63 |
def get_memory(self) -> str:
|
| 64 |
"""Get a summary of current game state."""
|
| 65 |
recent = self.history[-5:] if self.history else []
|
| 66 |
+
recent_str = "\n".join([f" > {a} -> {r[:60]}..." for a, r in recent]) if recent else " (none yet)"
|
| 67 |
|
| 68 |
return f"""Current State:
|
| 69 |
- Location: {self.current_location}
|
|
|
|
| 101 |
item_names = []
|
| 102 |
for item in items:
|
| 103 |
item_str = str(item)
|
|
|
|
|
|
|
| 104 |
item_lower = item_str.lower()
|
| 105 |
if "parent" in item_lower:
|
| 106 |
idx = item_lower.index("parent")
|
| 107 |
name = item_str[:idx].strip()
|
|
|
|
| 108 |
if ":" in name:
|
| 109 |
name = name.split(":", 1)[1].strip()
|
| 110 |
item_names.append(name)
|
|
|
|
| 115 |
item_names.append(item_str)
|
| 116 |
|
| 117 |
return f"Inventory: {', '.join(item_names)}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
+
# Global game state
|
| 121 |
_game_state: GameState | None = None
|
| 122 |
|
| 123 |
|
|
|
|
| 129 |
return _game_state
|
| 130 |
|
| 131 |
|
| 132 |
+
# =============================================================================
|
| 133 |
# MCP Tools
|
| 134 |
+
# =============================================================================
|
| 135 |
|
| 136 |
@mcp.tool()
|
| 137 |
def play_action(action: str) -> str:
|
| 138 |
"""
|
| 139 |
Execute a game action in the text adventure.
|
| 140 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 141 |
Args:
|
| 142 |
action: The command to execute (e.g., 'north', 'take lamp', 'open mailbox')
|
| 143 |
|
|
|
|
| 147 |
game = get_game()
|
| 148 |
result = game.take_action(action)
|
| 149 |
|
| 150 |
+
# Add score info
|
| 151 |
+
score_info = f"\n\n[Score: {game.state.score} | Moves: {game.state.moves}]"
|
| 152 |
+
|
| 153 |
if game.state.reward > 0:
|
| 154 |
score_info = f"\n\n+{game.state.reward} points! (Total: {game.state.score})"
|
| 155 |
|
|
|
|
| 165 |
"""
|
| 166 |
Get a summary of the current game state.
|
| 167 |
|
| 168 |
+
Returns location, score, moves, recent actions, and current observation.
|
|
|
|
|
|
|
| 169 |
"""
|
| 170 |
return get_game().get_memory()
|
| 171 |
|
|
|
|
| 173 |
@mcp.tool()
|
| 174 |
def get_map() -> str:
|
| 175 |
"""
|
| 176 |
+
Get a map showing explored locations and connections.
|
| 177 |
|
| 178 |
+
Useful for navigation and avoiding getting lost.
|
|
|
|
| 179 |
"""
|
| 180 |
return get_game().get_map()
|
| 181 |
|
|
|
|
| 184 |
def inventory() -> str:
|
| 185 |
"""
|
| 186 |
Check what items you are currently carrying.
|
|
|
|
|
|
|
|
|
|
| 187 |
"""
|
| 188 |
return get_game().get_inventory()
|
| 189 |
|
| 190 |
|
| 191 |
+
# =============================================================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
# Main
|
| 193 |
+
# =============================================================================
|
| 194 |
|
| 195 |
if __name__ == "__main__":
|
| 196 |
mcp.run()
|
function_calling/controller.py
DELETED
|
@@ -1,291 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Function-Calling Controller for Zork (API-Based)
|
| 3 |
-
|
| 4 |
-
This controller uses the HuggingFace API's native function calling feature.
|
| 5 |
-
The model is given tool schemas and can call them via the tools API.
|
| 6 |
-
|
| 7 |
-
Model: Llama 3.2 3B Instruct (supports native function calling)
|
| 8 |
-
|
| 9 |
-
Compare with simple_controller.py which uses text-based "parsing" approach.
|
| 10 |
-
"""
|
| 11 |
-
|
| 12 |
-
import os
|
| 13 |
-
import json
|
| 14 |
-
from dotenv import load_dotenv
|
| 15 |
-
from huggingface_hub import InferenceClient
|
| 16 |
-
|
| 17 |
-
from tools import ALL_TOOLS, set_game_state, add_to_history
|
| 18 |
-
|
| 19 |
-
# Add parent directory to path to import games module
|
| 20 |
-
import sys
|
| 21 |
-
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 22 |
-
from games.zork_env import ZorkEnvironment
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
# System prompt for the agent
|
| 26 |
-
SYSTEM_PROMPT = """You are playing Zork, a classic text adventure game.
|
| 27 |
-
|
| 28 |
-
## YOUR GOAL
|
| 29 |
-
Explore, collect treasures (bring them to the trophy case), and maximize your score.
|
| 30 |
-
|
| 31 |
-
## VALID COMMANDS (use ONLY these exact verbs)
|
| 32 |
-
|
| 33 |
-
Movement:
|
| 34 |
-
north, south, east, west, up, down (or n, s, e, w, u, d)
|
| 35 |
-
enter, exit, climb, cross, go <direction>
|
| 36 |
-
|
| 37 |
-
Looking:
|
| 38 |
-
look, examine <thing>, look at <thing>, look in <thing>, read <thing>
|
| 39 |
-
|
| 40 |
-
Objects:
|
| 41 |
-
take <item>, drop <item>, pick up <item>
|
| 42 |
-
open <thing>, close <thing>, unlock <thing> with <key>
|
| 43 |
-
put <item> in <container>, give <item> to <person>
|
| 44 |
-
|
| 45 |
-
Light:
|
| 46 |
-
turn on lamp, turn off lamp, light match
|
| 47 |
-
|
| 48 |
-
Combat:
|
| 49 |
-
attack <enemy> with <weapon>, kill <enemy> with <weapon>
|
| 50 |
-
|
| 51 |
-
Other:
|
| 52 |
-
inventory (or i), wait (or z), score, save, restore
|
| 53 |
-
push <thing>, pull <thing>, move <thing>, tie <rope> to <thing>
|
| 54 |
-
eat <food>, drink <liquid>, wave <item>
|
| 55 |
-
|
| 56 |
-
## FORBIDDEN (these will NOT work):
|
| 57 |
-
check, inspect, search, investigate, grab, pick, use, interact,
|
| 58 |
-
go to, walk to, head to, travel, proceed
|
| 59 |
-
|
| 60 |
-
## YOUR TOOLS
|
| 61 |
-
memory() - See current state and recent actions
|
| 62 |
-
get_map() - See explored locations
|
| 63 |
-
inventory() - Check what you're carrying
|
| 64 |
-
|
| 65 |
-
## RESPONSE FORMAT
|
| 66 |
-
When you want to take a game action, respond with:
|
| 67 |
-
ACTION: <command>
|
| 68 |
-
|
| 69 |
-
Examples:
|
| 70 |
-
ACTION: open mailbox
|
| 71 |
-
ACTION: north
|
| 72 |
-
ACTION: take lamp
|
| 73 |
-
ACTION: examine leaflet"""
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
# Valid Zork command verbs for validation
|
| 77 |
-
VALID_VERBS = {
|
| 78 |
-
"north", "south", "east", "west", "up", "down", "n", "s", "e", "w", "u", "d",
|
| 79 |
-
"look", "l", "examine", "x", "read",
|
| 80 |
-
"take", "get", "drop", "put", "give",
|
| 81 |
-
"open", "close", "unlock", "lock",
|
| 82 |
-
"turn", "light", "extinguish", "blow",
|
| 83 |
-
"attack", "kill", "fight", "hit",
|
| 84 |
-
"enter", "exit", "go", "climb", "jump",
|
| 85 |
-
"inventory", "i", "wait", "z", "score",
|
| 86 |
-
"move", "push", "pull", "tie", "untie",
|
| 87 |
-
"eat", "drink", "smell", "touch", "rub",
|
| 88 |
-
"wave", "raise", "lower", "pour",
|
| 89 |
-
"say", "answer", "yes", "no",
|
| 90 |
-
"pray", "odysseus", "echo", "hello",
|
| 91 |
-
}
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
def validate_action(action: str) -> str:
|
| 95 |
-
"""Validate and potentially fix an action."""
|
| 96 |
-
action = action.strip().lower()
|
| 97 |
-
if not action:
|
| 98 |
-
return "look"
|
| 99 |
-
|
| 100 |
-
verb = action.split()[0]
|
| 101 |
-
|
| 102 |
-
if verb in VALID_VERBS:
|
| 103 |
-
return action
|
| 104 |
-
|
| 105 |
-
# Common corrections
|
| 106 |
-
corrections = {
|
| 107 |
-
"check": "examine",
|
| 108 |
-
"inspect": "examine",
|
| 109 |
-
"search": "examine",
|
| 110 |
-
"grab": "take",
|
| 111 |
-
"pick": "take",
|
| 112 |
-
"see": "look",
|
| 113 |
-
"view": "look",
|
| 114 |
-
"walk": "go",
|
| 115 |
-
}
|
| 116 |
-
|
| 117 |
-
if verb in corrections:
|
| 118 |
-
return corrections[verb] + action[len(verb):]
|
| 119 |
-
|
| 120 |
-
return "look" # Default fallback
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
def build_tool_schemas():
|
| 124 |
-
"""Convert LangChain tools to OpenAI function schemas."""
|
| 125 |
-
schemas = []
|
| 126 |
-
for tool in ALL_TOOLS:
|
| 127 |
-
schema = {
|
| 128 |
-
"type": "function",
|
| 129 |
-
"function": {
|
| 130 |
-
"name": tool.name,
|
| 131 |
-
"description": tool.description,
|
| 132 |
-
"parameters": {
|
| 133 |
-
"type": "object",
|
| 134 |
-
"properties": {},
|
| 135 |
-
"required": []
|
| 136 |
-
}
|
| 137 |
-
}
|
| 138 |
-
}
|
| 139 |
-
schemas.append(schema)
|
| 140 |
-
return schemas
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
def run_tool(tool_name: str) -> str:
|
| 144 |
-
"""Execute a tool by name and return its result."""
|
| 145 |
-
for tool in ALL_TOOLS:
|
| 146 |
-
if tool.name == tool_name:
|
| 147 |
-
return tool.invoke({})
|
| 148 |
-
return f"Unknown tool: {tool_name}"
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
class FunctionCallingController:
|
| 152 |
-
"""Controller using LLM API-based function calling."""
|
| 153 |
-
|
| 154 |
-
def __init__(self, model: str = "meta-llama/Llama-3.2-3B-Instruct"):
|
| 155 |
-
load_dotenv()
|
| 156 |
-
token = os.getenv("HF_TOKEN")
|
| 157 |
-
if not token:
|
| 158 |
-
raise ValueError("HF_TOKEN not set in environment")
|
| 159 |
-
|
| 160 |
-
self.client = InferenceClient(token=token)
|
| 161 |
-
self.model = os.getenv("HF_MODEL", model)
|
| 162 |
-
self.tool_schemas = build_tool_schemas()
|
| 163 |
-
|
| 164 |
-
def get_action(self, observation: str, game_state) -> str:
|
| 165 |
-
"""Get the next action from the LLM."""
|
| 166 |
-
|
| 167 |
-
# Update tool state
|
| 168 |
-
set_game_state(
|
| 169 |
-
observation=observation,
|
| 170 |
-
inventory=list(game_state.inventory) if game_state.inventory else [],
|
| 171 |
-
score=game_state.score,
|
| 172 |
-
moves=game_state.moves
|
| 173 |
-
)
|
| 174 |
-
|
| 175 |
-
# Build messages fresh each time (simpler than managing tool history)
|
| 176 |
-
messages = [
|
| 177 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 178 |
-
{"role": "user", "content": f"Game output:\n{observation}\n\nWhat do you do?"}
|
| 179 |
-
]
|
| 180 |
-
|
| 181 |
-
# Allow up to 3 tool calls before requiring action
|
| 182 |
-
for _ in range(3):
|
| 183 |
-
response = self.client.chat.completions.create(
|
| 184 |
-
model=self.model,
|
| 185 |
-
messages=messages,
|
| 186 |
-
tools=self.tool_schemas,
|
| 187 |
-
tool_choice="auto",
|
| 188 |
-
max_tokens=300,
|
| 189 |
-
)
|
| 190 |
-
|
| 191 |
-
message = response.choices[0].message
|
| 192 |
-
|
| 193 |
-
# Check if model wants to use a tool
|
| 194 |
-
if message.tool_calls:
|
| 195 |
-
tool_call = message.tool_calls[0]
|
| 196 |
-
tool_name = tool_call.function.name
|
| 197 |
-
|
| 198 |
-
print(f" [Tool] {tool_name}")
|
| 199 |
-
tool_result = run_tool(tool_name)
|
| 200 |
-
print(f" {tool_result[:100]}...")
|
| 201 |
-
|
| 202 |
-
# Add tool interaction to messages for next iteration
|
| 203 |
-
messages.append({
|
| 204 |
-
"role": "assistant",
|
| 205 |
-
"content": None,
|
| 206 |
-
"tool_calls": [{
|
| 207 |
-
"id": tool_call.id,
|
| 208 |
-
"type": "function",
|
| 209 |
-
"function": {"name": tool_name, "arguments": "{}"}
|
| 210 |
-
}]
|
| 211 |
-
})
|
| 212 |
-
messages.append({
|
| 213 |
-
"role": "tool",
|
| 214 |
-
"tool_call_id": tool_call.id,
|
| 215 |
-
"content": tool_result
|
| 216 |
-
})
|
| 217 |
-
|
| 218 |
-
# Continue to get the actual action
|
| 219 |
-
continue
|
| 220 |
-
|
| 221 |
-
# Model responded with text - extract action
|
| 222 |
-
content = message.content or ""
|
| 223 |
-
|
| 224 |
-
# Look for ACTION: in response
|
| 225 |
-
if "ACTION:" in content.upper():
|
| 226 |
-
for line in content.split('\n'):
|
| 227 |
-
if "ACTION:" in line.upper():
|
| 228 |
-
action = line.split(":", 1)[1].strip().lower()
|
| 229 |
-
validated = validate_action(action)
|
| 230 |
-
if validated:
|
| 231 |
-
return validated
|
| 232 |
-
else:
|
| 233 |
-
print(f" [Warning] Invalid action '{action}', defaulting to 'look'")
|
| 234 |
-
return "look"
|
| 235 |
-
|
| 236 |
-
# If no ACTION found, try to extract a command from the response
|
| 237 |
-
content_lower = content.lower().strip()
|
| 238 |
-
validated = validate_action(content_lower)
|
| 239 |
-
if validated:
|
| 240 |
-
return validated
|
| 241 |
-
|
| 242 |
-
# Default
|
| 243 |
-
return "look"
|
| 244 |
-
|
| 245 |
-
# After 3 tool calls, just return look
|
| 246 |
-
return "look"
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
def main():
|
| 250 |
-
"""Run the API-based function-calling controller."""
|
| 251 |
-
print("=" * 60)
|
| 252 |
-
print("Zork - API Function Calling Controller")
|
| 253 |
-
print(" (using Llama 3.2 3B with native tool calling)")
|
| 254 |
-
print("=" * 60)
|
| 255 |
-
|
| 256 |
-
controller = FunctionCallingController()
|
| 257 |
-
env = ZorkEnvironment("zork1")
|
| 258 |
-
|
| 259 |
-
state = env.reset()
|
| 260 |
-
print(f"\n{state.observation}\n")
|
| 261 |
-
|
| 262 |
-
max_steps = 30
|
| 263 |
-
|
| 264 |
-
for step in range(max_steps):
|
| 265 |
-
print(f"\n{'─' * 50}")
|
| 266 |
-
print(f"Step {step + 1}/{max_steps} | Score: {state.score}")
|
| 267 |
-
print("─" * 50)
|
| 268 |
-
|
| 269 |
-
action = controller.get_action(state.observation, state)
|
| 270 |
-
print(f"\n> ACTION: {action}")
|
| 271 |
-
|
| 272 |
-
# Take action in game
|
| 273 |
-
state = env.step(action)
|
| 274 |
-
add_to_history(action, state.observation)
|
| 275 |
-
|
| 276 |
-
print(f"\n{state.observation}")
|
| 277 |
-
|
| 278 |
-
if state.reward > 0:
|
| 279 |
-
print(f"\n+{state.reward} points!")
|
| 280 |
-
|
| 281 |
-
if state.done:
|
| 282 |
-
print("\nGAME OVER!")
|
| 283 |
-
break
|
| 284 |
-
|
| 285 |
-
print(f"\n{'=' * 60}")
|
| 286 |
-
print(f"Final Score: {state.score}")
|
| 287 |
-
print("=" * 60)
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
if __name__ == "__main__":
|
| 291 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function_calling/simple_controller.py
DELETED
|
@@ -1,268 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Function-Calling Controller for Zork (Text-Based)
|
| 3 |
-
|
| 4 |
-
This controller uses text-based "function calling" - the LLM outputs
|
| 5 |
-
TOOL: <name> or ACTION: <command> and we parse the text response.
|
| 6 |
-
|
| 7 |
-
Model: Qwen 2.5 7B Instruct (any chat model works)
|
| 8 |
-
|
| 9 |
-
This approach is:
|
| 10 |
-
- Simpler and more reliable than API-based function calling
|
| 11 |
-
- Works with any chat model (no special support needed)
|
| 12 |
-
|
| 13 |
-
Compare with controller.py which uses API-based tool calling.
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
import os
|
| 17 |
-
import re
|
| 18 |
-
from dotenv import load_dotenv
|
| 19 |
-
from huggingface_hub import InferenceClient
|
| 20 |
-
|
| 21 |
-
from tools import ALL_TOOLS, set_game_state, add_to_history
|
| 22 |
-
|
| 23 |
-
# Add parent directory to path
|
| 24 |
-
import sys
|
| 25 |
-
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 26 |
-
from games.zork_env import ZorkEnvironment
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
SYSTEM_PROMPT = """You are playing Zork, a classic text adventure game.
|
| 30 |
-
|
| 31 |
-
## YOUR GOAL
|
| 32 |
-
Explore, collect treasures (bring them to the trophy case), and maximize your score.
|
| 33 |
-
|
| 34 |
-
## VALID COMMANDS (use ONLY these exact verbs)
|
| 35 |
-
|
| 36 |
-
Movement:
|
| 37 |
-
north, south, east, west, up, down (or n, s, e, w, u, d)
|
| 38 |
-
enter, exit, climb, cross, go <direction>
|
| 39 |
-
|
| 40 |
-
Looking:
|
| 41 |
-
look, examine <thing>, look at <thing>, look in <thing>, read <thing>
|
| 42 |
-
|
| 43 |
-
Objects:
|
| 44 |
-
take <item>, drop <item>, pick up <item>
|
| 45 |
-
open <thing>, close <thing>, unlock <thing> with <key>
|
| 46 |
-
put <item> in <container>, give <item> to <person>
|
| 47 |
-
|
| 48 |
-
Light:
|
| 49 |
-
turn on lamp, turn off lamp, light match
|
| 50 |
-
|
| 51 |
-
Combat:
|
| 52 |
-
attack <enemy> with <weapon>, kill <enemy> with <weapon>
|
| 53 |
-
|
| 54 |
-
Other:
|
| 55 |
-
inventory (or i), wait (or z), score, save, restore
|
| 56 |
-
push <thing>, pull <thing>, move <thing>, tie <rope> to <thing>
|
| 57 |
-
eat <food>, drink <liquid>, wave <item>
|
| 58 |
-
|
| 59 |
-
## FORBIDDEN (these will NOT work):
|
| 60 |
-
check, inspect, search, investigate, grab, pick, use, interact,
|
| 61 |
-
go to, walk to, head to, travel, proceed
|
| 62 |
-
|
| 63 |
-
## YOUR TOOLS
|
| 64 |
-
TOOL: memory - See current state and recent actions
|
| 65 |
-
TOOL: get_map - See explored locations
|
| 66 |
-
TOOL: inventory - Check what you're carrying
|
| 67 |
-
|
| 68 |
-
## RESPONSE FORMAT
|
| 69 |
-
Either use a tool:
|
| 70 |
-
TOOL: memory
|
| 71 |
-
|
| 72 |
-
Or take a game action:
|
| 73 |
-
ACTION: open mailbox
|
| 74 |
-
|
| 75 |
-
Always respond with TOOL: or ACTION: followed by your choice."""
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
# Valid Zork command verbs for validation
|
| 79 |
-
VALID_VERBS = {
|
| 80 |
-
"north", "south", "east", "west", "up", "down", "n", "s", "e", "w", "u", "d",
|
| 81 |
-
"look", "l", "examine", "x", "read",
|
| 82 |
-
"take", "get", "drop", "put", "give",
|
| 83 |
-
"open", "close", "unlock", "lock",
|
| 84 |
-
"turn", "light", "extinguish", "blow",
|
| 85 |
-
"attack", "kill", "fight", "hit",
|
| 86 |
-
"enter", "exit", "go", "climb", "jump",
|
| 87 |
-
"inventory", "i", "wait", "z", "score",
|
| 88 |
-
"move", "push", "pull", "tie", "untie",
|
| 89 |
-
"eat", "drink", "smell", "touch", "rub",
|
| 90 |
-
"wave", "raise", "lower", "pour",
|
| 91 |
-
"say", "answer", "yes", "no",
|
| 92 |
-
"pray", "odysseus", "echo", "hello",
|
| 93 |
-
}
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
def run_tool(tool_name: str) -> str:
|
| 97 |
-
"""Execute a tool by name."""
|
| 98 |
-
tool_name = tool_name.strip().lower().replace(" ", "_")
|
| 99 |
-
for tool in ALL_TOOLS:
|
| 100 |
-
if tool.name == tool_name:
|
| 101 |
-
return tool.invoke({})
|
| 102 |
-
return f"Unknown tool: {tool_name}. Available: memory, get_map, inventory"
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
class SimpleController:
|
| 106 |
-
"""Controller using text-based tool calling."""
|
| 107 |
-
|
| 108 |
-
def __init__(self, model: str = "Qwen/Qwen2.5-7B-Instruct"):
|
| 109 |
-
load_dotenv()
|
| 110 |
-
token = os.getenv("HF_TOKEN")
|
| 111 |
-
if not token:
|
| 112 |
-
raise ValueError("HF_TOKEN not set in environment")
|
| 113 |
-
|
| 114 |
-
self.client = InferenceClient(token=token)
|
| 115 |
-
self.model = os.getenv("HF_MODEL", model)
|
| 116 |
-
self.messages = []
|
| 117 |
-
|
| 118 |
-
def _call_llm(self, user_message: str) -> str:
|
| 119 |
-
"""Call the LLM and get response."""
|
| 120 |
-
self.messages.append({"role": "user", "content": user_message})
|
| 121 |
-
|
| 122 |
-
# Keep conversation short
|
| 123 |
-
if len(self.messages) > 15:
|
| 124 |
-
self.messages = self.messages[-15:]
|
| 125 |
-
|
| 126 |
-
response = self.client.chat.completions.create(
|
| 127 |
-
model=self.model,
|
| 128 |
-
messages=[{"role": "system", "content": SYSTEM_PROMPT}] + self.messages,
|
| 129 |
-
max_tokens=150,
|
| 130 |
-
temperature=0.7,
|
| 131 |
-
)
|
| 132 |
-
|
| 133 |
-
reply = response.choices[0].message.content or ""
|
| 134 |
-
self.messages.append({"role": "assistant", "content": reply})
|
| 135 |
-
return reply
|
| 136 |
-
|
| 137 |
-
def _validate_action(self, action: str) -> str | None:
|
| 138 |
-
"""Validate and potentially fix an action. Returns None if invalid."""
|
| 139 |
-
action = action.strip().lower()
|
| 140 |
-
if not action:
|
| 141 |
-
return None
|
| 142 |
-
|
| 143 |
-
# Get the first word (verb)
|
| 144 |
-
verb = action.split()[0]
|
| 145 |
-
|
| 146 |
-
# Check if it's a valid verb
|
| 147 |
-
if verb in VALID_VERBS:
|
| 148 |
-
return action
|
| 149 |
-
|
| 150 |
-
# Try common corrections
|
| 151 |
-
corrections = {
|
| 152 |
-
"check": "examine",
|
| 153 |
-
"inspect": "examine",
|
| 154 |
-
"search": "examine",
|
| 155 |
-
"grab": "take",
|
| 156 |
-
"pick": "take", # "pick up" -> "take"
|
| 157 |
-
"see": "look",
|
| 158 |
-
"view": "look",
|
| 159 |
-
"walk": "go",
|
| 160 |
-
}
|
| 161 |
-
|
| 162 |
-
if verb in corrections:
|
| 163 |
-
fixed = corrections[verb] + action[len(verb):]
|
| 164 |
-
print(f" [Correcting] '{verb}' -> '{corrections[verb]}'")
|
| 165 |
-
return fixed
|
| 166 |
-
|
| 167 |
-
return None
|
| 168 |
-
|
| 169 |
-
def get_action(self, observation: str, game_state) -> str:
|
| 170 |
-
"""Get the next action, allowing tool use."""
|
| 171 |
-
|
| 172 |
-
# Update tool state
|
| 173 |
-
set_game_state(
|
| 174 |
-
observation=observation,
|
| 175 |
-
inventory=list(game_state.inventory) if game_state.inventory else [],
|
| 176 |
-
score=game_state.score,
|
| 177 |
-
moves=game_state.moves
|
| 178 |
-
)
|
| 179 |
-
|
| 180 |
-
prompt = f"Game:\n{observation}\n\nRespond with TOOL: or ACTION:"
|
| 181 |
-
|
| 182 |
-
# Allow up to 3 tool calls before requiring an action
|
| 183 |
-
for _ in range(3):
|
| 184 |
-
response = self._call_llm(prompt)
|
| 185 |
-
|
| 186 |
-
# Check for TOOL:
|
| 187 |
-
tool_match = re.search(r'TOOL:\s*(\w+)', response, re.IGNORECASE)
|
| 188 |
-
if tool_match:
|
| 189 |
-
tool_name = tool_match.group(1)
|
| 190 |
-
print(f" [Tool] {tool_name}")
|
| 191 |
-
|
| 192 |
-
result = run_tool(tool_name)
|
| 193 |
-
print(f" {result[:80]}...")
|
| 194 |
-
|
| 195 |
-
# Feed result back
|
| 196 |
-
prompt = f"Tool result:\n{result}\n\nNow respond with TOOL: or ACTION:"
|
| 197 |
-
continue
|
| 198 |
-
|
| 199 |
-
# Check for ACTION:
|
| 200 |
-
action_match = re.search(r'ACTION:\s*(.+)', response, re.IGNORECASE)
|
| 201 |
-
if action_match:
|
| 202 |
-
action = action_match.group(1).strip().lower()
|
| 203 |
-
# Clean up action (remove quotes, extra text)
|
| 204 |
-
action = action.split('\n')[0].strip('"\'')
|
| 205 |
-
|
| 206 |
-
# Validate the action
|
| 207 |
-
validated = self._validate_action(action)
|
| 208 |
-
if validated:
|
| 209 |
-
return validated
|
| 210 |
-
else:
|
| 211 |
-
print(f" [Warning] Invalid action '{action}', asking for retry...")
|
| 212 |
-
prompt = f"'{action}' is not a valid Zork command. Use verbs like: look, examine, take, open, north, south, etc.\n\nRespond with ACTION:"
|
| 213 |
-
continue
|
| 214 |
-
|
| 215 |
-
# If neither, try to extract a command
|
| 216 |
-
words = response.lower().split()
|
| 217 |
-
for cmd in ["north", "south", "east", "west", "up", "down",
|
| 218 |
-
"look", "take", "open", "enter", "examine"]:
|
| 219 |
-
if cmd in words:
|
| 220 |
-
idx = words.index(cmd)
|
| 221 |
-
return " ".join(words[idx:idx+3])
|
| 222 |
-
|
| 223 |
-
return "look"
|
| 224 |
-
|
| 225 |
-
return "look"
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
def main():
|
| 229 |
-
"""Run the simple controller."""
|
| 230 |
-
print("=" * 60)
|
| 231 |
-
print("Zork - Simple Function Calling Demo")
|
| 232 |
-
print("=" * 60)
|
| 233 |
-
|
| 234 |
-
controller = SimpleController()
|
| 235 |
-
env = ZorkEnvironment("zork1")
|
| 236 |
-
|
| 237 |
-
state = env.reset()
|
| 238 |
-
print(f"\n{state.observation}\n")
|
| 239 |
-
|
| 240 |
-
max_steps = 30
|
| 241 |
-
|
| 242 |
-
for step in range(max_steps):
|
| 243 |
-
print(f"\n{'─' * 50}")
|
| 244 |
-
print(f"Step {step + 1}/{max_steps} | Score: {state.score}")
|
| 245 |
-
print("─" * 50)
|
| 246 |
-
|
| 247 |
-
action = controller.get_action(state.observation, state)
|
| 248 |
-
print(f"\n> ACTION: {action}")
|
| 249 |
-
|
| 250 |
-
state = env.step(action)
|
| 251 |
-
add_to_history(action, state.observation)
|
| 252 |
-
|
| 253 |
-
print(f"\n{state.observation}")
|
| 254 |
-
|
| 255 |
-
if state.reward > 0:
|
| 256 |
-
print(f"\n+{state.reward} points!")
|
| 257 |
-
|
| 258 |
-
if state.done:
|
| 259 |
-
print("\nGAME OVER!")
|
| 260 |
-
break
|
| 261 |
-
|
| 262 |
-
print(f"\n{'=' * 60}")
|
| 263 |
-
print(f"Final Score: {state.score}")
|
| 264 |
-
print("=" * 60)
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
if __name__ == "__main__":
|
| 268 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
function_calling/tools.py
DELETED
|
@@ -1,127 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Simple tools for the Zork agent using LangChain's tool decorator.
|
| 3 |
-
"""
|
| 4 |
-
|
| 5 |
-
from langchain_core.tools import tool
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
# Game state that tools can access (set by the controller)
|
| 9 |
-
_game_state = {
|
| 10 |
-
"observation": "",
|
| 11 |
-
"inventory": [],
|
| 12 |
-
"score": 0,
|
| 13 |
-
"moves": 0,
|
| 14 |
-
"history": [], # List of (action, result) tuples
|
| 15 |
-
}
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
def set_game_state(observation: str, inventory: list, score: int, moves: int):
|
| 19 |
-
"""Update the game state (called by controller after each action)."""
|
| 20 |
-
_game_state["observation"] = observation
|
| 21 |
-
_game_state["inventory"] = inventory
|
| 22 |
-
_game_state["score"] = score
|
| 23 |
-
_game_state["moves"] = moves
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
def add_to_history(action: str, result: str):
|
| 27 |
-
"""Add an action and its result to history."""
|
| 28 |
-
_game_state["history"].append((action, result))
|
| 29 |
-
# Keep only last 10 actions
|
| 30 |
-
if len(_game_state["history"]) > 10:
|
| 31 |
-
_game_state["history"] = _game_state["history"][-10:]
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
@tool
|
| 35 |
-
def memory() -> str:
|
| 36 |
-
"""Get a summary of the current game state including location, score, and recent actions."""
|
| 37 |
-
obs = _game_state["observation"]
|
| 38 |
-
score = _game_state["score"]
|
| 39 |
-
moves = _game_state["moves"]
|
| 40 |
-
|
| 41 |
-
# Extract location (first line of observation)
|
| 42 |
-
lines = obs.strip().split('\n')
|
| 43 |
-
location = lines[0] if lines else "Unknown"
|
| 44 |
-
|
| 45 |
-
# Recent actions
|
| 46 |
-
recent = _game_state["history"][-5:] if _game_state["history"] else []
|
| 47 |
-
recent_str = "\n".join([f" > {a} → {r[:50]}..." for a, r in recent]) if recent else " (none yet)"
|
| 48 |
-
|
| 49 |
-
return f"""Current State:
|
| 50 |
-
- Location: {location}
|
| 51 |
-
- Score: {score} points
|
| 52 |
-
- Moves: {moves}
|
| 53 |
-
|
| 54 |
-
Recent Actions:
|
| 55 |
-
{recent_str}
|
| 56 |
-
|
| 57 |
-
Current Observation:
|
| 58 |
-
{obs}"""
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
@tool
|
| 62 |
-
def get_map() -> str:
|
| 63 |
-
"""Get a map showing known locations and connections based on exploration history."""
|
| 64 |
-
# Build a simple map from history
|
| 65 |
-
locations = set()
|
| 66 |
-
connections = []
|
| 67 |
-
|
| 68 |
-
prev_loc = None
|
| 69 |
-
for action, result in _game_state["history"]:
|
| 70 |
-
# Extract location from result
|
| 71 |
-
lines = result.strip().split('\n')
|
| 72 |
-
if lines:
|
| 73 |
-
loc = lines[0]
|
| 74 |
-
locations.add(loc)
|
| 75 |
-
|
| 76 |
-
# If this was a movement action, record connection
|
| 77 |
-
if action in ["north", "south", "east", "west", "up", "down", "enter", "exit"]:
|
| 78 |
-
if prev_loc and prev_loc != loc:
|
| 79 |
-
connections.append(f" {prev_loc} --{action}--> {loc}")
|
| 80 |
-
prev_loc = loc
|
| 81 |
-
|
| 82 |
-
if not locations:
|
| 83 |
-
return "Map: No locations explored yet. Try moving around!"
|
| 84 |
-
|
| 85 |
-
loc_list = "\n".join([f" - {loc}" for loc in sorted(locations)])
|
| 86 |
-
conn_list = "\n".join(connections[-10:]) if connections else " (no connections recorded)"
|
| 87 |
-
|
| 88 |
-
return f"""Known Locations:
|
| 89 |
-
{loc_list}
|
| 90 |
-
|
| 91 |
-
Connections:
|
| 92 |
-
{conn_list}"""
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
@tool
|
| 96 |
-
def inventory() -> str:
|
| 97 |
-
"""Get the list of items currently in your inventory."""
|
| 98 |
-
items = _game_state["inventory"]
|
| 99 |
-
|
| 100 |
-
if not items:
|
| 101 |
-
return "Inventory: You are empty-handed."
|
| 102 |
-
|
| 103 |
-
# Clean up item names (Jericho returns objects with metadata)
|
| 104 |
-
item_names = []
|
| 105 |
-
for item in items:
|
| 106 |
-
item_str = str(item)
|
| 107 |
-
# Handle Jericho's object format: "leaflet Parent4 Sibling0..."
|
| 108 |
-
# Look for "Parent" (case-insensitive) to find where metadata starts
|
| 109 |
-
item_lower = item_str.lower()
|
| 110 |
-
if "parent" in item_lower:
|
| 111 |
-
idx = item_lower.index("parent")
|
| 112 |
-
name = item_str[:idx].strip()
|
| 113 |
-
# Remove leading "obj123: " if present
|
| 114 |
-
if ":" in name:
|
| 115 |
-
name = name.split(":", 1)[1].strip()
|
| 116 |
-
item_names.append(name)
|
| 117 |
-
elif ":" in item_str:
|
| 118 |
-
name = item_str.split(":")[1].strip()
|
| 119 |
-
item_names.append(name)
|
| 120 |
-
else:
|
| 121 |
-
item_names.append(item_str)
|
| 122 |
-
|
| 123 |
-
return f"Inventory: {', '.join(item_names)}"
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
# Export all tools
|
| 127 |
-
ALL_TOOLS = [memory, get_map, inventory]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mcp_server/README.md
DELETED
|
@@ -1,83 +0,0 @@
|
|
| 1 |
-
# Zork MCP Server
|
| 2 |
-
|
| 3 |
-
This directory contains an MCP (Model Context Protocol) server that exposes Zork game tools to LLM agents.
|
| 4 |
-
|
| 5 |
-
## Overview
|
| 6 |
-
|
| 7 |
-
The MCP server wraps the Jericho Zork environment and provides tools that any MCP-compatible agent (like Mini SWE Agent) can use to play the game.
|
| 8 |
-
|
| 9 |
-
## Tools Available
|
| 10 |
-
|
| 11 |
-
| Tool | Description |
|
| 12 |
-
|------|-------------|
|
| 13 |
-
| `play_action(action)` | Execute a game command (e.g., "north", "take lamp") |
|
| 14 |
-
| `memory()` | Get current state summary (location, score, recent actions) |
|
| 15 |
-
| `get_map()` | View explored locations and connections |
|
| 16 |
-
| `inventory()` | Check items you're carrying |
|
| 17 |
-
| `valid_actions()` | Get hints on available commands |
|
| 18 |
-
| `reset_game(game)` | Start over with zork1, zork2, or zork3 |
|
| 19 |
-
| `hint()` | Get contextual hints for your situation |
|
| 20 |
-
|
| 21 |
-
## Resources
|
| 22 |
-
|
| 23 |
-
The server also exposes MCP resources:
|
| 24 |
-
- `zork://state` - Current game state
|
| 25 |
-
- `zork://history` - Complete action history
|
| 26 |
-
- `zork://map` - Explored locations map
|
| 27 |
-
|
| 28 |
-
## Running the Server
|
| 29 |
-
|
| 30 |
-
### Standalone (for testing)
|
| 31 |
-
```bash
|
| 32 |
-
python mcp_server/zork_server.py
|
| 33 |
-
```
|
| 34 |
-
|
| 35 |
-
### With MCP Inspector (for debugging)
|
| 36 |
-
```bash
|
| 37 |
-
npx @modelcontextprotocol/inspector python mcp_server/zork_server.py
|
| 38 |
-
```
|
| 39 |
-
|
| 40 |
-
### With Mini SWE Agent
|
| 41 |
-
```bash
|
| 42 |
-
python play_zork.py
|
| 43 |
-
```
|
| 44 |
-
|
| 45 |
-
## Configuration
|
| 46 |
-
|
| 47 |
-
The `mcp_config.json` file configures the server for use with MCP clients:
|
| 48 |
-
|
| 49 |
-
```json
|
| 50 |
-
{
|
| 51 |
-
"mcpServers": {
|
| 52 |
-
"zork": {
|
| 53 |
-
"command": "python",
|
| 54 |
-
"args": ["mcp_server/zork_server.py"]
|
| 55 |
-
}
|
| 56 |
-
}
|
| 57 |
-
}
|
| 58 |
-
```
|
| 59 |
-
|
| 60 |
-
## Architecture
|
| 61 |
-
|
| 62 |
-
```
|
| 63 |
-
┌─────────────────────────────────────────┐
|
| 64 |
-
│ MCP Client (Agent) │
|
| 65 |
-
│ (Mini SWE Agent / Claude / etc.) │
|
| 66 |
-
└──────────────────┬──────────────────────┘
|
| 67 |
-
│ MCP Protocol (stdio)
|
| 68 |
-
▼
|
| 69 |
-
┌─────────────────────────────────────────┐
|
| 70 |
-
│ Zork MCP Server │
|
| 71 |
-
│ (FastMCP - zork_server.py) │
|
| 72 |
-
│ │
|
| 73 |
-
│ Tools: play_action, memory, map, │
|
| 74 |
-
│ inventory, valid_actions, │
|
| 75 |
-
│ reset_game, hint │
|
| 76 |
-
└──────────────────┬──────────────────────┘
|
| 77 |
-
│
|
| 78 |
-
▼
|
| 79 |
-
┌─────────────────────────────────────────┐
|
| 80 |
-
│ Jericho + Frotz │
|
| 81 |
-
│ (Z-machine game interpreter) │
|
| 82 |
-
└─────────────────────────────────────────┘
|
| 83 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
mcp_server/__init__.py
DELETED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
# Text Adventure MCP Server
|
|
|
|
|
|
mcp_server/mcp_config.json
DELETED
|
@@ -1,9 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"mcpServers": {
|
| 3 |
-
"zork": {
|
| 4 |
-
"command": "python",
|
| 5 |
-
"args": ["mcp_server/zork_server.py"],
|
| 6 |
-
"cwd": "${workspaceFolder}"
|
| 7 |
-
}
|
| 8 |
-
}
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -1,9 +1,12 @@
|
|
| 1 |
# Core dependencies
|
| 2 |
jericho
|
| 3 |
python-dotenv
|
|
|
|
| 4 |
|
| 5 |
# MCP Server
|
| 6 |
fastmcp
|
| 7 |
|
| 8 |
# Function calling (optional, for the alternative approach)
|
| 9 |
-
langchain-core
|
|
|
|
|
|
|
|
|
| 1 |
# Core dependencies
|
| 2 |
jericho
|
| 3 |
python-dotenv
|
| 4 |
+
spacy
|
| 5 |
|
| 6 |
# MCP Server
|
| 7 |
fastmcp
|
| 8 |
|
| 9 |
# Function calling (optional, for the alternative approach)
|
| 10 |
+
langchain-core
|
| 11 |
+
|
| 12 |
+
huggingface_hub
|
run_agent.py
CHANGED
|
@@ -1,258 +1,138 @@
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
-
|
| 4 |
|
| 5 |
-
Run
|
| 6 |
-
- react: Basic ReAct agent with HuggingFace models
|
| 7 |
-
- function: Function-calling controller (API-based or text-based)
|
| 8 |
-
- mcp: MCP ReAct agent using FastMCP Client
|
| 9 |
|
| 10 |
Usage:
|
| 11 |
-
python run_agent.py
|
| 12 |
-
python run_agent.py --
|
| 13 |
-
python run_agent.py --
|
|
|
|
| 14 |
|
| 15 |
Examples:
|
| 16 |
-
# Run
|
| 17 |
-
python run_agent.py
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
python run_agent.py --
|
| 21 |
|
| 22 |
-
#
|
| 23 |
-
python run_agent.py --
|
| 24 |
|
| 25 |
-
#
|
| 26 |
-
python run_agent.py --
|
| 27 |
-
|
| 28 |
-
#
|
| 29 |
-
python run_agent.py
|
| 30 |
"""
|
| 31 |
|
| 32 |
import argparse
|
| 33 |
import sys
|
| 34 |
import os
|
| 35 |
-
import
|
| 36 |
from pathlib import Path
|
| 37 |
|
| 38 |
# Add games module to path for discovering available games
|
| 39 |
sys.path.insert(0, str(Path(__file__).parent))
|
| 40 |
-
from games.zork_env import list_available_games
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
print(f"Step {step}")
|
| 81 |
-
print("─" * 40)
|
| 82 |
-
|
| 83 |
-
action = agent.choose_action(state.observation, state)
|
| 84 |
-
print(f"\n> {action}")
|
| 85 |
-
|
| 86 |
-
state = env.step(action)
|
| 87 |
-
print(f"\n{state.observation}")
|
| 88 |
-
|
| 89 |
-
if state.reward > 0:
|
| 90 |
-
print(f"\n+{state.reward} points! (Total: {state.score}/{state.max_score})")
|
| 91 |
-
elif state.reward < 0:
|
| 92 |
-
print(f"\n{state.reward} points! (Total: {state.score}/{state.max_score})")
|
| 93 |
-
else:
|
| 94 |
-
print(f"\nScore: {state.score}/{state.max_score}")
|
| 95 |
-
|
| 96 |
-
agent.update_history(action, state.observation, state)
|
| 97 |
-
|
| 98 |
-
if state.done:
|
| 99 |
-
print("\n" + "=" * 60)
|
| 100 |
-
print("GAME OVER!")
|
| 101 |
-
break
|
| 102 |
-
|
| 103 |
-
except KeyboardInterrupt:
|
| 104 |
-
print("\n\nGame interrupted by user")
|
| 105 |
-
|
| 106 |
-
elapsed_time = time.time() - start_time
|
| 107 |
-
return print_summary(env.game, state, step, elapsed_time)
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
# =============================================================================
|
| 111 |
-
# Mode: MCP ReAct Agent
|
| 112 |
-
# =============================================================================
|
| 113 |
-
|
| 114 |
-
def run_mcp_agent(args):
|
| 115 |
-
"""Run MCP ReAct Agent using FastMCP Client."""
|
| 116 |
-
import asyncio
|
| 117 |
-
from agents.mcp_react_agent import MCPReActAgent, MCPAgentConfig
|
| 118 |
-
|
| 119 |
-
print("\n[MCP] Running MCP ReAct Agent with FastMCP")
|
| 120 |
print(f" Game: {args.game}")
|
| 121 |
-
print(f" Model: {args.model}")
|
| 122 |
-
print(f" Server: mcp_server/zork_server.py")
|
| 123 |
print()
|
| 124 |
-
|
| 125 |
-
config = MCPAgentConfig(verbose=args.verbose, model=args.model, game=args.game)
|
| 126 |
-
agent = MCPReActAgent("mcp_server/zork_server.py", config)
|
| 127 |
-
|
| 128 |
-
return asyncio.run(agent.run(max_steps=args.max_steps))
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
# =============================================================================
|
| 132 |
-
# Mode: Function Calling
|
| 133 |
-
# =============================================================================
|
| 134 |
|
| 135 |
-
|
| 136 |
-
"""Run the function-calling controller."""
|
| 137 |
-
# Import the appropriate controller
|
| 138 |
-
sys.path.insert(0, str(Path(__file__).parent / "function_calling"))
|
| 139 |
-
from tools import add_to_history
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
controller = SimpleController(model=args.model)
|
| 145 |
-
else:
|
| 146 |
-
from controller import FunctionCallingController
|
| 147 |
-
print("\n[Function] Running Function Calling Controller (API-based)")
|
| 148 |
-
controller = FunctionCallingController(model=args.model)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
state = env.reset()
|
| 156 |
-
|
| 157 |
-
print("=" * 60)
|
| 158 |
-
print(f"{args.game.upper()} - Function Calling Mode")
|
| 159 |
-
print("=" * 60)
|
| 160 |
-
print(f"\n{state.observation}\n")
|
| 161 |
-
|
| 162 |
-
start_time = time.time()
|
| 163 |
-
step = 0
|
| 164 |
-
|
| 165 |
-
try:
|
| 166 |
-
for step in range(1, args.max_steps + 1):
|
| 167 |
-
print(f"\n{'─' * 50}")
|
| 168 |
-
print(f"Step {step}/{args.max_steps} | Score: {state.score}")
|
| 169 |
-
print("─" * 50)
|
| 170 |
-
|
| 171 |
-
action = controller.get_action(state.observation, state)
|
| 172 |
-
print(f"\n> ACTION: {action}")
|
| 173 |
-
|
| 174 |
-
state = env.step(action)
|
| 175 |
-
add_to_history(action, state.observation)
|
| 176 |
-
|
| 177 |
-
print(f"\n{state.observation}")
|
| 178 |
-
|
| 179 |
-
if state.reward > 0:
|
| 180 |
-
print(f"\n+{state.reward} points!")
|
| 181 |
-
|
| 182 |
-
if state.done:
|
| 183 |
-
print("\nGAME OVER!")
|
| 184 |
-
break
|
| 185 |
-
|
| 186 |
-
except KeyboardInterrupt:
|
| 187 |
-
print("\n\nGame interrupted by user")
|
| 188 |
-
|
| 189 |
-
elapsed_time = time.time() - start_time
|
| 190 |
-
return print_summary(args.game, state, step, elapsed_time)
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
# =============================================================================
|
| 194 |
-
# Common Utilities
|
| 195 |
-
# =============================================================================
|
| 196 |
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
print(f"Steps Taken: {step}")
|
| 206 |
-
print(f"Time Elapsed: {elapsed_time:.1f} seconds")
|
| 207 |
-
print("=" * 60)
|
| 208 |
-
|
| 209 |
-
return {
|
| 210 |
-
"game": game,
|
| 211 |
-
"final_score": state.score,
|
| 212 |
-
"max_score": state.max_score,
|
| 213 |
-
"score_percentage": 100 * state.score / state.max_score,
|
| 214 |
-
"moves": state.moves,
|
| 215 |
-
"steps": step,
|
| 216 |
-
"elapsed_time": elapsed_time,
|
| 217 |
-
"game_over": state.done,
|
| 218 |
-
}
|
| 219 |
|
| 220 |
|
| 221 |
def main():
|
|
|
|
|
|
|
|
|
|
| 222 |
parser = argparse.ArgumentParser(
|
| 223 |
-
description="Run
|
| 224 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 225 |
-
epilog="""
|
| 226 |
-
Modes:
|
| 227 |
-
react Basic ReAct agent (direct game interaction)
|
| 228 |
-
function Function-calling controller (use --simple for text-based)
|
| 229 |
-
mcp MCP ReAct agent using FastMCP Client (recommended)
|
| 230 |
-
|
| 231 |
Examples:
|
| 232 |
-
python run_agent.py
|
| 233 |
-
python run_agent.py --
|
| 234 |
-
python run_agent.py --
|
| 235 |
-
python run_agent.py --
|
| 236 |
-
python run_agent.py --
|
| 237 |
-
python run_agent.py
|
| 238 |
"""
|
| 239 |
)
|
| 240 |
-
|
| 241 |
# Get available games for help text
|
| 242 |
available_games = list_available_games()
|
| 243 |
game_help = f"Game to play (default: zork1). {len(available_games)} games available."
|
| 244 |
-
|
|
|
|
| 245 |
parser.add_argument(
|
| 246 |
-
"--
|
| 247 |
type=str,
|
| 248 |
-
default="
|
| 249 |
-
|
| 250 |
-
help="Which agent mode to use (default: react)"
|
| 251 |
)
|
| 252 |
parser.add_argument(
|
| 253 |
"--game", "-g",
|
| 254 |
type=str,
|
| 255 |
-
default="
|
| 256 |
help=game_help
|
| 257 |
)
|
| 258 |
parser.add_argument(
|
|
@@ -260,31 +140,34 @@ Examples:
|
|
| 260 |
action="store_true",
|
| 261 |
help="List all available games and exit"
|
| 262 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
parser.add_argument(
|
| 264 |
"--max-steps", "-n",
|
| 265 |
type=int,
|
| 266 |
default=100,
|
| 267 |
help="Maximum number of steps to run (default: 100)"
|
| 268 |
)
|
| 269 |
-
parser.add_argument(
|
| 270 |
-
"--model",
|
| 271 |
-
type=str,
|
| 272 |
-
default=None,
|
| 273 |
-
help="Model to use (default: meta-llama/Llama-3.2-3B-Instruct)"
|
| 274 |
-
)
|
| 275 |
parser.add_argument(
|
| 276 |
"--verbose", "-v",
|
| 277 |
action="store_true",
|
| 278 |
help="Show detailed reasoning from the agent"
|
| 279 |
)
|
| 280 |
-
|
| 281 |
-
"--simple",
|
| 282 |
-
action="store_true",
|
| 283 |
-
help="Use text-based function calling (works with any model, only for --mode function)"
|
| 284 |
-
)
|
| 285 |
-
|
| 286 |
args = parser.parse_args()
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
# Handle --list-games
|
| 289 |
if args.list_games:
|
| 290 |
print(f"\nAvailable games ({len(available_games)} total):\n")
|
|
@@ -295,41 +178,32 @@ Examples:
|
|
| 295 |
print(" " + " ".join(f"{g:<15}" for g in row))
|
| 296 |
print()
|
| 297 |
sys.exit(0)
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
# Validate game choice
|
| 300 |
if args.game.lower() not in available_games:
|
| 301 |
print(f"\nError: Unknown game '{args.game}'")
|
| 302 |
print(f"Use --list-games to see {len(available_games)} available options.")
|
| 303 |
sys.exit(1)
|
| 304 |
-
|
| 305 |
-
# Get default model from environment
|
| 306 |
-
default_model = os.getenv("HF_MODEL", "meta-llama/Llama-3.2-3B-Instruct")
|
| 307 |
-
|
| 308 |
-
# Set model if not specified
|
| 309 |
-
if args.model is None:
|
| 310 |
-
args.model = default_model
|
| 311 |
-
|
| 312 |
print("\n" + "=" * 60)
|
| 313 |
-
print("Text Adventure
|
| 314 |
print("=" * 60)
|
| 315 |
-
print(f"
|
| 316 |
print(f"Game: {args.game}")
|
| 317 |
print(f"Max Steps: {args.max_steps}")
|
| 318 |
-
print(f"Model: {args.model}")
|
| 319 |
print(f"Verbose: {args.verbose}")
|
| 320 |
-
|
| 321 |
-
# Run the
|
| 322 |
try:
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
elif args.mode == "function":
|
| 326 |
-
results = run_function_calling(args)
|
| 327 |
-
elif args.mode == "mcp":
|
| 328 |
-
results = run_mcp_agent(args)
|
| 329 |
-
else:
|
| 330 |
-
print(f"Unknown mode: {args.mode}")
|
| 331 |
-
sys.exit(1)
|
| 332 |
-
|
| 333 |
except FileNotFoundError as e:
|
| 334 |
print(f"\n[Error] {e}")
|
| 335 |
sys.exit(1)
|
|
@@ -344,7 +218,7 @@ Examples:
|
|
| 344 |
print("\nMake sure to install dependencies:")
|
| 345 |
print(" pip install -r requirements.txt")
|
| 346 |
sys.exit(1)
|
| 347 |
-
|
| 348 |
return results
|
| 349 |
|
| 350 |
|
|
|
|
| 1 |
#!/usr/bin/env python3
|
| 2 |
"""
|
| 3 |
+
Text Adventure Agent Runner
|
| 4 |
|
| 5 |
+
Run the MCP ReAct agent to play text adventure games like Zork.
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
Usage:
|
| 8 |
+
python run_agent.py
|
| 9 |
+
python run_agent.py --game advent
|
| 10 |
+
python run_agent.py --max-steps 50
|
| 11 |
+
python run_agent.py --agent hidden_submission
|
| 12 |
|
| 13 |
Examples:
|
| 14 |
+
# Run on Zork 1 with example agent (default)
|
| 15 |
+
python run_agent.py
|
| 16 |
|
| 17 |
+
# Play a different game
|
| 18 |
+
python run_agent.py --game advent
|
| 19 |
|
| 20 |
+
# Use a different agent folder
|
| 21 |
+
python run_agent.py --agent hidden_submission
|
| 22 |
|
| 23 |
+
# List all available games
|
| 24 |
+
python run_agent.py --list-games
|
| 25 |
+
|
| 26 |
+
# Run with verbose output
|
| 27 |
+
python run_agent.py -v
|
| 28 |
"""
|
| 29 |
|
| 30 |
import argparse
|
| 31 |
import sys
|
| 32 |
import os
|
| 33 |
+
import asyncio
|
| 34 |
from pathlib import Path
|
| 35 |
|
| 36 |
# Add games module to path for discovering available games
|
| 37 |
sys.path.insert(0, str(Path(__file__).parent))
|
| 38 |
+
from games.zork_env import list_available_games
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def find_agent_folders() -> list[str]:
|
| 42 |
+
"""Find all folders containing agent.py and mcp_server.py."""
|
| 43 |
+
project_root = Path(__file__).parent
|
| 44 |
+
agent_folders = []
|
| 45 |
+
|
| 46 |
+
for folder in project_root.iterdir():
|
| 47 |
+
if folder.is_dir():
|
| 48 |
+
agent_file = folder / "agent.py"
|
| 49 |
+
server_file = folder / "mcp_server.py"
|
| 50 |
+
if agent_file.exists() and server_file.exists():
|
| 51 |
+
agent_folders.append(folder.name)
|
| 52 |
+
|
| 53 |
+
return sorted(agent_folders)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
async def run_mcp_agent(args):
|
| 57 |
+
"""Run MCP ReAct Agent from the specified folder."""
|
| 58 |
+
agent_folder = Path(__file__).parent / args.agent
|
| 59 |
+
agent_file = agent_folder / "agent.py"
|
| 60 |
+
server_file = agent_folder / "mcp_server.py"
|
| 61 |
+
|
| 62 |
+
# Validate folder structure
|
| 63 |
+
if not agent_folder.exists():
|
| 64 |
+
raise FileNotFoundError(f"Agent folder not found: {agent_folder}")
|
| 65 |
+
if not agent_file.exists():
|
| 66 |
+
raise FileNotFoundError(f"agent.py not found in {agent_folder}")
|
| 67 |
+
if not server_file.exists():
|
| 68 |
+
raise FileNotFoundError(f"mcp_server.py not found in {agent_folder}")
|
| 69 |
+
|
| 70 |
+
# Import from the specified folder
|
| 71 |
+
sys.path.insert(0, str(agent_folder))
|
| 72 |
+
from agent import StudentAgent
|
| 73 |
+
from fastmcp import Client
|
| 74 |
+
from fastmcp.client.transports import StdioTransport
|
| 75 |
+
|
| 76 |
+
print(f"\n[MCP] Running Student Agent with FastMCP")
|
| 77 |
+
print(f" Agent: {args.agent}/")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
print(f" Game: {args.game}")
|
|
|
|
|
|
|
| 79 |
print()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
|
| 81 |
+
agent = StudentAgent()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
# Create transport for the MCP server
|
| 84 |
+
env_vars = os.environ.copy()
|
| 85 |
+
env_vars["GAME"] = args.game
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
transport = StdioTransport(
|
| 88 |
+
command=sys.executable,
|
| 89 |
+
args=[str(server_file)],
|
| 90 |
+
env=env_vars,
|
| 91 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
async with Client(transport) as client:
|
| 94 |
+
return await agent.run(
|
| 95 |
+
client=client,
|
| 96 |
+
game=args.game,
|
| 97 |
+
max_steps=args.max_steps,
|
| 98 |
+
seed=42, # Using a fixed seed for direct running
|
| 99 |
+
verbose=args.verbose,
|
| 100 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
|
| 103 |
def main():
|
| 104 |
+
# Find available agent folders
|
| 105 |
+
agent_folders = find_agent_folders()
|
| 106 |
+
|
| 107 |
parser = argparse.ArgumentParser(
|
| 108 |
+
description="Run the MCP ReAct agent to play text adventure games",
|
| 109 |
formatter_class=argparse.RawDescriptionHelpFormatter,
|
| 110 |
+
epilog=f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
Examples:
|
| 112 |
+
python run_agent.py # Play Zork 1 with example agent
|
| 113 |
+
python run_agent.py --game advent # Play Adventure
|
| 114 |
+
python run_agent.py --agent hidden_submission # Use hidden agent
|
| 115 |
+
python run_agent.py --list-games # List all games
|
| 116 |
+
python run_agent.py --list-agents # List all agent folders
|
| 117 |
+
python run_agent.py -v # Verbose output
|
| 118 |
"""
|
| 119 |
)
|
| 120 |
+
|
| 121 |
# Get available games for help text
|
| 122 |
available_games = list_available_games()
|
| 123 |
game_help = f"Game to play (default: zork1). {len(available_games)} games available."
|
| 124 |
+
agent_help = f"Agent folder to use (default: example_submission). Available: {', '.join(agent_folders)}"
|
| 125 |
+
|
| 126 |
parser.add_argument(
|
| 127 |
+
"--agent", "-a",
|
| 128 |
type=str,
|
| 129 |
+
default="example_submission",
|
| 130 |
+
help=agent_help
|
|
|
|
| 131 |
)
|
| 132 |
parser.add_argument(
|
| 133 |
"--game", "-g",
|
| 134 |
type=str,
|
| 135 |
+
default="lostpig",
|
| 136 |
help=game_help
|
| 137 |
)
|
| 138 |
parser.add_argument(
|
|
|
|
| 140 |
action="store_true",
|
| 141 |
help="List all available games and exit"
|
| 142 |
)
|
| 143 |
+
parser.add_argument(
|
| 144 |
+
"--list-agents",
|
| 145 |
+
action="store_true",
|
| 146 |
+
help="List all available agent folders and exit"
|
| 147 |
+
)
|
| 148 |
parser.add_argument(
|
| 149 |
"--max-steps", "-n",
|
| 150 |
type=int,
|
| 151 |
default=100,
|
| 152 |
help="Maximum number of steps to run (default: 100)"
|
| 153 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
parser.add_argument(
|
| 155 |
"--verbose", "-v",
|
| 156 |
action="store_true",
|
| 157 |
help="Show detailed reasoning from the agent"
|
| 158 |
)
|
| 159 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
args = parser.parse_args()
|
| 161 |
+
|
| 162 |
+
# Handle --list-agents
|
| 163 |
+
if args.list_agents:
|
| 164 |
+
print(f"\nAvailable agent folders ({len(agent_folders)} total):\n")
|
| 165 |
+
for folder in agent_folders:
|
| 166 |
+
print(f" {folder}/")
|
| 167 |
+
print("\nEach folder must contain agent.py and mcp_server.py")
|
| 168 |
+
print()
|
| 169 |
+
sys.exit(0)
|
| 170 |
+
|
| 171 |
# Handle --list-games
|
| 172 |
if args.list_games:
|
| 173 |
print(f"\nAvailable games ({len(available_games)} total):\n")
|
|
|
|
| 178 |
print(" " + " ".join(f"{g:<15}" for g in row))
|
| 179 |
print()
|
| 180 |
sys.exit(0)
|
| 181 |
+
|
| 182 |
+
# Validate agent choice
|
| 183 |
+
if args.agent not in agent_folders:
|
| 184 |
+
print(f"\nError: Unknown agent folder '{args.agent}'")
|
| 185 |
+
print(f"Available: {', '.join(agent_folders)}")
|
| 186 |
+
print("Use --list-agents to see details.")
|
| 187 |
+
sys.exit(1)
|
| 188 |
+
|
| 189 |
# Validate game choice
|
| 190 |
if args.game.lower() not in available_games:
|
| 191 |
print(f"\nError: Unknown game '{args.game}'")
|
| 192 |
print(f"Use --list-games to see {len(available_games)} available options.")
|
| 193 |
sys.exit(1)
|
| 194 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
print("\n" + "=" * 60)
|
| 196 |
+
print("Text Adventure MCP Agent Runner")
|
| 197 |
print("=" * 60)
|
| 198 |
+
print(f"Agent: {args.agent}/")
|
| 199 |
print(f"Game: {args.game}")
|
| 200 |
print(f"Max Steps: {args.max_steps}")
|
|
|
|
| 201 |
print(f"Verbose: {args.verbose}")
|
| 202 |
+
|
| 203 |
+
# Run the agent
|
| 204 |
try:
|
| 205 |
+
results = asyncio.run(run_mcp_agent(args))
|
| 206 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
except FileNotFoundError as e:
|
| 208 |
print(f"\n[Error] {e}")
|
| 209 |
sys.exit(1)
|
|
|
|
| 218 |
print("\nMake sure to install dependencies:")
|
| 219 |
print(" pip install -r requirements.txt")
|
| 220 |
sys.exit(1)
|
| 221 |
+
|
| 222 |
return results
|
| 223 |
|
| 224 |
|
submission_template/README.md
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Student Submission: Text Adventure Agent
|
| 2 |
+
|
| 3 |
+
> Replace this with your name and student ID
|
| 4 |
+
|
| 5 |
+
## Overview
|
| 6 |
+
|
| 7 |
+
This is my submission for the Text Adventure Agent assignment. My agent uses the ReAct pattern to play text adventure games via MCP.
|
| 8 |
+
|
| 9 |
+
## Approach
|
| 10 |
+
|
| 11 |
+
<!-- Describe your approach here -->
|
| 12 |
+
|
| 13 |
+
- What strategy does your agent use?
|
| 14 |
+
- What tools did you implement in your MCP server?
|
| 15 |
+
- Any interesting techniques or optimizations?
|
| 16 |
+
|
| 17 |
+
## Files
|
| 18 |
+
|
| 19 |
+
- `agent.py` - ReAct agent implementation
|
| 20 |
+
- `mcp_server.py` - MCP server with game tools
|
| 21 |
+
- `requirements.txt` - Additional dependencies (if any)
|
| 22 |
+
|
| 23 |
+
## Local Testing
|
| 24 |
+
|
| 25 |
+
```bash
|
| 26 |
+
# Test the MCP server
|
| 27 |
+
fastmcp dev mcp_server.py
|
| 28 |
+
|
| 29 |
+
# Run the agent
|
| 30 |
+
python agent.py
|
| 31 |
+
```
|
submission_template/agent.py
ADDED
|
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Student Agent for Text Adventure Games
|
| 3 |
+
|
| 4 |
+
This is your submission file. Implement the StudentAgent class to play
|
| 5 |
+
text adventure games using the MCP server you also implement.
|
| 6 |
+
|
| 7 |
+
Your agent should:
|
| 8 |
+
1. Connect to the MCP server via the provided client
|
| 9 |
+
2. Use the ReAct pattern (Thought -> Action -> Observation)
|
| 10 |
+
3. Call MCP tools to interact with the game
|
| 11 |
+
4. Maximize the game score within the step limit
|
| 12 |
+
|
| 13 |
+
Required method:
|
| 14 |
+
async def run(self, client, game, max_steps, seed, verbose) -> RunResult
|
| 15 |
+
|
| 16 |
+
The 'client' is a FastMCP Client already connected to your MCP server.
|
| 17 |
+
Use it to call tools like: await client.call_tool("play_action", {"action": "look"})
|
| 18 |
+
|
| 19 |
+
Tips:
|
| 20 |
+
- Start by looking around and understanding your environment
|
| 21 |
+
- Keep track of visited locations to avoid loops
|
| 22 |
+
- Pick up useful items (lamp, sword, etc.)
|
| 23 |
+
- The seed parameter should be used to set your LLM's seed for reproducibility
|
| 24 |
+
"""
|
| 25 |
+
|
| 26 |
+
import json
|
| 27 |
+
import os
|
| 28 |
+
import re
|
| 29 |
+
from dataclasses import dataclass, field
|
| 30 |
+
from typing import Optional
|
| 31 |
+
|
| 32 |
+
from dotenv import load_dotenv
|
| 33 |
+
from huggingface_hub import InferenceClient
|
| 34 |
+
|
| 35 |
+
# Load environment variables
|
| 36 |
+
load_dotenv()
|
| 37 |
+
|
| 38 |
+
# =============================================================================
|
| 39 |
+
# LLM Configuration - DO NOT MODIFY
|
| 40 |
+
# =============================================================================
|
| 41 |
+
|
| 42 |
+
# Model to use (fixed for fair evaluation)
|
| 43 |
+
LLM_MODEL = "Qwen/Qwen2.5-72B-Instruct"
|
| 44 |
+
|
| 45 |
+
# Initialize the LLM client (uses HF_TOKEN from environment)
|
| 46 |
+
_hf_token = os.getenv("HF_TOKEN")
|
| 47 |
+
if not _hf_token:
|
| 48 |
+
raise ValueError("HF_TOKEN not found. Set it in your .env file.")
|
| 49 |
+
|
| 50 |
+
LLM_CLIENT = InferenceClient(token=_hf_token)
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def call_llm(prompt: str, system_prompt: str, seed: int, max_tokens: int = 300) -> str:
|
| 54 |
+
"""
|
| 55 |
+
Call the LLM with the given prompt. Use this function in your agent.
|
| 56 |
+
|
| 57 |
+
Args:
|
| 58 |
+
prompt: The user prompt (current game state, history, etc.)
|
| 59 |
+
system_prompt: The system prompt (instructions for the agent)
|
| 60 |
+
seed: Random seed for reproducibility
|
| 61 |
+
max_tokens: Maximum tokens in response (default: 300)
|
| 62 |
+
|
| 63 |
+
Returns:
|
| 64 |
+
The LLM's response text
|
| 65 |
+
|
| 66 |
+
Example:
|
| 67 |
+
response = call_llm(
|
| 68 |
+
prompt="You are in a forest. What do you do?",
|
| 69 |
+
system_prompt=SYSTEM_PROMPT,
|
| 70 |
+
seed=42,
|
| 71 |
+
)
|
| 72 |
+
"""
|
| 73 |
+
messages = [
|
| 74 |
+
{"role": "system", "content": system_prompt},
|
| 75 |
+
{"role": "user", "content": prompt},
|
| 76 |
+
]
|
| 77 |
+
|
| 78 |
+
response = LLM_CLIENT.chat.completions.create(
|
| 79 |
+
model=LLM_MODEL,
|
| 80 |
+
messages=messages,
|
| 81 |
+
temperature=0.0, # Deterministic for reproducibility
|
| 82 |
+
max_tokens=max_tokens,
|
| 83 |
+
seed=seed,
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
return response.choices[0].message.content
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
@dataclass
|
| 90 |
+
class RunResult:
|
| 91 |
+
"""Result of running the agent. Do not modify this class."""
|
| 92 |
+
final_score: int
|
| 93 |
+
max_score: int
|
| 94 |
+
moves: int
|
| 95 |
+
locations_visited: set[str]
|
| 96 |
+
game_completed: bool
|
| 97 |
+
error: Optional[str] = None
|
| 98 |
+
history: list[tuple[str, str, str]] = field(default_factory=list)
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
# =============================================================================
|
| 102 |
+
# System Prompt - Customize this for your agent
|
| 103 |
+
# =============================================================================
|
| 104 |
+
|
| 105 |
+
SYSTEM_PROMPT = """You are playing a classic text adventure game.
|
| 106 |
+
|
| 107 |
+
GOAL: Explore the world, solve puzzles, and maximize your score.
|
| 108 |
+
|
| 109 |
+
AVAILABLE TOOLS (use via MCP):
|
| 110 |
+
- play_action: Execute a game command (north, take lamp, open mailbox, etc.)
|
| 111 |
+
- memory: Get current game state and history (if implemented)
|
| 112 |
+
- inventory: Check what you're carrying (if implemented)
|
| 113 |
+
|
| 114 |
+
VALID GAME COMMANDS for play_action:
|
| 115 |
+
- Movement: north, south, east, west, up, down, enter, exit
|
| 116 |
+
- Objects: take <item>, drop <item>, open <thing>, close <thing>, examine <thing>
|
| 117 |
+
- Other: look, inventory, read <thing>, turn on lamp
|
| 118 |
+
|
| 119 |
+
RESPOND IN THIS EXACT FORMAT (no markdown):
|
| 120 |
+
THOUGHT: <your reasoning about what to do next>
|
| 121 |
+
TOOL: <tool_name>
|
| 122 |
+
ARGS: <JSON arguments, e.g., {"action": "look"}>
|
| 123 |
+
|
| 124 |
+
Example:
|
| 125 |
+
THOUGHT: I should look around to see where I am.
|
| 126 |
+
TOOL: play_action
|
| 127 |
+
ARGS: {"action": "look"}
|
| 128 |
+
"""
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
# =============================================================================
|
| 132 |
+
# Student Agent - IMPLEMENT THIS CLASS
|
| 133 |
+
# =============================================================================
|
| 134 |
+
|
| 135 |
+
class StudentAgent:
|
| 136 |
+
"""
|
| 137 |
+
Your ReAct agent implementation.
|
| 138 |
+
|
| 139 |
+
TODO:
|
| 140 |
+
1. Implement the run() method with the ReAct loop
|
| 141 |
+
2. Parse LLM responses to extract tool calls
|
| 142 |
+
3. Track state and avoid loops
|
| 143 |
+
|
| 144 |
+
Use the provided call_llm() function to interact with the LLM.
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
def __init__(self):
|
| 148 |
+
"""Initialize your agent here."""
|
| 149 |
+
# TODO: Initialize any state tracking you need
|
| 150 |
+
# self.history = []
|
| 151 |
+
# self.visited_locations = set()
|
| 152 |
+
pass
|
| 153 |
+
|
| 154 |
+
async def run(
|
| 155 |
+
self,
|
| 156 |
+
client, # FastMCP Client connected to your MCP server
|
| 157 |
+
game: str,
|
| 158 |
+
max_steps: int,
|
| 159 |
+
seed: int,
|
| 160 |
+
verbose: bool = False,
|
| 161 |
+
) -> RunResult:
|
| 162 |
+
"""
|
| 163 |
+
Run the agent for a game session.
|
| 164 |
+
|
| 165 |
+
Args:
|
| 166 |
+
client: FastMCP Client connected to your MCP server
|
| 167 |
+
game: Name of the game being played (e.g., "zork1")
|
| 168 |
+
max_steps: Maximum number of steps to take
|
| 169 |
+
seed: Random seed for reproducibility (use for LLM calls)
|
| 170 |
+
verbose: Whether to print detailed output
|
| 171 |
+
|
| 172 |
+
Returns:
|
| 173 |
+
RunResult with final score and statistics
|
| 174 |
+
"""
|
| 175 |
+
# TODO: Implement your ReAct loop here
|
| 176 |
+
#
|
| 177 |
+
# Basic structure:
|
| 178 |
+
# 1. Get initial observation (call play_action with "look")
|
| 179 |
+
# 2. Loop for max_steps:
|
| 180 |
+
# a. Build prompt with current observation and history
|
| 181 |
+
# b. Call LLM to get thought and action
|
| 182 |
+
# c. Parse the response to extract tool and args
|
| 183 |
+
# d. Call the tool via client.call_tool(tool_name, args)
|
| 184 |
+
# e. Update history and state
|
| 185 |
+
# f. Check for game over
|
| 186 |
+
# 3. Return RunResult with final statistics
|
| 187 |
+
|
| 188 |
+
# Example of calling a tool:
|
| 189 |
+
# result = await client.call_tool("play_action", {"action": "look"})
|
| 190 |
+
# observation = result[0].text if result else "No response"
|
| 191 |
+
|
| 192 |
+
# Example of calling the LLM:
|
| 193 |
+
# response = call_llm(
|
| 194 |
+
# prompt="Current observation: " + observation,
|
| 195 |
+
# system_prompt=SYSTEM_PROMPT,
|
| 196 |
+
# seed=seed,
|
| 197 |
+
# )
|
| 198 |
+
|
| 199 |
+
# Placeholder implementation - replace with your code
|
| 200 |
+
locations_visited = set()
|
| 201 |
+
history = []
|
| 202 |
+
final_score = 0
|
| 203 |
+
moves = 0
|
| 204 |
+
|
| 205 |
+
# TODO: Your implementation here
|
| 206 |
+
# ...
|
| 207 |
+
|
| 208 |
+
return RunResult(
|
| 209 |
+
final_score=final_score,
|
| 210 |
+
max_score=350, # Zork1 max score, adjust if needed
|
| 211 |
+
moves=moves,
|
| 212 |
+
locations_visited=locations_visited,
|
| 213 |
+
game_completed=False,
|
| 214 |
+
history=history,
|
| 215 |
+
)
|
| 216 |
+
|
| 217 |
+
def _build_prompt(self, observation: str, history: list) -> str:
|
| 218 |
+
"""
|
| 219 |
+
Build the prompt for the LLM.
|
| 220 |
+
|
| 221 |
+
TODO: Implement this to create effective prompts
|
| 222 |
+
"""
|
| 223 |
+
# TODO: Combine system prompt, history, and current observation
|
| 224 |
+
pass
|
| 225 |
+
|
| 226 |
+
def _parse_response(self, response: str) -> tuple[str, str, dict]:
|
| 227 |
+
"""
|
| 228 |
+
Parse LLM response to extract thought, tool name, and arguments.
|
| 229 |
+
|
| 230 |
+
TODO: Implement robust parsing
|
| 231 |
+
|
| 232 |
+
Returns:
|
| 233 |
+
Tuple of (thought, tool_name, args_dict)
|
| 234 |
+
"""
|
| 235 |
+
# TODO: Parse the response format:
|
| 236 |
+
# THOUGHT: ...
|
| 237 |
+
# TOOL: ...
|
| 238 |
+
# ARGS: {...}
|
| 239 |
+
pass
|
| 240 |
+
|
| 241 |
+
def _call_llm(self, prompt: str, system_prompt: str, seed: int) -> str:
|
| 242 |
+
"""
|
| 243 |
+
Call the LLM with the given prompt.
|
| 244 |
+
|
| 245 |
+
This is a convenience wrapper - you can also use call_llm() directly.
|
| 246 |
+
"""
|
| 247 |
+
return call_llm(prompt, system_prompt, seed)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
# =============================================================================
|
| 251 |
+
# For local testing
|
| 252 |
+
# =============================================================================
|
| 253 |
+
|
| 254 |
+
async def test_agent():
|
| 255 |
+
"""Test the agent locally."""
|
| 256 |
+
from fastmcp import Client
|
| 257 |
+
|
| 258 |
+
# Path to your MCP server
|
| 259 |
+
server_path = "mcp_server.py"
|
| 260 |
+
|
| 261 |
+
agent = StudentAgent()
|
| 262 |
+
|
| 263 |
+
async with Client(server_path) as client:
|
| 264 |
+
result = await agent.run(
|
| 265 |
+
client=client,
|
| 266 |
+
game="zork1",
|
| 267 |
+
max_steps=10,
|
| 268 |
+
seed=42,
|
| 269 |
+
verbose=True,
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
print(f"\nFinal Score: {result.final_score}")
|
| 273 |
+
print(f"Moves: {result.moves}")
|
| 274 |
+
print(f"Locations: {result.locations_visited}")
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
if __name__ == "__main__":
|
| 278 |
+
import asyncio
|
| 279 |
+
asyncio.run(test_agent())
|
submission_template/app.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Hugging Face Space - Text Adventure Agent Submission
|
| 3 |
+
|
| 4 |
+
This is a code-only Space for submitting your agent implementation.
|
| 5 |
+
The evaluation is run separately.
|
| 6 |
+
|
| 7 |
+
Files in this submission:
|
| 8 |
+
- agent.py: Your ReAct agent implementation
|
| 9 |
+
- mcp_server.py: Your MCP server implementation
|
| 10 |
+
- requirements.txt: Additional dependencies
|
| 11 |
+
|
| 12 |
+
To test locally:
|
| 13 |
+
fastmcp dev mcp_server.py
|
| 14 |
+
python agent.py
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import gradio as gr
|
| 18 |
+
from pathlib import Path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def read_readme():
|
| 22 |
+
"""Read the README content."""
|
| 23 |
+
readme_path = Path(__file__).parent / "README.md"
|
| 24 |
+
if readme_path.exists():
|
| 25 |
+
return readme_path.read_text()
|
| 26 |
+
return "# Submission\n\nNo README.md found."
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def read_file_content(filename: str) -> str:
|
| 30 |
+
"""Read a source file's content."""
|
| 31 |
+
file_path = Path(__file__).parent / filename
|
| 32 |
+
if file_path.exists():
|
| 33 |
+
return file_path.read_text()
|
| 34 |
+
return f"# File not found: {filename}"
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# Create the Gradio interface
|
| 38 |
+
with gr.Blocks(title="Text Adventure Agent Submission") as demo:
|
| 39 |
+
gr.Markdown("# Text Adventure Agent Submission")
|
| 40 |
+
gr.Markdown(
|
| 41 |
+
"This Space contains a student submission for the Text Adventure Agent assignment. "
|
| 42 |
+
"Use the tabs below to view the submitted code."
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
with gr.Tabs():
|
| 46 |
+
with gr.Tab("README"):
|
| 47 |
+
gr.Markdown(read_readme())
|
| 48 |
+
|
| 49 |
+
with gr.Tab("Agent Code"):
|
| 50 |
+
gr.Code(
|
| 51 |
+
value=read_file_content("agent.py"),
|
| 52 |
+
language="python",
|
| 53 |
+
label="agent.py",
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
with gr.Tab("MCP Server Code"):
|
| 57 |
+
gr.Code(
|
| 58 |
+
value=read_file_content("mcp_server.py"),
|
| 59 |
+
language="python",
|
| 60 |
+
label="mcp_server.py",
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
gr.Markdown(
|
| 64 |
+
"---\n"
|
| 65 |
+
"**Note:** This is a code submission Space. "
|
| 66 |
+
"Evaluation is performed using the evaluation script."
|
| 67 |
+
)
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
demo.launch()
|
templates/mcp_server_template.py → submission_template/mcp_server.py
RENAMED
|
@@ -1,15 +1,27 @@
|
|
| 1 |
"""
|
| 2 |
-
MCP Server
|
| 3 |
|
| 4 |
-
This is
|
| 5 |
-
|
| 6 |
|
| 7 |
-
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
"""
|
| 14 |
|
| 15 |
import sys
|
|
@@ -26,122 +38,172 @@ from games.zork_env import TextAdventureEnv
|
|
| 26 |
# Create the MCP Server
|
| 27 |
# =============================================================================
|
| 28 |
|
| 29 |
-
|
| 30 |
-
# Hint: mcp = FastMCP("Your Server Name")
|
| 31 |
-
mcp = FastMCP("Text Adventure Server")
|
| 32 |
|
| 33 |
|
| 34 |
# =============================================================================
|
| 35 |
# Game State Management
|
| 36 |
# =============================================================================
|
| 37 |
|
| 38 |
-
class
|
| 39 |
"""
|
| 40 |
Manages the text adventure game state.
|
| 41 |
|
| 42 |
-
TODO:
|
| 43 |
-
- Action history (for
|
| 44 |
- Explored locations (for mapping)
|
| 45 |
-
- Current
|
| 46 |
"""
|
| 47 |
|
| 48 |
-
def __init__(self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
self.game_name = game
|
| 50 |
self.env = TextAdventureEnv(game)
|
| 51 |
self.state = self.env.reset()
|
| 52 |
-
# TODO:
|
| 53 |
-
|
| 54 |
-
# self.explored_locations = {}
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
"""Execute
|
|
|
|
|
|
|
|
|
|
| 58 |
self.state = self.env.step(action)
|
|
|
|
| 59 |
# TODO: Update your state tracking here
|
|
|
|
|
|
|
|
|
|
| 60 |
return self.state.observation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
|
| 63 |
-
# Global game
|
| 64 |
-
_game
|
| 65 |
|
| 66 |
|
| 67 |
-
def get_game() ->
|
| 68 |
-
"""Get or
|
| 69 |
global _game
|
| 70 |
-
if _game is None:
|
| 71 |
-
|
|
|
|
|
|
|
| 72 |
return _game
|
| 73 |
|
| 74 |
|
| 75 |
# =============================================================================
|
| 76 |
-
# MCP Tools - IMPLEMENT THESE
|
| 77 |
# =============================================================================
|
| 78 |
|
| 79 |
@mcp.tool()
|
| 80 |
def play_action(action: str) -> str:
|
| 81 |
"""
|
| 82 |
-
Execute a game
|
| 83 |
|
| 84 |
This is the main tool for interacting with the game.
|
| 85 |
|
| 86 |
-
Common commands:
|
| 87 |
-
- Movement: north, south, east, west, up, down
|
| 88 |
-
- Objects: take <item>, drop <item>, open <thing>
|
| 89 |
-
- Look: look, examine <thing>
|
| 90 |
-
|
| 91 |
Args:
|
| 92 |
-
action: The command to execute (e.g.,
|
| 93 |
-
|
| 94 |
Returns:
|
| 95 |
-
The game's response to
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
"""
|
| 97 |
-
# TODO: Implement this tool
|
| 98 |
-
# Hint: Use get_game().take_action(action)
|
| 99 |
game = get_game()
|
| 100 |
-
result = game.take_action(action)
|
| 101 |
|
| 102 |
-
# TODO:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
return result
|
| 104 |
|
| 105 |
|
| 106 |
-
# TODO: Implement additional
|
| 107 |
-
# These are optional but will help your agent play better!
|
| 108 |
|
| 109 |
# @mcp.tool()
|
| 110 |
# def memory() -> str:
|
| 111 |
# """
|
| 112 |
-
# Get
|
| 113 |
#
|
| 114 |
-
# Returns
|
| 115 |
-
#
|
| 116 |
# """
|
| 117 |
-
#
|
|
|
|
| 118 |
# pass
|
| 119 |
|
| 120 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
# @mcp.tool()
|
| 122 |
# def get_map() -> str:
|
| 123 |
# """
|
| 124 |
# Get a map of explored locations.
|
| 125 |
#
|
| 126 |
-
#
|
|
|
|
| 127 |
# """
|
| 128 |
-
#
|
|
|
|
| 129 |
# pass
|
| 130 |
|
| 131 |
|
| 132 |
# @mcp.tool()
|
| 133 |
-
# def
|
| 134 |
# """
|
| 135 |
-
#
|
|
|
|
|
|
|
|
|
|
| 136 |
# """
|
| 137 |
-
# #
|
| 138 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
|
| 140 |
|
| 141 |
# =============================================================================
|
| 142 |
-
#
|
| 143 |
# =============================================================================
|
| 144 |
|
| 145 |
if __name__ == "__main__":
|
| 146 |
-
# This runs the server
|
| 147 |
mcp.run()
|
|
|
|
| 1 |
"""
|
| 2 |
+
Student MCP Server for Text Adventure Games
|
| 3 |
|
| 4 |
+
This is your MCP server submission. Implement the tools that your agent
|
| 5 |
+
will use to play text adventure games.
|
| 6 |
|
| 7 |
+
Required tool:
|
| 8 |
+
play_action(action: str) -> str
|
| 9 |
+
Execute a game command and return the result.
|
| 10 |
|
| 11 |
+
Recommended tools:
|
| 12 |
+
memory() -> str
|
| 13 |
+
Return current game state, score, and recent history.
|
| 14 |
+
|
| 15 |
+
inventory() -> str
|
| 16 |
+
Return the player's current inventory.
|
| 17 |
+
|
| 18 |
+
get_map() -> str
|
| 19 |
+
Return a map of explored locations.
|
| 20 |
+
|
| 21 |
+
Test your server with:
|
| 22 |
+
fastmcp dev submission_template/mcp_server.py
|
| 23 |
+
|
| 24 |
+
Then open the MCP Inspector in your browser to test the tools interactively.
|
| 25 |
"""
|
| 26 |
|
| 27 |
import sys
|
|
|
|
| 38 |
# Create the MCP Server
|
| 39 |
# =============================================================================
|
| 40 |
|
| 41 |
+
mcp = FastMCP("Student Text Adventure Server")
|
|
|
|
|
|
|
| 42 |
|
| 43 |
|
| 44 |
# =============================================================================
|
| 45 |
# Game State Management
|
| 46 |
# =============================================================================
|
| 47 |
|
| 48 |
+
class GameManager:
|
| 49 |
"""
|
| 50 |
Manages the text adventure game state.
|
| 51 |
|
| 52 |
+
TODO: Extend this class to track:
|
| 53 |
+
- Action history (for memory tool)
|
| 54 |
- Explored locations (for mapping)
|
| 55 |
+
- Current score and moves
|
| 56 |
"""
|
| 57 |
|
| 58 |
+
def __init__(self):
|
| 59 |
+
self.env: TextAdventureEnv = None
|
| 60 |
+
self.state = None
|
| 61 |
+
self.game_name: str = ""
|
| 62 |
+
# TODO: Add more state tracking
|
| 63 |
+
# self.history: list[tuple[str, str]] = []
|
| 64 |
+
# self.explored_locations: dict[str, set[str]] = {}
|
| 65 |
+
# self.current_location: str = ""
|
| 66 |
+
|
| 67 |
+
def initialize(self, game: str = "zork1"):
|
| 68 |
+
"""Initialize or reset the game."""
|
| 69 |
self.game_name = game
|
| 70 |
self.env = TextAdventureEnv(game)
|
| 71 |
self.state = self.env.reset()
|
| 72 |
+
# TODO: Reset your state tracking here
|
| 73 |
+
return self.state.observation
|
|
|
|
| 74 |
|
| 75 |
+
def step(self, action: str) -> str:
|
| 76 |
+
"""Execute an action and return the result."""
|
| 77 |
+
if self.env is None:
|
| 78 |
+
self.initialize()
|
| 79 |
+
|
| 80 |
self.state = self.env.step(action)
|
| 81 |
+
|
| 82 |
# TODO: Update your state tracking here
|
| 83 |
+
# self.history.append((action, self.state.observation))
|
| 84 |
+
# Update location tracking, etc.
|
| 85 |
+
|
| 86 |
return self.state.observation
|
| 87 |
+
|
| 88 |
+
def get_score(self) -> int:
|
| 89 |
+
"""Get current score."""
|
| 90 |
+
return self.state.score if self.state else 0
|
| 91 |
+
|
| 92 |
+
def get_moves(self) -> int:
|
| 93 |
+
"""Get number of moves taken."""
|
| 94 |
+
return self.state.moves if self.state else 0
|
| 95 |
|
| 96 |
|
| 97 |
+
# Global game manager
|
| 98 |
+
_game = GameManager()
|
| 99 |
|
| 100 |
|
| 101 |
+
def get_game() -> GameManager:
|
| 102 |
+
"""Get or initialize the game manager."""
|
| 103 |
global _game
|
| 104 |
+
if _game.env is None:
|
| 105 |
+
# Get game from environment variable (set by evaluator)
|
| 106 |
+
game = os.environ.get("GAME", "zork1")
|
| 107 |
+
_game.initialize(game)
|
| 108 |
return _game
|
| 109 |
|
| 110 |
|
| 111 |
# =============================================================================
|
| 112 |
+
# MCP Tools - IMPLEMENT THESE
|
| 113 |
# =============================================================================
|
| 114 |
|
| 115 |
@mcp.tool()
|
| 116 |
def play_action(action: str) -> str:
|
| 117 |
"""
|
| 118 |
+
Execute a game command and return the result.
|
| 119 |
|
| 120 |
This is the main tool for interacting with the game.
|
| 121 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 122 |
Args:
|
| 123 |
+
action: The command to execute (e.g., "north", "take lamp", "open mailbox")
|
| 124 |
+
|
| 125 |
Returns:
|
| 126 |
+
The game's response to the action
|
| 127 |
+
|
| 128 |
+
Valid commands include:
|
| 129 |
+
- Movement: north, south, east, west, up, down, enter, exit
|
| 130 |
+
- Objects: take <item>, drop <item>, open <thing>, examine <thing>
|
| 131 |
+
- Other: look, inventory, read <thing>, turn on lamp
|
| 132 |
"""
|
|
|
|
|
|
|
| 133 |
game = get_game()
|
|
|
|
| 134 |
|
| 135 |
+
# TODO: You might want to add action validation here
|
| 136 |
+
# TODO: You might want to include score changes in the response
|
| 137 |
+
|
| 138 |
+
result = game.step(action)
|
| 139 |
+
|
| 140 |
+
# Optional: Append score info
|
| 141 |
+
# result += f"\n[Score: {game.get_score()} | Moves: {game.get_moves()}]"
|
| 142 |
+
|
| 143 |
return result
|
| 144 |
|
| 145 |
|
| 146 |
+
# TODO: Implement additional tools to help your agent
|
|
|
|
| 147 |
|
| 148 |
# @mcp.tool()
|
| 149 |
# def memory() -> str:
|
| 150 |
# """
|
| 151 |
+
# Get the current game state summary.
|
| 152 |
#
|
| 153 |
+
# Returns:
|
| 154 |
+
# A summary including current location, score, moves, and recent history
|
| 155 |
# """
|
| 156 |
+
# game = get_game()
|
| 157 |
+
# # TODO: Return useful state information
|
| 158 |
# pass
|
| 159 |
|
| 160 |
|
| 161 |
+
# @mcp.tool()
|
| 162 |
+
# def inventory() -> str:
|
| 163 |
+
# """
|
| 164 |
+
# Check what the player is carrying.
|
| 165 |
+
#
|
| 166 |
+
# Returns:
|
| 167 |
+
# List of items in the player's inventory
|
| 168 |
+
# """
|
| 169 |
+
# game = get_game()
|
| 170 |
+
# result = game.step("inventory")
|
| 171 |
+
# return result
|
| 172 |
+
|
| 173 |
+
|
| 174 |
# @mcp.tool()
|
| 175 |
# def get_map() -> str:
|
| 176 |
# """
|
| 177 |
# Get a map of explored locations.
|
| 178 |
#
|
| 179 |
+
# Returns:
|
| 180 |
+
# A text representation of explored locations and connections
|
| 181 |
# """
|
| 182 |
+
# game = get_game()
|
| 183 |
+
# # TODO: Return map of explored locations
|
| 184 |
# pass
|
| 185 |
|
| 186 |
|
| 187 |
# @mcp.tool()
|
| 188 |
+
# def get_valid_actions() -> str:
|
| 189 |
# """
|
| 190 |
+
# Get a list of likely valid actions from the current location.
|
| 191 |
+
#
|
| 192 |
+
# Returns:
|
| 193 |
+
# List of actions that might work here
|
| 194 |
# """
|
| 195 |
+
# # This is a hint: Jericho provides get_valid_actions()
|
| 196 |
+
# game = get_game()
|
| 197 |
+
# if game.env and game.env.env:
|
| 198 |
+
# valid = game.env.env.get_valid_actions()
|
| 199 |
+
# return "Valid actions: " + ", ".join(valid[:20])
|
| 200 |
+
# return "Could not determine valid actions"
|
| 201 |
|
| 202 |
|
| 203 |
# =============================================================================
|
| 204 |
+
# Run the server
|
| 205 |
# =============================================================================
|
| 206 |
|
| 207 |
if __name__ == "__main__":
|
| 208 |
+
# This runs the server with stdio transport (for MCP clients)
|
| 209 |
mcp.run()
|
submission_template/requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies (provided by course infrastructure)
|
| 2 |
+
# jericho
|
| 3 |
+
# python-dotenv
|
| 4 |
+
# fastmcp
|
| 5 |
+
# huggingface_hub
|
| 6 |
+
|
| 7 |
+
# Required for HF Space
|
| 8 |
+
gradio
|
templates/README.md
DELETED
|
@@ -1,129 +0,0 @@
|
|
| 1 |
-
# Text Adventure LLM Agent Templates
|
| 2 |
-
|
| 3 |
-
This folder contains starter templates for building your own AI agent to play text adventure games.
|
| 4 |
-
|
| 5 |
-
## Assignment Overview
|
| 6 |
-
|
| 7 |
-
You need to implement two components:
|
| 8 |
-
|
| 9 |
-
1. **MCP Server** (`mcp_server_template.py`) - Exposes game functionality as tools
|
| 10 |
-
2. **ReAct Agent** (`react_agent_template.py`) - Uses the MCP server to play the game
|
| 11 |
-
|
| 12 |
-
## Architecture
|
| 13 |
-
|
| 14 |
-
```
|
| 15 |
-
+-------------------+ MCP Protocol +------------------+
|
| 16 |
-
| | <------------------> | |
|
| 17 |
-
| ReAct Agent | (tools/calls) | MCP Server |
|
| 18 |
-
| (Your Agent) | | (Your Server) |
|
| 19 |
-
| | | |
|
| 20 |
-
+-------------------+ +------------------+
|
| 21 |
-
| |
|
| 22 |
-
| LLM API | Game API
|
| 23 |
-
v v
|
| 24 |
-
+-------------------+ +------------------+
|
| 25 |
-
| | | |
|
| 26 |
-
| HuggingFace | | Text Adventure |
|
| 27 |
-
| Inference API | | (Jericho) |
|
| 28 |
-
+-------------------+ +------------------+
|
| 29 |
-
```
|
| 30 |
-
|
| 31 |
-
## Getting Started
|
| 32 |
-
|
| 33 |
-
### 1. Set Up Environment
|
| 34 |
-
|
| 35 |
-
```bash
|
| 36 |
-
# Create virtual environment
|
| 37 |
-
uv venv
|
| 38 |
-
source .venv/bin/activate
|
| 39 |
-
|
| 40 |
-
# Install dependencies
|
| 41 |
-
uv pip install -r requirements.txt
|
| 42 |
-
|
| 43 |
-
# Copy environment file and add your HuggingFace token
|
| 44 |
-
cp .env.example .env
|
| 45 |
-
# Edit .env and add HF_TOKEN=your_token_here
|
| 46 |
-
```
|
| 47 |
-
|
| 48 |
-
### 2. Implement the MCP Server
|
| 49 |
-
|
| 50 |
-
Start with `mcp_server_template.py`. Your server needs to:
|
| 51 |
-
|
| 52 |
-
1. Create a FastMCP server instance
|
| 53 |
-
2. Implement at least the `play_action` tool to send commands to the game
|
| 54 |
-
3. Optionally add helper tools (memory, map, inventory, hints)
|
| 55 |
-
|
| 56 |
-
Test your server:
|
| 57 |
-
```bash
|
| 58 |
-
# Run the server directly (will use stdio transport)
|
| 59 |
-
python templates/mcp_server_template.py
|
| 60 |
-
|
| 61 |
-
# Or use FastMCP's development tools
|
| 62 |
-
fastmcp dev templates/mcp_server_template.py
|
| 63 |
-
```
|
| 64 |
-
|
| 65 |
-
### 3. Implement the ReAct Agent
|
| 66 |
-
|
| 67 |
-
Start with `react_agent_template.py`. Your agent needs to:
|
| 68 |
-
|
| 69 |
-
1. Connect to your MCP server using FastMCP Client
|
| 70 |
-
2. Implement a ReAct loop (Thought -> Action -> Observation)
|
| 71 |
-
3. Use the LLM to decide what tools to call
|
| 72 |
-
4. Parse the LLM's response and execute the chosen tool
|
| 73 |
-
|
| 74 |
-
Test your agent:
|
| 75 |
-
```bash
|
| 76 |
-
python templates/react_agent_template.py
|
| 77 |
-
```
|
| 78 |
-
|
| 79 |
-
## MCP Protocol Basics
|
| 80 |
-
|
| 81 |
-
MCP (Model Context Protocol) is a standard for LLM-tool communication:
|
| 82 |
-
|
| 83 |
-
- **Tools**: Functions the LLM can call (e.g., `play_action`, `get_inventory`)
|
| 84 |
-
- **Resources**: Read-only data (e.g., game state, map)
|
| 85 |
-
- **Prompts**: Reusable prompt templates
|
| 86 |
-
|
| 87 |
-
FastMCP makes it easy:
|
| 88 |
-
|
| 89 |
-
```python
|
| 90 |
-
# Server side - define a tool
|
| 91 |
-
from fastmcp import FastMCP
|
| 92 |
-
|
| 93 |
-
mcp = FastMCP("My Server")
|
| 94 |
-
|
| 95 |
-
@mcp.tool()
|
| 96 |
-
def my_tool(arg: str) -> str:
|
| 97 |
-
"""Tool description for the LLM."""
|
| 98 |
-
return f"Result: {arg}"
|
| 99 |
-
|
| 100 |
-
# Client side - call a tool
|
| 101 |
-
from fastmcp import Client
|
| 102 |
-
|
| 103 |
-
async with Client(mcp) as client:
|
| 104 |
-
result = await client.call_tool("my_tool", {"arg": "hello"})
|
| 105 |
-
```
|
| 106 |
-
|
| 107 |
-
## Evaluation Criteria
|
| 108 |
-
|
| 109 |
-
Your implementation will be evaluated on:
|
| 110 |
-
|
| 111 |
-
1. **Correctness**: Does it work? Can it play text adventure games?
|
| 112 |
-
2. **Score**: How many points does your agent achieve?
|
| 113 |
-
3. **Code Quality**: Is your code clean, documented, and well-structured?
|
| 114 |
-
4. **Creativity**: Did you add interesting features or optimizations?
|
| 115 |
-
|
| 116 |
-
## Tips
|
| 117 |
-
|
| 118 |
-
1. Start simple - get a basic loop working first
|
| 119 |
-
2. Use `memory()` and `get_map()` tools to help the agent track state
|
| 120 |
-
3. Add loop detection to avoid repeating the same actions
|
| 121 |
-
4. Test with verbose output to debug the agent's reasoning
|
| 122 |
-
5. The LLM may generate invalid commands - handle errors gracefully
|
| 123 |
-
|
| 124 |
-
## Resources
|
| 125 |
-
|
| 126 |
-
- [FastMCP Documentation](https://gofastmcp.com/)
|
| 127 |
-
- [MCP Protocol Specification](https://modelcontextprotocol.io/)
|
| 128 |
-
- [Jericho (Text Adventures)](https://github.com/microsoft/jericho)
|
| 129 |
-
- [HuggingFace Inference API](https://huggingface.co/docs/huggingface_hub/guides/inference)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
templates/react_agent_template.py
DELETED
|
@@ -1,303 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
ReAct Agent Template for Text Adventure Games
|
| 3 |
-
|
| 4 |
-
This is a starter template for building a ReAct agent that plays text adventures using MCP.
|
| 5 |
-
|
| 6 |
-
ReAct (Reasoning + Acting) is a simple but effective agent pattern:
|
| 7 |
-
1. THINK: Reason about the current situation
|
| 8 |
-
2. ACT: Choose and execute a tool
|
| 9 |
-
3. OBSERVE: See the result
|
| 10 |
-
4. Repeat until goal is achieved
|
| 11 |
-
|
| 12 |
-
Your task is to implement:
|
| 13 |
-
1. Connect to the MCP server
|
| 14 |
-
2. Implement the ReAct loop
|
| 15 |
-
3. Use the LLM to generate thoughts and choose actions
|
| 16 |
-
|
| 17 |
-
TODO:
|
| 18 |
-
1. Set up the MCP client connection
|
| 19 |
-
2. Implement the agent loop
|
| 20 |
-
3. Parse LLM responses to extract tool calls
|
| 21 |
-
"""
|
| 22 |
-
|
| 23 |
-
import asyncio
|
| 24 |
-
import os
|
| 25 |
-
from huggingface_hub import InferenceClient
|
| 26 |
-
from dotenv import load_dotenv
|
| 27 |
-
|
| 28 |
-
# FastMCP client for connecting to MCP servers
|
| 29 |
-
from fastmcp import Client
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
# =============================================================================
|
| 33 |
-
# Configuration
|
| 34 |
-
# =============================================================================
|
| 35 |
-
|
| 36 |
-
# Load environment variables
|
| 37 |
-
load_dotenv()
|
| 38 |
-
|
| 39 |
-
# LLM Configuration
|
| 40 |
-
MODEL = os.getenv("HF_MODEL", "meta-llama/Llama-3.2-3B-Instruct")
|
| 41 |
-
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 42 |
-
|
| 43 |
-
if not HF_TOKEN:
|
| 44 |
-
raise ValueError("HF_TOKEN not found. Set it in your .env file.")
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
# =============================================================================
|
| 48 |
-
# System Prompt - Instructions for the LLM
|
| 49 |
-
# =============================================================================
|
| 50 |
-
|
| 51 |
-
SYSTEM_PROMPT = """You are playing a classic text adventure game.
|
| 52 |
-
|
| 53 |
-
GOAL: Explore the world, solve puzzles, collect treasures, and maximize your score.
|
| 54 |
-
|
| 55 |
-
AVAILABLE TOOLS:
|
| 56 |
-
- play_action: Execute a game command (north, take lamp, open mailbox, etc.)
|
| 57 |
-
- memory: Get current game state summary (optional, if implemented)
|
| 58 |
-
- get_map: See explored locations (optional, if implemented)
|
| 59 |
-
- inventory: Check your items (optional, if implemented)
|
| 60 |
-
|
| 61 |
-
VALID GAME COMMANDS:
|
| 62 |
-
- Movement: north, south, east, west, up, down
|
| 63 |
-
- Objects: take <item>, drop <item>, open <thing>, examine <thing>
|
| 64 |
-
- Light: turn on lamp
|
| 65 |
-
|
| 66 |
-
RESPOND IN THIS EXACT FORMAT:
|
| 67 |
-
THOUGHT: <your reasoning>
|
| 68 |
-
TOOL: <tool_name>
|
| 69 |
-
ARGS: <arguments as JSON, or empty {} if no args>
|
| 70 |
-
|
| 71 |
-
Example:
|
| 72 |
-
THOUGHT: I see a container. I should open it to see what's inside.
|
| 73 |
-
TOOL: play_action
|
| 74 |
-
ARGS: {"action": "open container"}
|
| 75 |
-
"""
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
# =============================================================================
|
| 79 |
-
# ReAct Agent Class
|
| 80 |
-
# =============================================================================
|
| 81 |
-
|
| 82 |
-
class ReActAgent:
|
| 83 |
-
"""
|
| 84 |
-
A ReAct agent that uses MCP tools to play text adventures.
|
| 85 |
-
|
| 86 |
-
TODO: Complete this implementation!
|
| 87 |
-
"""
|
| 88 |
-
|
| 89 |
-
def __init__(self, mcp_server_path: str):
|
| 90 |
-
"""
|
| 91 |
-
Initialize the agent.
|
| 92 |
-
|
| 93 |
-
Args:
|
| 94 |
-
mcp_server_path: Path to the MCP server script
|
| 95 |
-
"""
|
| 96 |
-
self.mcp_server_path = mcp_server_path
|
| 97 |
-
self.llm = InferenceClient(token=HF_TOKEN)
|
| 98 |
-
self.history: list[dict] = []
|
| 99 |
-
|
| 100 |
-
async def run(self, max_steps: int = 50, verbose: bool = True):
|
| 101 |
-
"""
|
| 102 |
-
Run the ReAct agent loop.
|
| 103 |
-
|
| 104 |
-
TODO: Implement the main agent loop!
|
| 105 |
-
|
| 106 |
-
Steps:
|
| 107 |
-
1. Connect to MCP server using FastMCP Client
|
| 108 |
-
2. Get initial observation (call play_action with "look")
|
| 109 |
-
3. Loop:
|
| 110 |
-
a. Build prompt with current observation
|
| 111 |
-
b. Call LLM to get thought and tool choice
|
| 112 |
-
c. Parse the response
|
| 113 |
-
d. Execute the chosen tool via MCP
|
| 114 |
-
e. Update history with observation
|
| 115 |
-
f. Check if done
|
| 116 |
-
"""
|
| 117 |
-
# TODO: Implement the agent loop
|
| 118 |
-
# Hint: Use `async with Client(self.mcp_server_path) as client:`
|
| 119 |
-
|
| 120 |
-
print("=" * 60)
|
| 121 |
-
print("Starting Text Adventure ReAct Agent")
|
| 122 |
-
print("=" * 60)
|
| 123 |
-
|
| 124 |
-
# Connect to the MCP server
|
| 125 |
-
async with Client(self.mcp_server_path) as client:
|
| 126 |
-
# List available tools
|
| 127 |
-
tools = await client.list_tools()
|
| 128 |
-
print(f"\nAvailable tools: {[t.name for t in tools]}")
|
| 129 |
-
|
| 130 |
-
# Get initial observation
|
| 131 |
-
result = await client.call_tool("play_action", {"action": "look"})
|
| 132 |
-
observation = result.content[0].text
|
| 133 |
-
print(f"\nInitial observation:\n{observation}\n")
|
| 134 |
-
|
| 135 |
-
# Main loop
|
| 136 |
-
for step in range(1, max_steps + 1):
|
| 137 |
-
print(f"\n{'─' * 40}")
|
| 138 |
-
print(f"Step {step}")
|
| 139 |
-
print("─" * 40)
|
| 140 |
-
|
| 141 |
-
# TODO: Build prompt for LLM
|
| 142 |
-
prompt = self._build_prompt(observation)
|
| 143 |
-
|
| 144 |
-
# TODO: Call LLM
|
| 145 |
-
response = self._call_llm(prompt)
|
| 146 |
-
|
| 147 |
-
# TODO: Parse response to get tool and arguments
|
| 148 |
-
thought, tool_name, tool_args = self._parse_response(response)
|
| 149 |
-
|
| 150 |
-
if verbose:
|
| 151 |
-
print(f"\nTHOUGHT: {thought}")
|
| 152 |
-
print(f"TOOL: {tool_name}")
|
| 153 |
-
print(f"ARGS: {tool_args}")
|
| 154 |
-
|
| 155 |
-
# TODO: Execute the tool via MCP
|
| 156 |
-
try:
|
| 157 |
-
result = await client.call_tool(tool_name, tool_args)
|
| 158 |
-
observation = result.content[0].text
|
| 159 |
-
print(f"\nRESULT:\n{observation}")
|
| 160 |
-
except Exception as e:
|
| 161 |
-
observation = f"Error: {e}"
|
| 162 |
-
print(f"\nERROR: {e}")
|
| 163 |
-
|
| 164 |
-
# TODO: Update history
|
| 165 |
-
self.history.append({
|
| 166 |
-
"thought": thought,
|
| 167 |
-
"tool": tool_name,
|
| 168 |
-
"args": tool_args,
|
| 169 |
-
"result": observation
|
| 170 |
-
})
|
| 171 |
-
|
| 172 |
-
# Check for game over
|
| 173 |
-
if "GAME OVER" in observation.upper():
|
| 174 |
-
print("\n\nGame Over!")
|
| 175 |
-
break
|
| 176 |
-
|
| 177 |
-
print("\n" + "=" * 60)
|
| 178 |
-
print("Agent finished")
|
| 179 |
-
print("=" * 60)
|
| 180 |
-
|
| 181 |
-
def _build_prompt(self, observation: str) -> str:
|
| 182 |
-
"""
|
| 183 |
-
Build the prompt for the LLM.
|
| 184 |
-
|
| 185 |
-
TODO: Customize this to include relevant context!
|
| 186 |
-
|
| 187 |
-
Consider including:
|
| 188 |
-
- Current observation
|
| 189 |
-
- Recent history (last few actions and results)
|
| 190 |
-
- Warnings about repeated actions
|
| 191 |
-
"""
|
| 192 |
-
parts = []
|
| 193 |
-
|
| 194 |
-
# Add recent history (last 3 actions)
|
| 195 |
-
if self.history:
|
| 196 |
-
parts.append("Recent actions:")
|
| 197 |
-
for entry in self.history[-3:]:
|
| 198 |
-
parts.append(f" > {entry['tool']}({entry['args']}) -> {entry['result'][:100]}...")
|
| 199 |
-
parts.append("")
|
| 200 |
-
|
| 201 |
-
# Current observation
|
| 202 |
-
parts.append(f"Current observation:\n{observation}")
|
| 203 |
-
parts.append("\nWhat do you do next?")
|
| 204 |
-
|
| 205 |
-
return "\n".join(parts)
|
| 206 |
-
|
| 207 |
-
def _call_llm(self, prompt: str) -> str:
|
| 208 |
-
"""
|
| 209 |
-
Call the LLM to get the next action.
|
| 210 |
-
|
| 211 |
-
TODO: Customize LLM parameters if needed.
|
| 212 |
-
"""
|
| 213 |
-
try:
|
| 214 |
-
messages = [
|
| 215 |
-
{"role": "system", "content": SYSTEM_PROMPT},
|
| 216 |
-
{"role": "user", "content": prompt}
|
| 217 |
-
]
|
| 218 |
-
|
| 219 |
-
response = self.llm.chat.completions.create(
|
| 220 |
-
model=MODEL,
|
| 221 |
-
messages=messages,
|
| 222 |
-
temperature=0.7,
|
| 223 |
-
max_tokens=200,
|
| 224 |
-
)
|
| 225 |
-
return response.choices[0].message.content
|
| 226 |
-
except Exception as e:
|
| 227 |
-
print(f"LLM Error: {e}")
|
| 228 |
-
return "THOUGHT: Error occurred.\nTOOL: play_action\nARGS: {\"action\": \"look\"}"
|
| 229 |
-
|
| 230 |
-
def _parse_response(self, response: str) -> tuple[str, str, dict]:
|
| 231 |
-
"""
|
| 232 |
-
Parse the LLM response to extract thought, tool, and arguments.
|
| 233 |
-
|
| 234 |
-
TODO: Make this more robust!
|
| 235 |
-
|
| 236 |
-
Expected format:
|
| 237 |
-
THOUGHT: <reasoning>
|
| 238 |
-
TOOL: <tool_name>
|
| 239 |
-
ARGS: <json args>
|
| 240 |
-
"""
|
| 241 |
-
import json
|
| 242 |
-
|
| 243 |
-
thought = ""
|
| 244 |
-
tool_name = "play_action"
|
| 245 |
-
tool_args = {"action": "look"}
|
| 246 |
-
|
| 247 |
-
lines = response.strip().split("\n")
|
| 248 |
-
|
| 249 |
-
for line in lines:
|
| 250 |
-
line_upper = line.upper().strip()
|
| 251 |
-
|
| 252 |
-
if line_upper.startswith("THOUGHT:"):
|
| 253 |
-
thought = line.split(":", 1)[1].strip()
|
| 254 |
-
elif line_upper.startswith("TOOL:"):
|
| 255 |
-
tool_name = line.split(":", 1)[1].strip().lower()
|
| 256 |
-
elif line_upper.startswith("ARGS:"):
|
| 257 |
-
try:
|
| 258 |
-
args_str = line.split(":", 1)[1].strip()
|
| 259 |
-
tool_args = json.loads(args_str)
|
| 260 |
-
except (json.JSONDecodeError, IndexError):
|
| 261 |
-
# Try to extract action from malformed args
|
| 262 |
-
if "action" in args_str.lower():
|
| 263 |
-
# Simple extraction for common case
|
| 264 |
-
tool_args = {"action": "look"}
|
| 265 |
-
|
| 266 |
-
return thought, tool_name, tool_args
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
# =============================================================================
|
| 270 |
-
# Main - Run the agent
|
| 271 |
-
# =============================================================================
|
| 272 |
-
|
| 273 |
-
async def main():
|
| 274 |
-
"""Run the ReAct agent."""
|
| 275 |
-
import argparse
|
| 276 |
-
|
| 277 |
-
parser = argparse.ArgumentParser(description="Run the ReAct Text Adventure Agent")
|
| 278 |
-
parser.add_argument(
|
| 279 |
-
"--server", "-s",
|
| 280 |
-
default="templates/mcp_server_template.py",
|
| 281 |
-
help="Path to the MCP server script"
|
| 282 |
-
)
|
| 283 |
-
parser.add_argument(
|
| 284 |
-
"--max-steps", "-n",
|
| 285 |
-
type=int,
|
| 286 |
-
default=50,
|
| 287 |
-
help="Maximum steps to run"
|
| 288 |
-
)
|
| 289 |
-
parser.add_argument(
|
| 290 |
-
"--verbose", "-v",
|
| 291 |
-
action="store_true",
|
| 292 |
-
default=True,
|
| 293 |
-
help="Show detailed output"
|
| 294 |
-
)
|
| 295 |
-
|
| 296 |
-
args = parser.parse_args()
|
| 297 |
-
|
| 298 |
-
agent = ReActAgent(args.server)
|
| 299 |
-
await agent.run(max_steps=args.max_steps, verbose=args.verbose)
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
if __name__ == "__main__":
|
| 303 |
-
asyncio.run(main())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|