""" vLLM Model Wrapper for HuggingFace Models This file creates an AI agent that uses vLLM to run large language models from HuggingFace. vLLM provides fast GPU-accelerated inference for LLMs. """ # Import Python's operating system module to interact with environment variables import os # Import random module for fallback random move selection import random # Import Optional type hint for parameters that can be None from typing import Optional # Import vLLM's main classes: LLM for model loading, SamplingParams for generation settings from vllm import LLM, SamplingParams # Import the base protocol that defines what an agent should look like from .base import AgentLike # Import utility functions for parsing game observations and moves from ..utils.parsing import ( extract_legal_moves, # Function: gets list of valid moves from observation text extract_forbidden, # Function: gets list of forbidden moves slice_board_and_moves, # Function: creates compact version of board state strip_think, # Function: removes thinking text from model output MOVE_RE # Regular expression: pattern to find moves like "[A0 B0]" ) # Import prompt management classes from ..prompts import PromptPack, get_prompt_pack class VLLMAgent(AgentLike): """ Agent class powered by vLLM for fast GPU inference. This agent can load any HuggingFace model and use it to play Stratego. Inherits from: AgentLike (protocol defining agent interface) """ def __init__( self, model_name: str, # String: HuggingFace model ID (e.g., "google/gemma-2-2b-it") system_prompt: Optional[str] = None, # String or None: custom system prompt override prompt_pack: Optional[PromptPack | str] = None, # PromptPack object or string: prompt configuration temperature: float = 0.2, # Float: controls randomness (0.0=deterministic, 1.0=creative) top_p: float = 0.9, # Float: nucleus sampling threshold max_tokens: int = 64, # Integer: maximum tokens to generate per response gpu_memory_utilization: float = 0.3, # Check Check checking Float: fraction of GPU memory to use (0.0-1.0) tensor_parallel_size: int = 1, # Integer: number of GPUs to use for this model download_dir: str = "/scratch/hm24/.cache/huggingface", # String: where to cache model files **kwargs, # Dictionary: additional vLLM arguments ): """ Initialize the vLLM agent by loading a model from HuggingFace. This constructor: 1. Sets up prompt configuration 2. Configures cache directories 3. Loads the model into GPU memory using vLLM 4. Configures generation parameters """ # Store the model name as an instance variable (self.model_name) # This is used later for displaying which model made a move self.model_name = model_name # Handle prompt_pack parameter which can be a string name or PromptPack object if isinstance(prompt_pack, str) or prompt_pack is None: # If it's a string or None, load the prompt pack by name # get_prompt_pack() returns a PromptPack object with system prompts and guidance self.prompt_pack: PromptPack = get_prompt_pack(prompt_pack) else: # If it's already a PromptPack object, use it directly self.prompt_pack = prompt_pack # Set system prompt: use custom if provided, otherwise use from prompt pack # The system prompt tells the model how to behave (e.g., "You are a Stratego player") self.system_prompt = system_prompt if system_prompt is not None else self.prompt_pack.system # Force HuggingFace to cache models in /scratch instead of home directory # Environment variables control where transformers library saves downloaded models os.environ["HF_HOME"] = download_dir os.environ["TRANSFORMERS_CACHE"] = download_dir # Print status messages to show progress print(f"🤖 Loading {model_name} with vLLM...") print(f"📁 Cache directory: {download_dir}") # Create vLLM engine instance # This loads the model from HuggingFace and prepares it for inference self.llm = LLM( model=model_name, # Which model to load download_dir=download_dir, # Where to save/load model files gpu_memory_utilization=gpu_memory_utilization, # How much GPU memory to use tensor_parallel_size=tensor_parallel_size, # How many GPUs to split model across trust_remote_code=True, # Allow custom model code from HuggingFace **kwargs # Pass any additional vLLM parameters ) # Create sampling parameters object # This controls how the model generates text (temperature, length, etc.) self.sampling_params = SamplingParams( temperature=temperature, # Randomness in generation top_p=top_p, # Nucleus sampling parameter max_tokens=max_tokens, # Maximum length of generated response stop=["\n\n", "Player", "Legal moves:"], # List of strings that stop generation ) # Print success message print(f"✅ Model loaded successfully!") def _llm_once(self, prompt: str) -> str: """ Generate a single response from the model. This is a private method (starts with _) used internally. Args: prompt (str): The input text to send to the model Returns: str: The model's response text, cleaned of thinking markers """ # Combine system prompt and user prompt into full prompt # Format: "System: \n\nUser: " full_prompt = f"{self.system_prompt}\n\n{prompt}" # Call vLLM to generate response # Returns a list of output objects (we only generate 1, so index [0]) outputs = self.llm.generate([full_prompt], self.sampling_params) # Extract text from first output's first completion # Structure: outputs[request_index].outputs[completion_index].text response = outputs[0].outputs[0].text.strip() # Remove any thinking markers (like ...) from response # strip_think() is a utility function that cleans the text return strip_think(response) def __call__(self, observation: str) -> str: """ Main method called when agent needs to make a move. This makes the agent callable like a function: agent(observation) Args: observation (str): The current game state as text from TextArena Returns: str: A move in format "[A0 B0]" representing from-square to-square """ # Step 1: Extract list of legal moves from observation text # Returns list like ["[A0 B0]", "[A1 B1]", ...] legal = extract_legal_moves(observation) # If no legal moves exist, return empty string (game might be over) if not legal: return "" # Step 2: Get forbidden moves (moves that were already tried and failed) # Returns set of move strings to avoid forbidden = set(extract_forbidden(observation)) # Filter legal moves to remove forbidden ones # List comprehension: keep only moves NOT in forbidden set # If all moves forbidden, fall back to full legal list legal_filtered = [m for m in legal if m not in forbidden] or legal[:] # Step 3: Create compact version of observation for model # slice_board_and_moves() removes unnecessary text to save tokens slim = slice_board_and_moves(observation) # Get guidance prompt from prompt pack # This wraps the slim observation with instructions guidance = self.prompt_pack.guidance(slim) # Step 4: Try to get valid move with retry loop (max 3 attempts) for attempt in range(3): # First strategy: Use guidance prompt # Call model with full game context and instructions raw = self._llm_once(guidance) # Search for move pattern in response using regex # MOVE_RE.search() looks for pattern like "[A0 B0]" m = MOVE_RE.search(raw) # If regex found a match if m: # Extract the matched move string mv = m.group(0) # Check if this move is in our legal filtered list if mv in legal_filtered: # Valid move found! Return it return mv # Second strategy (attempts 1 and 2): Direct instruction if attempt > 0: # Ask model directly for move without game context raw2 = self._llm_once("Output exactly one legal move [A0 B0].") # Search for move pattern in this response m2 = MOVE_RE.search(raw2) if m2: mv2 = m2.group(0) # Check validity if mv2 in legal_filtered: return mv2 # Step 5: Fallback - all attempts failed # Choose random legal move to ensure game continues # random.choice() picks one item from list randomly return random.choice(legal_filtered) def cleanup(self): """ Free GPU memory by deleting model. Call this when done with agent to release resources. """ # Delete the LLM object, freeing VRAM del self.llm # Import torch to access CUDA functions import torch # Force PyTorch to release all unused GPU memory torch.cuda.empty_cache()