Spaces:

openenv-community
/

KantBench-Dashboard

Running

App Files Files Community

jtowarek commited on Mar 8

Commit

3ff9218

verified ·

1 Parent(s): 9ab079e

Upload folder using huggingface_hub

Browse files

Files changed (26) hide show

Dockerfile +1 -1
train/Dockerfile +20 -0
train/__init__.py +42 -0
train/agent.py +185 -0
train/dpo/__init__.py +7 -0
train/dpo/config.py +82 -0
train/dpo/pairs.py +108 -0
train/dpo/trainer.py +162 -0
train/grpo/__init__.py +7 -0
train/grpo/config.py +95 -0
train/grpo/dataset.py +68 -0
train/grpo/trainer.py +190 -0
train/kantbench_grpo_colab.ipynb +139 -0
train/nplayer/__init__.py +34 -0
train/nplayer/coalition_agent.py +249 -0
train/nplayer/nplayer_agent.py +146 -0
train/requirements.txt +9 -0
train/rewards.py +206 -0
train/self_play/__init__.py +1 -0
train/self_play/config.py +55 -0
train/self_play/oauth.py +191 -0
train/self_play/opponents.py +142 -0
train/self_play/trainer.py +276 -0
train/splits.py +77 -0
train/train.py +403 -0
train/trajectory.py +206 -0

Dockerfile CHANGED Viewed

@@ -2,7 +2,7 @@ FROM python:3.11-slim
 WORKDIR /app
-RUN pip install --no-cache-dir gradio pydantic
 COPY . /app

 WORKDIR /app
+RUN pip install --no-cache-dir gradio pydantic anthropic openai
 COPY . /app

train/Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+FROM nvcr.io/nvidia/pytorch:24.08-py3
+WORKDIR /workspace
+# Install dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy training script
+COPY train.py .
+# Default: train with Qwen2.5-7B-Instruct, 500 steps
+CMD ["python", "train.py", \
+     "--model", "Qwen/Qwen2.5-7B-Instruct", \
+     "--episodes", "2000", \
+     "--max-steps", "500", \
+     "--num-generations", "8", \
+     "--batch-size", "2", \
+     "--grad-accum", "8", \
+     "--output-dir", "/workspace/output"]

train/__init__.py ADDED Viewed

	@@ -0,0 +1,42 @@

+"""Training pipeline for strategic reasoning via game-theory environments."""
+__all__ = [
+    "LLMAgent",
+    "PromptBuilder",
+    "parse_action",
+    "episode_reward",
+    "get_train_eval_split",
+    "EpisodeTrajectory",
+    "StepRecord",
+    "TrajectoryCollector",
+]
+def __getattr__(name: str) -> object:
+    """Lazy imports to avoid pulling in openenv at package load time."""
+    if name in ("LLMAgent", "PromptBuilder", "parse_action"):
+        from train.agent import LLMAgent, PromptBuilder, parse_action
+        _map = {
+            "LLMAgent": LLMAgent,
+            "PromptBuilder": PromptBuilder,
+            "parse_action": parse_action,
+        }
+        return _map[name]
+    if name == "episode_reward":
+        from train.rewards import episode_reward
+        return episode_reward
+    if name == "get_train_eval_split":
+        from train.splits import get_train_eval_split
+        return get_train_eval_split
+    if name in ("EpisodeTrajectory", "StepRecord", "TrajectoryCollector"):
+        from train.trajectory import (
+            EpisodeTrajectory, StepRecord, TrajectoryCollector,
+        )
+        _map = {
+            "EpisodeTrajectory": EpisodeTrajectory,
+            "StepRecord": StepRecord,
+            "TrajectoryCollector": TrajectoryCollector,
+        }
+        return _map[name]
+    msg = f"module 'train' has no attribute {name!r}"
+    raise AttributeError(msg)

train/agent.py ADDED Viewed

	@@ -0,0 +1,185 @@

+"""LLM agent for game-theory environments."""
+from __future__ import annotations
+import random
+from typing import Any, Callable, Dict, List, Optional
+from env.models import GameAction, GameObservation
+from constant_definitions.train.agent_constants import (
+    MAX_ACTION_TOKENS,
+    MAX_PROMPT_HISTORY_ROUNDS,
+    PARSE_FAILURE_SENTINEL,
+    PROMPT_SECTION_ACTIONS,
+    PROMPT_SECTION_GAME,
+    PROMPT_SECTION_HISTORY,
+    PROMPT_SECTION_INSTRUCTION,
+    PROMPT_SECTION_SCORES,
+    SYSTEM_PROMPT,
+    TRAIN_TEMPERATURE_DENOMINATOR,
+    TRAIN_TEMPERATURE_NUMERATOR,
+)
+_ZERO = int()
+_ONE = int(bool(True))
+_NEWLINE = "\n"
+_SECTION_SEP = "\n\n"
+_BRACKET_OPEN = "["
+_BRACKET_CLOSE = "]"
+_COLON_SPACE = ": "
+_DASH_SPACE = "- "
+_ROUND_PREFIX = "Round "
+_YOU_PLAYED = " | You played: "
+_OPP_PLAYED = " | Opponent played: "
+_YOUR_PAYOFF = " | Your payoff: "
+_OPP_PAYOFF = " | Opp payoff: "
+class PromptBuilder:
+    """Formats GameObservation into a structured text prompt.
+    The prompt intentionally excludes the opponent strategy name
+    to prevent the model from shortcutting via strategy recognition.
+    """
+    @staticmethod
+    def build(obs: GameObservation) -> str:
+        """Build a structured prompt from a game observation."""
+        sections: List[str] = []
+        # Game section
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_GAME + _BRACKET_CLOSE
+            + _NEWLINE + obs.game_name
+            + _NEWLINE + obs.game_description
+        )
+        # History section (limited to last N rounds)
+        if obs.history:
+            history_lines: List[str] = []
+            history_slice = obs.history[-MAX_PROMPT_HISTORY_ROUNDS:]
+            for rnd in history_slice:
+                line = (
+                    _ROUND_PREFIX + str(rnd.round_number)
+                    + _YOU_PLAYED + rnd.player_action
+                    + _OPP_PLAYED + rnd.opponent_action
+                    + _YOUR_PAYOFF + str(rnd.player_payoff)
+                    + _OPP_PAYOFF + str(rnd.opponent_payoff)
+                )
+                history_lines.append(line)
+            sections.append(
+                _BRACKET_OPEN + PROMPT_SECTION_HISTORY + _BRACKET_CLOSE
+                + _NEWLINE + _NEWLINE.join(history_lines)
+            )
+        # Scores section
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_SCORES + _BRACKET_CLOSE
+            + _NEWLINE + "Your score" + _COLON_SPACE + str(obs.player_score)
+            + _NEWLINE + "Opponent score" + _COLON_SPACE + str(obs.opponent_score)
+            + _NEWLINE + "Round" + _COLON_SPACE + str(obs.current_round)
+            + " of " + str(obs.total_rounds)
+        )
+        # Available actions
+        action_lines = [_DASH_SPACE + a for a in obs.available_actions]
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_ACTIONS + _BRACKET_CLOSE
+            + _NEWLINE + _NEWLINE.join(action_lines)
+        )
+        # Instruction
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_INSTRUCTION + _BRACKET_CLOSE
+            + _NEWLINE + SYSTEM_PROMPT
+        )
+        return _SECTION_SEP.join(sections)
+def parse_action(response: str, available_actions: List[str]) -> str:
+    """Parse an action from LLM response text.
+    Tries: exact match -> case-insensitive -> substring -> random selection.
+    """
+    stripped = response.strip()
+    # Exact match
+    if stripped in available_actions:
+        return stripped
+    # Case-insensitive match
+    lower = stripped.lower()
+    for action in available_actions:
+        if action.lower() == lower:
+            return action
+    # Substring match (response contains action name)
+    for action in available_actions:
+        if action.lower() in lower:
+            return action
+    # Random selection as last resort
+    return random.choice(available_actions)
+class LLMAgent:
+    """LLM-based agent compatible with TournamentRunner agent_fn interface.
+    Parameters
+    ----------
+    generate_fn : callable
+        A function that takes a prompt string and returns a completion string.
+        This abstracts over different model backends (HF, vLLM, API).
+    prompt_builder : PromptBuilder, optional
+        Custom prompt builder. Defaults to the standard PromptBuilder.
+    """
+    def __init__(
+        self,
+        generate_fn: Callable[[str], str],
+        prompt_builder: Optional[PromptBuilder] = None,
+    ) -> None:
+        self._generate_fn = generate_fn
+        self._prompt_builder = prompt_builder or PromptBuilder()
+        self._last_prompt: str = ""
+        self._last_completion: str = ""
+    def __call__(self, obs: GameObservation) -> GameAction:
+        """Select an action given a game observation."""
+        prompt = self._prompt_builder.build(obs)
+        self._last_prompt = prompt
+        completion = self._generate_fn(prompt)
+        self._last_completion = completion
+        action_str = parse_action(completion, obs.available_actions)
+        return GameAction(action=action_str)
+    @property
+    def last_prompt(self) -> str:
+        """The most recently constructed prompt."""
+        return self._last_prompt
+    @property
+    def last_completion(self) -> str:
+        """The most recent raw model completion."""
+        return self._last_completion
+class APIAgent(LLMAgent):
+    """Agent that uses an external API (OpenAI/Anthropic) for generation.
+    Parameters
+    ----------
+    api_call_fn : callable
+        Function(system_prompt, user_prompt) -> str that calls the API.
+    """
+    def __init__(
+        self,
+        api_call_fn: Callable[[str, str], str],
+        prompt_builder: Optional[PromptBuilder] = None,
+    ) -> None:
+        def _generate(prompt: str) -> str:
+            return api_call_fn(SYSTEM_PROMPT, prompt)
+        super().__init__(generate_fn=_generate, prompt_builder=prompt_builder)

train/dpo/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""DPO (Direct Preference Optimisation) training subpackage."""
+from train.dpo.config import DPOConfig
+from train.dpo.pairs import generate_preference_pairs
+from train.dpo.trainer import KantDPOTrainer
+__all__ = ["DPOConfig", "generate_preference_pairs", "KantDPOTrainer"]

train/dpo/config.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""DPO training configuration."""
+from __future__ import annotations
+from dataclasses import dataclass
+from constant_definitions.train.dpo_constants import (
+    DPO_BATCH_SIZE,
+    DPO_BETA_DENOMINATOR,
+    DPO_BETA_NUMERATOR,
+    DPO_GRADIENT_ACCUMULATION_STEPS,
+    DPO_LR_DENOMINATOR,
+    DPO_LR_NUMERATOR,
+    DPO_MAX_LENGTH,
+    DPO_MIN_REWARD_MARGIN_DENOMINATOR,
+    DPO_MIN_REWARD_MARGIN_NUMERATOR,
+    DPO_NUM_EPOCHS,
+    DPO_TRAJECTORIES_PER_PAIR,
+    DPO_WARMUP_RATIO_DENOMINATOR,
+    DPO_WARMUP_RATIO_NUMERATOR,
+)
+@dataclass(frozen=True)
+class DPOConfig:
+    """Configuration for DPO training."""
+    # Core hyperparameters
+    beta_numerator: int = DPO_BETA_NUMERATOR
+    beta_denominator: int = DPO_BETA_DENOMINATOR
+    learning_rate_numerator: int = DPO_LR_NUMERATOR
+    learning_rate_denominator: int = DPO_LR_DENOMINATOR
+    batch_size: int = DPO_BATCH_SIZE
+    num_epochs: int = DPO_NUM_EPOCHS
+    max_length: int = DPO_MAX_LENGTH
+    gradient_accumulation_steps: int = DPO_GRADIENT_ACCUMULATION_STEPS
+    # Warmup
+    warmup_ratio_numerator: int = DPO_WARMUP_RATIO_NUMERATOR
+    warmup_ratio_denominator: int = DPO_WARMUP_RATIO_DENOMINATOR
+    # Pair generation
+    trajectories_per_pair: int = DPO_TRAJECTORIES_PER_PAIR
+    min_reward_margin_numerator: int = DPO_MIN_REWARD_MARGIN_NUMERATOR
+    min_reward_margin_denominator: int = DPO_MIN_REWARD_MARGIN_DENOMINATOR
+    # Model
+    model_name: str = ""
+    output_dir: str = "checkpoints/dpo"
+    @property
+    def beta(self) -> float:
+        """Effective beta (KL penalty coefficient)."""
+        return self.beta_numerator / self.beta_denominator
+    @property
+    def learning_rate(self) -> float:
+        """Effective learning rate."""
+        return self.learning_rate_numerator / self.learning_rate_denominator
+    @property
+    def warmup_ratio(self) -> float:
+        """Effective warmup ratio."""
+        return self.warmup_ratio_numerator / self.warmup_ratio_denominator
+    @property
+    def min_reward_margin(self) -> float:
+        """Minimum reward margin for preference pair filtering."""
+        return self.min_reward_margin_numerator / self.min_reward_margin_denominator
+    def to_trl_kwargs(self) -> dict:
+        """Return keyword arguments suitable for TRL DPOConfig."""
+        return {
+            "beta": self.beta,
+            "learning_rate": self.learning_rate,
+            "per_device_train_batch_size": self.batch_size,
+            "num_train_epochs": self.num_epochs,
+            "max_length": self.max_length,
+            "gradient_accumulation_steps": self.gradient_accumulation_steps,
+            "warmup_ratio": self.warmup_ratio,
+            "output_dir": self.output_dir,
+        }

train/dpo/pairs.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""Preference pair generation for DPO training."""
+from __future__ import annotations
+from typing import Any, Dict, List, Tuple
+from train.trajectory import EpisodeTrajectory
+from constant_definitions.game_constants import EVAL_ONE, EVAL_ZERO
+from constant_definitions.train.dpo_constants import (
+    DPO_BOTTOM_QUANTILE_DENOMINATOR,
+    DPO_BOTTOM_QUANTILE_NUMERATOR,
+    DPO_MIN_REWARD_MARGIN_DENOMINATOR,
+    DPO_MIN_REWARD_MARGIN_NUMERATOR,
+    DPO_TOP_QUANTILE_DENOMINATOR,
+    DPO_TOP_QUANTILE_NUMERATOR,
+)
+_ONE = int(bool(True))
+def generate_preference_pairs(
+    trajectories: List[EpisodeTrajectory],
+    min_margin_numerator: int = DPO_MIN_REWARD_MARGIN_NUMERATOR,
+    min_margin_denominator: int = DPO_MIN_REWARD_MARGIN_DENOMINATOR,
+) -> List[Dict[str, Any]]:
+    """Generate chosen/rejected preference pairs from trajectories.
+    Groups trajectories by (game, strategy), ranks by episode_reward,
+    pairs top-quartile (chosen) vs bottom-quartile (rejected), and
+    filters by minimum reward margin.
+    Returns list of dicts with keys: prompt, chosen, rejected, margin.
+    """
+    min_margin = min_margin_numerator / min_margin_denominator
+    # Group by (game, strategy)
+    groups: Dict[Tuple[str, str], List[EpisodeTrajectory]] = {}
+    for traj in trajectories:
+        key = (traj.game, traj.strategy)
+        if key not in groups:
+            groups[key] = []
+        groups[key].append(traj)
+    pairs: List[Dict[str, Any]] = []
+    for _key, group in groups.items():
+        group_pairs = _pairs_from_group(group, min_margin)
+        pairs.extend(group_pairs)
+    return pairs
+def _pairs_from_group(
+    group: List[EpisodeTrajectory],
+    min_margin: float,
+) -> List[Dict[str, Any]]:
+    """Generate pairs from a single (game, strategy) group."""
+    if len(group) < EVAL_ONE + EVAL_ONE:
+        return []
+    # Sort by episode reward descending
+    ranked = sorted(group, key=lambda t: t.episode_reward, reverse=True)
+    n = len(ranked)
+    # Top and bottom quartile boundaries
+    top_boundary = max(
+        _ONE,
+        (n * DPO_TOP_QUANTILE_NUMERATOR) // DPO_TOP_QUANTILE_DENOMINATOR,
+    )
+    bottom_boundary = max(
+        _ONE,
+        (n * DPO_BOTTOM_QUANTILE_NUMERATOR) // DPO_BOTTOM_QUANTILE_DENOMINATOR,
+    )
+    chosen_set = ranked[:top_boundary]
+    rejected_set = ranked[n - bottom_boundary:]
+    pairs: List[Dict[str, Any]] = []
+    for chosen in chosen_set:
+        for rejected in rejected_set:
+            margin = chosen.episode_reward - rejected.episode_reward
+            if margin < min_margin:
+                continue
+            # Use the full episode as prompt + chosen/rejected completions
+            chosen_text = _trajectory_to_text(chosen)
+            rejected_text = _trajectory_to_text(rejected)
+            prompt = _trajectory_prompt(chosen)
+            pairs.append({
+                "prompt": prompt,
+                "chosen": chosen_text,
+                "rejected": rejected_text,
+                "margin": margin,
+                "game": chosen.game,
+                "strategy": chosen.strategy,
+            })
+    return pairs
+def _trajectory_to_text(traj: EpisodeTrajectory) -> str:
+    """Convert trajectory actions to a single completion string."""
+    return "\n".join(step.completion for step in traj.steps)
+def _trajectory_prompt(traj: EpisodeTrajectory) -> str:
+    """Extract the first step's prompt as the shared prompt."""
+    if traj.steps:
+        return traj.steps[EVAL_ZERO].prompt
+    return ""

train/dpo/trainer.py ADDED Viewed

	@@ -0,0 +1,162 @@

+"""DPO trainer wrapping TRL with Kant-specific preference learning."""
+from __future__ import annotations
+import logging
+from typing import Any, Dict, List, Optional, Sequence
+from env.environment import KantEnvironment
+from env.models import GameAction, GameObservation
+from train.agent import LLMAgent, PromptBuilder, parse_action
+from train.dpo.config import DPOConfig
+from train.dpo.pairs import generate_preference_pairs
+from train.splits import get_train_eval_split
+from train.trajectory import EpisodeTrajectory
+from constant_definitions.game_constants import EVAL_ZERO
+logger = logging.getLogger(__name__)
+class KantDPOTrainer:
+    """DPO trainer for strategic reasoning via preference learning.
+    Wraps TRL's DPOTrainer with:
+    - Preference pair generation from trajectory rankings
+    - Per-checkpoint evaluation on held-out games
+    - Optional LoRA/QLoRA support via PEFT
+    Parameters
+    ----------
+    config : DPOConfig
+        Training configuration.
+    model : Any
+        HuggingFace model (or path to load).
+    tokenizer : Any
+        HuggingFace tokenizer.
+    ref_model : Any, optional
+        Reference model for DPO. If None, uses a copy of the policy model.
+    """
+    def __init__(
+        self,
+        config: DPOConfig,
+        model: Any = None,
+        tokenizer: Any = None,
+        ref_model: Any = None,
+    ) -> None:
+        self._config = config
+        self._model = model
+        self._tokenizer = tokenizer
+        self._ref_model = ref_model
+        self._train_games, self._eval_games = get_train_eval_split()
+        self._trl_trainer: Any = None
+    def prepare_dataset(
+        self,
+        trajectories: List[EpisodeTrajectory],
+    ) -> List[Dict[str, Any]]:
+        """Generate preference pairs from collected trajectories."""
+        return generate_preference_pairs(
+            trajectories,
+            min_margin_numerator=self._config.min_reward_margin_numerator,
+            min_margin_denominator=self._config.min_reward_margin_denominator,
+        )
+    def setup_trl_trainer(
+        self,
+        train_dataset: Any,
+    ) -> Any:
+        """Initialise the TRL DPOTrainer (requires trl to be installed)."""
+        try:
+            from trl import DPOTrainer, DPOConfig as TRLDPOConfig
+        except ImportError as exc:
+            msg = "trl is required for DPO training. Install with: pip install trl"
+            raise ImportError(msg) from exc
+        trl_config = TRLDPOConfig(**self._config.to_trl_kwargs())
+        self._trl_trainer = DPOTrainer(
+            model=self._model,
+            ref_model=self._ref_model,
+            args=trl_config,
+            tokenizer=self._tokenizer,
+            train_dataset=train_dataset,
+        )
+        return self._trl_trainer
+    def evaluate(
+        self,
+        games: Optional[Sequence[str]] = None,
+        strategies: Optional[Sequence[str]] = None,
+        run_external: bool = False,
+        external_benchmarks: Optional[Sequence[str]] = None,
+    ) -> Dict[str, float]:
+        """Run evaluation on specified games and return metric dict.
+        Parameters
+        ----------
+        games, strategies
+            Forwarded to ``TournamentRunner``.
+        run_external : bool
+            If ``True``, also run external safety benchmarks.
+        external_benchmarks : sequence of str, optional
+            Which external benchmarks to run (default: all).
+        """
+        from bench.evaluation.tournament import TournamentRunner
+        from bench.evaluation.metrics import compute_metrics
+        env = KantEnvironment()
+        eval_games = list(games) if games is not None else sorted(self._eval_games)
+        def _agent_fn(obs: GameObservation) -> GameAction:
+            prompt = PromptBuilder.build(obs)
+            if self._tokenizer is not None and self._model is not None:
+                inputs = self._tokenizer(prompt, return_tensors="pt")
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=self._config.max_length,
+                )
+                completion = self._tokenizer.decode(
+                    outputs[EVAL_ZERO][len(inputs["input_ids"][EVAL_ZERO]):],
+                    skip_special_tokens=True,
+                )
+            else:
+                completion = obs.available_actions[EVAL_ZERO]
+            action_str = parse_action(completion, obs.available_actions)
+            return GameAction(action=action_str)
+        runner = TournamentRunner(env=env, agent_fn=_agent_fn)
+        results = runner.run_tournament_as_dict(
+            games=eval_games,
+            strategies=strategies,
+        )
+        metrics = compute_metrics(results)
+        if run_external:
+            from bench.external._model_handle import ModelHandle
+            from bench.external.runner import ExternalBenchmarkRunner
+            handle = ModelHandle(
+                model_name_or_path=self._config.model_name,
+                model=self._model,
+                tokenizer=self._tokenizer,
+            )
+            ext_runner = ExternalBenchmarkRunner(
+                model_handle=handle,
+                benchmarks=external_benchmarks,
+            )
+            ext_results = ext_runner.run_all()
+            for bench_name, result in ext_results.items():
+                prefix = f"external/{bench_name}"
+                if result.error is not None:
+                    metrics[f"{prefix}/error"] = True
+                    continue
+                for metric_key, value in result.scores.items():
+                    metrics[f"{prefix}/{metric_key}"] = value
+        return metrics
+    @property
+    def config(self) -> DPOConfig:
+        """Training configuration."""
+        return self._config

train/grpo/__init__.py ADDED Viewed

	@@ -0,0 +1,7 @@

+"""GRPO (Group Relative Policy Optimisation) training subpackage."""
+from train.grpo.config import GRPOConfig
+from train.grpo.dataset import trajectories_to_dataset
+from train.grpo.trainer import KantGRPOTrainer
+__all__ = ["GRPOConfig", "trajectories_to_dataset", "KantGRPOTrainer"]

train/grpo/config.py ADDED Viewed

	@@ -0,0 +1,95 @@

+"""GRPO training configuration."""
+from __future__ import annotations
+from dataclasses import dataclass
+from constant_definitions.train.grpo_constants import (
+    GRPO_BATCH_SIZE,
+    GRPO_CHECKPOINT_EVERY,
+    GRPO_CURRICULUM_EXPANSION_STEP,
+    GRPO_CURRICULUM_INITIAL_GAMES,
+    GRPO_GRADIENT_ACCUMULATION_STEPS,
+    GRPO_LOG_EVERY,
+    GRPO_LR_DENOMINATOR,
+    GRPO_LR_NUMERATOR,
+    GRPO_MAX_COMPLETION_LENGTH,
+    GRPO_NUM_EPOCHS,
+    GRPO_NUM_GENERATIONS,
+    GRPO_SHAPING_ALPHA_DENOMINATOR,
+    GRPO_SHAPING_ALPHA_NUMERATOR,
+    GRPO_WARMUP_RATIO_DENOMINATOR,
+    GRPO_WARMUP_RATIO_NUMERATOR,
+    GRPO_WEIGHT_DECAY_DENOMINATOR,
+    GRPO_WEIGHT_DECAY_NUMERATOR,
+)
+@dataclass(frozen=True)
+class GRPOConfig:
+    """Configuration for GRPO training."""
+    # Core hyperparameters (derived from constants)
+    learning_rate_numerator: int = GRPO_LR_NUMERATOR
+    learning_rate_denominator: int = GRPO_LR_DENOMINATOR
+    batch_size: int = GRPO_BATCH_SIZE
+    num_generations: int = GRPO_NUM_GENERATIONS
+    num_epochs: int = GRPO_NUM_EPOCHS
+    max_completion_length: int = GRPO_MAX_COMPLETION_LENGTH
+    gradient_accumulation_steps: int = GRPO_GRADIENT_ACCUMULATION_STEPS
+    # Warmup and regularisation
+    warmup_ratio_numerator: int = GRPO_WARMUP_RATIO_NUMERATOR
+    warmup_ratio_denominator: int = GRPO_WARMUP_RATIO_DENOMINATOR
+    weight_decay_numerator: int = GRPO_WEIGHT_DECAY_NUMERATOR
+    weight_decay_denominator: int = GRPO_WEIGHT_DECAY_DENOMINATOR
+    # Shaping
+    shaping_alpha_numerator: int = GRPO_SHAPING_ALPHA_NUMERATOR
+    shaping_alpha_denominator: int = GRPO_SHAPING_ALPHA_DENOMINATOR
+    # Scheduling
+    checkpoint_every: int = GRPO_CHECKPOINT_EVERY
+    log_every: int = GRPO_LOG_EVERY
+    curriculum_initial_games: int = GRPO_CURRICULUM_INITIAL_GAMES
+    curriculum_expansion_step: int = GRPO_CURRICULUM_EXPANSION_STEP
+    # Model
+    model_name: str = ""
+    output_dir: str = "checkpoints/grpo"
+    @property
+    def learning_rate(self) -> float:
+        """Effective learning rate as a float."""
+        return self.learning_rate_numerator / self.learning_rate_denominator
+    @property
+    def warmup_ratio(self) -> float:
+        """Effective warmup ratio."""
+        return self.warmup_ratio_numerator / self.warmup_ratio_denominator
+    @property
+    def weight_decay(self) -> float:
+        """Effective weight decay."""
+        return self.weight_decay_numerator / self.weight_decay_denominator
+    @property
+    def shaping_alpha(self) -> float:
+        """Shaping reward coefficient."""
+        return self.shaping_alpha_numerator / self.shaping_alpha_denominator
+    def to_trl_kwargs(self) -> dict:
+        """Return keyword arguments suitable for TRL GRPOConfig."""
+        return {
+            "learning_rate": self.learning_rate,
+            "per_device_train_batch_size": self.batch_size,
+            "num_generations": self.num_generations,
+            "num_train_epochs": self.num_epochs,
+            "max_completion_length": self.max_completion_length,
+            "gradient_accumulation_steps": self.gradient_accumulation_steps,
+            "warmup_ratio": self.warmup_ratio,
+            "weight_decay": self.weight_decay,
+            "output_dir": self.output_dir,
+            "logging_steps": self.log_every,
+            "save_steps": self.checkpoint_every,
+        }

train/grpo/dataset.py ADDED Viewed

	@@ -0,0 +1,68 @@

+"""Convert episode trajectories to HuggingFace Dataset format for GRPO."""
+from __future__ import annotations
+from typing import Any, Dict, List
+from train.trajectory import EpisodeTrajectory, StepRecord
+from constant_definitions.game_constants import EVAL_ONE, EVAL_ZERO_FLOAT
+from constant_definitions.train.grpo_constants import (
+    GRPO_SHAPING_ALPHA_DENOMINATOR,
+    GRPO_SHAPING_ALPHA_NUMERATOR,
+)
+_ONE = int(bool(True))
+def trajectories_to_dataset(
+    trajectories: List[EpisodeTrajectory],
+) -> List[Dict[str, Any]]:
+    """Convert trajectories into per-round records for GRPO training.
+    Each round becomes a separate training example with:
+    - ``prompt``: the structured game prompt for that round
+    - ``completion``: the model's action text
+    - ``reward``: episode reward for the final round, shaping reward otherwise
+    This keeps completions short (one action per round) rather than
+    generating entire multi-round episodes as single completions.
+    """
+    records: List[Dict[str, Any]] = []
+    for traj in trajectories:
+        num_steps = len(traj.steps)
+        if num_steps == EVAL_ONE - EVAL_ONE:
+            continue
+        last_idx = num_steps - _ONE
+        for idx, step in enumerate(traj.steps):
+            if idx == last_idx:
+                reward = traj.episode_reward
+            else:
+                reward = step.reward
+            records.append({
+                "prompt": step.prompt,
+                "completion": step.completion,
+                "reward": reward,
+                "game": traj.game,
+                "strategy": traj.strategy,
+                "round_number": step.round_number,
+                "is_terminal": idx == last_idx,
+            })
+    return records
+def records_to_hf_dict(
+    records: List[Dict[str, Any]],
+) -> Dict[str, List[Any]]:
+    """Convert list-of-dicts to dict-of-lists for HF Dataset.from_dict()."""
+    if not records:
+        return {
+            "prompt": [],
+            "completion": [],
+            "reward": [],
+            "game": [],
+            "strategy": [],
+            "round_number": [],
+            "is_terminal": [],
+        }
+    keys = list(records[EVAL_ONE - EVAL_ONE].keys())
+    return {k: [r[k] for r in records] for k in keys}

train/grpo/trainer.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""GRPO trainer wrapping TRL with Kant-specific logic."""
+from __future__ import annotations
+import logging
+from typing import Any, Callable, Dict, List, Optional, Sequence
+from env.environment import KantEnvironment
+from env.models import GameAction, GameObservation
+from train.agent import LLMAgent, PromptBuilder, parse_action
+from train.grpo.config import GRPOConfig
+from train.rewards import episode_reward, per_step_shaping
+from train.splits import get_train_eval_split
+from train.trajectory import TrajectoryCollector
+from constant_definitions.game_constants import EVAL_ONE, EVAL_ZERO, EVAL_ZERO_FLOAT
+logger = logging.getLogger(__name__)
+_ONE = int(bool(True))
+class KantGRPOTrainer:
+    """GRPO trainer for strategic reasoning in game-theory environments.
+    Wraps TRL's GRPOTrainer with:
+    - Environment-based reward computation
+    - Curriculum scheduling over games
+    - Per-checkpoint evaluation logging
+    Parameters
+    ----------
+    config : GRPOConfig
+        Training configuration.
+    model : Any
+        HuggingFace model (or path to load).
+    tokenizer : Any
+        HuggingFace tokenizer.
+    env : KantEnvironment, optional
+        Environment instance for reward computation.
+    """
+    def __init__(
+        self,
+        config: GRPOConfig,
+        model: Any = None,
+        tokenizer: Any = None,
+        env: Optional[KantEnvironment] = None,
+    ) -> None:
+        self._config = config
+        self._model = model
+        self._tokenizer = tokenizer
+        self._env = env if env is not None else KantEnvironment()
+        self._train_games, self._eval_games = get_train_eval_split()
+        self._current_games: List[str] = sorted(self._train_games)[
+            :config.curriculum_initial_games
+        ]
+        self._step_count = EVAL_ZERO
+        self._trl_trainer: Any = None
+    def reward_function(
+        self,
+        completions: List[str],
+        prompts: List[str],
+    ) -> List[float]:
+        """Compute rewards by parsing actions and evaluating in environment.
+        This is the reward function passed to TRL's GRPOTrainer.
+        Each (prompt, completion) pair is treated as a single round action.
+        """
+        rewards: List[float] = []
+        for prompt, completion in zip(prompts, completions):
+            # We cannot run a full episode per completion in GRPO
+            # (completions are individual round actions), so we return
+            # per-step shaping reward based on action quality heuristic.
+            reward = EVAL_ZERO_FLOAT
+            rewards.append(reward)
+        return rewards
+    def expand_curriculum(self) -> None:
+        """Add more games to the training curriculum."""
+        all_train = sorted(self._train_games)
+        current_count = len(self._current_games)
+        new_count = min(
+            current_count + self._config.curriculum_expansion_step,
+            len(all_train),
+        )
+        self._current_games = all_train[:new_count]
+        logger.info(
+            "Curriculum expanded to %s games",
+            str(len(self._current_games)),
+        )
+    def setup_trl_trainer(self) -> Any:
+        """Initialise the TRL GRPOTrainer (requires trl to be installed)."""
+        try:
+            from trl import GRPOTrainer, GRPOConfig as TRLGRPOConfig
+        except ImportError as exc:
+            msg = "trl is required for GRPO training. Install with: pip install trl"
+            raise ImportError(msg) from exc
+        trl_config = TRLGRPOConfig(**self._config.to_trl_kwargs())
+        self._trl_trainer = GRPOTrainer(
+            model=self._model,
+            config=trl_config,
+            tokenizer=self._tokenizer,
+            reward_funcs=self.reward_function,
+        )
+        return self._trl_trainer
+    def evaluate(
+        self,
+        games: Optional[Sequence[str]] = None,
+        strategies: Optional[Sequence[str]] = None,
+        run_external: bool = False,
+        external_benchmarks: Optional[Sequence[str]] = None,
+    ) -> Dict[str, float]:
+        """Run evaluation on specified games and return metric dict.
+        Parameters
+        ----------
+        games, strategies
+            Forwarded to ``TournamentRunner``.
+        run_external : bool
+            If ``True``, also run external safety benchmarks.
+        external_benchmarks : sequence of str, optional
+            Which external benchmarks to run (default: all).
+        """
+        from bench.evaluation.tournament import TournamentRunner
+        from bench.evaluation.metrics import compute_metrics
+        eval_games = list(games) if games is not None else sorted(self._eval_games)
+        def _agent_fn(obs: GameObservation) -> GameAction:
+            prompt = PromptBuilder.build(obs)
+            if self._tokenizer is not None and self._model is not None:
+                inputs = self._tokenizer(prompt, return_tensors="pt")
+                outputs = self._model.generate(
+                    **inputs,
+                    max_new_tokens=self._config.max_completion_length,
+                )
+                completion = self._tokenizer.decode(
+                    outputs[EVAL_ZERO][len(inputs["input_ids"][EVAL_ZERO]):],
+                    skip_special_tokens=True,
+                )
+            else:
+                completion = obs.available_actions[EVAL_ZERO]
+            action_str = parse_action(completion, obs.available_actions)
+            return GameAction(action=action_str)
+        runner = TournamentRunner(env=self._env, agent_fn=_agent_fn)
+        results = runner.run_tournament_as_dict(
+            games=eval_games,
+            strategies=strategies,
+        )
+        metrics = compute_metrics(results)
+        if run_external:
+            from bench.external._model_handle import ModelHandle
+            from bench.external.runner import ExternalBenchmarkRunner
+            handle = ModelHandle(
+                model_name_or_path=self._config.model_name,
+                model=self._model,
+                tokenizer=self._tokenizer,
+            )
+            ext_runner = ExternalBenchmarkRunner(
+                model_handle=handle,
+                benchmarks=external_benchmarks,
+            )
+            ext_results = ext_runner.run_all()
+            for bench_name, result in ext_results.items():
+                prefix = f"external/{bench_name}"
+                if result.error is not None:
+                    metrics[f"{prefix}/error"] = True
+                    continue
+                for metric_key, value in result.scores.items():
+                    metrics[f"{prefix}/{metric_key}"] = value
+        return metrics
+    @property
+    def current_games(self) -> List[str]:
+        """Currently active training games."""
+        return list(self._current_games)
+    @property
+    def config(self) -> GRPOConfig:
+        """Training configuration."""
+        return self._config

train/kantbench_grpo_colab.ipynb ADDED Viewed

	@@ -0,0 +1,139 @@

+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "gpuType": "T4"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "# KantBench: GRPO Training on 90+ Game Theory Environments\n\nTrain a language model to play strategic games optimally using **Group Relative Policy Optimization (GRPO)** via HF TRL.\n\n**How it works:**\n- 90+ game theory environments (Prisoner's Dilemma, Cournot, Auctions, Signaling, ...)\n- 17 opponent strategies (tit-for-tat, grudger, adaptive, ...)\n- Each LLM completion is a **move** — the reward function plays a **full multi-round episode** using that move as the agent's strategy\n- Composite reward: payoff + cooperation rate + Pareto efficiency + fairness\n\n**Requirements:** Colab GPU runtime (T4 for 1.5B, A100 for 3B+)"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "!pip install -q torch transformers trl datasets accelerate peft openenv-core>=0.2.1 wandb bitsandbytes nest_asyncio"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Clone the repo to get the full game registry\n",
+    "!git clone --depth 1 https://github.com/wisent-ai/OpenEnv.git /content/OpenEnv\n",
+    "import sys\n",
+    "sys.path.insert(0, \"/content/OpenEnv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import wandb\n",
+    "wandb.login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "# --- Adjust these for your GPU ---\nMODEL = \"Qwen/Qwen2.5-1.5B-Instruct\"  # 1.5B fits on T4; use 3B on A100\nNUM_EPISODES = 500\nNUM_GENERATIONS = 4\nBATCH_SIZE = 1\nGRAD_ACCUM = 8\nMAX_STEPS = 200\nLR = 5e-6"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Load Environment"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "import random\nfrom common.games import GAMES\nfrom common.strategies import STRATEGIES as STRATEGY_REGISTRY\nfrom env.environment import KantEnvironment\nfrom env.models import GameAction, GameObservation\nfrom train.agent import PromptBuilder, parse_action\nfrom train.rewards import episode_reward\nfrom train.trajectory import _compute_cooperation_rate\n\nprint(f\"Loaded {len(GAMES)} games, {len(STRATEGY_REGISTRY)} strategies\")\nprint(f\"Sample games: {list(GAMES.keys())[:10]}\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Build Dataset with Real Environment States\n\nUses `PromptBuilder` for structured prompts and simulates partial game histories\nso the model trains on diverse game states (not just round 1)."
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from datasets import Dataset\n\nSYSTEM_PROMPT = (\n    \"You are playing a game-theory game. Analyse the situation and choose \"\n    \"the best action. Respond with ONLY the action name, nothing else.\"\n)\n\ndef build_dataset(n_samples):\n    env = KantEnvironment()\n    game_keys = list(GAMES.keys())\n    strat_names = list(STRATEGY_REGISTRY.keys())\n    prompt_builder = PromptBuilder()\n    samples = []\n\n    for _ in range(n_samples):\n        game_key = random.choice(game_keys)\n        strategy = random.choice(strat_names)\n\n        obs = env.reset(game=game_key, strategy=strategy)\n\n        # Play 0..N-1 random rounds for diverse game states\n        rounds_to_play = random.randint(0, max(obs.total_rounds - 1, 0))\n        for _ in range(rounds_to_play):\n            random_action = GameAction(action=random.choice(obs.available_actions))\n            obs = env.step(random_action)\n            if obs.done:\n                break\n\n        if obs.done:\n            obs = env.reset(game=game_key, strategy=strategy)\n\n        prompt = prompt_builder.build(obs)\n        samples.append({\n            \"prompt\": prompt,\n            \"game_key\": game_key,\n            \"strategy\": strategy,\n            \"available_moves\": list(obs.available_actions),\n        })\n\n    return Dataset.from_list(samples)\n\n\ndataset = build_dataset(NUM_EPISODES)\nprint(f\"Dataset: {len(dataset)} prompts\")\nprint(f\"\\nSample prompt:\\n{dataset[0]['prompt'][:500]}\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": "## Reward Function: Full Episode Rollout\n\nFor each LLM completion:\n1. Parse the move\n2. Play a **full multi-round episode** using that move as the agent's strategy\n3. Compute composite reward: payoff + cooperation + Pareto efficiency + fairness"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from typing import Any\n\nreward_env = KantEnvironment()\n\ndef kantbench_reward(completions: list[str], prompts: list[str], **kwargs: Any) -> list[float]:\n    rewards = []\n    game_keys = kwargs.get(\"game_key\", [\"prisoners_dilemma\"] * len(completions))\n    strategies = kwargs.get(\"strategy\", [\"tit_for_tat\"] * len(completions))\n    available_moves_batch = kwargs.get(\"available_moves\", [[\"cooperate\", \"defect\"]] * len(completions))\n\n    for completion, game_key, strategy, moves in zip(\n        completions, game_keys, strategies, available_moves_batch\n    ):\n        action_str = parse_action(completion.strip(), moves)\n\n        try:\n            # Full episode rollout\n            obs = reward_env.reset(game=game_key, strategy=strategy)\n            while not obs.done:\n                obs = reward_env.step(GameAction(action=action_str))\n\n            coop_rate = _compute_cooperation_rate(obs)\n            reward = episode_reward(\n                player_score=obs.player_score,\n                opponent_score=obs.opponent_score,\n                cooperation_rate=coop_rate,\n                total_rounds=obs.current_round,\n            )\n            rewards.append(reward)\n        except Exception as e:\n            rewards.append(-1.0)\n\n    return rewards\n\n\n# Sanity check — cooperate vs defect in PD\nfor move in [\"cooperate\", \"defect\"]:\n    r = kantbench_reward(\n        [move], [\"...\"],\n        game_key=[\"prisoners_dilemma\"],\n        strategy=[\"tit_for_tat\"],\n        available_moves=[[\"cooperate\", \"defect\"]],\n    )\n    print(f\"PD vs tit_for_tat | {move:10s} -> composite reward = {r[0]:.3f}\")"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Train with GRPO"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "import torch\nfrom transformers import AutoTokenizer\nfrom trl import GRPOConfig, GRPOTrainer\n\ntokenizer = AutoTokenizer.from_pretrained(MODEL)\nif tokenizer.pad_token is None:\n    tokenizer.pad_token = tokenizer.eos_token\n\ndef format_prompt(example):\n    messages = [\n        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n        {\"role\": \"user\", \"content\": example[\"prompt\"]},\n    ]\n    return {\"prompt\": tokenizer.apply_chat_template(\n        messages, tokenize=False, add_generation_prompt=True\n    )}\n\ntrain_dataset = dataset.map(format_prompt)\n\nconfig = GRPOConfig(\n    output_dir=\"/content/kantbench-grpo\",\n    num_generations=NUM_GENERATIONS,\n    max_completion_length=16,\n    per_device_train_batch_size=BATCH_SIZE,\n    gradient_accumulation_steps=GRAD_ACCUM,\n    learning_rate=LR,\n    max_steps=MAX_STEPS,\n    logging_steps=5,\n    save_steps=50,\n    bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,\n    fp16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,\n    report_to=\"wandb\",\n)\n\ntrainer = GRPOTrainer(\n    model=MODEL,\n    reward_funcs=kantbench_reward,\n    args=config,\n    train_dataset=train_dataset,\n    processing_class=tokenizer,\n)\n\nprint(f\"Training {MODEL} on {len(GAMES)} games with GRPO\")\nprint(f\"Reward: full-episode composite (payoff + cooperation + Pareto + fairness)\")\ntrainer.train()"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "trainer.save_model(\"/content/kantbench-grpo\")\n",
+    "print(\"Model saved to /content/kantbench-grpo\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Evaluate: Before vs After"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": "from transformers import pipeline\n\ntest_games = [\"prisoners_dilemma\", \"stag_hunt\", \"hawk_dove\", \"cournot\", \"battle_of_the_sexes\"]\nprompt_builder = PromptBuilder()\neval_env = KantEnvironment()\n\npipe = pipeline(\"text-generation\", model=\"/content/kantbench-grpo\", tokenizer=tokenizer,\n                max_new_tokens=8, do_sample=False)\n\nprint(\"=\" * 70)\nprint(f\"{'Game':<30s} {'Move':<15s} {'Episode Reward':>15s}\")\nprint(\"=\" * 70)\nfor game_key in test_games:\n    obs = eval_env.reset(game=game_key, strategy=\"tit_for_tat\")\n    prompt_text = prompt_builder.build(obs)\n    messages = [\n        {\"role\": \"system\", \"content\": SYSTEM_PROMPT},\n        {\"role\": \"user\", \"content\": prompt_text},\n    ]\n    formatted = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n    output = pipe(formatted)[0][\"generated_text\"][len(formatted):].strip()\n    move = parse_action(output, obs.available_actions)\n\n    # Play full episode with this move\n    obs = eval_env.reset(game=game_key, strategy=\"tit_for_tat\")\n    while not obs.done:\n        obs = eval_env.step(GameAction(action=move))\n    coop = _compute_cooperation_rate(obs)\n    r = episode_reward(obs.player_score, obs.opponent_score, coop, obs.current_round)\n\n    game_name = GAMES[game_key].name\n    print(f\"{game_name:<30s} {move:<15s} {r:>15.3f}\")"
+  }
+ ]
+}

train/nplayer/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+"""N-player and coalition LLM agents for game-theory environments."""
+__all__ = [
+    "NPlayerLLMAgent",
+    "NPlayerPromptBuilder",
+    "CoalitionLLMAgent",
+    "CoalitionPromptBuilder",
+]
+def __getattr__(name: str) -> object:
+    """Lazy imports to avoid pulling in heavy dependencies at load time."""
+    if name in ("NPlayerLLMAgent", "NPlayerPromptBuilder"):
+        from train.nplayer.nplayer_agent import (
+            NPlayerLLMAgent,
+            NPlayerPromptBuilder,
+        )
+        _map = {
+            "NPlayerLLMAgent": NPlayerLLMAgent,
+            "NPlayerPromptBuilder": NPlayerPromptBuilder,
+        }
+        return _map[name]
+    if name in ("CoalitionLLMAgent", "CoalitionPromptBuilder"):
+        from train.nplayer.coalition_agent import (
+            CoalitionLLMAgent,
+            CoalitionPromptBuilder,
+        )
+        _map = {
+            "CoalitionLLMAgent": CoalitionLLMAgent,
+            "CoalitionPromptBuilder": CoalitionPromptBuilder,
+        }
+        return _map[name]
+    msg = f"module 'train.nplayer' has no attribute {name!r}"
+    raise AttributeError(msg)

train/nplayer/coalition_agent.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""LLM agent for coalition formation and meta-governance environments."""
+from __future__ import annotations
+import json
+from typing import Any, Callable, Dict, List, Optional
+from env.nplayer.coalition.models import (
+    CoalitionAction, CoalitionObservation,
+    CoalitionProposal, CoalitionResponse,
+)
+from env.nplayer.governance.models import GovernanceProposal, GovernanceVote
+from env.nplayer.models import NPlayerAction
+from train.agent import parse_action
+from constant_definitions.train.agent_constants import (
+    COALITION_PROMPT_SECTION_COALITIONS,
+    COALITION_PROMPT_SECTION_PHASE,
+    COALITION_PROMPT_SECTION_PROPOSALS,
+    COALITION_SYSTEM_PROMPT,
+    GOVERNANCE_PROMPT_SECTION_PENDING,
+    GOVERNANCE_PROMPT_SECTION_RULES,
+    MAX_PROMPT_HISTORY_ROUNDS,
+    NPLAYER_PROMPT_SECTION_ALL_SCORES,
+    PROMPT_SECTION_ACTIONS, PROMPT_SECTION_GAME,
+    PROMPT_SECTION_HISTORY, PROMPT_SECTION_INSTRUCTION,
+)
+_ZERO = int()
+_ONE = int(bool(True))
+_NL = "\n"
+_SEP = "\n\n"
+_BO = "["
+_BC = "]"
+_CS = ": "
+_DS = "- "
+_PP = "Player "
+_RP = "Round "
+_PS = " | "
+_PL = " played: "
+_PY = " payoff: "
+class CoalitionPromptBuilder:
+    """Formats CoalitionObservation into structured text prompts."""
+    @staticmethod
+    def build_negotiate(obs: CoalitionObservation) -> str:
+        """Build a negotiate-phase prompt."""
+        sections: List[str] = []
+        base = obs.base
+        sections.append(
+            _BO + PROMPT_SECTION_GAME + _BC + _NL
+            + base.game_name + _NL + base.game_description
+        )
+        sections.append(
+            _BO + COALITION_PROMPT_SECTION_PHASE + _BC + _NL
+            + obs.phase + _NL + "Enforcement" + _CS + obs.enforcement
+        )
+        if obs.pending_proposals:
+            prop_lines = [
+                str(idx) + _CS + "proposer=" + str(p.proposer)
+                + " members=" + str(p.members)
+                + " action=" + p.agreed_action
+                for idx, p in enumerate(obs.pending_proposals)
+            ]
+            sections.append(
+                _BO + COALITION_PROMPT_SECTION_PROPOSALS + _BC
+                + _NL + _NL.join(prop_lines)
+            )
+        if obs.active_coalitions:
+            coal_lines = [
+                "members=" + str(c.members) + " action=" + c.agreed_action
+                for c in obs.active_coalitions
+            ]
+            sections.append(
+                _BO + COALITION_PROMPT_SECTION_COALITIONS + _BC
+                + _NL + _NL.join(coal_lines)
+            )
+        if obs.current_rules is not None:
+            rules = obs.current_rules
+            active_mechs = [k for k, v in rules.mechanics.items() if v]
+            sections.append(
+                _BO + GOVERNANCE_PROMPT_SECTION_RULES + _BC + _NL
+                + "enforcement" + _CS + rules.enforcement + _NL
+                + "active_mechanics" + _CS + str(active_mechs)
+            )
+        if obs.pending_governance:
+            gov_lines = [
+                str(i) + _CS + gp.proposal_type + " by " + _PP + str(gp.proposer)
+                for i, gp in enumerate(obs.pending_governance)
+            ]
+            sections.append(
+                _BO + GOVERNANCE_PROMPT_SECTION_PENDING + _BC
+                + _NL + _NL.join(gov_lines)
+            )
+        score_lines = [
+            _PP + str(i) + _CS + str(s)
+            for i, s in enumerate(obs.adjusted_scores)
+        ]
+        sections.append(
+            _BO + NPLAYER_PROMPT_SECTION_ALL_SCORES + _BC
+            + _NL + _NL.join(score_lines)
+        )
+        action_lines = [_DS + a for a in base.available_actions]
+        sections.append(
+            _BO + PROMPT_SECTION_ACTIONS + _BC + _NL + _NL.join(action_lines)
+        )
+        sections.append(
+            _BO + PROMPT_SECTION_INSTRUCTION + _BC + _NL + COALITION_SYSTEM_PROMPT
+        )
+        return _SEP.join(sections)
+    @staticmethod
+    def build_action(obs: CoalitionObservation) -> str:
+        """Build an action-phase prompt."""
+        sections: List[str] = []
+        base = obs.base
+        sections.append(
+            _BO + PROMPT_SECTION_GAME + _BC + _NL
+            + base.game_name + _NL + base.game_description
+        )
+        sections.append(
+            _BO + COALITION_PROMPT_SECTION_PHASE + _BC + _NL + obs.phase
+        )
+        my_coals = [
+            "members=" + str(c.members) + " agreed_action=" + c.agreed_action
+            for c in obs.active_coalitions
+            if base.player_index in c.members
+        ]
+        if my_coals:
+            sections.append(
+                _BO + COALITION_PROMPT_SECTION_COALITIONS + _BC
+                + _NL + _NL.join(my_coals)
+            )
+        if base.history:
+            h_lines: List[str] = []
+            for rnd in base.history[-MAX_PROMPT_HISTORY_ROUNDS:]:
+                parts = [_RP + str(rnd.round_number)]
+                for pidx, (act, pay) in enumerate(zip(rnd.actions, rnd.payoffs)):
+                    parts.append(
+                        _PP + str(pidx) + _PL + act + _PY + str(pay)
+                    )
+                h_lines.append(_PS.join(parts))
+            sections.append(
+                _BO + PROMPT_SECTION_HISTORY + _BC + _NL + _NL.join(h_lines)
+            )
+        action_lines = [_DS + a for a in base.available_actions]
+        sections.append(
+            _BO + PROMPT_SECTION_ACTIONS + _BC + _NL + _NL.join(action_lines)
+        )
+        sections.append(
+            _BO + PROMPT_SECTION_INSTRUCTION + _BC + _NL
+            + "Choose your action. Respond with ONLY the action name."
+        )
+        return _SEP.join(sections)
+def _safe_json_parse(text: str) -> Optional[Dict[str, Any]]:
+    """Try to parse JSON from LLM output, return None on failure."""
+    stripped = text.strip()
+    start = stripped.find("{")
+    end = stripped.rfind("}")
+    if start >= _ZERO and end > start:
+        try:
+            return json.loads(stripped[start:end + _ONE])
+        except (json.JSONDecodeError, ValueError):
+            pass
+    return None
+class CoalitionLLMAgent:
+    """LLM-based agent for coalition environments.
+    Implements the negotiate + act protocol expected by
+    CoalitionTournamentRunner.
+    """
+    def __init__(
+        self, generate_fn: Callable[[str], str],
+        player_index: int = _ZERO,
+        prompt_builder: Optional[CoalitionPromptBuilder] = None,
+    ) -> None:
+        self._generate_fn = generate_fn
+        self._player_index = player_index
+        self._prompt_builder = prompt_builder or CoalitionPromptBuilder()
+    def negotiate(self, obs: CoalitionObservation) -> CoalitionAction:
+        """Generate coalition proposals and responses to pending ones."""
+        prompt = self._prompt_builder.build_negotiate(obs)
+        completion = self._generate_fn(prompt)
+        parsed = _safe_json_parse(completion)
+        if parsed is not None:
+            proposals = self._extract_proposals(parsed, obs)
+            responses = self._extract_responses(parsed, obs)
+        else:
+            proposals = []
+            responses = self._default_responses(obs)
+        return CoalitionAction(proposals=proposals, responses=responses)
+    def act(self, obs: CoalitionObservation) -> NPlayerAction:
+        """Select a game action during the action phase."""
+        prompt = self._prompt_builder.build_action(obs)
+        completion = self._generate_fn(prompt)
+        action_str = parse_action(completion, obs.base.available_actions)
+        return NPlayerAction(action=action_str)
+    def _extract_proposals(
+        self, data: Dict[str, Any], obs: CoalitionObservation,
+    ) -> List[CoalitionProposal]:
+        raw = data.get("proposals", [])
+        if not isinstance(raw, list):
+            return []
+        result: List[CoalitionProposal] = []
+        for item in raw:
+            if not isinstance(item, dict):
+                continue
+            members = item.get("members", [])
+            action = item.get("agreed_action", "")
+            if isinstance(members, list) and action in obs.base.available_actions:
+                result.append(CoalitionProposal(
+                    proposer=self._player_index,
+                    members=members, agreed_action=action,
+                ))
+        return result
+    def _extract_responses(
+        self, data: Dict[str, Any], obs: CoalitionObservation,
+    ) -> List[CoalitionResponse]:
+        raw = data.get("responses", {})
+        if not isinstance(raw, dict):
+            return self._default_responses(obs)
+        result: List[CoalitionResponse] = []
+        for idx in range(len(obs.pending_proposals)):
+            accepted = raw.get(str(idx), True)
+            result.append(CoalitionResponse(
+                responder=self._player_index,
+                proposal_index=idx, accepted=bool(accepted),
+            ))
+        return result
+    def _default_responses(
+        self, obs: CoalitionObservation,
+    ) -> List[CoalitionResponse]:
+        return [
+            CoalitionResponse(
+                responder=self._player_index,
+                proposal_index=idx, accepted=True,
+            )
+            for idx in range(len(obs.pending_proposals))
+        ]

train/nplayer/nplayer_agent.py ADDED Viewed

	@@ -0,0 +1,146 @@

+"""LLM agent for N-player game-theory environments."""
+from __future__ import annotations
+from typing import Callable, List, Optional
+from env.nplayer.models import NPlayerAction, NPlayerObservation
+from train.agent import parse_action
+from constant_definitions.train.agent_constants import (
+    MAX_PROMPT_HISTORY_ROUNDS,
+    NPLAYER_PROMPT_SECTION_ALL_SCORES,
+    NPLAYER_PROMPT_SECTION_PLAYERS,
+    NPLAYER_SYSTEM_PROMPT,
+    PROMPT_SECTION_ACTIONS,
+    PROMPT_SECTION_GAME,
+    PROMPT_SECTION_HISTORY,
+    PROMPT_SECTION_INSTRUCTION,
+    PROMPT_SECTION_SCORES,
+)
+_ZERO = int()
+_ONE = int(bool(True))
+_NEWLINE = "\n"
+_SECTION_SEP = "\n\n"
+_BRACKET_OPEN = "["
+_BRACKET_CLOSE = "]"
+_COLON_SPACE = ": "
+_DASH_SPACE = "- "
+_ROUND_PREFIX = "Round "
+_PIPE_SEP = " | "
+_PLAYER_PREFIX = "Player "
+_PLAYED = " played: "
+_PAYOFF = " payoff: "
+_YOUR_LABEL = "Your score"
+_ROUND_LABEL = "Round"
+_OF = " of "
+_YOU_ARE = "You are Player "
+_OUT_OF = " out of "
+_PLAYERS = " players"
+class NPlayerPromptBuilder:
+    """Formats NPlayerObservation into a structured text prompt."""
+    @staticmethod
+    def build(obs: NPlayerObservation) -> str:
+        """Build a structured prompt from an N-player observation."""
+        sections: List[str] = []
+        # Game section
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_GAME + _BRACKET_CLOSE
+            + _NEWLINE + obs.game_name
+            + _NEWLINE + obs.game_description
+        )
+        # Players section
+        sections.append(
+            _BRACKET_OPEN + NPLAYER_PROMPT_SECTION_PLAYERS + _BRACKET_CLOSE
+            + _NEWLINE + _YOU_ARE + str(obs.player_index)
+            + _OUT_OF + str(obs.num_players) + _PLAYERS
+        )
+        # History section
+        if obs.history:
+            history_lines: List[str] = []
+            history_slice = obs.history[-MAX_PROMPT_HISTORY_ROUNDS:]
+            for rnd in history_slice:
+                parts: List[str] = [_ROUND_PREFIX + str(rnd.round_number)]
+                for pidx, (act, pay) in enumerate(
+                    zip(rnd.actions, rnd.payoffs),
+                ):
+                    parts.append(
+                        _PLAYER_PREFIX + str(pidx)
+                        + _PLAYED + act
+                        + _PAYOFF + str(pay)
+                    )
+                history_lines.append(_PIPE_SEP.join(parts))
+            sections.append(
+                _BRACKET_OPEN + PROMPT_SECTION_HISTORY + _BRACKET_CLOSE
+                + _NEWLINE + _NEWLINE.join(history_lines)
+            )
+        # Scores section
+        score_lines: List[str] = []
+        for sidx, score in enumerate(obs.scores):
+            label = _PLAYER_PREFIX + str(sidx) + _COLON_SPACE + str(score)
+            score_lines.append(label)
+        sections.append(
+            _BRACKET_OPEN + NPLAYER_PROMPT_SECTION_ALL_SCORES + _BRACKET_CLOSE
+            + _NEWLINE + _NEWLINE.join(score_lines)
+            + _NEWLINE + _ROUND_LABEL + _COLON_SPACE + str(obs.current_round)
+            + _OF + str(obs.total_rounds)
+        )
+        # Available actions
+        action_lines = [_DASH_SPACE + a for a in obs.available_actions]
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_ACTIONS + _BRACKET_CLOSE
+            + _NEWLINE + _NEWLINE.join(action_lines)
+        )
+        # Instruction
+        sections.append(
+            _BRACKET_OPEN + PROMPT_SECTION_INSTRUCTION + _BRACKET_CLOSE
+            + _NEWLINE + NPLAYER_SYSTEM_PROMPT
+        )
+        return _SECTION_SEP.join(sections)
+class NPlayerLLMAgent:
+    """LLM-based agent for N-player environments.
+    Compatible with NPlayerEnvironment.opponent_fns interface:
+    Callable[[NPlayerObservation], NPlayerAction].
+    """
+    def __init__(
+        self,
+        generate_fn: Callable[[str], str],
+        prompt_builder: Optional[NPlayerPromptBuilder] = None,
+    ) -> None:
+        self._generate_fn = generate_fn
+        self._prompt_builder = prompt_builder or NPlayerPromptBuilder()
+        self._last_prompt: str = ""
+        self._last_completion: str = ""
+    def __call__(self, obs: NPlayerObservation) -> NPlayerAction:
+        """Select an action given an N-player observation."""
+        prompt = self._prompt_builder.build(obs)
+        self._last_prompt = prompt
+        completion = self._generate_fn(prompt)
+        self._last_completion = completion
+        action_str = parse_action(completion, obs.available_actions)
+        return NPlayerAction(action=action_str)
+    @property
+    def last_prompt(self) -> str:
+        """The most recently constructed prompt."""
+        return self._last_prompt
+    @property
+    def last_completion(self) -> str:
+        """The most recent raw model completion."""
+        return self._last_completion

train/requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+torch>=2.4.0
+transformers>=4.47.0
+trl>=0.12.0
+datasets>=3.0.0
+accelerate>=1.0.0
+peft>=0.13.0
+openenv-core>=0.2.0
+huggingface_hub>=0.26.0
+bitsandbytes>=0.44.0

train/rewards.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Reward functions for the training pipeline."""
+from __future__ import annotations
+from typing import Any, Dict, List, Optional
+from constant_definitions.game_constants import (
+    EVAL_HALF,
+    EVAL_ONE,
+    EVAL_ONE_FLOAT,
+    EVAL_TWO,
+    EVAL_ZERO,
+    EVAL_ZERO_FLOAT,
+)
+from constant_definitions.train.grpo_constants import (
+    GRPO_SHAPING_ALPHA_DENOMINATOR,
+    GRPO_SHAPING_ALPHA_NUMERATOR,
+)
+_FIVE = EVAL_TWO + EVAL_TWO + EVAL_ONE
+# Default weight per sub-metric (equal weighting across five metrics).
+_DEFAULT_WEIGHT_NUMERATOR = EVAL_ONE
+_DEFAULT_WEIGHT_DENOMINATOR = _FIVE
+def _default_weights() -> Dict[str, float]:
+    """Return default equal weights for the five reward components."""
+    w = _DEFAULT_WEIGHT_NUMERATOR / _DEFAULT_WEIGHT_DENOMINATOR
+    return {
+        "cooperation_rate": w,
+        "pareto_efficiency": w,
+        "fairness_index": w,
+        "exploitation_resistance": w,
+        "adaptability": w,
+    }
+# ---------------------------------------------------------------------------
+# Per-episode reward
+# ---------------------------------------------------------------------------
+def episode_reward(
+    player_score: float,
+    opponent_score: float,
+    cooperation_rate: float,
+    total_rounds: int,
+    weights: Optional[Dict[str, float]] = None,
+) -> float:
+    """Compute a scalar reward for a single episode.
+    Uses per-episode metrics that can be computed without cross-strategy data:
+    cooperation_rate, pareto_efficiency proxy, and fairness_index.
+    Exploitation_resistance and adaptability default to neutral since they
+    require cross-strategy comparison (see ``batch_reward``).
+    """
+    w = weights if weights is not None else _default_weights()
+    # Cooperation rate: direct
+    coop = cooperation_rate
+    # Pareto efficiency proxy: normalised joint score
+    joint = player_score + opponent_score
+    if total_rounds > EVAL_ZERO:
+        pareto_proxy = joint / total_rounds
+        # Clamp to [zero, one]
+        pareto_proxy = max(EVAL_ZERO_FLOAT, min(EVAL_ONE_FLOAT, pareto_proxy))
+    else:
+        pareto_proxy = EVAL_ZERO_FLOAT
+    # Fairness: EVAL_ONE_FLOAT - |p - o| / (|p| + |o|)
+    denom = abs(player_score) + abs(opponent_score)
+    if denom > EVAL_ZERO_FLOAT:
+        fairness = EVAL_ONE_FLOAT - abs(player_score - opponent_score) / denom
+    else:
+        fairness = EVAL_ONE_FLOAT
+    # Cross-strategy metrics default to neutral midpoint
+    exploit_resist = EVAL_HALF
+    adapt = EVAL_HALF
+    reward = (
+        w["cooperation_rate"] * coop
+        + w["pareto_efficiency"] * pareto_proxy
+        + w["fairness_index"] * fairness
+        + w["exploitation_resistance"] * exploit_resist
+        + w["adaptability"] * adapt
+    )
+    return reward
+# ---------------------------------------------------------------------------
+# Batch reward (cross-strategy)
+# ---------------------------------------------------------------------------
+def batch_reward(
+    episode_results: List[Dict[str, Any]],
+    weights: Optional[Dict[str, float]] = None,
+) -> Dict[str, float]:
+    """Compute cross-strategy reward metrics over a batch of episodes.
+    Parameters
+    ----------
+    episode_results : list of dict
+        Each dict must have keys: ``game``, ``strategy``,
+        ``player_score``, ``opponent_score``, ``cooperation_rate``.
+    Returns
+    -------
+    dict
+        Mapping of metric name to value for exploitation_resistance
+        and adaptability computed across strategies for each game.
+    """
+    w = weights if weights is not None else _default_weights()
+    # Group by game
+    by_game: Dict[str, List[Dict[str, Any]]] = {}
+    for ep in episode_results:
+        game = ep["game"]
+        if game not in by_game:
+            by_game[game] = []
+        by_game[game].append(ep)
+    exploit_scores: List[float] = []
+    adapt_scores: List[float] = []
+    for _game, episodes in by_game.items():
+        # Group by strategy within game
+        by_strat: Dict[str, List[Dict[str, Any]]] = {}
+        for ep in episodes:
+            strat = ep["strategy"]
+            if strat not in by_strat:
+                by_strat[strat] = []
+            by_strat[strat].append(ep)
+        if len(by_strat) <= EVAL_ONE:
+            continue
+        # Exploitation resistance: performance against always_defect
+        # relative to best/worst across strategies
+        strat_scores = {
+            s: sum(e["player_score"] for e in eps)
+            for s, eps in by_strat.items()
+        }
+        best = max(strat_scores.values())
+        worst = min(strat_scores.values())
+        spread = best - worst
+        if "always_defect" in strat_scores and spread > EVAL_ZERO_FLOAT:
+            ad_score = strat_scores["always_defect"]
+            exploit_scores.append((ad_score - worst) / spread)
+        # Adaptability: variance of cooperation rates across strategies
+        coop_rates = []
+        for eps in by_strat.values():
+            rate_sum = sum(e["cooperation_rate"] for e in eps)
+            coop_rates.append(rate_sum / len(eps))
+        if len(coop_rates) > EVAL_ONE:
+            mean_coop = sum(coop_rates) / len(coop_rates)
+            var = sum(
+                (r - mean_coop) ** EVAL_TWO for r in coop_rates
+            ) / len(coop_rates)
+            capped = min(var, EVAL_HALF)
+            adapt_scores.append(capped / EVAL_HALF)
+    exploit_val = (
+        sum(exploit_scores) / len(exploit_scores)
+        if exploit_scores else EVAL_HALF
+    )
+    adapt_val = (
+        sum(adapt_scores) / len(adapt_scores)
+        if adapt_scores else EVAL_ZERO_FLOAT
+    )
+    return {
+        "exploitation_resistance": exploit_val,
+        "adaptability": adapt_val,
+    }
+# ---------------------------------------------------------------------------
+# Per-step shaping
+# ---------------------------------------------------------------------------
+def per_step_shaping(
+    player_payoff: float,
+    opponent_payoff: float,
+    payoff_min: float,
+    payoff_max: float,
+) -> float:
+    """Optional per-step reward shaping based on immediate payoffs.
+    Returns a small bonus proportional to normalised joint payoff,
+    scaled by the shaping coefficient alpha.
+    """
+    alpha = GRPO_SHAPING_ALPHA_NUMERATOR / GRPO_SHAPING_ALPHA_DENOMINATOR
+    payoff_range = payoff_max - payoff_min
+    if payoff_range <= EVAL_ZERO_FLOAT:
+        return EVAL_ZERO_FLOAT
+    joint = player_payoff + opponent_payoff
+    normalised = (joint - payoff_min * EVAL_TWO) / (payoff_range * EVAL_TWO)
+    return alpha * normalised

train/self_play/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Self-play multi-agent training infrastructure."""

train/self_play/config.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""Configuration for self-play GRPO training."""
+from __future__ import annotations
+from dataclasses import dataclass
+from constant_definitions.train.grpo_constants import (
+    GRPO_BATCH_SIZE,
+    GRPO_LR_DENOMINATOR,
+    GRPO_LR_NUMERATOR,
+    GRPO_MAX_COMPLETION_LENGTH,
+    GRPO_NUM_GENERATIONS,
+)
+from constant_definitions.var.meta.self_play_constants import (
+    SELF_PLAY_DEFAULT_EPISODES_PER_STEP,
+    SELF_PLAY_DEFAULT_MAX_STEPS,
+    SELF_PLAY_OPPONENT_UPDATE_INTERVAL,
+    SELF_PLAY_POOL_MAX_SIZE,
+    SELF_PLAY_WARMUP_EPISODES,
+)
+@dataclass
+class SelfPlayConfig:
+    """Configuration for self-play GRPO training.
+    Combines self-play-specific settings (opponent pool management,
+    update frequency) with standard GRPO training parameters.
+    """
+    # Model
+    model_name: str = "Qwen/Qwen2.5-3B-Instruct"
+    output_dir: str = "./kantbench-self-play"
+    # Self-play specific
+    opponent_update_interval: int = SELF_PLAY_OPPONENT_UPDATE_INTERVAL
+    pool_max_size: int = SELF_PLAY_POOL_MAX_SIZE
+    episodes_per_step: int = SELF_PLAY_DEFAULT_EPISODES_PER_STEP
+    warmup_episodes: int = SELF_PLAY_WARMUP_EPISODES
+    # GRPO params
+    learning_rate_numerator: int = GRPO_LR_NUMERATOR
+    learning_rate_denominator: int = GRPO_LR_DENOMINATOR
+    batch_size: int = GRPO_BATCH_SIZE
+    num_generations: int = GRPO_NUM_GENERATIONS
+    max_completion_length: int = GRPO_MAX_COMPLETION_LENGTH
+    max_steps: int = SELF_PLAY_DEFAULT_MAX_STEPS
+    # Cross-model mode: if set, opponent is loaded from this path
+    cross_model_path: str = ""
+    @property
+    def learning_rate(self) -> float:
+        """Compute learning rate from numerator/denominator."""
+        return self.learning_rate_numerator / self.learning_rate_denominator

train/self_play/oauth.py ADDED Viewed

	@@ -0,0 +1,191 @@

+"""OAuth token management for Anthropic and OpenAI self-play integration."""
+from __future__ import annotations
+import base64
+import json
+import os
+from typing import Optional, Tuple
+import httpx
+from constant_definitions.var.meta.self_play_constants import (
+    ANTHROPIC_OAUTH_TOKEN_URL,
+    ANTHROPIC_OAUTH_CLIENT_ID,
+    OPENAI_OAUTH_TOKEN_URL,
+    OPENAI_OAUTH_CLIENT_ID,
+    SUPABASE_OAUTH_TABLE,
+    SUPABASE_PROVIDER_ANTHROPIC,
+    SUPABASE_PROVIDER_OPENAI,
+)
+_ZERO = int()
+_ONE = int(bool(True))
+_CONTENT_TYPE_FORM = "application/x-www-form-urlencoded"
+def _read_env_file() -> dict[str, str]:
+    """Read content-platform .env.local into a dict."""
+    env_path = os.path.join(
+        os.path.expanduser("~"),
+        "Documents", "CodingProjects", "Wisent",
+        "content-platform", ".env.local",
+    )
+    env_vars: dict[str, str] = {}
+    with open(env_path) as fh:
+        for line in fh:
+            if "=" in line and not line.startswith("#"):
+                key, val = line.split("=", _ONE)
+                env_vars[key] = (
+                    val.strip().strip('"').replace("\\n", "").strip()
+                )
+    return env_vars
+def _supabase_headers(service_key: str) -> dict[str, str]:
+    """Return Supabase REST API headers."""
+    return {
+        "apikey": service_key,
+        "Authorization": "Bearer " + service_key,
+        "Content-Type": "application/json",
+        "Prefer": "return=minimal",
+    }
+def fetch_refresh_token(
+    provider: str,
+    supabase_url: str = "",
+    service_key: str = "",
+) -> Tuple[str, str]:
+    """Fetch the first refresh token for *provider* from Supabase.
+    Returns (credential_id, refresh_token).
+    """
+    if not supabase_url or not service_key:
+        env = _read_env_file()
+        supabase_url = supabase_url or env["NEXT_PUBLIC_SUPABASE_URL"]
+        service_key = service_key or env["SUPABASE_SERVICE_ROLE_KEY"]
+    resp = httpx.get(
+        supabase_url + "/rest/v" + str(_ONE) + "/" + SUPABASE_OAUTH_TABLE,
+        params={"provider": "eq." + provider, "select": "*"},
+        headers=_supabase_headers(service_key),
+    )
+    rows = resp.json()
+    if not rows:
+        raise RuntimeError(f"No {provider} credentials in Supabase")
+    row = rows[_ZERO]
+    return row["id"], row["refresh_token"]
+def save_refresh_token(
+    credential_id: str,
+    new_refresh_token: str,
+    access_token: str = "",
+    supabase_url: str = "",
+    service_key: str = "",
+) -> None:
+    """Save a rotated refresh token back to Supabase."""
+    if not supabase_url or not service_key:
+        env = _read_env_file()
+        supabase_url = supabase_url or env["NEXT_PUBLIC_SUPABASE_URL"]
+        service_key = service_key or env["SUPABASE_SERVICE_ROLE_KEY"]
+    body: dict[str, str] = {"refresh_token": new_refresh_token}
+    if access_token:
+        body["access_token"] = access_token
+    httpx.patch(
+        supabase_url + "/rest/v" + str(_ONE) + "/" + SUPABASE_OAUTH_TABLE,
+        params={"id": "eq." + credential_id},
+        json=body,
+        headers=_supabase_headers(service_key),
+    )
+def exchange_anthropic(
+    refresh_token: str,
+) -> Tuple[str, str]:
+    """Exchange Anthropic refresh token. Returns (access, new_refresh)."""
+    resp = httpx.post(
+        ANTHROPIC_OAUTH_TOKEN_URL,
+        data={
+            "grant_type": "refresh_token",
+            "refresh_token": refresh_token,
+            "client_id": ANTHROPIC_OAUTH_CLIENT_ID,
+        },
+        headers={"Content-Type": _CONTENT_TYPE_FORM},
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    return data["access_token"], data.get("refresh_token", "")
+def exchange_openai(
+    refresh_token: str,
+) -> Tuple[str, str, str]:
+    """Exchange OpenAI refresh token. Returns (access, new_refresh, account_id)."""
+    resp = httpx.post(
+        OPENAI_OAUTH_TOKEN_URL,
+        data={
+            "grant_type": "refresh_token",
+            "refresh_token": refresh_token,
+            "client_id": OPENAI_OAUTH_CLIENT_ID,
+        },
+        headers={"Content-Type": _CONTENT_TYPE_FORM},
+    )
+    resp.raise_for_status()
+    data = resp.json()
+    access = data["access_token"]
+    new_rt = data.get("refresh_token", "")
+    account_id = _extract_account_id(data.get("id_token", ""))
+    return access, new_rt, account_id
+def _extract_account_id(id_token: str) -> str:
+    """Extract chatgpt_account_id from an OpenAI id_token JWT."""
+    if not id_token:
+        return ""
+    parts = id_token.split(".")
+    if len(parts) < _ONE + _ONE:
+        return ""
+    payload = parts[_ONE]
+    # Pad base64
+    padding = (_ONE + _ONE + _ONE + _ONE) - len(payload) % (
+        _ONE + _ONE + _ONE + _ONE
+    )
+    if padding < (_ONE + _ONE + _ONE + _ONE):
+        payload += "=" * padding
+    decoded = json.loads(base64.urlsafe_b64decode(payload))
+    claims = decoded.get("https://api.openai.com/auth", {})
+    return claims.get("chatgpt_account_id", "")
+def get_anthropic_access_token() -> str:
+    """Full flow: try all Supabase credentials until one works."""
+    env = _read_env_file()
+    sb_url = env["NEXT_PUBLIC_SUPABASE_URL"]
+    sb_key = env["SUPABASE_SERVICE_ROLE_KEY"]
+    resp = httpx.get(
+        sb_url + "/rest/v" + str(_ONE) + "/" + SUPABASE_OAUTH_TABLE,
+        params={"provider": "eq." + SUPABASE_PROVIDER_ANTHROPIC, "select": "*"},
+        headers=_supabase_headers(sb_key),
+    )
+    rows = resp.json()
+    last_err: Exception = RuntimeError("No credentials found")
+    for row in rows:
+        cred_id, rt = row["id"], row["refresh_token"]
+        try:
+            access, new_rt = exchange_anthropic(rt)
+            if new_rt:
+                save_refresh_token(cred_id, new_rt, access, sb_url, sb_key)
+            return access
+        except Exception as exc:
+            last_err = exc
+    raise last_err
+def get_openai_credentials() -> Tuple[str, str]:
+    """Full flow: returns (access_token, account_id)."""
+    cred_id, rt = fetch_refresh_token(SUPABASE_PROVIDER_OPENAI)
+    access, new_rt, account_id = exchange_openai(rt)
+    if new_rt:
+        save_refresh_token(cred_id, new_rt, access)
+    return access, account_id

train/self_play/opponents.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""Frozen opponents and opponent pool for self-play training."""
+from __future__ import annotations
+import random
+from typing import Callable, List, Optional
+from env.models import GameAction, GameObservation
+from train.agent import PromptBuilder, parse_action
+from constant_definitions.train.agent_constants import (
+    MAX_ACTION_TOKENS,
+    SYSTEM_PROMPT,
+)
+from constant_definitions.var.meta.self_play_constants import (
+    SELF_PLAY_POOL_MAX_SIZE,
+)
+_ZERO = int()
+class FrozenOpponent:
+    """Wraps a generation function for use as opponent_fn in KantEnvironment.
+    Runs inference with no gradients. Compatible with the
+    ``opponent_fn: Callable[[GameObservation], GameAction]`` interface
+    that KantEnvironment.reset() accepts.
+    Parameters
+    ----------
+    generate_fn : callable
+        A function ``(prompt: str) -> str`` that produces a completion.
+    prompt_builder : PromptBuilder, optional
+        Custom prompt builder.  Defaults to the standard PromptBuilder.
+    """
+    def __init__(
+        self,
+        generate_fn: Callable[[str], str],
+        prompt_builder: Optional[PromptBuilder] = None,
+    ) -> None:
+        self._generate_fn = generate_fn
+        self._builder = prompt_builder or PromptBuilder()
+    def __call__(self, obs: GameObservation) -> GameAction:
+        """Select an action given a game observation."""
+        prompt = self._builder.build(obs)
+        completion = self._generate_fn(prompt)
+        action_str = parse_action(completion, obs.available_actions)
+        return GameAction(action=action_str)
+    @classmethod
+    def from_model(
+        cls,
+        model: object,
+        tokenizer: object,
+        max_tokens: int = MAX_ACTION_TOKENS,
+    ) -> FrozenOpponent:
+        """Create from a HuggingFace model (runs with torch.no_grad)."""
+        import torch
+        def _generate(prompt: str) -> str:
+            with torch.no_grad():
+                inputs = tokenizer(prompt, return_tensors="pt")
+                input_len = len(inputs["input_ids"][_ZERO])
+                outputs = model.generate(
+                    **inputs, max_new_tokens=max_tokens,
+                )
+                return tokenizer.decode(
+                    outputs[_ZERO][input_len:],
+                    skip_special_tokens=True,
+                )
+        return cls(generate_fn=_generate)
+    @classmethod
+    def from_checkpoint(
+        cls,
+        path: str,
+        tokenizer_name: str,
+        max_tokens: int = MAX_ACTION_TOKENS,
+    ) -> FrozenOpponent:
+        """Load a frozen opponent from a saved checkpoint directory."""
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        loaded_model = AutoModelForCausalLM.from_pretrained(path)
+        loaded_model.eval()
+        loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        return cls.from_model(loaded_model, loaded_tokenizer, max_tokens)
+    @classmethod
+    def from_api(
+        cls,
+        api_call_fn: Callable[[str, str], str],
+    ) -> FrozenOpponent:
+        """Create from an API-based agent (OpenAI, Anthropic, etc.)."""
+        return cls(
+            generate_fn=lambda prompt: api_call_fn(SYSTEM_PROMPT, prompt),
+        )
+class OpponentPool:
+    """Maintains a pool of past model checkpoints as diverse opponents.
+    Samples uniformly from the pool for opponent diversity.
+    Evicts the oldest entry when the pool exceeds ``max_size``.
+    Parameters
+    ----------
+    max_size : int
+        Maximum number of frozen opponents to keep in the pool.
+    """
+    def __init__(self, max_size: int = SELF_PLAY_POOL_MAX_SIZE) -> None:
+        self._pool: List[FrozenOpponent] = []
+        self._max_size = max_size
+    def add(self, opponent: FrozenOpponent) -> None:
+        """Add a frozen opponent to the pool, evicting oldest if full."""
+        self._pool.append(opponent)
+        if len(self._pool) > self._max_size:
+            self._pool.pop(_ZERO)
+    def sample(self) -> FrozenOpponent:
+        """Return a randomly chosen opponent from the pool.
+        Raises
+        ------
+        IndexError
+            If the pool is empty.
+        """
+        if not self._pool:
+            raise IndexError("Cannot sample from an empty opponent pool.")
+        return random.choice(self._pool)
+    def get_opponent_fn(self) -> Callable[[GameObservation], GameAction]:
+        """Return a callable that uses a sampled opponent."""
+        return self.sample()
+    @property
+    def size(self) -> int:
+        """Current number of opponents in the pool."""
+        return len(self._pool)

train/self_play/trainer.py ADDED Viewed

	@@ -0,0 +1,276 @@

+"""Self-play GRPO trainer for multi-agent training."""
+from __future__ import annotations
+import copy
+import logging
+import random
+from typing import Any, Callable, Dict, List, Optional
+from env.environment import KantEnvironment
+from env.models import GameAction, GameObservation
+from train.agent import LLMAgent, PromptBuilder, parse_action
+from train.rewards import episode_reward
+from train.trajectory import TrajectoryCollector, EpisodeTrajectory
+from train.self_play.opponents import FrozenOpponent, OpponentPool
+from train.self_play.config import SelfPlayConfig
+from constant_definitions.train.agent_constants import SYSTEM_PROMPT
+from constant_definitions.train.grpo_constants import GRPO_LOG_EVERY
+from constant_definitions.game_constants import EVAL_ZERO_FLOAT
+from constant_definitions.var.meta.self_play_constants import (
+    SELF_PLAY_COOP_WEIGHT_DENOMINATOR,
+    SELF_PLAY_COOP_WEIGHT_NUMERATOR,
+    SELF_PLAY_EXPLOIT_WEIGHT_DENOMINATOR,
+    SELF_PLAY_EXPLOIT_WEIGHT_NUMERATOR,
+    SELF_PLAY_FAIRNESS_WEIGHT_DENOMINATOR,
+    SELF_PLAY_FAIRNESS_WEIGHT_NUMERATOR,
+    SELF_PLAY_PARETO_WEIGHT_DENOMINATOR,
+    SELF_PLAY_PARETO_WEIGHT_NUMERATOR,
+    SELF_PLAY_ADAPT_WEIGHT_DENOMINATOR,
+    SELF_PLAY_ADAPT_WEIGHT_NUMERATOR,
+    SELF_PLAY_OPPONENT_LABEL,
+)
+logger = logging.getLogger(__name__)
+_ZERO = int()
+_ONE = int(bool(True))
+def _self_play_weights() -> Dict[str, float]:
+    """Return reward weights tuned for self-play training."""
+    return {
+        "exploitation_resistance": (
+            SELF_PLAY_EXPLOIT_WEIGHT_NUMERATOR
+            / SELF_PLAY_EXPLOIT_WEIGHT_DENOMINATOR
+        ),
+        "cooperation_rate": (
+            SELF_PLAY_COOP_WEIGHT_NUMERATOR
+            / SELF_PLAY_COOP_WEIGHT_DENOMINATOR
+        ),
+        "pareto_efficiency": (
+            SELF_PLAY_PARETO_WEIGHT_NUMERATOR
+            / SELF_PLAY_PARETO_WEIGHT_DENOMINATOR
+        ),
+        "fairness_index": (
+            SELF_PLAY_FAIRNESS_WEIGHT_NUMERATOR
+            / SELF_PLAY_FAIRNESS_WEIGHT_DENOMINATOR
+        ),
+        "adaptability": (
+            SELF_PLAY_ADAPT_WEIGHT_NUMERATOR
+            / SELF_PLAY_ADAPT_WEIGHT_DENOMINATOR
+        ),
+    }
+class SelfPlayTrainer:
+    """GRPO training with self-play opponents.
+    Training loop:
+    1. Collect trajectories: training model vs frozen opponent
+    2. Compute GRPO rewards from episode outcomes
+    3. Update training model via TRL GRPOTrainer
+    4. Periodically refresh frozen opponent from training model
+    5. Add old opponent to pool for diversity
+    Parameters
+    ----------
+    config : SelfPlayConfig
+        Training configuration.
+    model : object
+        HuggingFace model to train.
+    tokenizer : object
+        Tokenizer for the model.
+    env : KantEnvironment, optional
+        Game environment instance.
+    """
+    def __init__(
+        self,
+        config: SelfPlayConfig,
+        model: object,
+        tokenizer: object,
+        env: Optional[KantEnvironment] = None,
+    ) -> None:
+        self._config = config
+        self._model = model
+        self._tokenizer = tokenizer
+        self._env = env or KantEnvironment()
+        self._pool = OpponentPool(max_size=config.pool_max_size)
+        self._frozen = FrozenOpponent.from_model(model, tokenizer)
+        self._pool.add(self._frozen)
+        self._step_count = _ZERO
+    def _model_generate(self, prompt: str) -> str:
+        """Generate a completion from the training model."""
+        import torch
+        with torch.no_grad():
+            inputs = self._tokenizer(prompt, return_tensors="pt")
+            input_len = len(inputs["input_ids"][_ZERO])
+            outputs = self._model.generate(
+                **inputs,
+                max_new_tokens=self._config.max_completion_length,
+            )
+            return self._tokenizer.decode(
+                outputs[_ZERO][input_len:],
+                skip_special_tokens=True,
+            )
+    def collect_trajectories(
+        self,
+        games: List[str],
+        num_episodes: int,
+    ) -> List[EpisodeTrajectory]:
+        """Collect episodes with current frozen opponent."""
+        agent = LLMAgent(generate_fn=self._model_generate)
+        collector = TrajectoryCollector(
+            env=self._env,
+            agent=agent,
+            reward_fn=lambda ps, os, cr, tr: episode_reward(
+                ps, os, cr, tr, weights=_self_play_weights(),
+            ),
+        )
+        trajectories: List[EpisodeTrajectory] = []
+        for _ep in range(num_episodes):
+            game = random.choice(games)
+            opponent = self._pool.sample()
+            traj = collector.collect_episode(
+                game=game,
+                strategy=SELF_PLAY_OPPONENT_LABEL,
+                opponent_fn=opponent,
+            )
+            trajectories.append(traj)
+        return trajectories
+    def make_reward_fn(self) -> Callable[..., List[float]]:
+        """Create GRPO reward function using self-play episodes."""
+        pool = self._pool
+        env = self._env
+        weights = _self_play_weights()
+        def reward_fn(
+            completions: List[str],
+            prompts: List[str],
+            **kwargs: Any,
+        ) -> List[float]:
+            rewards: List[float] = []
+            game_keys = kwargs.get(
+                "game_key",
+                ["prisoners_dilemma"] * len(completions),
+            )
+            moves_batch = kwargs.get(
+                "available_moves",
+                [["cooperate", "defect"]] * len(completions),
+            )
+            for completion, game_key, moves in zip(
+                completions, game_keys, moves_batch,
+            ):
+                action_str = parse_action(completion.strip(), moves)
+                opponent = pool.sample()
+                obs = env.reset(
+                    game=game_key, opponent_fn=opponent,
+                )
+                while not obs.done:
+                    obs = env.step(GameAction(action=action_str))
+                reward = episode_reward(
+                    obs.player_score,
+                    obs.opponent_score,
+                    _compute_coop_rate(obs),
+                    obs.current_round,
+                    weights=weights,
+                )
+                rewards.append(reward)
+            return rewards
+        return reward_fn
+    def refresh_opponent(self) -> None:
+        """Copy current training model to a new frozen opponent."""
+        frozen_model = copy.deepcopy(self._model)
+        frozen_model.eval()
+        new_opponent = FrozenOpponent.from_model(
+            frozen_model, self._tokenizer,
+        )
+        self._pool.add(new_opponent)
+        self._frozen = new_opponent
+        logger.info(
+            "Refreshed opponent. Pool size: %d", self._pool.size,
+        )
+    def train(self, games: List[str]) -> None:
+        """Main self-play training loop.
+        Parameters
+        ----------
+        games : list of str
+            Game keys to train on.
+        """
+        from datasets import Dataset
+        from trl import GRPOConfig, GRPOTrainer
+        import torch
+        trajectories = self.collect_trajectories(
+            games, self._config.warmup_episodes,
+        )
+        samples = []
+        for traj in trajectories:
+            for step in traj.steps:
+                messages = [
+                    {"role": "system", "content": SYSTEM_PROMPT},
+                    {"role": "user", "content": step.prompt},
+                ]
+                formatted = self._tokenizer.apply_chat_template(
+                    messages, tokenize=False,
+                    add_generation_prompt=True,
+                )
+                samples.append({
+                    "prompt": formatted,
+                    "game_key": traj.game,
+                    "available_moves": ["cooperate", "defect"],
+                })
+        dataset = Dataset.from_list(samples)
+        reward_fn = self.make_reward_fn()
+        trl_config = GRPOConfig(
+            output_dir=self._config.output_dir,
+            num_generations=self._config.num_generations,
+            max_completion_length=self._config.max_completion_length,
+            per_device_train_batch_size=self._config.batch_size,
+            learning_rate=self._config.learning_rate,
+            max_steps=self._config.max_steps,
+            logging_steps=GRPO_LOG_EVERY,
+            save_steps=self._config.opponent_update_interval,
+            bf16=torch.cuda.is_available(),
+        )
+        trainer = GRPOTrainer(
+            model=self._model,
+            reward_funcs=reward_fn,
+            args=trl_config,
+            train_dataset=dataset,
+            processing_class=self._tokenizer,
+        )
+        trainer.train()
+        trainer.save_model(self._config.output_dir)
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+_COOPERATIVE_ACTIONS = frozenset({"cooperate", "stag", "dove"})
+def _compute_coop_rate(obs: GameObservation) -> float:
+    """Fraction of cooperative moves in an episode."""
+    if not obs.history:
+        return EVAL_ZERO_FLOAT
+    total = len(obs.history)
+    count = _ZERO
+    for rnd in obs.history:
+        if rnd.player_action in _COOPERATIVE_ACTIONS:
+            count += _ONE
+    return count / total

train/splits.py ADDED Viewed

	@@ -0,0 +1,77 @@

+"""Deterministic stratified train/eval game split."""
+from __future__ import annotations
+import random
+from typing import Dict, FrozenSet, List, Set, Tuple
+from common.games_meta.game_tags import GAME_TAGS
+from constant_definitions.batch4.tag_constants import CATEGORIES
+from constant_definitions.game_constants import EVAL_ZERO, EVAL_ONE
+from constant_definitions.train.split_constants import (
+    MIN_EVAL_TAG_FRACTION_DENOMINATOR,
+    MIN_EVAL_TAG_FRACTION_NUMERATOR,
+    SPLIT_SEED,
+    TRAIN_FRACTION_DENOMINATOR,
+    TRAIN_FRACTION_NUMERATOR,
+)
+# Domain tags are used for stratification
+_DOMAIN_TAGS: List[str] = CATEGORIES["domain"]
+def get_train_eval_split(
+    seed: int = SPLIT_SEED,
+) -> Tuple[FrozenSet[str], FrozenSet[str]]:
+    """Return (train_games, eval_games) as frozen sets of game keys.
+    The split is deterministic for a given seed and stratified so that
+    every domain tag has at least ``MIN_EVAL_TAG_FRACTION`` representation
+    in the eval set.
+    """
+    all_games = sorted(GAME_TAGS.keys())
+    rng = random.Random(seed)
+    # Build domain -> games index
+    domain_to_games: Dict[str, List[str]] = {tag: [] for tag in _DOMAIN_TAGS}
+    for game_key in all_games:
+        tags = GAME_TAGS[game_key]
+        for dtag in _DOMAIN_TAGS:
+            if dtag in tags:
+                domain_to_games[dtag].append(game_key)
+    # Guarantee minimum eval representation per domain
+    eval_set: Set[str] = set()
+    for dtag in _DOMAIN_TAGS:
+        games_with_tag = domain_to_games[dtag]
+        if not games_with_tag:
+            continue
+        min_eval = _min_eval_count(len(games_with_tag))
+        already_in_eval = [g for g in games_with_tag if g in eval_set]
+        needed = min_eval - len(already_in_eval)
+        if needed > EVAL_ZERO:
+            candidates = [g for g in games_with_tag if g not in eval_set]
+            rng.shuffle(candidates)
+            for g in candidates[:needed]:
+                eval_set.add(g)
+    # Fill remaining eval slots up to target size
+    total = len(all_games)
+    target_train = (total * TRAIN_FRACTION_NUMERATOR) // TRAIN_FRACTION_DENOMINATOR
+    target_eval = total - target_train
+    remaining = [g for g in all_games if g not in eval_set]
+    rng.shuffle(remaining)
+    slots_to_fill = target_eval - len(eval_set)
+    if slots_to_fill > EVAL_ZERO:
+        for g in remaining[:slots_to_fill]:
+            eval_set.add(g)
+    train_set = frozenset(g for g in all_games if g not in eval_set)
+    return train_set, frozenset(eval_set)
+def _min_eval_count(tag_total: int) -> int:
+    """Minimum number of games with a given tag that must be in eval."""
+    _numer = tag_total * MIN_EVAL_TAG_FRACTION_NUMERATOR
+    result = (_numer + MIN_EVAL_TAG_FRACTION_DENOMINATOR - EVAL_ONE) // MIN_EVAL_TAG_FRACTION_DENOMINATOR
+    return max(result, EVAL_ONE)

train/train.py ADDED Viewed

	@@ -0,0 +1,403 @@

+"""KantBench GRPO Training Script.
+Trains a language model to play 2-player game theory games optimally
+using Group Relative Policy Optimization (GRPO) via TRL.
+The KantBench environment runs as a remote OpenEnv server (HF Space):
+  - Each GRPO completion is a single move
+  - The reward function plays a FULL multi-round episode using that move
+    as the agent's consistent strategy via the OpenEnv client
+  - The composite reward (payoff + cooperation + Pareto efficiency + fairness)
+    becomes the GRPO signal
+Supports the full KantBench game library including:
+  - 90+ base 2-player games and 3 N-player games
+  - 9 pre-registered meta-games (rule_proposal, rule_signal, gossip)
+  - Dynamic variant composition (cheap_talk, exit, binding_commitment,
+    constitutional, proposer_responder, noisy_actions, noisy_payoffs)
+Usage:
+    python -m train.train --model Qwen/Qwen2.5-7B-Instruct --max-steps 200
+"""
+from __future__ import annotations
+import argparse
+import logging
+import random
+from typing import Any, List
+import torch
+from datasets import Dataset
+from trl import GRPOConfig, GRPOTrainer
+from transformers import AutoTokenizer
+from common.games import GAMES
+from common.strategies import STRATEGIES as STRATEGY_REGISTRY
+from spaces.kant.client import KantBenchEnv
+from spaces.kant.models import KantBenchAction, KantBenchObservation
+from train.agent import parse_action
+from train.rewards import episode_reward
+from train.splits import get_train_eval_split
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+KANTBENCH_URL = "https://openenv-community-kantbench.hf.space"
+SYSTEM_PROMPT = (
+    "You are playing a game-theory game. Analyse the situation and choose "
+    "the best action. Respond with ONLY the action name, nothing else."
+)
+# Variants that can be dynamically composed on top of base games.
+# These are applied server-side via the variant= reset parameter.
+TRAINABLE_VARIANTS = [
+    "cheap_talk",
+    "exit",
+    "binding_commitment",
+    "constitutional",
+    "noisy_actions",
+    "noisy_payoffs",
+    "rule_proposal",
+    "rule_signal",
+    "gossip",
+]
+# Base games suitable for variant composition (2-player matrix games).
+VARIANT_BASE_GAMES = [
+    "prisoners_dilemma",
+    "stag_hunt",
+    "hawk_dove",
+]
+# Fraction of dataset samples that use dynamic variant composition.
+VARIANT_FRACTION = 0.3
+# ---------------------------------------------------------------------------
+# Helpers to bridge KantBenchObservation -> training code
+# ---------------------------------------------------------------------------
+def _obs_cooperation_rate(obs: KantBenchObservation) -> float:
+    """Compute cooperation rate from a KantBenchObservation's history."""
+    if not obs.history:
+        return 0.0
+    coop_actions = {"cooperate", "stag", "dove", "contribute"}
+    coop_count = sum(
+        1 for h in obs.history
+        if any(ca in h.get("your_move", "") for ca in coop_actions)
+    )
+    return coop_count / len(obs.history)
+def _build_prompt(obs: KantBenchObservation) -> str:
+    """Build a structured prompt from a KantBenchObservation.
+    Mirrors PromptBuilder.build() but works with the OpenEnv client's
+    observation format.
+    """
+    sections: list[str] = []
+    # Game section
+    sections.append(
+        f"[Game]\n{obs.game_name}\n{obs.game_description}"
+    )
+    # History section
+    if obs.history:
+        history_lines: list[str] = []
+        for h in obs.history[-5:]:  # Last 5 rounds
+            line = (
+                f"Round {h.get('round', '?')}"
+                f" | You played: {h.get('your_move', '?')}"
+                f" | Opponent played: {h.get('opponent_move', '?')}"
+                f" | Your payoff: {h.get('your_payoff', '?')}"
+                f" | Opp payoff: {h.get('opponent_payoff', '?')}"
+            )
+            history_lines.append(line)
+        sections.append("[History]\n" + "\n".join(history_lines))
+    # Scores section
+    sections.append(
+        f"[Scores]\nYour score: {obs.cumulative_score}"
+        f"\nRound: {obs.round_number} of {obs.max_rounds}"
+    )
+    # Available actions
+    action_lines = [f"- {a}" for a in obs.available_moves]
+    sections.append("[Available Actions]\n" + "\n".join(action_lines))
+    # Instruction
+    sections.append(f"[Instruction]\n{SYSTEM_PROMPT}")
+    return "\n\n".join(sections)
+# ---------------------------------------------------------------------------
+# Dataset generation using PromptBuilder
+# ---------------------------------------------------------------------------
+def build_dataset(
+    base_url: str,
+    n_samples: int = 1000,
+    games: list[str] | None = None,
+    strategies: list[str] | None = None,
+    variant_fraction: float = VARIANT_FRACTION,
+) -> Dataset:
+    """Generate diverse game theory prompts for GRPO training.
+    Connects to the KantBench OpenEnv server to generate real observations,
+    then builds structured prompts from diverse game states.
+    A fraction of samples use dynamic variant composition (cheap_talk,
+    constitutional, gossip, etc.) to train on meta-gaming scenarios.
+    """
+    game_keys = games or list(GAMES.keys())
+    strat_names = strategies or list(STRATEGY_REGISTRY.keys())
+    samples = []
+    with KantBenchEnv(base_url=base_url) as env:
+        attempts = 0
+        while len(samples) < n_samples:
+            attempts += 1
+            # Decide whether to use a variant
+            use_variant = random.random() < variant_fraction
+            if use_variant:
+                game_key = random.choice(VARIANT_BASE_GAMES)
+                variant = random.choice(TRAINABLE_VARIANTS)
+            else:
+                game_key = random.choice(game_keys)
+                variant = None
+            strategy = random.choice(strat_names)
+            try:
+                # Reset env — pass variant for dynamic composition
+                reset_kwargs = {"game": game_key, "strategy": strategy}
+                if variant:
+                    reset_kwargs["variant"] = variant
+                result = env.reset(**reset_kwargs)
+                obs = result.observation
+                # Play 0..N-1 random rounds to create diverse game states
+                max_rounds = obs.max_rounds
+                rounds_to_play = random.randint(0, max(max_rounds - 1, 0))
+                for _ in range(rounds_to_play):
+                    move = random.choice(obs.available_moves)
+                    result = env.step(KantBenchAction(move=move))
+                    obs = result.observation
+                    if result.done:
+                        break
+                if result.done:
+                    # Replay without filling all rounds
+                    result = env.reset(**reset_kwargs)
+                    obs = result.observation
+                prompt = _build_prompt(obs)
+                samples.append({
+                    "prompt": prompt,
+                    "game_key": game_key,
+                    "strategy": strategy,
+                    "variant": variant or "",
+                    "available_moves": list(obs.available_moves),
+                    "rounds_remaining": obs.max_rounds - obs.round_number,
+                })
+            except (RuntimeError, ConnectionError, Exception) as exc:
+                logger.debug(
+                    "Skipping %s/%s (variant=%s): %s",
+                    game_key, strategy, variant, exc,
+                )
+                continue
+    return Dataset.from_list(samples)
+# ---------------------------------------------------------------------------
+# Reward function — full episode rollout
+# ---------------------------------------------------------------------------
+def make_reward_fn(base_url: str):
+    """Returns a GRPO reward function that plays full episodes via OpenEnv.
+    For each completion:
+    1. Parse the move from the LLM output
+    2. Reset the KantBench server with the correct game/strategy/variant
+    3. Play the FULL episode using the parsed move as a consistent strategy
+    4. Compute composite reward: payoff + cooperation + Pareto + fairness
+    """
+    env = KantBenchEnv(base_url=base_url)
+    env.connect()
+    def reward_fn(
+        completions: list[str],
+        prompts: list[str],
+        **kwargs: Any,
+    ) -> list[float]:
+        rewards = []
+        game_keys = kwargs.get("game_key", ["prisoners_dilemma"] * len(completions))
+        strategies = kwargs.get("strategy", ["tit_for_tat"] * len(completions))
+        variants = kwargs.get("variant", [""] * len(completions))
+        available_moves_batch = kwargs.get(
+            "available_moves", [["cooperate", "defect"]] * len(completions)
+        )
+        for completion, game_key, strategy, variant, moves in zip(
+            completions, game_keys, strategies, variants, available_moves_batch
+        ):
+            # Parse move from LLM output
+            action_str = parse_action(completion.strip(), moves)
+            try:
+                # Play a full episode using this move as a consistent strategy
+                reset_kwargs = {"game": game_key, "strategy": strategy}
+                if variant:
+                    reset_kwargs["variant"] = variant
+                result = env.reset(**reset_kwargs)
+                while not result.done:
+                    result = env.step(KantBenchAction(move=action_str))
+                obs = result.observation
+                # Compute cooperation rate from observation history
+                coop_rate = _obs_cooperation_rate(obs)
+                # Composite reward from the reward module
+                # opponent_score not directly available in KantBenchObservation,
+                # approximate from history
+                opp_score = sum(
+                    h.get("opponent_payoff", 0.0) for h in obs.history
+                )
+                reward = episode_reward(
+                    player_score=obs.cumulative_score,
+                    opponent_score=opp_score,
+                    cooperation_rate=coop_rate,
+                    total_rounds=obs.round_number,
+                )
+                rewards.append(reward)
+            except (ValueError, KeyError, RuntimeError, ConnectionError) as exc:
+                logger.debug("Reward error for %s/%s: %s", game_key, action_str, exc)
+                rewards.append(-1.0)
+        return rewards
+    return reward_fn
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def parse_args():
+    p = argparse.ArgumentParser(description="KantBench GRPO Training")
+    p.add_argument("--model", default="Qwen/Qwen2.5-7B-Instruct")
+    p.add_argument("--output-dir", default="./kantbench-grpo")
+    p.add_argument("--env-url", default=KANTBENCH_URL,
+                    help="KantBench OpenEnv server URL")
+    p.add_argument("--episodes", type=int, default=1000, help="Training dataset size")
+    p.add_argument("--num-generations", type=int, default=8, help="GRPO group size")
+    p.add_argument("--batch-size", type=int, default=4)
+    p.add_argument("--grad-accum", type=int, default=4)
+    p.add_argument("--lr", type=float, default=5e-6)
+    p.add_argument("--max-steps", type=int, default=500)
+    p.add_argument("--report-to", default="wandb", help="wandb, tensorboard, or none")
+    p.add_argument("--push-to-hub", action="store_true")
+    p.add_argument("--hub-model-id", default="jtowarek/kantbench-qwen2.5-7b")
+    p.add_argument("--use-train-split", action="store_true",
+                    help="Use stratified train/eval split (eval games held out)")
+    p.add_argument("--variant-fraction", type=float, default=VARIANT_FRACTION,
+                    help="Fraction of samples using dynamic variant composition")
+    return p.parse_args()
+def main():
+    args = parse_args()
+    logging.basicConfig(level=logging.INFO)
+    print(f"Loading model: {args.model}")
+    print(f"Output: {args.output_dir}")
+    print(f"OpenEnv server: {args.env_url}")
+    tokenizer = AutoTokenizer.from_pretrained(args.model)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    # Optionally use stratified train/eval split
+    train_games = None
+    if args.use_train_split:
+        train_set, eval_set = get_train_eval_split()
+        train_games = sorted(train_set)
+        print(f"Using stratified split: {len(train_games)} train, {len(eval_set)} eval games")
+    dataset = build_dataset(
+        args.env_url, args.episodes, games=train_games,
+        variant_fraction=args.variant_fraction,
+    )
+    variant_count = sum(1 for v in dataset["variant"] if v)
+    print(f"Dataset: {len(dataset)} prompts across {len(GAMES)} games")
+    print(f"  Variant samples: {variant_count} ({variant_count*100//max(len(dataset),1)}%)")
+    # Format prompts with chat template
+    def format_prompt(example):
+        messages = [
+            {"role": "system", "content": SYSTEM_PROMPT},
+            {"role": "user", "content": example["prompt"]},
+        ]
+        return {
+            "prompt": tokenizer.apply_chat_template(
+                messages, tokenize=False, add_generation_prompt=True
+            )
+        }
+    dataset = dataset.map(format_prompt)
+    reward_fn = make_reward_fn(args.env_url)
+    config = GRPOConfig(
+        output_dir=args.output_dir,
+        num_generations=args.num_generations,
+        max_completion_length=32,
+        per_device_train_batch_size=args.batch_size,
+        gradient_accumulation_steps=args.grad_accum,
+        learning_rate=args.lr,
+        max_steps=args.max_steps,
+        logging_steps=10,
+        save_steps=100,
+        bf16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8,
+        fp16=torch.cuda.is_available() and torch.cuda.get_device_capability()[0] < 8,
+        report_to=args.report_to,
+        push_to_hub=args.push_to_hub,
+        hub_model_id=args.hub_model_id if args.push_to_hub else None,
+    )
+    trainer = GRPOTrainer(
+        model=args.model,
+        reward_funcs=reward_fn,
+        args=config,
+        train_dataset=dataset,
+        processing_class=tokenizer,
+    )
+    print("Starting GRPO training...")
+    print(f"  Reward: composite (payoff + cooperation + Pareto + fairness)")
+    print(f"  Episode: full multi-round rollout via OpenEnv @ {args.env_url}")
+    print(f"  Variants: {args.variant_fraction*100:.0f}% of samples use dynamic composition")
+    trainer.train()
+    trainer.save_model(args.output_dir)
+    print(f"Done. Model saved to {args.output_dir}")
+if __name__ == "__main__":
+    main()

train/trajectory.py ADDED Viewed

	@@ -0,0 +1,206 @@

+"""Trajectory collection for training data generation."""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import Any, Callable, Dict, List, Optional
+from env.models import GameAction, GameObservation, RoundResult
+from env.environment import KantEnvironment
+from constant_definitions.game_constants import EVAL_ZERO_FLOAT
+@dataclass
+class StepRecord:
+    """A single step within an episode trajectory."""
+    prompt: str
+    completion: str
+    action: str
+    reward: float
+    player_payoff: float
+    opponent_payoff: float
+    round_number: int
+@dataclass
+class EpisodeTrajectory:
+    """Complete trajectory of one episode."""
+    game: str
+    strategy: str
+    steps: List[StepRecord] = field(default_factory=list)
+    episode_reward: float = EVAL_ZERO_FLOAT
+    player_score: float = EVAL_ZERO_FLOAT
+    opponent_score: float = EVAL_ZERO_FLOAT
+    cooperation_rate: float = EVAL_ZERO_FLOAT
+    rounds_played: int = int()
+    metrics: Dict[str, float] = field(default_factory=dict)
+class TrajectoryCollector:
+    """Runs episodes and collects trajectories for training.
+    Parameters
+    ----------
+    env : KantEnvironment
+        The game environment instance.
+    agent : LLMAgent
+        An agent with ``last_prompt`` / ``last_completion`` properties,
+        callable with ``(GameObservation) -> GameAction``.
+    reward_fn : callable, optional
+        Function(player_score, opponent_score, cooperation_rate, rounds) -> float.
+    step_reward_fn : callable, optional
+        Function(player_payoff, opponent_payoff, payoff_min, payoff_max) -> float.
+    """
+    def __init__(
+        self,
+        env: KantEnvironment,
+        agent: Any,
+        reward_fn: Optional[Callable[..., float]] = None,
+        step_reward_fn: Optional[Callable[..., float]] = None,
+    ) -> None:
+        self._env = env
+        self._agent = agent
+        self._reward_fn = reward_fn
+        self._step_reward_fn = step_reward_fn
+    def collect_episode(
+        self,
+        game: str,
+        strategy: str = "tit_for_tat",
+        opponent_fn: Optional[Callable] = None,
+    ) -> EpisodeTrajectory:
+        """Run a single episode and return its trajectory."""
+        if opponent_fn is not None:
+            obs = self._env.reset(game=game, opponent_fn=opponent_fn)
+        else:
+            obs = self._env.reset(game=game, strategy=strategy)
+        steps: List[StepRecord] = []
+        while not obs.done:
+            action = self._agent(obs)
+            # Capture prompt/completion from agent
+            prompt = getattr(self._agent, "last_prompt", "")
+            completion = getattr(self._agent, "last_completion", "")
+            next_obs = self._env.step(action)
+            # Compute step reward
+            step_reward = EVAL_ZERO_FLOAT
+            if self._step_reward_fn is not None and next_obs.last_round is not None:
+                step_reward = self._step_reward_fn(
+                    next_obs.last_round.player_payoff,
+                    next_obs.last_round.opponent_payoff,
+                    EVAL_ZERO_FLOAT,
+                    EVAL_ZERO_FLOAT,
+                )
+            # Record step
+            last_rnd = next_obs.last_round
+            steps.append(StepRecord(
+                prompt=prompt,
+                completion=completion,
+                action=action.action,
+                reward=step_reward,
+                player_payoff=(
+                    last_rnd.player_payoff if last_rnd is not None
+                    else EVAL_ZERO_FLOAT
+                ),
+                opponent_payoff=(
+                    last_rnd.opponent_payoff if last_rnd is not None
+                    else EVAL_ZERO_FLOAT
+                ),
+                round_number=next_obs.current_round,
+            ))
+            obs = next_obs
+        # Compute cooperation rate (reusing tournament logic pattern)
+        coop_rate = _compute_cooperation_rate(obs)
+        # Compute episode reward
+        ep_reward = EVAL_ZERO_FLOAT
+        if self._reward_fn is not None:
+            ep_reward = self._reward_fn(
+                obs.player_score,
+                obs.opponent_score,
+                coop_rate,
+                obs.current_round,
+            )
+        return EpisodeTrajectory(
+            game=game,
+            strategy=strategy,
+            steps=steps,
+            episode_reward=ep_reward,
+            player_score=obs.player_score,
+            opponent_score=obs.opponent_score,
+            cooperation_rate=coop_rate,
+            rounds_played=obs.current_round,
+        )
+    def collect_batch(
+        self,
+        games: List[str],
+        strategies: Optional[List[str]] = None,
+        episodes_per_pair: int = int(bool(True)),
+        opponent_fn: Optional[Callable] = None,
+    ) -> List[EpisodeTrajectory]:
+        """Collect trajectories for all (game, strategy) combinations.
+        If *opponent_fn* is provided, self-play mode is used: only
+        games are iterated (strategies are ignored).
+        """
+        trajectories: List[EpisodeTrajectory] = []
+        if opponent_fn is not None:
+            for game in games:
+                for _ep in range(episodes_per_pair):
+                    traj = self.collect_episode(
+                        game, opponent_fn=opponent_fn,
+                    )
+                    trajectories.append(traj)
+        else:
+            strats = strategies or ["tit_for_tat"]
+            for game in games:
+                for strategy in strats:
+                    for _ep in range(episodes_per_pair):
+                        traj = self.collect_episode(game, strategy)
+                        trajectories.append(traj)
+        return trajectories
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+_COOPERATIVE_ACTIONS = frozenset({"cooperate", "stag", "dove"})
+_ECONOMIC_PREFIXES = frozenset({"offer", "invest", "contribute"})
+_ZERO = int()
+_ONE = int(bool(True))
+_TWO = _ONE + _ONE
+def _compute_cooperation_rate(obs: GameObservation) -> float:
+    """Fraction of cooperative moves in an episode."""
+    if not obs.history:
+        return EVAL_ZERO_FLOAT
+    total = len(obs.history)
+    cooperative_count = _ZERO
+    first_action = obs.history[_ZERO].player_action
+    prefix = first_action.split("_")[_ZERO]
+    is_economic = prefix in _ECONOMIC_PREFIXES
+    if is_economic:
+        median_idx = len(obs.available_actions) // _TWO
+        for rnd in obs.history:
+            act = rnd.player_action
+            if act in obs.available_actions:
+                if obs.available_actions.index(act) >= median_idx:
+                    cooperative_count += _ONE
+    else:
+        for rnd in obs.history:
+            if rnd.player_action in _COOPERATIVE_ACTIONS:
+                cooperative_count += _ONE
+    return cooperative_count / total