Muqeeth commited on 6 days ago

Commit

5fb294e

verified ·

1 Parent(s): f41f3d1

Add files using upload-large-folder tool

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

run.log +0 -0
src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/apply_template.py +12 -1
src_code_for_reproducibility/chat_utils/chat_turn.py +5 -0
src_code_for_reproducibility/chat_utils/template_specific.py +27 -0
src_code_for_reproducibility/markov_games/__init__.py +4 -0
src_code_for_reproducibility/markov_games/agent.py +18 -22
src_code_for_reproducibility/markov_games/alternative_actions_runner.py +19 -11
src_code_for_reproducibility/markov_games/group_timesteps.py +3 -20
src_code_for_reproducibility/markov_games/linear_runner.py +13 -1
src_code_for_reproducibility/markov_games/markov_game.py +35 -26
src_code_for_reproducibility/markov_games/mg_utils.py +16 -8
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +34 -11
src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py +14 -3
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +10 -0
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py +10 -0
src_code_for_reproducibility/markov_games/rollout_tree.py +10 -1
src_code_for_reproducibility/markov_games/run_markov_games.py +11 -0
src_code_for_reproducibility/markov_games/simulation.py +25 -18
src_code_for_reproducibility/markov_games/statistics_runner.py +10 -0
src_code_for_reproducibility/models/__init__.py +4 -0
src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/adapter_training_wrapper.py +14 -8
src_code_for_reproducibility/models/human_policy.py +5 -0
src_code_for_reproducibility/models/inference_backend.py +5 -0
src_code_for_reproducibility/models/inference_backend_dummy.py +5 -0
src_code_for_reproducibility/models/inference_backend_vllm.py +6 -12
src_code_for_reproducibility/models/large_language_model_api.py +7 -4
src_code_for_reproducibility/models/large_language_model_local.py +8 -31
src_code_for_reproducibility/models/scalar_critic.py +14 -9
src_code_for_reproducibility/training/__init__.py +4 -0
src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/annealing_methods.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/tally_rollout.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/training_data_utils.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/annealing_methods.py +15 -1

run.log CHANGED Viewed

The diff for this file is too large to render. See raw diff

src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc and b/src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc differ

src_code_for_reproducibility/chat_utils/apply_template.py CHANGED Viewed

@@ -1,10 +1,17 @@
 import torch
 from mllm.chat_utils.chat_turn import ChatTurn
 from mllm.chat_utils.template_specific import (
     custom_llama3_template,
     custom_qwen2_template,
     custom_qwen3_template,
     qwen2_assistant_postfix,
     qwen3_assistant_postfix,
 )
@@ -20,6 +27,8 @@ def get_custom_chat_template(tokenizer) -> str:
         return custom_llama3_template
     elif "qwen3" in tokenizer.name_or_path.lower():
         return custom_qwen3_template
     else:
         raise ValueError(f"Tokenizer {tokenizer.name_or_path} not supported")
@@ -32,13 +41,15 @@ def get_custom_assistant_postfix(tokenizer) -> torch.Tensor:
         return qwen2_assistant_postfix
     elif "qwen3" in tokenizer.name_or_path.lower():
         return qwen3_assistant_postfix
     return torch.tensor([], dtype=torch.long)
 def tokenize_chats(chats: list[ChatTurn], tokenizer, enable_thinking) -> None:
     """
     Set the chat_template_token_ids for each chat turn.
-    # TODO: use engine tokens if available
     """
     custom_template = get_custom_chat_template(tokenizer)
     custom_assistant_postfix: torch.Tensor = get_custom_assistant_postfix(tokenizer)

+"""
+File: mllm/chat_utils/apply_template.py
+Summary: Applies tokenizer-specific chat templates and stitches chat token IDs.
+"""
 import torch
 from mllm.chat_utils.chat_turn import ChatTurn
 from mllm.chat_utils.template_specific import (
+    custom_gemma3_template,
     custom_llama3_template,
     custom_qwen2_template,
     custom_qwen3_template,
+    gemma3_assistant_postfix,
     qwen2_assistant_postfix,
     qwen3_assistant_postfix,
 )
         return custom_llama3_template
     elif "qwen3" in tokenizer.name_or_path.lower():
         return custom_qwen3_template
+    elif "gemma" in tokenizer.name_or_path.lower():
+        return custom_gemma3_template
     else:
         raise ValueError(f"Tokenizer {tokenizer.name_or_path} not supported")
         return qwen2_assistant_postfix
     elif "qwen3" in tokenizer.name_or_path.lower():
         return qwen3_assistant_postfix
+    elif "gemma" in tokenizer.name_or_path.lower():
+        return gemma3_assistant_postfix
     return torch.tensor([], dtype=torch.long)
 def tokenize_chats(chats: list[ChatTurn], tokenizer, enable_thinking) -> None:
     """
     Set the chat_template_token_ids for each chat turn.
+    We rely on tokenizer-side templates because engine-provided cached tokens are not exposed yet.
     """
     custom_template = get_custom_chat_template(tokenizer)
     custom_assistant_postfix: torch.Tensor = get_custom_assistant_postfix(tokenizer)

src_code_for_reproducibility/chat_utils/chat_turn.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from __future__ import annotations
 import json

+"""
+File: mllm/chat_utils/chat_turn.py
+Summary: Defines the ChatTurn schema plus helpers for serialization and validation.
+"""
 from __future__ import annotations
 import json

src_code_for_reproducibility/chat_utils/template_specific.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import huggingface_hub
 import torch
 from transformers import AutoTokenizer
@@ -25,6 +30,11 @@ qwen3_assistant_postfix = (
     .encode("\n", return_tensors="pt")
     .flatten()
 )
 custom_qwen2_template = """
 {%- if add_system_prompt %}
     {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
@@ -85,3 +95,20 @@ custom_qwen3_template = """
     {%- endif %}
 {%- endif %}
 """

+"""
+File: mllm/chat_utils/template_specific.py
+Summary: Stores chat template variants and assistant postfix tensors per tokenizer.
+"""
 import huggingface_hub
 import torch
 from transformers import AutoTokenizer
     .encode("\n", return_tensors="pt")
     .flatten()
 )
+gemma3_assistant_postfix = (
+    AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
+    .encode("\n", return_tensors="pt")
+    .flatten()
+)
 custom_qwen2_template = """
 {%- if add_system_prompt %}
     {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
     {%- endif %}
 {%- endif %}
 """
+custom_gemma3_template = """
+{%- if add_system_prompt %}
+{{- bos_token -}}
+{%- endif %}
+{%- for message in messages -%}
+{%- if message['role'] == 'assistant' -%}
+{%- set role = 'model' -%}
+{%- else -%}
+{%- set role = message['role'] -%}
+{%- endif -%}
+{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+{{ '<start_of_turn>model\n' }}
+{%- endif -%}
+"""

src_code_for_reproducibility/markov_games/__init__.py CHANGED Viewed

	@@ -0,0 +1,4 @@

+"""
+File: mllm/markov_games/__init__.py
+Summary: Makes Markov-game subpackages importable from the top-level namespace.
+"""

src_code_for_reproducibility/markov_games/agent.py CHANGED Viewed

@@ -1,11 +1,6 @@
 """
-In simple RL paradise, where the action dimensions are constant and well defined,
-Agent classes are not necessary. But in MARL, with LLM's, there isn't always
-a direct path from policy to action. For instance, from the observation of the environment,
-a prompt must be created. Then, the outputs of the policy might be incorrect, so a second
-request to the LLM must be sent before the action is well defined. This is why this Agent class exists.
-It acts as a mini environment, bridging the gap between the core simulation and
-the LLM policies.
 """
 from abc import ABC, abstractmethod
@@ -18,6 +13,8 @@ from mllm.markov_games.rollout_tree import AgentActLog
 class Agent(ABC):
     @abstractmethod
     def __init__(
         self,
@@ -29,7 +26,10 @@ class Agent(ABC):
         **kwargs,
     ):
         """
-        Initialize the agent state.
         """
         self.seed = seed
         self.agent_id = agent_id
@@ -40,37 +40,33 @@ class Agent(ABC):
     async def act(self, observation) -> Tuple[Any, AgentActLog]:
         """
-        Query (possibly multiple times) a policy (or possibly a pool of policies) to
-        obtain the action of the agent.
-        Example:
-        action = None
-        prompt = self.observation_to_prompt(observation)
-        while not self.valid(action):
-            output = await self.policy.generate(prompt)
-            action = self.policy_output_to_action(output)
-        return action
-        Returns:
-            action
-            step_info
         """
         raise NotImplementedError
     def get_safe_copy(self):
         """
-        Return copy of the agent object that is decorrelated from the original object.
         """
         raise NotImplementedError
     def reset(self):
         raise NotImplementedError
     def render(self):
         raise NotImplementedError
     def close(self):
         raise NotImplementedError
     def get_agent_info(self):
         raise NotImplementedError

 """
+File: mllm/markov_games/agent.py
+Summary: Declares the base Agent interface connecting simulations to policy calls.
 """
 from abc import ABC, abstractmethod
 class Agent(ABC):
+    """Abstract policy wrapper that bridges simulations with arbitrary backends."""
     @abstractmethod
     def __init__(
         self,
         **kwargs,
     ):
         """
+        Initialize the agent state and seed its RNG.
+        Subclasses typically store extra handles (tokenizers, inference clients, etc.)
+        but they should always call ``super().__init__`` so sampling remains reproducible.
         """
         self.seed = seed
         self.agent_id = agent_id
     async def act(self, observation) -> Tuple[Any, AgentActLog]:
         """
+        Produce the next action (and associated chat log) given an environment observation.
+        Implementations can iterate with rejection sampling, multi-call deliberation, etc.
+        Returns both the chosen action and an `AgentActLog` describing how it was produced.
         """
         raise NotImplementedError
     def get_safe_copy(self):
         """
+        Return a deep copy whose future calls do not mutate the original agent.
+        Needed for branch exploration/reruns with alternative actions.
         """
         raise NotImplementedError
     def reset(self):
+        """Reset any internal state between rollouts."""
         raise NotImplementedError
     def render(self):
+        """Optional human-readable visualization of the agent (CLI/UI)."""
         raise NotImplementedError
     def close(self):
+        """Release any external resources (network sockets, subprocesses, etc.)."""
         raise NotImplementedError
     def get_agent_info(self):
+        """Return diagnostic metadata to embed inside rollout logs."""
         raise NotImplementedError

src_code_for_reproducibility/markov_games/alternative_actions_runner.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 import copy
 import json
@@ -16,7 +21,6 @@ from mllm.markov_games.rollout_tree import (
 AgentId = str
 async def run_with_unilateral_alt_action(
     markov_game: MarkovGame,
     agent_id: AgentId,
@@ -25,7 +29,11 @@ async def run_with_unilateral_alt_action(
     max_depth: int,
 ):
     """
-    This function is used to generate a new branch for a given agent.
     """
     # Generate alternative action and take a step
@@ -65,20 +73,20 @@ async def AlternativeActionsRunner(
     branch_only_on_new_round: bool = False,
 ):
     """
-    This method generates a trajectory with partially completed branches,
-    where the branching comes from taking unilateraly different actions.
-    The resulting data is used to estimate the updated advantage alignment policy gradient terms.
-    Let k := nb_sub_steps. Then the number of steps generated is O(Tk), where T is
-    the maximum trajectory length.
     """
     tasks = []
     time_step = 0
     terminated = False
-    root = RolloutTreeRootNode(
-        id=markov_game.get_id(),
-        crn_id=markov_game.get_crn_id()
-    )
     previous_node = root
     while not terminated:

+"""
+File: mllm/markov_games/alternative_actions_runner.py
+Summary: Generates rollout branches by replaying trajectories with unilateral action changes.
+"""
 import asyncio
 import copy
 import json
 AgentId = str
 async def run_with_unilateral_alt_action(
     markov_game: MarkovGame,
     agent_id: AgentId,
     max_depth: int,
 ):
     """
+    Roll out a counterfactual branch where ``agent_id`` deviates unilaterally.
+    Starting from ``branch_node`` (which already contains the main trajectory),
+    we replay the simulation with the deviating agent's action while freezing
+    all other agents/actions, then continue for ``max_depth`` steps.
     """
     # Generate alternative action and take a step
     branch_only_on_new_round: bool = False,
 ):
     """
+    Generate a rollout tree containing the main path plus unilateral deviation branches.
+    For each timestep we:
+      1. Cache agent actions without side effects.
+      2. Advance the main trajectory.
+      3. Spawn ``nb_alternative_actions`` asynchronous deviations per agent,
+         each replaying up to ``max_depth`` steps from the cached pre-action state.
+    The resulting branches feed advantage-alignment estimators.
     """
     tasks = []
     time_step = 0
     terminated = False
+    root = RolloutTreeRootNode(id=markov_game.get_id(), crn_id=markov_game.get_crn_id())
     previous_node = root
     while not terminated:

src_code_for_reproducibility/markov_games/group_timesteps.py CHANGED Viewed

@@ -1,6 +1,8 @@
 """
-This module contains the logic for grouping time steps.
 """
 import copy
 from typing import Callable
@@ -84,25 +86,6 @@ def group_time_steps(
                 raise Exception(
                     "Grouping timesteps by round is not supported for branching trajectories yet."
                 )
-            # Special recursive case for branches
-            # if isinstance(current_node, RolloutTreeBranchNode):
-            #     branches = {}
-            #     for agent_id, branch_nodes in current_node.branches.items():
-            #         branch_group_nodes = []
-            #         for branch_node in branch_nodes:
-            #             branch_group_node = group_time_steps_rec(
-            #                 current_node=branch_node,
-            #                 group_time_step=group_time_step,
-            #                 accumulation_step_logs=copy.deepcopy(accumulation_step_logs))
-            #             branch_group_nodes.append(branch_group_node)
-            #         branches[agent_id] = branch_group_nodes
-            #     main_child_group_node = group_time_steps_rec(
-            #         current_node=current_node.main_child,
-            #         group_time_step=group_time_step,
-            #         accumulation_step_logs=copy.deepcopy(accumulation_step_logs))
-            #     return RolloutTreeBranchNode(main_child=main_child_group_node, branches=branches)
             # Accumulate
             accumulation_step_logs.append(current_node.step_log)

 """
+File: mllm/markov_games/group_timesteps.py
+Summary: Provides timestep-grouping utilities for rollout trees and training.
 """
 import copy
 from typing import Callable
                 raise Exception(
                     "Grouping timesteps by round is not supported for branching trajectories yet."
                 )
             # Accumulate
             accumulation_step_logs.append(current_node.step_log)

src_code_for_reproducibility/markov_games/linear_runner.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 import json
 import os.path
@@ -10,7 +15,14 @@ async def LinearRunner(
     markov_game: MarkovGame, output_folder: str
 ) -> RolloutTreeRootNode:
     """
-    This method generates a trajectory without branching.
     """
     time_step = 0
     terminated = False

+"""
+File: mllm/markov_games/linear_runner.py
+Summary: Simulates a single unbranched Markov-game rollout and records it.
+"""
 import asyncio
 import json
 import os.path
     markov_game: MarkovGame, output_folder: str
 ) -> RolloutTreeRootNode:
     """
+    Generate a single main-path rollout (no branching) for the provided Markov game.
+    Parameters
+    ----------
+    markov_game:
+        Initialized ``MarkovGame`` with agents + simulation ready to step.
+    output_folder:
+        Unused placeholder in the legacy API (kept for compatibility).
     """
     time_step = 0
     terminated = False

src_code_for_reproducibility/markov_games/markov_game.py CHANGED Viewed

@@ -1,18 +1,8 @@
 """
-This class unifies a simulation, and the agents acting in it (see `simulation.py` & `agent.py`).
-In a MarkovGame step,
-    1) each agent takes an action,
-    2) the state transitions with respect to these actions,
-    3) all relevant data of the step is appended to the historical data list
-In order to perform 3), the agents and the simulation are expected, at each time step,
-to return a log of the state transition (from their perspective).
-For instance, the Simulation might send rewards and the agents might send prompting contexts to be used later to generate the training data.
-A different approach would be to simply have the agents keep their data private and log it upon completion of a trajectory.
-The approach we use here centralizes the data gathering aspect,
-making it easy to create sub-trajectories (in the `runners` defined in `runners.py`) descriptions that
-only log information for step transitions occuring after the branching out.
 """
 import asyncio
 import copy
 import json
@@ -31,6 +21,8 @@ AgentId = str
 @dataclass
 class AgentAndActionSafeCopy:
     action: Any
     action_info: AgentActLog
     agent_after_action: type[Agent]
@@ -45,12 +37,18 @@ class MarkovGame(object):
         crn_id: int,
     ):
         """
-        Args:
-            agents:
-            output_path:
-                Path where the step infos are saved.
-            simulation:
-                Simulation object. Example: IPDSimulation
         """
         self.agents = agents
         self.agent_ids = self.agents.keys()
@@ -131,7 +129,7 @@ class MarkovGame(object):
     async def set_action_of_agent(self, agent_id: AgentId):
         """
-        TOWRITE
         """
         agent = self.agents[agent_id]
         obs = self.simulation.get_obs_agent(agent_id)
@@ -141,7 +139,7 @@ class MarkovGame(object):
     async def set_actions(self):
         """
-        TOWRITE
         """
         # background_tasks = set()
         tasks = []
@@ -152,16 +150,27 @@ class MarkovGame(object):
     def take_simulation_step(self):
         """
-        TOWRITE
         """
         terminated, self.simulation_step_log = self.simulation.step(self.actions)
         return terminated
     def get_step_log(self) -> StepLog:
         """
-        TOWRITE
-        TODO: assert actions and simulation have taken step
         """
         step_log = StepLog(
             simulation_step_log=self.simulation_step_log,
             action_logs=self.agent_step_logs,
@@ -170,7 +179,7 @@ class MarkovGame(object):
     async def step(self) -> Tuple[bool, StepLog]:
         """
-        TOWRITE
         """
         await self.set_actions()
         terminated = self.take_simulation_step()
@@ -179,7 +188,7 @@ class MarkovGame(object):
     def get_safe_copy(self):
         """
-        TOWRITE
         """
         new_markov_game = copy.copy(self)

 """
+File: mllm/markov_games/markov_game.py
+Summary: Defines the MarkovGame base class plus shared simulation interfaces.
 """
 import asyncio
 import copy
 import json
 @dataclass
 class AgentAndActionSafeCopy:
+    """Snapshot of an agent, its action, and metadata used for branch replay."""
     action: Any
     action_info: AgentActLog
     agent_after_action: type[Agent]
         crn_id: int,
     ):
         """
+        Initialize the Markov game wrapper.
+        Parameters
+        ----------
+        id:
+            Unique rollout identifier (logged into rollout trees).
+        agents:
+            Mapping of agent_id -> Agent instance.
+        simulation:
+            Environment implementing the ``Simulation`` interface (IPD, TAS, etc.).
+        crn_id:
+            Identifier for the common random number stream used by this rollout.
         """
         self.agents = agents
         self.agent_ids = self.agents.keys()
     async def set_action_of_agent(self, agent_id: AgentId):
         """
+        Query a single agent for its next action and store the result locally.
         """
         agent = self.agents[agent_id]
         obs = self.simulation.get_obs_agent(agent_id)
     async def set_actions(self):
         """
+        Query every agent concurrently and populate the cached actions/logs.
         """
         # background_tasks = set()
         tasks = []
     def take_simulation_step(self):
         """
+        Advance the simulation by one step using the cached actions.
         """
         terminated, self.simulation_step_log = self.simulation.step(self.actions)
         return terminated
     def get_step_log(self) -> StepLog:
         """
+        Package the most recent simulation step and agent logs into a StepLog.
         """
+        if self.simulation_step_log is None:
+            raise RuntimeError(
+                "Simulation step log is empty; call take_simulation_step() first."
+            )
+        missing_logs = [
+            agent_id for agent_id, log in self.agent_step_logs.items() if log is None
+        ]
+        if missing_logs:
+            raise RuntimeError(
+                f"Agent action logs missing for: {', '.join(missing_logs)}. "
+                "Ensure set_actions() ran before requesting the step log."
+            )
         step_log = StepLog(
             simulation_step_log=self.simulation_step_log,
             action_logs=self.agent_step_logs,
     async def step(self) -> Tuple[bool, StepLog]:
         """
+        Convenience step that collects actions, advances the simulation, and returns the log.
         """
         await self.set_actions()
         terminated = self.take_simulation_step()
     def get_safe_copy(self):
         """
+        Create a shallow copy of the game with deep-copied agents/simulation for branching.
         """
         new_markov_game = copy.copy(self)

src_code_for_reproducibility/markov_games/mg_utils.py CHANGED Viewed

@@ -1,9 +1,18 @@
 import asyncio
 import copy
 from collections.abc import Callable
 from dataclasses import dataclass
 from mllm.markov_games.ipd.ipd_agent import IPDAgent
 from mllm.markov_games.ipd.ipd_simulation import IPD
 from mllm.markov_games.markov_game import MarkovGame
 from mllm.markov_games.negotiation.dond_agent import DealNoDealAgent
@@ -12,17 +21,10 @@ from mllm.markov_games.negotiation.nego_hard_coded_policies import (
     HardCodedNegoGreedyPolicy,
     HardCodedNegoWelfareMaximizingPolicy,
 )
-from mllm.markov_games.ipd.Ipd_hard_coded_agents import AlwaysCooperateIPDAgent, AlwaysDefectIPDAgent
 from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
 from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressSimulation
-from mllm.markov_games.negotiation.tas_agent import TrustAndSplitAgent
 from mllm.markov_games.negotiation.tas_rps_agent import TrustAndSplitRPSAgent
 from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSSimulation
-from mllm.markov_games.negotiation.tas_simple_agent import TrustAndSplitSimpleAgent
-from mllm.markov_games.negotiation.tas_simple_simulation import (
-    TrustAndSplitSimpleSimulation,
-)
-from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitSimulation
 from mllm.markov_games.rollout_tree import (
     AgentActLog,
     RolloutTreeBranchNode,
@@ -37,6 +39,8 @@ AgentId = str
 @dataclass
 class AgentConfig:
     agent_id: str
     agent_name: str
     agent_class_name: str
@@ -46,6 +50,8 @@ class AgentConfig:
 @dataclass
 class MarkovGameConfig:
     id: int
     seed: int
     simulation_class_name: str
@@ -57,7 +63,9 @@ def init_markov_game_components(
     config: MarkovGameConfig, policies: dict[str, Callable[[list[dict]], str]]
 ):
     """
-    TOWRITE
     """
     agents = {}
     agent_names = []

+"""
+File: mllm/markov_games/mg_utils.py
+Summary: Holds miscellaneous helpers shared across Markov-game modules.
+"""
 import asyncio
 import copy
 from collections.abc import Callable
 from dataclasses import dataclass
 from mllm.markov_games.ipd.ipd_agent import IPDAgent
+from mllm.markov_games.ipd.Ipd_hard_coded_agents import (
+    AlwaysCooperateIPDAgent,
+    AlwaysDefectIPDAgent,
+)
 from mllm.markov_games.ipd.ipd_simulation import IPD
 from mllm.markov_games.markov_game import MarkovGame
 from mllm.markov_games.negotiation.dond_agent import DealNoDealAgent
     HardCodedNegoGreedyPolicy,
     HardCodedNegoWelfareMaximizingPolicy,
 )
 from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
 from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressSimulation
 from mllm.markov_games.negotiation.tas_rps_agent import TrustAndSplitRPSAgent
 from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSSimulation
 from mllm.markov_games.rollout_tree import (
     AgentActLog,
     RolloutTreeBranchNode,
 @dataclass
 class AgentConfig:
+    """Configuration blob describing one agent in a Markov game spec."""
     agent_id: str
     agent_name: str
     agent_class_name: str
 @dataclass
 class MarkovGameConfig:
+    """Top-level config that ties together simulation settings and agent configs."""
     id: int
     seed: int
     simulation_class_name: str
     config: MarkovGameConfig, policies: dict[str, Callable[[list[dict]], str]]
 ):
     """
+    Materialize Agents and the Simulation described by ``config`` and return a MarkovGame.
+    `policies` is a mapping of policy_id -> callable retrieved from the hosting trainer.
     """
     agents = {}
     agent_names = []

src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py CHANGED Viewed

@@ -1,30 +1,45 @@
 import copy
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 from numpy.random import default_rng
 from mllm.markov_games.rollout_tree import SimulationStepLog
-from mllm.markov_games.negotiation.nego_simulation import Split, NegotiationState, NegotiationObs, NegotiationSimulation
 from mllm.utils.get_coagent_id import get_coagent_id
 AgentId = str
 @dataclass
 class DealNoDealState(NegotiationState):
     item_types: List[str]
     values: Dict[AgentId, Dict[str, int]]
 @dataclass
 class DealNoDealObs(NegotiationObs):
     my_values: Dict[str, int]
     item_types: List[str]
     previous_values_coagent: Dict[str, int] | None
 def random_partition_integer(rng, total: int, parts: int) -> List[int]:
     if parts <= 0:
         return []
     if total <= 0:
@@ -37,7 +52,9 @@ def random_partition_integer(rng, total: int, parts: int) -> List[int]:
         prev = c
     return vals
 class DealNoDealSimulation(NegotiationSimulation):
     def __init__(
         self,
@@ -75,7 +92,9 @@ class DealNoDealSimulation(NegotiationSimulation):
             if ok1 and ok2:
                 return {self.agent_ids[0]: a, self.agent_ids[1]: b}
-    def _is_valid_allocation(self, allocation: Dict[str, int], stock: Dict[str, int]) -> bool:
         for t in self.item_types:
             v = allocation.get(t)
             if v is None:
@@ -85,16 +104,18 @@ class DealNoDealSimulation(NegotiationSimulation):
             if v < 0 or v > int(stock.get(t, 0)):
                 return False
         return True
     def set_new_round_of_variant(self):
         # Keep same values, resample stock
         self.state.quantities = self._sample_stock()
-    def get_info_of_variant(self, state: NegotiationState, actions: Dict[AgentId, Any]) -> Dict[str, Any]:
         return {
             "quantities": copy.deepcopy(state.quantities),
             "values": copy.deepcopy(state.values),
-            'splits': copy.deepcopy(state.splits),
         }
     def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
@@ -105,11 +126,15 @@ class DealNoDealSimulation(NegotiationSimulation):
         split_b = splits[self.agent_ids[1]].items_given_to_self
         rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
         for t in self.item_types:
-            # If not complementary, return 0!
             if not split_a[t] + split_b[t] == self.state.quantities[t]:
                 return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
-            rewards[self.agent_ids[0]] += split_a[t] * self.state.values[self.agent_ids[0]][t]
-            rewards[self.agent_ids[1]] += split_b[t] * self.state.values[self.agent_ids[1]][t]
         return rewards
     def get_obs(self):
@@ -149,5 +174,3 @@ class DealNoDealSimulation(NegotiationSimulation):
             item_types=list(self.item_types),
         )
         return self.get_obs()

+"""
+File: mllm/markov_games/negotiation/dond_simulation.py
+Summary: Simulates Deal-or-No-Deal negotiation games and logs rollouts.
+"""
 import copy
 from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 from numpy.random import default_rng
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+)
 from mllm.markov_games.rollout_tree import SimulationStepLog
 from mllm.utils.get_coagent_id import get_coagent_id
 AgentId = str
 @dataclass
 class DealNoDealState(NegotiationState):
+    """NegotiationState with per-agent value tables and item taxonomy."""
     item_types: List[str]
     values: Dict[AgentId, Dict[str, int]]
 @dataclass
 class DealNoDealObs(NegotiationObs):
+    """Observation that reveals own values and (lagged) opponent values."""
     my_values: Dict[str, int]
     item_types: List[str]
     previous_values_coagent: Dict[str, int] | None
 def random_partition_integer(rng, total: int, parts: int) -> List[int]:
+    """Sample non-negative integers summing to ``total`` across ``parts`` buckets."""
     if parts <= 0:
         return []
     if total <= 0:
         prev = c
     return vals
 class DealNoDealSimulation(NegotiationSimulation):
+    """NegotiationSimulation variant implementing the Rubinstein-style Deal-or-No-Deal."""
     def __init__(
         self,
             if ok1 and ok2:
                 return {self.agent_ids[0]: a, self.agent_ids[1]: b}
+    def _is_valid_allocation(
+        self, allocation: Dict[str, int], stock: Dict[str, int]
+    ) -> bool:
         for t in self.item_types:
             v = allocation.get(t)
             if v is None:
             if v < 0 or v > int(stock.get(t, 0)):
                 return False
         return True
     def set_new_round_of_variant(self):
         # Keep same values, resample stock
         self.state.quantities = self._sample_stock()
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
         return {
             "quantities": copy.deepcopy(state.quantities),
             "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
         }
     def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
         split_b = splits[self.agent_ids[1]].items_given_to_self
         rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
         for t in self.item_types:
+            # If not complementary, return 0!
             if not split_a[t] + split_b[t] == self.state.quantities[t]:
                 return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+            rewards[self.agent_ids[0]] += (
+                split_a[t] * self.state.values[self.agent_ids[0]][t]
+            )
+            rewards[self.agent_ids[1]] += (
+                split_b[t] * self.state.values[self.agent_ids[1]][t]
+            )
         return rewards
     def get_obs(self):
             item_types=list(self.item_types),
         )
         return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py CHANGED Viewed

@@ -1,7 +1,8 @@
 """
-Negotiation simulation environment
-other agent is set at the start of every round. Even though current agent changes over message turns in a round.
 """
 import copy
 from abc import abstractmethod
 from dataclasses import dataclass
@@ -18,16 +19,22 @@ AgentId = str
 @dataclass
 class Split:
     items_given_to_self: Dict[str, int]
 @dataclass
 class Message:
     message: str
 @dataclass  # gets extended by variants
 class NegotiationState:
     round_nb: int
     last_message: str
     current_agent: AgentId
@@ -44,6 +51,8 @@ class NegotiationState:
 @dataclass  # gets extended by variants
 class NegotiationObs:
     round_nb: int
     last_message: str
     quota_messages_per_agent_per_round: int
@@ -134,12 +143,14 @@ class NegotiationSimulation(Simulation):
     @abstractmethod
     def set_new_round_of_variant(self):
         pass
     @abstractmethod
     def get_info_of_variant(
         self, state: NegotiationState, actions: Dict[AgentId, Any]
     ) -> Dict[str, Any]:
         pass
     def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
@@ -190,7 +201,7 @@ class NegotiationSimulation(Simulation):
             is_last_timestep_in_round = True
             done = self.state.round_nb >= self.nb_of_rounds
-        # Message phase
         elif isinstance(action, Message):
             self.state.last_message = action.message
             self.state.nb_messages_sent[current_agent] += 1

 """
+File: mllm/markov_games/negotiation/nego_simulation.py
+Summary: Simulation harness for general negotiation environments.
 """
 import copy
 from abc import abstractmethod
 from dataclasses import dataclass
 @dataclass
 class Split:
+    """Structured proposal describing how many units of each item an agent keeps."""
     items_given_to_self: Dict[str, int]
 @dataclass
 class Message:
+    """Single chat utterance exchanged during the negotiation phase."""
     message: str
 @dataclass  # gets extended by variants
 class NegotiationState:
+    """Full simulator state snapshot shared by all negotiation variants."""
     round_nb: int
     last_message: str
     current_agent: AgentId
 @dataclass  # gets extended by variants
 class NegotiationObs:
+    """Observation presented to agents each turn (base fields; variants extend)."""
     round_nb: int
     last_message: str
     quota_messages_per_agent_per_round: int
     @abstractmethod
     def set_new_round_of_variant(self):
+        """Variant hook: sample new private values / stock before each round."""
         pass
     @abstractmethod
     def get_info_of_variant(
         self, state: NegotiationState, actions: Dict[AgentId, Any]
     ) -> Dict[str, Any]:
+        """Variant hook: populate SimulationStepLog.info with custom diagnostics."""
         pass
     def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
             is_last_timestep_in_round = True
             done = self.state.round_nb >= self.nb_of_rounds
+        # Message phase: roll the conversation forward a single turn.
         elif isinstance(action, Message):
             self.state.last_message = action.message
             self.state.nb_messages_sent[current_agent] += 1

src_code_for_reproducibility/markov_games/negotiation/tas_agent.py CHANGED Viewed

@@ -1,9 +1,16 @@
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 from mllm.markov_games.negotiation.nego_simulation import Split
 from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
 class TrustAndSplitAgent(NegotiationAgent):
     def __init__(self, num_message_chars, *args, **kwargs):
         self.num_message_chars = num_message_chars
         super().__init__(*args, **kwargs)
@@ -58,12 +65,14 @@ class TrustAndSplitAgent(NegotiationAgent):
         self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitObs) -> str:
         return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
     #     return rf"(?s).{{0,{self.num_message_chars}}}"
     def get_split_regex(self, observation: TrustAndSplitObs) -> str:
         items = list(observation.quantities.keys())
         # Accept both singular and plural forms
         item_pattern = "|".join(
@@ -75,6 +84,7 @@ class TrustAndSplitAgent(NegotiationAgent):
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitObs
     ) -> Split:
         items = list(observation.quantities.keys())
         import re as _re

+"""
+File: mllm/markov_games/negotiation/tas_agent.py
+Summary: Agent implementation for Take-and-Split negotiations.
+"""
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 from mllm.markov_games.negotiation.nego_simulation import Split
 from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
 class TrustAndSplitAgent(NegotiationAgent):
+    """Prompt/template wrapper for the classic multi-item Take-and-Split benchmark."""
     def __init__(self, num_message_chars, *args, **kwargs):
         self.num_message_chars = num_message_chars
         super().__init__(*args, **kwargs)
         self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+        """Constrain chat to bounded XML tags for stable parsing."""
         return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
     #     return rf"(?s).{{0,{self.num_message_chars}}}"
     def get_split_regex(self, observation: TrustAndSplitObs) -> str:
+        """Allow natural-language item names while still returning machine-parsable XML."""
         items = list(observation.quantities.keys())
         # Accept both singular and plural forms
         item_pattern = "|".join(
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitObs
     ) -> Split:
+        """Convert human-readable allocation text back into canonical item IDs."""
         items = list(observation.quantities.keys())
         import re as _re

src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import copy
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -15,6 +20,8 @@ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
 class TrustAndSplitRPSAgent(NegotiationAgent):
     def __init__(
         self,
         num_message_chars: int,
@@ -88,6 +95,7 @@ class TrustAndSplitRPSAgent(NegotiationAgent):
             self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
         if self.message_start_end_format:
             return (
                 rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
@@ -96,6 +104,7 @@ class TrustAndSplitRPSAgent(NegotiationAgent):
             return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
         if self.proposal_start_end_format:
             return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
         else:
@@ -104,6 +113,7 @@ class TrustAndSplitRPSAgent(NegotiationAgent):
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitRPSObs
     ) -> Split:
         import re as _re
         if self.proposal_start_end_format:

+"""
+File: mllm/markov_games/negotiation/tas_rps_agent.py
+Summary: Agent logic for TAS Rock-Paper-Scissors blended game.
+"""
 import copy
 from collections.abc import Callable
 from dataclasses import dataclass
 class TrustAndSplitRPSAgent(NegotiationAgent):
+    """NegotiationAgent that reasons about hidden hands before submitting TAS splits."""
     def __init__(
         self,
         num_message_chars: int,
             self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        """Switch between <message>...</message> and <<message_start>> formats on demand."""
         if self.message_start_end_format:
             return (
                 rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
             return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        """Force single-number proposals inside whichever tag style the config selected."""
         if self.proposal_start_end_format:
             return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
         else:
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitRPSObs
     ) -> Split:
+        """Parse the proposal tag (or raw integer fallback) into a Split."""
         import re as _re
         if self.proposal_start_end_format:

src_code_for_reproducibility/markov_games/rollout_tree.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-TODO: add parent to nodes so that some verification can be done. For instance, to ensure that node reward keys match the parent node.
 """
 from __future__ import annotations
@@ -18,11 +19,15 @@ AgentId = str
 class SimulationStepLog(BaseModel):
     rewards: dict[AgentId, float]
     info: Any = None
 class AgentActLog(BaseModel):
     chat_turns: list[ChatTurn] | None
     info: Any = None
@@ -55,6 +60,8 @@ class StepLog(BaseModel):
 class RolloutTreeNode(BaseModel):
     step_log: StepLog
     time_step: int
     child: RolloutTreeNode | RolloutTreeBranchNode | None = None
@@ -70,6 +77,8 @@ class RolloutTreeBranchNode(BaseModel):
 class RolloutTreeRootNode(BaseModel):
     id: int
     crn_id: int  # ID of the rng used to generate this rollout tree
     child: RolloutTreeNode | RolloutTreeBranchNode | None = None

 """
+File: mllm/markov_games/rollout_tree.py
+Summary: Defines rollout tree data structures and serialization helpers.
 """
 from __future__ import annotations
 class SimulationStepLog(BaseModel):
+    """Minimal snapshot of environment-side rewards and auxiliary info."""
     rewards: dict[AgentId, float]
     info: Any = None
 class AgentActLog(BaseModel):
+    """LLM-side provenance for an action (chat turns + metadata)."""
     chat_turns: list[ChatTurn] | None
     info: Any = None
 class RolloutTreeNode(BaseModel):
+    """Single timestep of the main trajectory (or a branch) plus linkage."""
     step_log: StepLog
     time_step: int
     child: RolloutTreeNode | RolloutTreeBranchNode | None = None
 class RolloutTreeRootNode(BaseModel):
+    """Entry point for serialized rollouts (main path plus optional branches)."""
     id: int
     crn_id: int  # ID of the rng used to generate this rollout tree
     child: RolloutTreeNode | RolloutTreeBranchNode | None = None

src_code_for_reproducibility/markov_games/run_markov_games.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 from collections.abc import Callable
 from dataclasses import dataclass
@@ -14,6 +19,12 @@ async def run_markov_games(
     output_folder: str,
     markov_games: list[MarkovGame],
 ) -> list[RolloutTreeRootNode]:
     tasks = []
     for mg in markov_games:
         tasks.append(

+"""
+File: mllm/markov_games/run_markov_games.py
+Summary: CLI entry point for running configured Markov-game experiments.
+"""
 import asyncio
 from collections.abc import Callable
 from dataclasses import dataclass
     output_folder: str,
     markov_games: list[MarkovGame],
 ) -> list[RolloutTreeRootNode]:
+    """
+    Kick off multiple Markov game rollouts concurrently and return their trees.
+    Parameters mirror the Hydra configs (runner callable + kwargs) so callers can
+    choose ``LinearRunner``, ``AlternativeActionsRunner`` or future variants.
+    """
     tasks = []
     for mg in markov_games:
         tasks.append(

src_code_for_reproducibility/markov_games/simulation.py CHANGED Viewed

@@ -1,8 +1,6 @@
 """
-A Simulation is the environment of a Markov Game.
-The Simulation is not responsible for properly checking / formatting the responses of LLM's.
-This is the job of the `Agent` class.
-Simulations expect clean actions, and are defined similarly to `gymnasium` environments, except that they are adapted for the Multi-agent setting.
 """
 from abc import ABC, abstractmethod
@@ -22,59 +20,68 @@ class Simulation(ABC):
     @abstractmethod
     def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
         """
-        Returns terminated, info
         """
         raise NotImplementedError
     def get_obs(self):
-        """Returns all agent observations in dict
-        Returns:
-            observations
-        """
         raise NotImplementedError
     def get_obs_agent(self, agent_id):
-        """Returns observation for agent_id"""
         raise NotImplementedError
     def get_obs_size(self):
-        """Returns the shape of the observation"""
         raise NotImplementedError
     def get_state(self):
         raise NotImplementedError
     def get_state_size(self):
-        """Returns the shape of the state"""
         raise NotImplementedError
     def get_avail_actions(self):
         raise NotImplementedError
     def get_avail_agent_actions(self, agent_id):
-        """Returns the available actions for agent_id"""
         raise NotImplementedError
     def get_total_actions(self):
-        """Returns the total number of actions an agent could ever take"""
-        # TODO: This is only suitable for a discrete 1 dimensional action space for each agent
         raise NotImplementedError
     def get_safe_copy(self):
         """
-        Return copy of the agent object that is decorrelated from the original object.
         """
         raise NotImplementedError
     def reset(self):
-        """Returns initial observations and states"""
         raise NotImplementedError
     def render(self):
         raise NotImplementedError
     def close(self):
         raise NotImplementedError
     # def seed(self):

 """
+File: mllm/markov_games/simulation.py
+Summary: Core simulation loop utilities and step logging for Markov games.
 """
 from abc import ABC, abstractmethod
     @abstractmethod
     def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
         """
+        Advance the environment by one logical tick using ``actions``.
+        Returns
+        -------
+        terminated: bool
+            Whether the episode has finished.
+        SimulationStepLog
+            Reward/info bundle describing this transition.
         """
         raise NotImplementedError
     def get_obs(self):
+        """Return a dict mapping agent_id -> observation for *all* agents."""
         raise NotImplementedError
     def get_obs_agent(self, agent_id):
+        """Return the observation for a single agent."""
         raise NotImplementedError
     def get_obs_size(self):
+        """Describe the observation tensor shape (useful for critic heads)."""
         raise NotImplementedError
     def get_state(self):
+        """Return the privileged simulator state if available."""
         raise NotImplementedError
     def get_state_size(self):
+        """Describe the state tensor shape."""
         raise NotImplementedError
     def get_avail_actions(self):
+        """Return the global action mask/tensor if the space is discrete."""
         raise NotImplementedError
     def get_avail_agent_actions(self, agent_id):
+        """Return the available action mask for a given agent."""
         raise NotImplementedError
     def get_total_actions(self):
+        """Returns the total number of actions an agent could ever take.
+        Implementations currently assume a discrete, one-dimensional action space per agent.
+        """
         raise NotImplementedError
     def get_safe_copy(self):
         """
+        Return copy of the simulator that shares no mutable state with the original.
         """
         raise NotImplementedError
     def reset(self):
+        """Reset to the initial state and return the starting observations."""
         raise NotImplementedError
     def render(self):
+        """Optional human-facing visualization."""
         raise NotImplementedError
     def close(self):
+        """Release any owned resources (files, processes, etc.)."""
         raise NotImplementedError
     # def seed(self):

src_code_for_reproducibility/markov_games/statistics_runner.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from __future__ import annotations
 import gc
@@ -36,17 +41,20 @@ def _iterate_main_nodes(root: RolloutTreeRootNode) -> Iterator[RolloutTreeNode]:
 def iterate_main_simulation_logs(
     root: RolloutTreeRootNode,
 ) -> Iterator[SimulationStepLog]:
     for node in _iterate_main_nodes(root):
         yield node.step_log.simulation_step_log
 def stream_rollout_files(iteration_folder: Path) -> Iterator[Path]:
     for p in iteration_folder.rglob("*.rt.pkl"):
         if p.is_file():
             yield p
 def load_root(path: Path) -> RolloutTreeRootNode:
     with open(path, "rb") as f:
         data = pickle.load(f)
     return RolloutTreeRootNode.model_validate(data)
@@ -54,6 +62,8 @@ def load_root(path: Path) -> RolloutTreeRootNode:
 @dataclass
 class StatRecord:
     mgid: int
     crn_id: Optional[int]
     iteration: str

+"""
+File: mllm/markov_games/statistics_runner.py
+Summary: Executes multiple rollouts to compute experiment statistics.
+"""
 from __future__ import annotations
 import gc
 def iterate_main_simulation_logs(
     root: RolloutTreeRootNode,
 ) -> Iterator[SimulationStepLog]:
+    """Yield ``SimulationStepLog`` objects along the main (non-branch) path."""
     for node in _iterate_main_nodes(root):
         yield node.step_log.simulation_step_log
 def stream_rollout_files(iteration_folder: Path) -> Iterator[Path]:
+    """Iterate over every ``*.rt.pkl`` file under an iteration directory."""
     for p in iteration_folder.rglob("*.rt.pkl"):
         if p.is_file():
             yield p
 def load_root(path: Path) -> RolloutTreeRootNode:
+    """Load and validate a rollout tree from disk."""
     with open(path, "rb") as f:
         data = pickle.load(f)
     return RolloutTreeRootNode.model_validate(data)
 @dataclass
 class StatRecord:
+    """Convenience container for serialized stat rows."""
     mgid: int
     crn_id: Optional[int]
     iteration: str

src_code_for_reproducibility/models/__init__.py CHANGED Viewed

	@@ -0,0 +1,4 @@

+"""
+File: mllm/models/__init__.py
+Summary: Exports model-layer utilities from the models package.
+"""

src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc differ

src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc differ

src_code_for_reproducibility/models/adapter_training_wrapper.py CHANGED Viewed

@@ -1,11 +1,14 @@
-import torch
-import torch.nn as nn
 import logging
 from typing import Union
-from peft import (
-    LoraConfig,
-    get_peft_model,
-)
 logger = logging.getLogger(__name__)
@@ -18,13 +21,14 @@ class AdapterWrapper(nn.Module):
       • exposes only the parameters that should be trained for that adapter
         (plus whatever extra modules you name).
     """
     def __init__(
         self,
         shared_llm: nn.Module,
         adapter_id: str,
         lora_config: dict,
         path: Union[str, None] = None,
-        ):
         super().__init__()
         self.shared_llm = shared_llm
         self.adapter_id = adapter_id
@@ -47,7 +51,9 @@ class AdapterWrapper(nn.Module):
                     adapter_name=adapter_id,
                 )
                 loaded_from = path
-            except Exception as exc:  # noqa: BLE001 - want to log any load failure context
                 logger.warning(
                     f"Adapter '{adapter_id}': failed to load from '{path}': {exc}"
                 )

+"""
+File: mllm/models/adapter_training_wrapper.py
+Summary: Wraps a shared LLM with adapter-specific PEFT handling for training.
+"""
 import logging
 from typing import Union
+import torch
+import torch.nn as nn
+from peft import LoraConfig, get_peft_model
 logger = logging.getLogger(__name__)
       • exposes only the parameters that should be trained for that adapter
         (plus whatever extra modules you name).
     """
     def __init__(
         self,
         shared_llm: nn.Module,
         adapter_id: str,
         lora_config: dict,
         path: Union[str, None] = None,
+    ):
         super().__init__()
         self.shared_llm = shared_llm
         self.adapter_id = adapter_id
                     adapter_name=adapter_id,
                 )
                 loaded_from = path
+            except (
+                Exception
+            ) as exc:  # noqa: BLE001 - want to log any load failure context
                 logger.warning(
                     f"Adapter '{adapter_id}': failed to load from '{path}': {exc}"
                 )

src_code_for_reproducibility/models/human_policy.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 import os
 import re

+"""
+File: mllm/models/human_policy.py
+Summary: Implements an interactive human-in-the-loop policy for experiments.
+"""
 import asyncio
 import os
 import re

src_code_for_reproducibility/models/inference_backend.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Optional

+"""
+File: mllm/models/inference_backend.py
+Summary: Declares the inference backend interface and shared dataclasses.
+"""
 from abc import ABC, abstractmethod
 from dataclasses import dataclass
 from typing import Any, Optional

src_code_for_reproducibility/models/inference_backend_dummy.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 from typing import Optional

+"""
+File: mllm/models/inference_backend_dummy.py
+Summary: Stub inference backend that returns synthetic completions for tests.
+"""
 import asyncio
 from typing import Optional

src_code_for_reproducibility/models/inference_backend_vllm.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import asyncio
 import re
 from typing import Optional
@@ -23,19 +28,12 @@ class VLLMAsyncBackend(LLMInferenceBackend):
         sampling_params: dict = {},
     ):
         self.model_name = model_name
-        # self.adapter_paths = adapter_paths or {}
-        # self.current_adapter = None
-        # self.vllm_adapter_ids = {
-        #     adapter_id: generate_short_id() for adapter_id in adapter_paths.keys()
-        # }
         self.vllm_adapter_ids = {}
         ea = dict(model=model_name, **engine_init_kwargs)
-        # ea["enable_lora"] = True
-        # ea["max_loras"] = len(self.vllm_adapter_ids)
-        # ea["enable_sleep_mode"] = True
         self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**ea))
         self.sampling_params = sampling_params
     def prepare_adapter(
         self,
@@ -43,7 +41,6 @@ class VLLMAsyncBackend(LLMInferenceBackend):
         adapter_path: Optional[str],
         weights_got_updated: bool,
     ) -> None:
-        # self.current_adapter = adapter_id
         if weights_got_updated:
             self.vllm_adapter_ids[adapter_id] = generate_short_id()
         self.current_lora_request = LoRARequest(
@@ -96,9 +93,6 @@ class VLLMAsyncBackend(LLMInferenceBackend):
         ]
         log_probs = torch.tensor(log_probs)
         out_token_ids = torch.tensor(out_token_ids, dtype=torch.long)
-        # for out_token_id, logprob_dict in zip(out_token_ids, res.outputs[0].logprobs):
-        #     if logprob_dict[out_token_id].logprob < -1:
-        #         print(f"High negative logprob {logprob_dict[out_token_id].logprob} for {logprob_dict}")
         content = raw_text
         reasoning_content = None

+"""
+File: mllm/models/inference_backend_vllm.py
+Summary: Connects to in-process vLLM instances for batched generation.
+"""
 import asyncio
 import re
 from typing import Optional
         sampling_params: dict = {},
     ):
         self.model_name = model_name
         self.vllm_adapter_ids = {}
         ea = dict(model=model_name, **engine_init_kwargs)
         self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**ea))
         self.sampling_params = sampling_params
+        self.tokenizer = tokenizer
     def prepare_adapter(
         self,
         adapter_path: Optional[str],
         weights_got_updated: bool,
     ) -> None:
         if weights_got_updated:
             self.vllm_adapter_ids[adapter_id] = generate_short_id()
         self.current_lora_request = LoRARequest(
         ]
         log_probs = torch.tensor(log_probs)
         out_token_ids = torch.tensor(out_token_ids, dtype=torch.long)
         content = raw_text
         reasoning_content = None

src_code_for_reproducibility/models/large_language_model_api.py CHANGED Viewed

@@ -1,3 +1,8 @@
 from __future__ import annotations
 import asyncio
@@ -13,7 +18,7 @@ from openai import AsyncOpenAI, OpenAIError
 from mllm.markov_games.rollout_tree import ChatTurn
 from mllm.models.inference_backend import LLMInferenceOutput
-# TODO: Get this automatically from OpenAI
 reasoning_models = [
     "gpt-5-nano",
     "gpt-5-mini",
@@ -119,9 +124,7 @@ class LargeLanguageModelOpenAI:
         agent_id: str,
         regex: Optional[str] = None,
     ) -> LLMInferenceOutput:
-        # Remove any non-role/content keys from the prompt else openai will error
-        # TODO:
         prompt = [{"role": p.role, "content": p.content} for p in state]
         # if self.sleep_between_requests:

+"""
+File: mllm/models/large_language_model_api.py
+Summary: Implements API-based large-language-model inference adapters.
+"""
 from __future__ import annotations
 import asyncio
 from mllm.markov_games.rollout_tree import ChatTurn
 from mllm.models.inference_backend import LLMInferenceOutput
+# Static list copied from the public OpenAI docs until a discovery endpoint is exposed.
 reasoning_models = [
     "gpt-5-nano",
     "gpt-5-mini",
         agent_id: str,
         regex: Optional[str] = None,
     ) -> LLMInferenceOutput:
+        # Remove any non-role/content keys from the prompt else openai will error.
         prompt = [{"role": p.role, "content": p.content} for p in state]
         # if self.sleep_between_requests:

src_code_for_reproducibility/models/large_language_model_local.py CHANGED Viewed

@@ -1,5 +1,6 @@
 """
-TODO: Figure out how to tweak SGlang not to go OOM when batch size is 32. See https://github.com/sgl-project/sglang/issues/6309.
 """
 import logging
@@ -16,23 +17,14 @@ import httpx
 import requests
 import torch
 import torch.nn as nn
-# from sglang.utils import (
-#     launch_server_cmd,
-#     print_highlight,
-#     terminate_process,
-#     wait_for_server,
-# )
 from torch.optim import SGD, Adam, AdamW, RMSprop
 from transformers import AutoModelForCausalLM, AutoTokenizer
-from trl import AutoModelForCausalLMWithValueHead
 from mllm.chat_utils.apply_template import chat_turns_to_token_ids
 from mllm.markov_games.rollout_tree import ChatTurn
 from mllm.models.adapter_training_wrapper import AdapterWrapper
 from mllm.models.inference_backend import LLMInferenceOutput
 from mllm.models.inference_backend_dummy import DummyInferenceBackend
-from mllm.models.inference_backend_sglang import SGLangOfflineBackend
 from mllm.models.inference_backend_vllm import VLLMAsyncBackend
 logger = logging.getLogger(__name__)
@@ -44,7 +36,7 @@ PolicyID = str
 class LeanLocalLLM:
     """
-    TOWRITE
     """
     def __init__(
@@ -55,7 +47,7 @@ class LeanLocalLLM:
         hf_kwargs: dict = {},
         adapter_configs: dict = {},
         output_directory: str = "./models/",
-        inference_backend: Literal["vllm", "sglang", "dummy"] = "vllm",
         inference_backend_sampling_params: dict = {},
         inference_backend_init_kwargs: dict = {},
         initial_adapter_paths: dict[str, str] | None = None,
@@ -180,15 +172,7 @@ class LeanLocalLLM:
         # Init inference inference_backend
         # ---------------------------------------------------------
-        if inference_backend == "sglang":
-            self.inference_backend = SGLangOfflineBackend(
-                model_name=self.model_name,
-                save_path=self.save_path,
-                adapter_paths=self.adapter_paths,
-                tokenizer=self.tokenizer,
-                kwargs=inference_backend_init_kwargs,
-            )
-        elif inference_backend == "vllm":
             self.inference_backend = VLLMAsyncBackend(
                 model_name=self.model_name,
                 # adapter_paths=self.adapter_paths,
@@ -206,7 +190,7 @@ class LeanLocalLLM:
     def get_inference_policies(self) -> dict[PolicyID, Callable]:
         """
-        TOWRITE
         """
         policies = {}
         for adapter_id in self.adapter_ids:
@@ -242,8 +226,8 @@ class LeanLocalLLM:
         """
         Returns wrappers over the adapters which allows them be
         interfaced like regular PyTorch models.
-        # TODO: create the adapter wrappers here
-        See adapter_wrapper.py
         """
         trainable_objects = {an: self.hf_adapters[an] for an in self.adapter_ids}
         return trainable_objects
@@ -297,13 +281,11 @@ class LeanLocalLLM:
                 tokenizer=self.tokenizer,
                 enable_thinking=self.enable_thinking,
             )
-            # print(f"context is {self.tokenizer.decode(context_token_ids)}")
             policy_output = await self.inference_backend.generate(
                 input_token_ids=context_token_ids.tolist(),
                 extract_thinking=(self.max_thinking_characters > 0),
                 regex=current_regex,
             )
-            # print(f"generated: {self.tokenizer.decode(policy_output.out_token_ids)}")
             if (
                 pattern is None
                 or (pattern.fullmatch(policy_output.content))
@@ -347,11 +329,6 @@ class LeanLocalLLM:
         for adapter_id in self.past_agent_adapter_ids:
             self.weights_got_updated[adapter_id] = True
-        # import random
-        # self.save_path = self.save_path + str(random.randint(1,500))
-        # print(f"Save path: {self.save_path}")
-        # self.adapter_paths = {adapter_id:os.path.join(self.save_path, adapter_id) for adapter_id in self.adapter_ids}
         adapter_id = self.adapter_ids[0]
         self.hf_adapters[adapter_id].save_pretrained(self.save_path)

 """
+File: mllm/models/large_language_model_local.py
+Summary: Provides a local large language model wrapper over inference backends.
 """
 import logging
 import requests
 import torch
 import torch.nn as nn
 from torch.optim import SGD, Adam, AdamW, RMSprop
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from mllm.chat_utils.apply_template import chat_turns_to_token_ids
 from mllm.markov_games.rollout_tree import ChatTurn
 from mllm.models.adapter_training_wrapper import AdapterWrapper
 from mllm.models.inference_backend import LLMInferenceOutput
 from mllm.models.inference_backend_dummy import DummyInferenceBackend
 from mllm.models.inference_backend_vllm import VLLMAsyncBackend
 logger = logging.getLogger(__name__)
 class LeanLocalLLM:
     """
+    Wrapper that manages local HuggingFace models, adapters, and inference backends.
     """
     def __init__(
         hf_kwargs: dict = {},
         adapter_configs: dict = {},
         output_directory: str = "./models/",
+        inference_backend: Literal["vllm", "dummy"] = "vllm",
         inference_backend_sampling_params: dict = {},
         inference_backend_init_kwargs: dict = {},
         initial_adapter_paths: dict[str, str] | None = None,
         # Init inference inference_backend
         # ---------------------------------------------------------
+        if inference_backend == "vllm":
             self.inference_backend = VLLMAsyncBackend(
                 model_name=self.model_name,
                 # adapter_paths=self.adapter_paths,
     def get_inference_policies(self) -> dict[PolicyID, Callable]:
         """
+        Build async policy callables keyed by adapter id for inference-only usage.
         """
         policies = {}
         for adapter_id in self.adapter_ids:
         """
         Returns wrappers over the adapters which allows them be
         interfaced like regular PyTorch models.
+        AdapterWrapper lives in adapter_wrapper.py; the huggingface modules already wrap
+        parameters here, so we surface them directly until an extra shim is required.
         """
         trainable_objects = {an: self.hf_adapters[an] for an in self.adapter_ids}
         return trainable_objects
                 tokenizer=self.tokenizer,
                 enable_thinking=self.enable_thinking,
             )
             policy_output = await self.inference_backend.generate(
                 input_token_ids=context_token_ids.tolist(),
                 extract_thinking=(self.max_thinking_characters > 0),
                 regex=current_regex,
             )
             if (
                 pattern is None
                 or (pattern.fullmatch(policy_output.content))
         for adapter_id in self.past_agent_adapter_ids:
             self.weights_got_updated[adapter_id] = True
         adapter_id = self.adapter_ids[0]
         self.hf_adapters[adapter_id].save_pretrained(self.save_path)

src_code_for_reproducibility/models/scalar_critic.py CHANGED Viewed

@@ -1,6 +1,13 @@
-import torch, torch.nn as nn, torch.optim as optim
-from transformers import AutoModelForCausalLM, AutoTokenizer
 from peft import LoraConfig, get_peft_model
 from mllm.models.adapter_training_wrapper import AdapterWrapper
@@ -11,18 +18,16 @@ class ScalarCritic(nn.Module):
         V_φ(s) = wᵀ h_last + b
     Only LoRA adapters (inside critic_adapter) and the value head are trainable.
     """
     def __init__(self, critic_adapter: AdapterWrapper):
         super().__init__()
         self.critic_adapter = critic_adapter
         hidden_size = self.critic_adapter.shared_llm.config.hidden_size
         self.value_head = nn.Linear(hidden_size, 1).to(
-            dtype=critic_adapter.dtype,
-            device=critic_adapter.device)
-    def forward(self,
-                input_ids,
-                attention_mask=None,
-                **kwargs):
         # AdapterWrapper activates its own adapter internally
         outputs = self.critic_adapter(
             input_ids=input_ids,
@@ -30,7 +35,7 @@ class ScalarCritic(nn.Module):
             output_hidden_states=True,
             **kwargs,
         )
-        h_last = outputs.hidden_states[-1]            # (B, S, H)
         values = self.value_head(h_last).squeeze(-1)  # (B, S)
         return values

+"""
+File: mllm/models/scalar_critic.py
+Summary: Defines a scalar critic network and helper utilities.
+"""
+import torch
+import torch.nn as nn
+import torch.optim as optim
 from peft import LoraConfig, get_peft_model
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from mllm.models.adapter_training_wrapper import AdapterWrapper
         V_φ(s) = wᵀ h_last + b
     Only LoRA adapters (inside critic_adapter) and the value head are trainable.
     """
     def __init__(self, critic_adapter: AdapterWrapper):
         super().__init__()
         self.critic_adapter = critic_adapter
         hidden_size = self.critic_adapter.shared_llm.config.hidden_size
         self.value_head = nn.Linear(hidden_size, 1).to(
+            dtype=critic_adapter.dtype, device=critic_adapter.device
+        )
+    def forward(self, input_ids, attention_mask=None, **kwargs):
         # AdapterWrapper activates its own adapter internally
         outputs = self.critic_adapter(
             input_ids=input_ids,
             output_hidden_states=True,
             **kwargs,
         )
+        h_last = outputs.hidden_states[-1]  # (B, S, H)
         values = self.value_head(h_last).squeeze(-1)  # (B, S)
         return values

src_code_for_reproducibility/training/__init__.py CHANGED Viewed

	@@ -0,0 +1,4 @@

+"""
+File: mllm/training/__init__.py
+Summary: Exposes training submodules through the package namespace.
+"""

src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/annealing_methods.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/annealing_methods.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/annealing_methods.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/tally_metrics.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/tally_rollout.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/tally_rollout.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/tally_rollout.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc differ

src_code_for_reproducibility/training/__pycache__/training_data_utils.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/training/__pycache__/training_data_utils.cpython-312.pyc and b/src_code_for_reproducibility/training/__pycache__/training_data_utils.cpython-312.pyc differ

src_code_for_reproducibility/training/annealing_methods.py CHANGED Viewed

@@ -1,6 +1,20 @@
 import numpy as np
 def sigmoid_annealing(step: int, temperature: float) -> float:
-    return 2 / (1 + np.exp(-step / temperature)) - 1

+"""
+File: mllm/training/annealing_methods.py
+Summary: Implements annealing schedules used across training loops.
+"""
 import numpy as np
 def sigmoid_annealing(step: int, temperature: float) -> float:
+    """
+    Smoothly ramp a scalar from 0 → 1 using a temperature-controlled sigmoid.
+    Args:
+        step: Current training step or iteration.
+        temperature: Controls how sharp the transition is; larger values flatten the curve.
+    Returns:
+        Float in [-1, 1] that can be rescaled for annealing schedules.
+    """
+    return 2 / (1 + np.exp(-step / temperature)) - 1