diff --git a/src_code_for_reproducibility/__pycache__/__init__.cpython-311.pyc b/src_code_for_reproducibility/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3ad66d4eacef55e556e51e2c58108384ab017db9 Binary files /dev/null and b/src_code_for_reproducibility/__pycache__/__init__.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/docs/source/conf.py b/src_code_for_reproducibility/docs/source/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..5c7512678928b6b7580c812cd62d1c22df9945ba --- /dev/null +++ b/src_code_for_reproducibility/docs/source/conf.py @@ -0,0 +1,48 @@ +# Configuration file for the Sphinx documentation builder. +import os +import sys +sys.path.insert(0, os.path.abspath('../..')) + +# -- Project information ----------------------------------------------------- +project = 'llm_negotiation' +copyright = '2023, Your Name' +author = 'Your Name' + +# -- General configuration --------------------------------------------------- +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.viewcode', + 'sphinx.ext.napoleon', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.mathjax', + 'sphinxcontrib.mermaid', + 'sphinx_rtd_theme', +] + +templates_path = ['_templates'] +exclude_patterns = [] + +# -- Options for HTML output ------------------------------------------------- +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] + +# -- Napoleon settings ------------------------------------------------------- +napoleon_google_docstring = True +napoleon_numpy_docstring = False +napoleon_include_init_with_doc = True +napoleon_include_private_with_doc = False +napoleon_include_special_with_doc = True +napoleon_use_admonition_for_examples = False +napoleon_use_admonition_for_notes = False +napoleon_use_admonition_for_references = False +napoleon_use_ivar = False +napoleon_use_param = True +napoleon_use_rtype = True +napoleon_preprocess_types = False +napoleon_type_aliases = None +napoleon_attr_annotations = True + +# -- Path setup -------------------------------------------------------------- +# Make sure the project's modules can be found by Sphinx +sys.path.insert(0, os.path.abspath('../../src')) \ No newline at end of file diff --git a/src_code_for_reproducibility/docs/source/environments/diplomacy.rst b/src_code_for_reproducibility/docs/source/environments/diplomacy.rst new file mode 100644 index 0000000000000000000000000000000000000000..c2121d08ecd6e5e13691c05624d22ddadef1f0c3 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/environments/diplomacy.rst @@ -0,0 +1,459 @@ +================= +Diplomacy +================= + +The Diplomacy environment provides a multi-agent negotiation interface for the classic board game Diplomacy, +based on DeepMind's implementation. This document describes the API for interacting with the Diplomacy environment +and its associated agent handler. + +Overview +-------- + +Diplomacy is a strategic board game set in Europe before World War I, where players control one of seven European powers +and negotiate with each other to gain control of supply centers. The game is played in turns, with each turn consisting +of movement phases, retreat phases, and build phases. + +Our implementation adapts DeepMind's Diplomacy code to the Multi-Agent Negotiation Environment standard, allowing it +to be used with LLM agents through a text-based interface. + +Game Rules +---------- + +### Game Board and Powers + +Diplomacy is played on a map of Europe divided into provinces. The game features seven Great Powers that players can control: + +- England (blue) +- France (light blue) +- Germany (black) +- Italy (green) +- Austria-Hungary (red) +- Russia (white) +- Turkey (yellow) + +Each power begins with three supply centers (except Russia, which starts with four) and an equal number of units. + +### Units and Movement + +There are two types of units in Diplomacy: +- **Armies (A)**: Can move to adjacent land provinces or be convoyed across water by fleets +- **Fleets (F)**: Can move to adjacent coastal provinces and sea regions + +During movement phases, each unit can execute one of these orders: +- **Hold**: The unit remains in its current province (e.g., "A PAR H") + - Format: [Unit Type] [Province] H + - Example: "A PAR H" means "Army in Paris holds its position" + +- **Move**: The unit attempts to move to an adjacent province (e.g., "A PAR - BUR") + - Format: [Unit Type] [Current Province] - [Destination Province] + - Example: "A PAR - BUR" means "Army in Paris moves to Burgundy" + - Example: "F BRE - ENG" means "Fleet in Brest moves to the English Channel" + +- **Support**: The unit supports another unit's move or hold (e.g., "A PAR S A MAR - BUR") + - Format for supporting a move: [Unit Type] [Province] S [Unit Type] [Province] - [Destination] + - Format for supporting a hold: [Unit Type] [Province] S [Unit Type] [Province] + - Example: "A PAR S A MAR - BUR" means "Army in Paris supports the Army in Marseille's move to Burgundy" + - Example: "F LON S F NTH" means "Fleet in London supports the Fleet in North Sea holding its position" + +- **Convoy**: A fleet can convoy an army across water (e.g., "F ENG C A LON - BRE") + - Format: [Fleet] [Sea Province] C [Army] [Coastal Province] - [Coastal Province] + - Example: "F ENG C A LON - BRE" means "Fleet in English Channel convoys the Army in London to Brest" + +All orders are executed simultaneously, and conflicts are resolved based on strength (number of supporting units). + +### Common Province Abbreviations + +Diplomacy uses three-letter abbreviations for provinces. Some common ones include: +- **PAR**: Paris +- **LON**: London +- **BER**: Berlin +- **MUN**: Munich +- **BUR**: Burgundy +- **MAR**: Marseilles +- **BRE**: Brest +- **ENG**: English Channel +- **NTH**: North Sea +- **VIE**: Vienna +- **ROM**: Rome +- **VEN**: Venice +- **MOW**: Moscow +- **CON**: Constantinople + +### Example: Movement and Conflicts + +For example, if France orders "A PAR - BUR" and Germany orders "A MUN - BUR", neither move succeeds as they have equal strength. However, if France also orders "A MAR S A PAR - BUR", then the French army from Paris would successfully move to Burgundy with strength of 2 against Germany's strength of 1. + +### Turn Structure + +A game year consists of five phases: +1. **Spring Movement**: All powers submit orders for their units +2. **Spring Retreat**: Units dislodged in the movement phase must retreat or be disbanded +3. **Fall Movement**: Another round of movement orders +4. **Fall Retreat**: Retreat orders for dislodged units +5. **Winter Adjustment**: Powers gain or lose units based on the number of supply centers they control + +### Supply Centers and Building + +Supply centers (marked on the map) are key to victory. When a power occupies a supply center during a Fall turn, they gain control of it. During the Winter Adjustment phase: +- If you control more supply centers than you have units, you can build new units in your home supply centers +- If you control fewer supply centers than you have units, you must remove excess units + +### Example: Building and Removing Units + +If France controls 5 supply centers but only has 4 units, during the Winter phase they can build one new unit in an unoccupied home supply center (Paris, Marseilles, or Brest). Conversely, if France controls only 3 supply centers but has 4 units, they must remove one unit of their choice. + +### Negotiation + +A critical component of Diplomacy is the negotiation between players. Before submitting orders, players can communicate freely to form alliances, coordinate attacks, or mislead opponents. These negotiations are not binding, and betrayal is a common strategy. + +### Example: Alliance and Betrayal + +England and France might agree to an alliance against Germany, with England promising to support France's move into Belgium. However, England could secretly order their fleet to move into Belgium themselves or support a German move instead. + +### Victory Conditions + +The game ends when one power controls 18 or more supply centers (majority of the 34 total centers), or when players agree to a draw. In tournament settings, games may also end after a predetermined number of game years. + +DiplomacyEnv +------------ + +The ``DiplomacyEnv`` class provides an interface to the Diplomacy game environment that follows the Multi-Agent +Negotiation Environment standard. + +.. code-block:: python + + class DiplomacyEnv: + """ + Multi-Agent Negotiation Environment for Diplomacy, adapting Deepmind's implementation + to the MarlEnvironment standard. + """ + def __init__(self, + initial_state: Optional[DiplomacyState] = None, + max_turns: int = 100, + points_per_supply_centre: bool = True, + forced_draw_probability: float = 0.0, + min_years_forced_draw: int = 35): + """Initialize the Diplomacy environment. + + Args: + initial_state: Initial DiplomacyState (optional) + max_turns: Maximum number of turns in the game + points_per_supply_centre: Whether to award points per supply center in case of a draw + forced_draw_probability: Probability of forcing a draw after min_years_forced_draw + min_years_forced_draw: Minimum years before considering a forced draw + """ + # ... + + def reset(self): + """Reset the environment to an initial state and return the initial observation. + + Returns: + observation (dict): A dictionary where keys are agent identifiers and values are observations. + Each observation contains: + - board_state: Current state of the board + - current_season: Current season in the game + - player_index: Index of the player's power + - possible_actions: List of possible actions in DeepMind's format + - human_readable_actions: List of human-readable action descriptions + - supply_centers: List of supply centers owned by the player + - units: List of units owned by the player + - year: Current year in the game + """ + # ... + + def step(self, actions): + """Take a step in the environment using the provided actions. + + Args: + actions (dict): A dictionary where keys are agent identifiers and values are actions. + Actions can be: + - List of integer actions in DeepMind's format + - List of string actions in text format (e.g., "A MUN - BER") + + Returns: + observations (dict): A dictionary where keys are agent identifiers and values are observations. + Each observation has the same structure as in reset(). + done (bool): Whether the episode has ended. + info (dict): Additional information about the environment, including: + - turn: Current turn number + - returns: Game returns if the game is done, otherwise None + - waiting_for: List of agents that still need to provide actions (if not all actions are provided) + """ + # ... + + def get_log_info(self): + """Get additional information about the environment for logging. + + Returns: + log_info (dict): Information about the environment required to log the game, including: + - power_names: List of power names + - game_history: History of the game + - current_turn: Current turn number + - current_season: Current season name + - supply_centers: Dictionary mapping power names to supply center counts + """ + # ... + + def render(self): + """Render the current state of the environment. + + Displays a visualization of the current game state. + """ + # ... + + def close(self): + """Perform any necessary cleanup.""" + # ... + + +Key Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``DiplomacyEnv`` class implements several key features: + +1. **Multi-Agent Support**: The environment tracks multiple agents (powers) and manages their interactions. + +2. **Turn-Based Gameplay**: The environment enforces the turn structure of Diplomacy, including different phases. + +3. **Action Processing**: The environment can handle actions in both text format and DeepMind's integer format. + +4. **Observation Generation**: The environment generates detailed observations for each agent, including board state, supply centers, and possible actions. + +5. **Game Termination**: The environment tracks game termination conditions, including supply center victory and maximum turn limits. + +Observation Structure +~~~~~~~~~~~~~~~~~~~~ + +Each agent receives an observation dictionary with the following structure: + +.. code-block:: python + + { + "board_state": np.ndarray, # Board state representation + "current_season": int, # Season index (0-4) + "player_index": int, # Index of the player's power (0-6) + "possible_actions": [int], # List of possible actions in DeepMind's format + "human_readable_actions": [str], # List of human-readable action descriptions + "supply_centers": [str], # List of supply centers owned by the player + "units": [dict], # List of units owned by the player + "year": int # Current year in the game + } + +Action Structure +~~~~~~~~~~~~~~~ + +Actions can be provided in two formats: + +1. **Text Format**: String actions like ``"A MUN - BER"`` or ``"F NTH C A LON - BEL"``. + +2. **Integer Format**: Lists of integers corresponding to DeepMind's action representation. + +The environment will convert text actions to the internal format as needed. + +DiplomacyAgent +-------------- + +The ``DiplomacyAgent`` class implements the agent handler interface for Diplomacy, processing observations from the environment and generating actions through an LLM. + +.. code-block:: python + + class DiplomacyAgent: + """ + Agent handler for Diplomacy, implementing the AgentState interface + for the multi-agent negotiation standard. + """ + + def __init__(self, + power_name: str, + use_text_interface: bool = True, + system_prompt: Optional[str] = None): + """Initialize the Diplomacy agent handler. + + Args: + power_name: Name of the power this agent controls + use_text_interface: Whether to use text-based interface (vs. structured) + system_prompt: Optional system prompt to use for the LLM + """ + # ... + + def step(self, observation_from_env, policy_output=None): + """Update the agent state based on the observation and action. + + Args: + observation_from_env: The observation from the environment, with structure: + - board_state: Current state of the board + - current_season: Current season in the game + - player_index: Index of the player's power + - possible_actions: List of possible actions + - human_readable_actions: List of human-readable action descriptions + - supply_centers: List of supply centers owned by the player + - units: List of units owned by the player + - year: Current year in the game + + policy_output: The output of the policy (LLM response), or None for initial prompt + + Returns: + policy_id (str): The policy identifier ("llm_policy") + policy_input (dict): The input to the policy, with structure: + - messages: List of conversation messages in the format: + [{"role": "system", "content": "..."}, + {"role": "user", "content": "..."}] + action: The official action to be sent to the environment, or None if not ready + done (bool): Whether the LLM action is ready to be sent to the environment + info (dict): Additional information about the agent: + - valid_action: Whether the extracted action is valid + """ + # ... + + def get_log_info(self): + """Get information about the agent required to log a trajectory. + + Returns: + log_info (dict): Information about the agent required to log a trajectory: + - power_name: Name of the power this agent controls + - conversation_history: List of conversation messages + - current_action: The current action, if any + """ + # ... + + def render(self): + """Render the current state of the agent. + + Displays the agent's current state, including conversation history. + """ + # ... + + def close(self): + """Perform any necessary cleanup.""" + # ... + + +Key Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``DiplomacyAgent`` class implements several key features: + +1. **LLM Interaction**: The agent generates prompts for an LLM and processes the LLM's responses to extract actions. + +2. **Conversation Management**: The agent maintains a conversation history for coherent interactions with the LLM. + +3. **Action Validation**: The agent validates extracted actions against the set of possible actions provided by the environment. + +4. **Error Handling**: The agent generates clarification prompts when invalid actions are detected. + +5. **Text-Based Interface**: The agent formats game state information into human-readable text for the LLM. + +Prompt Structure +~~~~~~~~~~~~~~~ + +The agent generates prompts that include: + +1. **System Prompt**: Instructions and context for the LLM, explaining its role as a Diplomacy player. + +2. **Game State Description**: A text description of the current game state, including: + - Current year and season + - Supply centers owned + - Units controlled + - Possible actions + +3. **Action Request**: Instructions on how to format actions. + +Example system prompt: + +.. code-block:: text + + You are playing the role of FRANCE in a game of Diplomacy. + Your goal is to control as many supply centers as possible. + You can negotiate with other players and form alliances, but remember that + these alliances are not binding. When you need to submit orders for your units, + write them in the correct format, with each order on a new line. + +Example game state description: + +.. code-block:: text + + Year: 1901, Season: SPRING_MOVES + You are playing as FRANCE. + You currently control 3 supply centers: PAR, MAR, BRE. + Your units are: A PAR, A MAR, F BRE. + + Please provide orders for your units. Here are your possible actions: + A PAR - BUR + A PAR - GAS + A PAR - PIC + A PAR H + ... + + Submit your orders, one per line, in the format like: "A MUN - BER" or "F NTH C A LON - BEL" + +Running Diplomacy Games +---------------------- + +To run Diplomacy games with LLM agents, you can use the ``run_batched_matches`` function with the ``DiplomacyEnv`` and ``DiplomacyAgent`` classes: + +.. code-block:: python + + from mllm.environments.diplomacy.diplomacy_env import DiplomacyEnv + from mllm.environments.diplomacy.diplomacy_agent import DiplomacyAgent + from mllm.run_matches import run_batched_matches + + # Create environment and agent handlers + env = DiplomacyEnv(max_turns=30) + + agent_handlers = { + "AUSTRIA": DiplomacyAgent(power_name="AUSTRIA"), + "ENGLAND": DiplomacyAgent(power_name="ENGLAND"), + "FRANCE": DiplomacyAgent(power_name="FRANCE"), + "GERMANY": DiplomacyAgent(power_name="GERMANY"), + "ITALY": DiplomacyAgent(power_name="ITALY"), + "RUSSIA": DiplomacyAgent(power_name="RUSSIA"), + "TURKEY": DiplomacyAgent(power_name="TURKEY") + } + + # Define policy mapping (mapping from policy IDs to actual policy functions) + policy_mapping = { + "llm_policy": my_llm_policy_function + } + + # Run the game + game_results = run_batched_matches( + envs=[env], + agent_handlers_per_env=[agent_handlers], + policy_mapping=policy_mapping, + max_parallel_matches=1 + ) + + # Process results + for result in game_results: + print(f"Game finished. Winner: {result['winner']}") + print(f"Supply centers: {result['supply_centers']}") + +This setup allows you to run Diplomacy games with LLM agents using the Multi-Agent Negotiation Environment standard. + +Limitations and Considerations +----------------------------- + +1. **Performance**: Processing observations and actions for seven powers using LLMs can be computationally intensive. + +2. **Action Parsing**: Extracting valid actions from LLM outputs may require sophisticated parsing and error handling. + +3. **Game Complexity**: Diplomacy is a complex game with many rules and edge cases, which may be challenging for LLMs to fully grasp. + +4. **Turn Duration**: Real Diplomacy games include negotiation phases of variable duration, which are not fully captured in this implementation. + +5. **Text Formatting**: The quality of LLM interactions depends heavily on the formatting and clarity of text prompts. + +Advanced Usage +------------ + +For advanced usage, you can customize: + +1. **System Prompts**: Modify agent behavior by providing custom system prompts. + +2. **Observation Processing**: Extend the observation processing to include additional information. + +3. **Action Parsing**: Implement more sophisticated action parsing for complex orders. + +4. **Visualization**: Add custom visualization methods to the environment's render function. + +5. **Logging**: Extend the logging capabilities to capture additional information about the game state. \ No newline at end of file diff --git a/src_code_for_reproducibility/docs/source/environments/dond.rst b/src_code_for_reproducibility/docs/source/environments/dond.rst new file mode 100644 index 0000000000000000000000000000000000000000..d23865259b01fc0c654eaa64fc2e5306cd5c681c --- /dev/null +++ b/src_code_for_reproducibility/docs/source/environments/dond.rst @@ -0,0 +1,410 @@ +================= +Deal or No Deal +================= + +The Deal or No Deal (DoND) environment provides a multi-agent negotiation interface where players trade +items with different values. This document describes the API for interacting with the DoND environment +and its associated agent handler. + +Overview +-------- + +Deal or No Deal is a negotiation game where two agents must agree on how to divide a set of items, +each of which has different values to each agent. The agents engage in a back-and-forth dialogue to +determine an allocation of the items, with each trying to maximize their own total value. + +Our implementation follows the Multi-Agent Negotiation Environment standard, allowing it to be used +with LLM agents through a text-based interface. + +Game Rules +---------- + +### Basic Structure + +The core mechanics of Deal or No Deal are: + +1. Two agents negotiate over a set of items (e.g., books, balls, hats) +2. Each item has: + - A specific quantity (how many of each item is available) + - A value for each agent (which may differ between agents) +3. Agents take turns sending messages to negotiate how to split the items +4. Once an agreement is reached, agents finalize the deal +5. Points are awarded based on the value of items each agent receives + +### Detailed Gameplay + +#### Setup Phase + +The game begins with: +- A set of items (e.g., "book", "hat", "ball") +- Each item has a quantity (e.g., 6 books, 2 hats, 4 balls) +- Each agent has private values for each item (e.g., books might be worth 5 points to one agent but only 2 points to the other) +- Agents are assigned roles (starting negotiator and responding negotiator) + +#### Negotiation Phase + +1. Agents take turns sending free-form text messages to each other +2. Messages can include offers, counter-offers, questions, or strategic communication +3. There is a maximum number of messages permitted (preventing endless negotiations) +4. Either agent can propose to finalize an agreement at any time + +For example: +- Agent 1: "I propose I get all the books and you get all the hats and balls." +- Agent 2: "That doesn't work for me. How about you get 3 books and I get 3 books, all the hats, and all the balls?" +- Agent 1: "Let me counter-offer: I get 4 books and 2 balls, you get 2 books, all hats, and 2 balls." + +#### Finalization Phase + +1. When an agent wants to finalize a deal, they must specify the exact allocation: + - How many of each item they receive + - How many of each item the other agent receives +2. The other agent must then either agree (by submitting the same allocation) or reject the finalization +3. If both agents submit matching finalizations, the deal is executed +4. If finalizations don't match, no agreement is reached, and both agents receive 0 points + +#### Scoring + +1. Each agent's score is calculated based on the value of items they receive +2. The formula is: Sum(quantity_of_item_i × value_of_item_i_to_agent) +3. If no agreement is reached, both agents receive 0 points + +### Example Game + +Let's walk through a simple example: + +**Setup:** +- Items: Books (4), Hats (2), Balls (6) +- Agent 1 values: Books=5, Hats=1, Balls=2 +- Agent 2 values: Books=3, Hats=6, Balls=1 + +**Negotiation (simplified):** +1. Agent 1: "I would like all the books and balls. You can have the hats." +2. Agent 2: "That doesn't work for me. Books are valuable. I propose I get all the hats and 2 books, you get 2 books and all the balls." +3. Agent 1: "How about I get 3 books and all the balls, and you get 1 book and all the hats?" +4. Agent 2: "I accept your proposal." + +**Finalization:** +- Agent 1 submits: Agent 1 gets (Books: 3, Hats: 0, Balls: 6), Agent 2 gets (Books: 1, Hats: 2, Balls: 0) +- Agent 2 submits the same allocation, confirming agreement + +**Scoring:** +- Agent 1 score: (3 books × 5) + (0 hats × 1) + (6 balls × 2) = 15 + 0 + 12 = 27 points +- Agent 2 score: (1 book × 3) + (2 hats × 6) + (0 balls × 1) = 3 + 12 + 0 = 15 points + +### Game Variations + +The DoND environment supports several variations through configuration parameters: + +#### Different Value Distributions + +The environment offers multiple ways to assign values to items: + +1. **Standard Random Setup (dond_random_setup)**: + - Items have even-numbered quantities + - Each agent receives distinct random values for each item + - Values are drawn from a uniform distribution + +2. **Independent Random Values (independent_random_vals)**: + - Item quantities can be any number in the specified range + - Values for each agent are drawn independently + - Creates more varied negotiation scenarios + +3. **Bicameral Value Distribution (bicameral_vals_assignator)**: + - Creates a "high value" and "low value" distribution for each item + - Each agent values approximately half the items highly and half lowly + - Values are drawn from normal distributions with different means + - Creates scenarios with clear trade opportunities + +#### Visibility Options + +1. **Finalization Visibility**: + - When enabled, both agents can see each other's finalization proposals + - When disabled, finalization proposals remain private until both are submitted + +2. **Other Values Visibility**: + - When enabled, agents can see each other's value functions + - When disabled, agents only know their own values + - Creates information asymmetry and richer negotiation dynamics + +#### Game Modes + +1. **Cooperative Mode ("coop")**: + - Agents are encouraged to find mutually beneficial solutions + - Success is measured by the sum of both agents' scores + +2. **Competitive Mode ("comp")**: + - Agents aim to maximize their individual scores + - Creates more adversarial negotiations + +#### Round Structure + +1. **Single Round**: + - One negotiation session between the same agents + - Simple evaluation of negotiation skills + +2. **Multiple Rounds**: + - Agents negotiate multiple times with different item setups + - Allows for learning and adaptation over time + - Roles can be swapped between rounds + +DondEnv +------------ + +The ``DondEnv`` class provides an interface to the Deal or No Deal environment that follows the Multi-Agent +Negotiation Environment standard. + +.. code-block:: python + + class DondEnv: + """ + Multi-Agent Negotiation Environment for Deal or No Deal. + """ + def __init__( + self, + agents, + mode="coop", + max_messages=None, + min_messages=None, + max_chars_per_message=None, + rounds_per_game=1, + random_setup_func=None, + random_setup_kwargs=None, + role_assignator_func=None, + role_assignator_func_kwargs=None, + finalization_visibility=False, + other_values_visibility=False, + random_seed=None + ): + """Initialize the Deal or No Deal environment. + + Args: + agents: List of agent IDs participating in the game + mode: Game mode ("coop" or "comp") + max_messages: Maximum number of messages per agent per round + min_messages: Minimum number of messages per agent per round + max_chars_per_message: Maximum characters per message + rounds_per_game: Number of negotiation rounds to play + random_setup_func: Function to generate item quantities and values + random_setup_kwargs: Arguments for the random setup function + role_assignator_func: Function to assign roles to agents + role_assignator_func_kwargs: Arguments for the role assignator + finalization_visibility: Whether agents can see each other's finalizations + other_values_visibility: Whether agents can see each other's values + random_seed: Seed for reproducibility + """ + # ... + + def reset(self): + """Reset the environment to an initial state and return the initial observation. + + Returns: + observation (dict): A dictionary where keys are agent identifiers and values are observations. + """ + # ... + + def step(self, actions): + """Take a step in the environment using the provided actions. + + Args: + actions (dict): A dictionary where keys are agent identifiers and values are actions. + Actions can be messages or finalization proposals. + + Returns: + observations (dict): A dictionary where keys are agent identifiers and values are observations. + done (bool): Whether the episode has ended. + info (dict): Additional information about the environment. + """ + # ... + + def get_state(self): + """Retrieve the current state of the game. + + Returns: + state (dict): The current state of the game, including items, quantities, values, etc. + """ + # ... + +Key Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``DondEnv`` class implements several key features: + +1. **Multi-Agent Support**: The environment tracks two agents and manages their alternating messages. + +2. **Turn-Based Dialogue**: The environment enforces turn structure and limits on message count. + +3. **Finalization Processing**: The environment validates and processes finalization proposals. + +4. **Random Setup**: The environment supports multiple methods of generating negotiation scenarios. + +5. **Round Management**: The environment can handle multiple rounds with different setups. + +Observation Structure +~~~~~~~~~~~~~~~~~~~~ + +Each agent receives an observation (state) dictionary with rich information about the game: + +.. code-block:: python + + { + "mode": str, # Game mode ("coop" or "comp") + "role_values": dict, # Value mappings for each role + "role_props": dict, # Properties for each role + "agent_to_role": dict, # Mapping from agent IDs to roles + "is_new_round": bool, # Whether this is the start of a new round + "is_new_game": bool, # Whether this is the start of a new game + "game_over": bool, # Whether the game is over + "items": list, # List of item names + "quantities": dict, # Quantities of each item + "has_finalized": bool, # Whether finalization has been proposed + "last_message": dict, # The last message sent + "messages_remaining": dict, # Number of messages each agent can still send + # And various history tracking fields + } + +Action Structure +~~~~~~~~~~~~~~~ + +Actions can be: + +1. **Text Messages**: Free-form text for negotiation. +2. **Finalization Proposals**: Structured data specifying the exact allocation of items. + +Example finalization format: + +.. code-block:: python + + { + "type": "finalize", + "allocation": { + "agent1": {"book": 3, "hat": 0, "ball": 6}, + "agent2": {"book": 1, "hat": 2, "ball": 0} + } + } + +Value Setup Functions +-------------------- + +The DoND environment provides several functions for setting up item values: + +.. code-block:: python + + def dond_random_setup(items, min_quant, max_quant, min_val, max_val, random_seed=None): + """ + Generates items, even-numbered quantities and distinct random values for each category for both agents. + + Args: + items (list): List of items. + min_quant (int): Minimum quantity per item. + max_quant (int): Maximum quantity per item. + min_val (int): Minimum value per item. + max_val (int): Maximum value per item. + random_seed (int, optional): Seed for random generation. + + Returns: + tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator)) + """ + # ... + + def independent_random_vals(items, min_quant, max_quant, min_val, max_val, random_seed=None): + """ + Generates random quantities and independent random values for both agents. + + Args: + Similar to dond_random_setup + + Returns: + tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator)) + """ + # ... + + def bicameral_vals_assignator(items, min_quant, max_quant, low_val_mean, low_val_std, high_val_mean, high_val_std, random_seed=None): + """ + Generates values with a bicameral distribution - each agent values half the items highly. + + Args: + items (list): List of items. + min_quant, max_quant: Range for quantities + low_val_mean, low_val_std: Mean and standard deviation for the "low value" distribution + high_val_mean, high_val_std: Mean and standard deviation for the "high value" distribution + random_seed: Seed for reproducibility + + Returns: + tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator)) + """ + # ... + +Running DoND Games +---------------------- + +To run Deal or No Deal games with LLM agents, you can use the following structure: + +.. code-block:: python + + from mllm.environments.dond.dond_game import DondEnv + from mllm.environments.dond.dond_agent import DondAgent + from src.run_matches import run_batched_matches + + # Create environment + env = DondEnv( + agents=["agent1", "agent2"], + mode="coop", + max_messages=10, + rounds_per_game=1, + random_setup_func="dond_random_setup", + random_setup_kwargs={ + "items": ["book", "hat", "ball"], + "min_quant": 2, + "max_quant": 8, + "min_val": 1, + "max_val": 10 + }, + finalization_visibility=False + ) + + # Create agent handlers (implementation details would vary) + agent_handlers = { + "agent1": DondAgent(agent_id="agent1"), + "agent2": DondAgent(agent_id="agent2") + } + + # Define policy mapping + policy_mapping = { + "llm_policy": my_llm_policy_function + } + + # Run the game + game_results = run_batched_matches( + envs=[env], + agent_handlers_per_env=[agent_handlers], + policy_mapping=policy_mapping, + max_parallel_matches=1 + ) + +Limitations and Considerations +----------------------------- + +1. **Negotiation Complexity**: The open-ended nature of negotiations can be challenging for some LLM agents. + +2. **Parsing Challenges**: Extracting structured finalization proposals from free-form text requires robust parsing. + +3. **Optimization Opportunities**: Different agents may employ different negotiation strategies to optimize outcomes. + +4. **Fairness Evaluation**: The environment allows research into questions of fair division and Pareto optimality. + +5. **Strategic Deception**: Agents might strategically misrepresent their true values, adding complexity to negotiations. + +Advanced Usage +------------ + +For advanced usage, you can: + +1. **Custom Value Functions**: Create more complex distributions of item values for specific research questions. + +2. **Novel Negotiation Scenarios**: Design item sets and values to test specific negotiation skills. + +3. **Curriculum Learning**: Create progressively more difficult negotiation scenarios. + +4. **Communication Analysis**: Analyze the language and strategies used in successful negotiations. + +5. **Multi-Round Dynamics**: Study how agents adapt their strategies over multiple rounds. \ No newline at end of file diff --git a/src_code_for_reproducibility/docs/source/environments/ipd.rst b/src_code_for_reproducibility/docs/source/environments/ipd.rst new file mode 100644 index 0000000000000000000000000000000000000000..98e55d0c72f29f026c2c5d27650f51f60a7e7601 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/environments/ipd.rst @@ -0,0 +1,411 @@ +================= +Iterated Prisoner's Dilemma +================= + +The Iterated Prisoner's Dilemma environment provides a classic game theory setting for studying cooperation +and competition between agents. This document describes the API for interacting with the IPD environment +and its associated agent handler. + +Overview +-------- + +The Prisoner's Dilemma is a fundamental problem in game theory that demonstrates why two rational individuals might not +cooperate, even when it appears in their best interest to do so. In the iterated version, the same two players +repeatedly face the same dilemma, allowing for the development of trust or retaliation based on previous interactions. + +Our implementation follows the Multi-Agent Negotiation Environment standard, allowing it to be used with +LLM agents through a text-based interface. + +Game Rules +---------- + +### Basic Premise + +The scenario behind the Prisoner's Dilemma is as follows: + +Two criminals are arrested and imprisoned. Each prisoner is in solitary confinement with no means of communicating with +the other. The prosecutors lack sufficient evidence to convict the pair on the principal charge, but they have enough +to convict both on a lesser charge. Simultaneously, the prosecutors offer each prisoner a bargain: + +- If both prisoners betray each other, each serves 2 years in prison (the "punishment" payoff) +- If one betrays the other while the other remains silent, the betrayer goes free (the "temptation" payoff) while the + silent accomplice serves 3 years (the "sucker" payoff) +- If both remain silent, each serves only 1 year in prison (the "reward" payoff) + +### Game Mechanics + +In our implementation, the choices are simplified to: +- **C**: Cooperate (remain silent) +- **D**: Defect (betray the other prisoner) + +Each round, both players simultaneously choose either C or D, and receive points based on the combination of their choices: + +- Both choose C: Both receive the "reward" payoff (3 points by default) +- Both choose D: Both receive the "punishment" payoff (1 point by default) +- One chooses C, one chooses D: The defector receives the "temptation" payoff (5 points by default), while the cooperator + receives the "sucker" payoff (0 points by default) + +### Example: Single Round + +Let's see how a single round plays out: + +1. Alice and Bob simultaneously make their choices +2. If Alice chooses C and Bob chooses C: + - Alice receives 3 points + - Bob receives 3 points +3. If Alice chooses C and Bob chooses D: + - Alice receives 0 points + - Bob receives 5 points +4. If Alice chooses D and Bob chooses C: + - Alice receives 5 points + - Bob receives 0 points +5. If Alice chooses D and Bob chooses D: + - Alice receives 1 point + - Bob receives 1 point + +### Iterated Game Structure + +The iterated version repeats this basic game for a fixed number of rounds. The key features are: + +1. Players know the total number of rounds in advance +2. After each round, players learn what choice the other player made +3. Players maintain a cumulative score across all rounds +4. Players can adjust their strategy based on the history of previous interactions + +### Game Variations + +The IPD environment supports several variations through configuration parameters: + +#### Different Payoff Matrices + +The standard payoff values can be modified to create different incentive structures: +- **Traditional PD**: reward=3, punishment=1, temptation=5, sucker=0 +- **Weak Temptation**: reward=3, punishment=1, temptation=4, sucker=0 (reduces the incentive to defect) +- **Harsh Punishment**: reward=3, punishment=0, temptation=5, sucker=0 (increases the cost of mutual defection) +- **Generous**: reward=4, punishment=2, temptation=5, sucker=1 (cushions the blow of being betrayed) + +#### Game Length Variations + +The number of rounds can significantly impact strategy: +- **Short Games** (5-10 rounds): Incentivizes more defection, especially near the end +- **Medium Games** (20-50 rounds): Allows for the development of tit-for-tat and forgiveness strategies +- **Long Games** (100+ rounds): Favors steady cooperation with occasional "probing" defections + +### Common Strategies + +While not enforced by the environment, several well-known strategies can emerge: +- **Always Cooperate**: Always choose C +- **Always Defect**: Always choose D +- **Tit for Tat**: Start with C, then copy what the opponent did in the previous round +- **Forgiving Tit for Tat**: Like Tit for Tat, but occasionally cooperate even after being defected against +- **Grudger**: Cooperate until the opponent defects once, then always defect +- **Random**: Choose randomly between C and D + +IPDEnv +------ + +The ``IPDEnv`` class provides an interface to the Iterated Prisoner's Dilemma environment that follows the +Multi-Agent Negotiation Environment standard. + +.. code-block:: python + + class IPDEnv: + """ + Iterated Prisoner's Dilemma environment following the MarlEnvironment standard. + + In each round of the game, two agents simultaneously choose to either cooperate (C) or defect (D). + The payoffs are as follows: + - If both cooperate: Both receive the "reward" (usually 3 points) + - If both defect: Both receive the "punishment" (usually 1 point) + - If one cooperates and one defects: The defector receives the "temptation" (usually 5 points) + and the cooperator receives the "sucker" payoff (usually 0 points) + + The game is played for a specified number of rounds. + """ + + def __init__( + self, + rounds_per_game: int = 10, + reward: float = 3.0, # Both cooperate + punishment: float = 1.0, # Both defect + temptation: float = 5.0, # Defector's reward when other cooperates + sucker: float = 0.0, # Cooperator's reward when other defects + random_seed: Optional[int] = None, + ): + """ + Initialize the Iterated Prisoner's Dilemma environment. + + Args: + rounds_per_game: Number of rounds to play + reward: Payoff when both agents cooperate + punishment: Payoff when both agents defect + temptation: Payoff for defecting when other agent cooperates + sucker: Payoff for cooperating when other agent defects + seed: Random seed for reproducibility + """ + # ... + + def reset(self) -> Dict[str, Dict[str, Any]]: + """ + Reset the environment to an initial state and return the initial observation. + + Returns: + observation (dict): A dictionary where keys are agent identifiers and values are observations. + """ + # ... + + def step(self, actions: Dict[str, str]) -> Tuple[Dict[str, Dict[str, Any]], bool, Dict[str, Any]]: + """ + Take a step in the environment using the provided actions. + + Args: + actions (dict): A dictionary where keys are agent identifiers and values are actions ('C' or 'D'). + + Returns: + observations (dict): A dictionary where keys are agent identifiers and values are observations. + done (bool): Whether the episode has ended. + info (dict): Additional information about the environment. + """ + # ... + +Key Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``IPDEnv`` class implements several key features: + +1. **Two-Agent Support**: The environment tracks two agents ("alice" and "bob") and manages their interactions. + +2. **Round-Based Play**: The environment enforces turn structure and tracks game history. + +3. **Payoff Matrix**: The environment calculates rewards based on the standard prisoner's dilemma payoff matrix. + +4. **Observation Generation**: The environment generates detailed observations for each agent, including action history and rewards. + +5. **Game Termination**: The environment tracks game termination after the specified number of rounds. + +Observation Structure +~~~~~~~~~~~~~~~~~~~~ + +Each agent receives an observation dictionary with the following structure: + +.. code-block:: python + + { + "current_round": int, # Current round number (0-indexed) + "rounds_per_game": int, # Total number of rounds in the game + "history": List[Dict], # Complete game history so far + "last_round_actions": Dict[str, str], # Actions from the previous round (if any) + "last_round_reward": float, # Reward received in the previous round (if any) + "total_reward": float, # Cumulative reward so far + "payoff_matrix": Dict[str, float], # The game's payoff matrix values + } + +Action Structure +~~~~~~~~~~~~~~~ + +Actions are simple strings: + +1. ``"C"`` for Cooperate +2. ``"D"`` for Defect + +IPDAgent +-------------- + +The ``IPDAgent`` class implements the agent handler interface for the Iterated Prisoner's Dilemma, processing observations from the environment and generating actions through an LLM. + +.. code-block:: python + + class IPDAgent: + """ + Agent handler for Iterated Prisoner's Dilemma, implementing the AgentState interface + for the multi-agent negotiation standard. + """ + + def __init__( + self, + agent_id: str, + policy_id: str = "llm_policy", + system_prompt: Optional[str] = None, + max_errors: int = 3, + opponent_id: Optional[str] = None, + ): + """ + Initialize the IPD agent handler. + + Args: + agent_id: Identifier for this agent ("alice" or "bob") + policy_id: Identifier for the policy this agent uses + system_prompt: Optional custom system prompt for the LLM + max_errors: Maximum number of parsing errors before defaulting to cooperate + opponent_id: Optional identifier of the opponent (inferred if not provided) + """ + # ... + + def step(self, observation_from_env: Dict[str, Any], policy_output: str = None) -> Tuple[str, Dict[str, Any], str, bool, Dict[str, Any]]: + """ + Update the agent state based on the observation and process the policy output. + + Args: + observation_from_env: The observation from the environment + policy_output: The output from the policy (LLM response) + + Returns: + policy_id: The policy identifier + policy_input: The input to the policy + action: The action to be sent to the environment + done: Whether the action is ready to be sent to the environment + info: Additional information about the agent + """ + # ... + +Key Implementation Details +~~~~~~~~~~~~~~~~~~~~~~~~~ + +The ``IPDAgent`` class implements several key features: + +1. **LLM Interaction**: The agent generates prompts for an LLM and processes the LLM's responses. + +2. **Action Extraction**: The agent parses the LLM's output to extract valid actions (C or D). + +3. **Error Handling**: The agent provides helpful error messages when parsing fails and defaults to cooperation after multiple failures. + +4. **History Tracking**: The agent maintains and provides the complete game history in its prompts. + +5. **Strategy Explanation**: The agent can extract and log the reasoning behind an LLM's decisions. + +Prompt Structure +~~~~~~~~~~~~~~~ + +The agent generates prompts that include: + +1. **System Prompt**: Instructions and context for the LLM, explaining its role and the rules of the Prisoner's Dilemma. + +2. **Game State Description**: A text description of the current game state, including: + - Current round number + - History of previous rounds (if any) + - Cumulative score + +3. **Action Request**: Instructions on how to format the response, requiring an explicit action tag. + +Example system prompt: + +.. code-block:: text + + You are playing as Alice in an Iterated Prisoner's Dilemma game against Bob. + In each round, you must choose to either Cooperate (C) or Defect (D). + + The payoffs are: + - If both players Cooperate: You each get 3 points + - If both players Defect: You each get 1 point + - If you Cooperate and Bob Defects: You get 0 points, Bob gets 5 points + - If you Defect and Bob Cooperates: You get 5 points, Bob gets 0 points + + Your goal is to maximize your total points across all rounds. + The game will last for exactly 10 rounds, and both players know this. + +Example game state prompt: + +.. code-block:: text + + Current round: 3/10 + + History: + Round 1: You chose C, Bob chose C. You earned 3 points. + Round 2: You chose C, Bob chose D. You earned 0 points. + + Your total score so far: 3 points + + What is your choice for round 3? + Please respond with C to cooperate or D to defect, + and explain your reasoning. + +Running IPD Games +---------------------- + +To run Iterated Prisoner's Dilemma games with LLM agents, you can use the following code structure: + +.. code-block:: python + + from mllm.environments.ipd.ipd_game import IPDEnv + from mllm.environments.ipd.ipd_agent import IPDAgent + from mllm.run_matches import run_batched_matches + + # Create environment + env = IPDEnv( + rounds_per_game=10, + reward=3.0, + punishment=1.0, + temptation=5.0, + sucker=0.0 + ) + + # Create agent handlers + agent_handlers = { + "alice": IPDAgent(agent_id="alice"), + "bob": IPDAgent(agent_id="bob") + } + + # Define policy mapping + policy_mapping = { + "llm_policy": my_llm_policy_function + } + + # Run the game + game_results = run_batched_matches( + envs=[env], + agent_handlers_per_env=[agent_handlers], + policy_mapping=policy_mapping, + max_parallel_matches=1 + ) + + # Process results + for result in game_results: + print(f"Game finished. Scores: {result['total_rewards']}") + +Statistics and Analysis +---------------------- + +The IPD environment includes utility functions for analyzing game outcomes: + +1. **Cooperation Rates**: Percentage of rounds where each agent cooperated. +2. **Mutual Cooperation/Defection**: Percentage of rounds where both agents made the same choice. +3. **Score Distribution**: Analysis of how points were accumulated over the game. + +These statistics can be calculated using the ``gather_ipd_statistics`` function: + +.. code-block:: python + + from mllm.environments.ipd.ipd_statistics_funcs import gather_ipd_statistics + + stats = gather_ipd_statistics(match_info, env_info) + print(f"Cooperation rates: {stats['cooperation_rate']}") + print(f"Mutual cooperation rate: {stats['mutual_cooperation_rate']}") + print(f"Mutual defection rate: {stats['mutual_defection_rate']}") + +Limitations and Considerations +----------------------------- + +1. **Determinism**: The environment is deterministic, with randomness only in initialization if a seed is provided. + +2. **Limited Player Count**: The IPD environment only supports exactly two players. + +3. **Perfect Information**: Both players have perfect information about the game history. + +4. **Simultaneous Actions**: Both players act simultaneously, which requires adaptations for some LLM interfaces. + +5. **Fixed Game Length**: The total number of rounds is fixed and known to both players from the start. + +Advanced Usage +------------ + +For advanced usage, you can customize: + +1. **Payoff Matrix**: Modify reward values to create different incentive structures. + +2. **System Prompts**: Customize the LLM's understanding of the game and potential strategies. + +3. **Error Handling**: Adjust how the agent responds to invalid LLM outputs. + +4. **Analysis**: Create custom statistics gathering for specific research questions. + +5. **Integration**: Connect the IPD environment to other negotiation frameworks or tournament systems. \ No newline at end of file diff --git a/src_code_for_reproducibility/docs/source/media/runbatch.png b/src_code_for_reproducibility/docs/source/media/runbatch.png new file mode 100644 index 0000000000000000000000000000000000000000..e7572fa514d9e029a6c08e7061fa88b03bc63de2 Binary files /dev/null and b/src_code_for_reproducibility/docs/source/media/runbatch.png differ diff --git a/src_code_for_reproducibility/docs/source/src.environments.dond.dond_game.rst b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_game.rst new file mode 100644 index 0000000000000000000000000000000000000000..d0e595aad169a5a8456f83afe5029e7475d7c9e7 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_game.rst @@ -0,0 +1,7 @@ +src.environments.dond.dond\_game module +======================================= + +.. automodule:: src.environments.dond.dond_game + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.environments.dond.dond_log_funcs.rst b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_log_funcs.rst new file mode 100644 index 0000000000000000000000000000000000000000..cf96327d1bcbc7f0f8785804a49a6975eef889c2 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_log_funcs.rst @@ -0,0 +1,7 @@ +src.environments.dond.dond\_log\_funcs module +============================================= + +.. automodule:: src.environments.dond.dond_log_funcs + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.environments.dond.dond_player.rst b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_player.rst new file mode 100644 index 0000000000000000000000000000000000000000..bab97f1009eb2d5c4e387ac6a83982a51e33c9e3 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.dond.dond_player.rst @@ -0,0 +1,7 @@ +src.environments.dond.dond\_agent module +========================================= + +.. automodule:: src.environments.dond.dond_agent + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.environments.env_imports.rst b/src_code_for_reproducibility/docs/source/src.environments.env_imports.rst new file mode 100644 index 0000000000000000000000000000000000000000..4354ba27eee9f0e0fa3f4f0e5d9131c256a4be57 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.env_imports.rst @@ -0,0 +1,7 @@ +src.environments.env\_imports module +==================================== + +.. automodule:: src.environments.env_imports + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.environments.ipd.ipd_log_funcs.rst b/src_code_for_reproducibility/docs/source/src.environments.ipd.ipd_log_funcs.rst new file mode 100644 index 0000000000000000000000000000000000000000..edec187f4876cdf653ae4f91035f43bc877a7d40 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.ipd.ipd_log_funcs.rst @@ -0,0 +1,7 @@ +src.environments.ipd.ipd\_log\_funcs module +=========================================== + +.. automodule:: src.environments.ipd.ipd_log_funcs + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.environments.ipd.rst b/src_code_for_reproducibility/docs/source/src.environments.ipd.rst new file mode 100644 index 0000000000000000000000000000000000000000..af26091b3a87dee4d6993f0ae09bdb1c380a130e --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.ipd.rst @@ -0,0 +1,19 @@ +src.environments.ipd package +============================ + +.. automodule:: src.environments.ipd + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + src.environments.ipd.ipd_agent + src.environments.ipd.ipd_game + src.environments.ipd.ipd_log_funcs + src.environments.ipd.ipd_statistics_funcs + src.environments.ipd.ipd_training_data_funcs diff --git a/src_code_for_reproducibility/docs/source/src.environments.rst b/src_code_for_reproducibility/docs/source/src.environments.rst new file mode 100644 index 0000000000000000000000000000000000000000..221ed1c07ebea145cd23bc06c6474d34b1d8a33e --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.environments.rst @@ -0,0 +1,25 @@ +src.environments package +======================== + +.. automodule:: src.environments + :members: + :undoc-members: + :show-inheritance: + +Subpackages +----------- + +.. toctree:: + :maxdepth: 4 + + src.environments.dond + src.environments.ipd + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + src.environments.env_imports + src.environments.environment_imports diff --git a/src_code_for_reproducibility/docs/source/src.experiments.dond_run_train.rst b/src_code_for_reproducibility/docs/source/src.experiments.dond_run_train.rst new file mode 100644 index 0000000000000000000000000000000000000000..6c94e4bc508836338d5d6393858d403e746b5d2d --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.experiments.dond_run_train.rst @@ -0,0 +1,7 @@ +src.experiments.dond\_run\_train module +======================================= + +.. automodule:: src.experiments.dond_run_train + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.generation.run_games.rst b/src_code_for_reproducibility/docs/source/src.generation.run_games.rst new file mode 100644 index 0000000000000000000000000000000000000000..dbf42d3f821df187cdd4a8bb9d093839ce6b608a --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.generation.run_games.rst @@ -0,0 +1,7 @@ +src.generation.run\_games module +================================ + +.. automodule:: src.generation.run_games + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.models.hf_agent.rst b/src_code_for_reproducibility/docs/source/src.models.hf_agent.rst new file mode 100644 index 0000000000000000000000000000000000000000..e96abf4e21b1523cd016c6172eed0ddf51b4954a --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.models.hf_agent.rst @@ -0,0 +1,7 @@ +src.models.hf\_agent module +=========================== + +.. automodule:: src.models.hf_agent + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.models.vllm_worker_wrap.rst b/src_code_for_reproducibility/docs/source/src.models.vllm_worker_wrap.rst new file mode 100644 index 0000000000000000000000000000000000000000..466f755e064534683462b4259876d874678290ed --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.models.vllm_worker_wrap.rst @@ -0,0 +1,7 @@ +src.models.vllm\_worker\_wrap module +==================================== + +.. automodule:: src.models.vllm_worker_wrap + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.run.rst b/src_code_for_reproducibility/docs/source/src.run.rst new file mode 100644 index 0000000000000000000000000000000000000000..379615bab324e654401411db23058b5eec87e277 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.run.rst @@ -0,0 +1,7 @@ +src.run module +============== + +.. automodule:: src.run + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.training.reinforce_training.rst b/src_code_for_reproducibility/docs/source/src.training.reinforce_training.rst new file mode 100644 index 0000000000000000000000000000000000000000..5daf4b7250022f523242d6239d0921f362df6d24 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.training.reinforce_training.rst @@ -0,0 +1,7 @@ +src.training.reinforce\_training module +======================================= + +.. automodule:: src.training.reinforce_training + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.training.rl_convs_processing.rst b/src_code_for_reproducibility/docs/source/src.training.rl_convs_processing.rst new file mode 100644 index 0000000000000000000000000000000000000000..cf5db1aa0cb6d010fc70f86c341467ba5e9b485e --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.training.rl_convs_processing.rst @@ -0,0 +1,7 @@ +src.training.rl\_convs\_processing module +========================================= + +.. automodule:: src.training.rl_convs_processing + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.utils.inherit_args.rst b/src_code_for_reproducibility/docs/source/src.utils.inherit_args.rst new file mode 100644 index 0000000000000000000000000000000000000000..72d8fc0ff082f4e6d37a852f3dd18a23f58f036d --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.utils.inherit_args.rst @@ -0,0 +1,7 @@ +src.utils.inherit\_args module +============================== + +.. automodule:: src.utils.inherit_args + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.utils.log_gpu_usage.rst b/src_code_for_reproducibility/docs/source/src.utils.log_gpu_usage.rst new file mode 100644 index 0000000000000000000000000000000000000000..44b83082b6eb027ef402603e034c712ccc2cbfcc --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.utils.log_gpu_usage.rst @@ -0,0 +1,7 @@ +src.utils.log\_gpu\_usage module +================================ + +.. automodule:: src.utils.log_gpu_usage + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.utils.log_statistics.rst b/src_code_for_reproducibility/docs/source/src.utils.log_statistics.rst new file mode 100644 index 0000000000000000000000000000000000000000..fde4c1553a35b932ba59aef8d29ceb90b20df48c --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.utils.log_statistics.rst @@ -0,0 +1,7 @@ +src.utils.log\_statistics module +================================ + +.. automodule:: src.utils.log_statistics + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.utils.parallel_shuffle.rst b/src_code_for_reproducibility/docs/source/src.utils.parallel_shuffle.rst new file mode 100644 index 0000000000000000000000000000000000000000..3ea9301dcc1ccb245e1c565a135447348244cd65 --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.utils.parallel_shuffle.rst @@ -0,0 +1,7 @@ +src.utils.parallel\_shuffle module +================================== + +.. automodule:: src.utils.parallel_shuffle + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/src.utils.quick_stats.rst b/src_code_for_reproducibility/docs/source/src.utils.quick_stats.rst new file mode 100644 index 0000000000000000000000000000000000000000..76a504eeda8c9b608e1b331cd932fd2f26bffafb --- /dev/null +++ b/src_code_for_reproducibility/docs/source/src.utils.quick_stats.rst @@ -0,0 +1,7 @@ +src.utils.quick\_stats module +============================= + +.. automodule:: src.utils.quick_stats + :members: + :undoc-members: + :show-inheritance: diff --git a/src_code_for_reproducibility/docs/source/usage.rst b/src_code_for_reproducibility/docs/source/usage.rst new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src_code_for_reproducibility/markov_games/__init__.py b/src_code_for_reproducibility/markov_games/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src_code_for_reproducibility/markov_games/alternative_actions_runner.py b/src_code_for_reproducibility/markov_games/alternative_actions_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..c64db2deda539a1a71e045309cfdf257d2cbc614 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/alternative_actions_runner.py @@ -0,0 +1,138 @@ +import asyncio +import copy +import json +import os.path +from typing import Any, Tuple + +from mllm.markov_games.markov_game import AgentAndActionSafeCopy, MarkovGame +from mllm.markov_games.rollout_tree import ( + AgentActLog, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + StepLog, +) + +AgentId = str + + + +async def run_with_unilateral_alt_action( + markov_game: MarkovGame, + agent_id: AgentId, + time_step: int, + branch_node: RolloutTreeBranchNode, + max_depth: int, +): + """ + This function is used to generate a new branch for a given agent. + """ + + # Generate alternative action and take a step + await markov_game.set_action_of_agent(agent_id) + terminated: bool = markov_game.take_simulation_step() + step_log = markov_game.get_step_log() + first_alternative_node = RolloutTreeNode( + step_log=step_log, + time_step=time_step, + ) + + # Generate rest of trajectory up to max depth + time_step += 1 + counter = 1 + previous_node = first_alternative_node + while not terminated and counter <= max_depth: + terminated, step_log = await markov_game.step() + current_node = RolloutTreeNode(step_log=step_log, time_step=time_step) + previous_node.child = current_node + previous_node = current_node + counter += 1 + time_step += 1 + + if branch_node.branches == None: + branch_node.branches = {agent_id: [first_alternative_node]} + else: + agent_branches = branch_node.branches.get(agent_id, []) + agent_branches.append(first_alternative_node) + branch_node.branches[agent_id] = agent_branches + + +async def AlternativeActionsRunner( + markov_game: MarkovGame, + output_folder: str, + nb_alternative_actions: int, + max_depth: int, + branch_only_on_new_round: bool = False, +): + """ + This method generates a trajectory with partially completed branches, + where the branching comes from taking unilateraly different actions. + The resulting data is used to estimate the updated advantage alignment policy gradient terms. + Let k := nb_sub_steps. Then the number of steps generated is O(Tk), where T is + the maximum trajectory length. + """ + + tasks = [] + time_step = 0 + terminated = False + root = RolloutTreeRootNode( + id=markov_game.get_id(), + crn_id=markov_game.get_crn_id() + ) + previous_node = root + + while not terminated: + mg_before_action = markov_game.get_safe_copy() + + # Get safe copies for main branch + agent_action_safe_copies: dict[ + AgentId, AgentAndActionSafeCopy + ] = await markov_game.get_actions_of_agents_without_side_effects() + + markov_game.set_actions_of_agents_manually(agent_action_safe_copies) + terminated = markov_game.take_simulation_step() + main_node = RolloutTreeNode( + step_log=markov_game.get_step_log(), time_step=time_step + ) + branch_node = RolloutTreeBranchNode(main_child=main_node) + previous_node.child = branch_node + previous_node = main_node + + # Get alternative branches by generating new unilateral actions + for agent_id in markov_game.agent_ids: + for _ in range(nb_alternative_actions): + # Get safe copies for branches + branch_agent_action_safe_copies: dict[ + AgentId, AgentAndActionSafeCopy + ] = { + agent_id: AgentAndActionSafeCopy( + action=copy.deepcopy(agent_action_safe_copy.action), + action_info=copy.deepcopy(agent_action_safe_copy.action_info), + agent_after_action=agent_action_safe_copy.agent_after_action.get_safe_copy(), + ) + for agent_id, agent_action_safe_copy in agent_action_safe_copies.items() + } + mg_branch: MarkovGame = mg_before_action.get_safe_copy() + other_agent_id = [id for id in mg_branch.agent_ids if id != agent_id][0] + mg_branch.set_action_and_agent_after_action_manually( + agent_id=other_agent_id, + agent_action_safe_copy=branch_agent_action_safe_copies[ + other_agent_id + ], + ) + task = asyncio.create_task( + run_with_unilateral_alt_action( + markov_game=mg_branch, + time_step=time_step, + agent_id=agent_id, + branch_node=branch_node, + max_depth=max_depth, + ) + ) + tasks.append(task) + time_step += 1 + + # wait for all branches to complete + await asyncio.gather(*tasks) + + return root diff --git a/src_code_for_reproducibility/markov_games/diplomacy/diplomacy_env.py b/src_code_for_reproducibility/markov_games/diplomacy/diplomacy_env.py new file mode 100644 index 0000000000000000000000000000000000000000..9b72612c43f2535d353b0157ce72a9b79c23cbb3 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/diplomacy/diplomacy_env.py @@ -0,0 +1,230 @@ +from typing import Dict, List, Tuple, Optional, Any +from diplomacy import Game +import random + +class DiplomacyEnv: + """Multi-Agent Reinforcement Learning environment for Diplomacy. + + This class wraps the Diplomacy game engine to provide an interface + compliant with the MARL standard. + """ + + def __init__(self, random_seed=None, map_name="standard", game_id=None, rules=None, max_steps=50): + """Initialize the Diplomacy environment. + + Args: + map_name: The name of the map to use (default: "standard") + game_id: Optional game ID + rules: Optional rules to apply to the game + max_steps: Maximum number of steps before forcing game end (default: 10) + """ + self.random_seed = random_seed + self.map_name = map_name + self.game_id = game_id + self.rules = rules or [] + self.game = None + self.active_powers = [] + self.render_mode = None + self.max_steps = max_steps + self.current_steps = 0 + + def reset(self): + """Reset the environment to an initial state and return the initial observation. + + Returns: + observation: A dictionary where keys are agent identifiers and values are observations. + """ + # Initialize a new game + self.game = Game(game_id=self.game_id, map_name=self.map_name) + + # Apply rules + for rule in self.rules: + self.game.add_rule(rule) + + # Determine active powers (not eliminated) + self.active_powers = [name for name, power in self.game.powers.items() + if not power.is_eliminated()] + + # Reset step counter + self.current_steps = 0 + + # Create initial observations for all powers + observations = {} + for power_name in self.active_powers: + observations[power_name] = self._create_observation(power_name) + + return observations + + def step(self, actions): + """Take a step in the environment using the provided actions. + + Args: + actions: A dictionary where keys are agent identifiers and values are actions. + + Returns: + observations: A dictionary where keys are agent identifiers and values are observations. + done: Whether the episode has ended. + info: Additional information about the environment. + """ + print(f"stepping {self.current_steps}") + self.current_steps += 1 + # Apply actions (orders) for each power + for power_name, action in actions.items(): + if power_name in self.active_powers: + orders = action.get("orders", []) + wait = action.get("wait", True) + + # Set orders for the power + if orders: + self.game.set_orders(power_name, orders) + + # Set wait flag + self.game.set_wait(power_name, wait) + + # Check if all active powers are ready to proceed + if self.game.does_not_wait(): + # Process the current phase + self.game.process() + + + # Update active powers list after processing + self.active_powers = [name for name, power in self.game.powers.items() + if not power.is_eliminated()] + + # Create observations for all active powers + observations = {} + for power_name in self.active_powers: + observations[power_name] = self._create_observation(power_name) + + # Check if the game is done (either naturally or due to max steps) + done = self.game.is_game_done or self.current_steps >= self.max_steps + + # Create info dict + info = { + "phase": self.game.get_current_phase(), + "active_powers": self.active_powers, + "centers": self.game.get_centers(), + "units": self.game.get_units(), + "current_steps": self.current_steps, + "max_steps_reached": self.current_steps >= self.max_steps + } + + return observations, done, info + + def _create_observation(self, power_name): + """Create observation for a specific power. + + Args: + power_name: The name of the power + + Returns: + An observation dictionary + """ + observation = { + "phase": self.game.get_current_phase(), + "units": self.game.get_units(), + "centers": self.game.get_centers(), + "orderable_locations": self.game.get_orderable_locations(power_name), + "order_status": self.game.get_order_status(power_name), + "possible_orders": self._get_possible_orders_for_power(power_name) + } + return observation + + def _get_possible_orders_for_power(self, power_name): + """Get all possible orders for a power's units. + + Args: + power_name: The name of the power + + Returns: + A dictionary mapping units to their possible orders + """ + all_possible_orders = self.game.get_all_possible_orders() + + # Filter for only the locations where this power has units + power_units = self.game.get_units(power_name) + power_unit_locations = [unit[2:] for unit in power_units] + + # For retreat phases, include retreating units + if self.game.phase_type == 'R': + power = self.game.get_power(power_name) + power_unit_locations.extend([unit[2:] for unit in power.retreats]) + + # For adjustment phases, include buildable locations + elif self.game.phase_type == 'A': + power = self.game.get_power(power_name) + # If we have more centers than units, we can build + if len(power.centers) > len(power.units): + buildable_sites = self.game._build_sites(power) + power_unit_locations.extend(buildable_sites) + # If we have more units than centers, we need to remove + elif len(power.units) > len(power.centers): + # All units are candidates for removal + pass + + # Filter the possible orders to only those for this power's units/locations + power_possible_orders = {} + for loc, orders in all_possible_orders.items(): + if loc[:3] in power_unit_locations: + power_possible_orders[loc] = orders + + return power_possible_orders + + def get_log_info(self): + """Get additional information about the environment for logging. + + Returns: + log_info: Information about the environment required to log the game. + """ + if not self.game: + return {} + + return { + "game_id": self.game.game_id, + "phase": self.game.get_current_phase(), + "map_name": self.game.map_name, + "centers": self.game.get_centers(), + "units": self.game.get_units(), + "powers": {name: { + "units": power.units, + "centers": power.centers, + "is_eliminated": power.is_eliminated(), + "order_status": self.game.get_order_status(name) + } for name, power in self.game.powers.items()}, + "orders": self.game.get_orders(), + "active_powers": self.active_powers, + "is_game_done": self.game.is_game_done, + "outcome": self.game.outcome if self.game.is_game_done else None + } + + def render(self, mode='human'): + """Render the current state of the environment. + + Args: + mode: The rendering mode ('human', 'svg', etc.) + + Returns: + The rendered image if applicable + """ + self.render_mode = mode + if self.game: + if mode == 'human': + # Just print basic game state + print(f"Game: {self.game.game_id}") + print(f"Phase: {self.game.get_current_phase()}") + print(f"Active Powers: {self.active_powers}") + print("Supply Centers:") + for power_name, centers in self.game.get_centers().items(): + print(f" {power_name}: {centers}") + print("Units:") + for power_name, units in self.game.get_units().items(): + print(f" {power_name}: {units}") + return None + elif mode == 'svg': + # Return SVG representation + return self.game.render(output_format='svg') + return None + + def close(self): + """Perform any necessary cleanup.""" + self.game = None \ No newline at end of file diff --git a/src_code_for_reproducibility/markov_games/gather_and_export_utils.py b/src_code_for_reproducibility/markov_games/gather_and_export_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f474823ebd9def7eb9cd620ea6d957e11a5aabaa --- /dev/null +++ b/src_code_for_reproducibility/markov_games/gather_and_export_utils.py @@ -0,0 +1,951 @@ +from __future__ import annotations + +import csv +import os +import pickle +import re +from collections import defaultdict +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple + +from mllm.markov_games.rollout_tree import * + +try: + # Re-export moved helpers for backward compatibility + from basic_render import ( + find_iteration_folders, + gather_rollout_trees, + get_rollout_trees, + ) +except Exception: + pass + +# -------------------------------------------------------------------------------------- +# Fetch external rollout trees +# -------------------------------------------------------------------------------------- + + +def find_iteration_folders(global_folder): + """Find all iteration_* folders within the global folder structure.""" + global_path = Path(global_folder) + + # Look for iteration_* folders in all subdirectories + iteration_folders = [] + + # Search in the global folder itself + for item in global_path.glob("iteration_*"): + if item.is_dir(): + iteration_folders.append(item) + + # Search in seed_* subdirectories + for seed_dir in global_path.glob("seed_*/"): + if seed_dir.is_dir(): + for item in seed_dir.glob("iteration_*"): + if item.is_dir(): + iteration_folders.append(item) + + return sorted(iteration_folders) + + +def gather_rollout_trees(iteration_folder): + """Gather all rollout trees from the iteration folder (.pkl only).""" + rollout_trees = [] + iteration_path = Path(iteration_folder) + for item in iteration_path.glob("**/*.rt.pkl"): + with open(item, "rb") as f: + data = pickle.load(f) + # Validate dicts back into Pydantic model for downstream use + rollout_tree = RolloutTreeRootNode.model_validate(data) + rollout_trees.append(rollout_tree) + return rollout_trees + + +def get_rollout_trees(global_folder) -> list[list[RolloutTreeRootNode]]: + """Get all rollout trees from the global folder.""" + iteration_folders = find_iteration_folders(global_folder) + rollout_trees = [] + for iteration_folder in iteration_folders: + rollout_trees.append(gather_rollout_trees(iteration_folder)) + return rollout_trees + + +# -------------------------------------------------------------------------------------- +# Gather data from rollout tree methods +# -------------------------------------------------------------------------------------- + + +def load_rollout_tree(path: Path) -> RolloutTreeRootNode: + """Load a rollout tree from a PKL file containing a dict.""" + with open(path, "rb") as f: + data = pickle.load(f) + return RolloutTreeRootNode.model_validate(data) + + +@dataclass +class RolloutNodeList: + id: str + nodes: List[RolloutTreeNode] + + +def get_rollout_tree_paths( + root: RolloutTreeRootNode, mgid: Optional[str] = None +) -> Tuple[RolloutNodeList, List[RolloutNodeList]]: + """ + Returns: + main_path: The main path from the root to the end of the tree. + branch_paths: A list of all branch paths from the root to the end of the tree. + Each branch path contains a list of nodes that are part of the branch, including the nodes from the main path before the branch was taken. + """ + branch_paths = [] + + def collect_path_nodes(current) -> List[RolloutTreeNode]: + """Recursively collect all nodes in a path starting from current node.""" + if current is None: + return [] + + if isinstance(current, RolloutTreeNode): + return [current] + collect_path_nodes(current.child) + + elif isinstance(current, RolloutTreeBranchNode): + # For branch nodes, we only follow the main_child for path collection + if current.main_child: + return [current.main_child] + collect_path_nodes( + current.main_child.child + ) + else: + return [] + + def traverse_for_branches( + current, + main_path_prefix: List[RolloutTreeNode], + path_id: str, + current_time_step: Optional[int] = 0, + ): + """Traverse tree to collect all branch paths.""" + if current is None: + return + + if isinstance(current, RolloutTreeNode): + # Continue traversing with this node added to the main path prefix + new_prefix = main_path_prefix + [current] + traverse_for_branches(current.child, new_prefix, path_id, current.time_step) + + elif isinstance(current, RolloutTreeBranchNode): + # Collect all branch paths + if current.branches: + for agent_id, branch_node_list in current.branches.items(): + if branch_node_list: + # Start with the main path prefix, then recursively collect all nodes in this branch + branch_path_nodes = main_path_prefix.copy() + for branch_node in branch_node_list: + branch_path_nodes.extend(collect_path_nodes(branch_node)) + + # Create proper branch path ID with mgid, agent_id, and time_step + mgid_str = mgid or str(root.id) + branch_path_id = f"mgid:{mgid_str}_type:branch_agent:{agent_id}_time_step:{current_time_step}" + branch_paths.append( + RolloutNodeList(id=branch_path_id, nodes=branch_path_nodes) + ) + + # Process the main child and add to prefix + new_prefix = main_path_prefix + if current.main_child: + new_prefix = main_path_prefix + [current.main_child] + + # Continue traversing the main path + if current.main_child: + traverse_for_branches( + current.main_child.child, + new_prefix, + path_id, + current.main_child.time_step, + ) + + # Collect the main path nodes + main_path_nodes = collect_path_nodes(root.child) + + # Traverse to collect all branch paths + traverse_for_branches(root.child, [], "") + + # Create the main path with proper mgid format + mgid_str = mgid or str(root.id) + main_path = RolloutNodeList(id=f"mgid:{mgid_str}_type:main", nodes=main_path_nodes) + + return main_path, branch_paths + + +class ChatTurnLog(BaseModel): + time_step: int + agent_id: str + role: str + content: str + reasoning: Optional[str] = None + is_state_end: bool + reward: float + + +def gather_agent_chat_turns_for_path( + agent_id: str, path: RolloutNodeList +) -> List[ChatTurnLog]: + """Iterate through all chat turns for a specific agent in a path sorted by time step.""" + turns = [] + for node in path.nodes: + action_log = node.step_log.action_logs.get(agent_id, []) + if action_log: + for chat_turn in action_log.chat_turns or []: + turns.append( + ChatTurnLog( + time_step=node.time_step, + agent_id=agent_id, + role=chat_turn.role, + content=chat_turn.content, + reasoning=getattr(chat_turn, "reasoning_content", None), + is_state_end=chat_turn.is_state_end, + reward=node.step_log.simulation_step_log.rewards.get( + agent_id, 0 + ), + ) + ) + return turns + + +def gather_all_chat_turns_for_path(path: RolloutNodeList) -> List[ChatTurnLog]: + """Iterate through all chat turns for all agents in a path sorted by time step.""" + turns = [] + + # Collect turns from all agents, but interleave them per timestep by (user, assistant) pairs + for node in path.nodes: + # Build (user[, assistant]) pairs for each agent at this timestep + agent_ids = sorted(list(node.step_log.action_logs.keys())) + per_agent_pairs: Dict[str, List[List[ChatTurnLog]]] = {} + + for agent_id in agent_ids: + action_log = node.step_log.action_logs.get(agent_id) + pairs: List[List[ChatTurnLog]] = [] + current_pair: List[ChatTurnLog] = [] + + if action_log and action_log.chat_turns: + for chat_turn in action_log.chat_turns: + turn_log = ChatTurnLog( + time_step=node.time_step, + agent_id=agent_id, + role=chat_turn.role, + content=chat_turn.content, + reasoning=getattr(chat_turn, "reasoning_content", None), + is_state_end=chat_turn.is_state_end, + reward=node.step_log.simulation_step_log.rewards.get( + agent_id, 0 + ), + ) + + if chat_turn.role == "user": + # If a previous pair is open, close it and start a new one + if current_pair: + pairs.append(current_pair) + current_pair = [] + current_pair = [turn_log] + else: + # assistant: attach to an open user message if present; otherwise stand alone + if ( + current_pair + and len(current_pair) == 1 + and current_pair[0].role == "user" + ): + current_pair.append(turn_log) + pairs.append(current_pair) + current_pair = [] + else: + # No preceding user or already paired; treat as its own unit + pairs.append([turn_log]) + + if current_pair: + # Unpaired trailing user message + pairs.append(current_pair) + + per_agent_pairs[agent_id] = pairs + + # Interleave pairs across agents: A1, B1, A2, B2, ... + index = 0 + while True: + added_any = False + for agent_id in agent_ids: + agent_pairs = per_agent_pairs.get(agent_id, []) + if index < len(agent_pairs): + for tl in agent_pairs[index]: + turns.append(tl) + added_any = True + if not added_any: + break + index += 1 + + return turns + + +def chat_turns_to_dict(chat_turns: Iterator[ChatTurnLog]) -> Iterator[Dict[str, Any]]: + """Render all chat turns for a path as structured data for JSON.""" + for chat_turn in chat_turns: + yield chat_turn.model_dump() + + +def get_all_agents(root: RolloutTreeRootNode) -> List[str]: + """list of all agent IDs that appear in the tree.""" + if root.child is None: + return [] + + # Get the first node to extract all agent IDs + first_node = root.child + if isinstance(first_node, RolloutTreeBranchNode): + first_node = first_node.main_child + + if first_node is None: + return [] + + # All agents should be present in the first node + agents = set(first_node.step_log.action_logs.keys()) + agents.update(first_node.step_log.simulation_step_log.rewards.keys()) + + return sorted(list(agents)) + + +def gather_agent_main_rewards(agent_id: str, path: RolloutNodeList) -> List[float]: + """Gather main rewards for a specific agent in a path.""" + rewards = [] + for node in path.nodes: + reward = node.step_log.simulation_step_log.rewards[agent_id] + rewards.append(reward) + return rewards + + +def gather_all_rewards(path: RolloutNodeList) -> List[Dict[AgentId, float]]: + """Gather main rewards from main trajectory in a path.""" + rewards = [] + for node in path.nodes: + rewards.append(node.step_log.simulation_step_log.rewards.copy()) + return rewards + + +def gather_simulation_stats( + path: RolloutNodeList, + filter: Callable[[SimulationStepLog], bool], + stat_func: Callable[[SimulationStepLog], Any], +) -> List[Any]: + """Gather stats from main trajectory in a path.""" + stats = [] + for node in path.nodes: + sl = node.step_log.simulation_step_log + if filter(sl): + stats.append(stat_func(sl)) + return stats + + +def gather_simulation_infos(path: RolloutNodeList) -> List[Dict[str, Any]]: + """Gather simulation information from main trajectory in a path.""" + infos = [] + for node in path.nodes: + infos.append(node.step_log.simulation_step_log.info) + return infos + + +def export_chat_logs(path: Path, outdir: Path): + """Process a rollout tree PKL file and generate a JSONL of chat turns as dicts. + Each line contains an object with path_id and chat_turns for a single path. + """ + import json + + root = load_rollout_tree(path) + mgid = root.id + + main_path, branch_paths = get_rollout_tree_paths(root) + all_paths = [main_path] + branch_paths + + outdir.mkdir(parents=True, exist_ok=True) + output_file = outdir / f"mgid:{mgid}_plucked_chats.render.jsonl" + + with open(output_file, "w", encoding="utf-8") as f: + for path_obj in all_paths: + chat_turns = gather_all_chat_turns_for_path(path_obj) + output_obj = { + "path_id": str(path_obj.id), + "chat_turns": list(chat_turns_to_dict(iter(chat_turns))), + } + f.write(json.dumps(output_obj, ensure_ascii=False) + "\n") + + +def export_rewards_to_csv(path: Path, outdir: Path, first_file: bool): + # Load the rollout tree + root = load_rollout_tree(path) + mgid = root.id + + # Get all paths + main_path, branch_paths = get_rollout_tree_paths(root) + outdir.mkdir(parents=True, exist_ok=True) + rewards_dict_list = gather_all_rewards(main_path) + agent_ids = rewards_dict_list[0].keys() + rewards_list = defaultdict(list) + for rewards_dict in rewards_dict_list: + for agent_id in agent_ids: + rewards_list[agent_id].append(rewards_dict[agent_id]) + + mgid = root.id + group_seed = getattr(root, "crn_id", None) + + for agent_id in agent_ids: + output_file = outdir / f"agent:{agent_id}_rewards.render.csv" + + # Build current row: [mgid, group_seed] + rewards + formatted_rewards = [f"{round(x, 1):>5}" for x in rewards_list[agent_id]] + current_row = [str(mgid), str(group_seed)] + formatted_rewards + + # Read existing rows (if any), skipping header if present + existing_rows: List[List[str]] = [] + if output_file.exists(): + with open(output_file, "r", newline="") as rf: + reader = csv.reader(rf) + for row in reader: + if not row or not any(cell.strip() for cell in row): + continue + if ( + len(row) >= 2 + and row[0].strip().lower() == "mgid" + and row[1].strip().lower() == "group_seed" + ): + # skip header + continue + existing_rows.append(row) + + # Append and sort by (group_seed, mgid) + existing_rows.append(current_row) + + def sort_key(r: List[str]): + def try_int(val: str): + try: + return int(val) + except Exception: + return None + + seed_raw = r[1] if len(r) > 1 else "" + mgid_raw = r[0] if len(r) > 0 else "" + seed_num = try_int(seed_raw) + mgid_num = try_int(mgid_raw) + # Sort numerically when possible; otherwise fall back to string + return ( + 0 if seed_num is not None else 1, + seed_num if seed_num is not None else seed_raw, + 0 if mgid_num is not None else 1, + mgid_num if mgid_num is not None else mgid_raw, + ) + + existing_rows.sort(key=sort_key) + + # Determine max reward length to build header and pad rows + max_reward_len = 0 + for r in existing_rows: + if len(r) > 2: + max_reward_len = max(max_reward_len, len(r) - 2) + max_reward_len = max(max_reward_len, len(current_row) - 2) + + def pad_row(r: List[str]) -> List[str]: + needed = (2 + max_reward_len) - len(r) + return r + ([""] * needed if needed > 0 else []) + + padded_rows = [pad_row(r) for r in existing_rows] + + # Build header + header = ["mgid", "group_seed"] + [f"r_t{t}" for t in range(max_reward_len)] + + # Rewrite the file with header to avoid extra/blank rows + with open(output_file, "w", newline="") as wf: + writer = csv.writer(wf) + writer.writerow(header) + writer.writerows(padded_rows) + + +# -------------------------------------------------------------------------------------- +# HTML exports +# -------------------------------------------------------------------------------------- + + +def html_from_chat_turns(chat_turns: List[ChatTurnLog]) -> str: + """ + Render chat turns as a single, wrapping sequence of messages in time order. + Keep badge and message bubble styles, include time on every badge and + include rewards on assistant badges. Each message is individually + hide/show by click; when hidden, only the badge remains and "(...)" is + shown inline (not inside a bubble). + """ + import html + + # Prepare ordering: sort by (time_step, original_index) to keep stable order within same step + indexed_turns = list(enumerate(chat_turns)) + indexed_turns.sort(key=lambda t: (t[1].time_step, t[0])) + + # CSS styles (simplified layout; no time-step or agent-column backgrounds) + css = """ + + """ + + # HTML structure + html_parts = [ + "", + "", + "", + "", + "Chat Turns", + css, + "", + "", + "", + '
', + '
', + '
', + '', + '', + "timesteps", + '', + '', + '', + '', + "to", + '', + '', + '', + "
", + "
", + '
', + ] + + last_time_step = None + for original_index, turn in indexed_turns: + # Build classes + agent_class = f"agent-{re.sub('[^a-z0-9_-]', '-', turn.agent_id.lower())}" + role_class = f"role-{turn.role}" + # Segments default collapsed for user role + segment_collapsed_class = " collapsed" if turn.role == "user" else "" + + # Badge content + if turn.role == "assistant": + name = html.escape(turn.agent_id) + emoji = '🤖' + raw_val = turn.reward + if isinstance(raw_val, (int, float)): + reward_val = f"{raw_val:.4f}".rstrip("0").rstrip(".") + if len(reward_val) > 8: + reward_val = reward_val[:8] + "…" + else: + reward_val = str(raw_val) + # Format: "🤖 Alice 💬 • Reward: 5.5556 • " + badge_inner = ( + f'{emoji} {name}' + f' {reward_val} r' + f' ' + ) + else: + # For user messages, show "User of {Agent ID}" in the badge + name = "User of " + html.escape(turn.agent_id) + emoji = '⚙️' + # Format (no reward): "⚙️ User of Alice • " + badge_inner = f'{emoji} {name}' + + badge = f'{badge_inner}' + + # Inline timestep distinction badge at step boundaries (render before first message) + ts_badge_html = "" + if last_time_step is None or turn.time_step != last_time_step: + ts_badge_html = f'⏱ {turn.time_step}' + last_time_step = turn.time_step + + escaped_content = html.escape(turn.content) + collapsed_text = re.sub(r"\s+", " ", escaped_content).strip() + # Optional reasoning + reasoning_val = getattr(turn, "reasoning", None) + reasoning_html = "" + if reasoning_val: + escaped_reasoning = html.escape(reasoning_val) + reasoning_text = re.sub(r"\s+", " ", escaped_reasoning).strip() + reasoning_html = ( + f'' + f'💭 ' + f'{reasoning_text} ' + f"" + ) + + html_parts.append( + f'
' + f'
{ts_badge_html}{badge}' + f"{reasoning_html}" + f'💬 {collapsed_text}' + f"
" + f"
" + ) + + html_parts.extend(["
", "", ""]) + + return "\n".join(html_parts) + + +def export_html_from_rollout_tree(path: Path, outdir: Path, main_only: bool = False): + """Process a rollout tree file and generate HTML files for each path. + Creates separate HTML files for the main path and each branch path. + The main path is saved in the root output directory, while branch paths + are saved in a 'branches' subdirectory. + + Args: + path: Path to the rollout tree JSON file + outdir: Output directory for HTML files + main_only: If True, only export the main trajectory (default: False) + """ + root = load_rollout_tree(path) + mgid = root.id + + main_path, branch_paths = get_rollout_tree_paths(root) + + outdir.mkdir(parents=True, exist_ok=True) + + # Create branches subdirectory if we have branch paths + if not main_only and branch_paths: + branches_dir = outdir / f"mgid:{mgid}_branches_html_renders" + branches_dir.mkdir(parents=True, exist_ok=True) + + # Generate HTML for the main path + chat_turns = gather_all_chat_turns_for_path(main_path) + html_content = html_from_chat_turns(chat_turns) + output_file = outdir / f"mgid:{mgid}_main_html_render.render.html" + with open(output_file, "w", encoding="utf-8") as f: + f.write(html_content) + + # Generate HTML for each branch path + for path_obj in branch_paths: + chat_turns = gather_all_chat_turns_for_path(path_obj) + + html_content = html_from_chat_turns(chat_turns) + + path_id: str = path_obj.id + output_filename = f"{path_id}_html_render.render.html" + + output_file = branches_dir / output_filename + + with open(output_file, "w", encoding="utf-8") as f: + f.write(html_content) diff --git a/src_code_for_reproducibility/markov_games/mg_utils.py b/src_code_for_reproducibility/markov_games/mg_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..f1378f80e58f7745ebdb4e4477860a7e2cf7a299 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/mg_utils.py @@ -0,0 +1,77 @@ +from collections.abc import Callable +from dataclasses import dataclass +import copy +import asyncio + +from mllm.markov_games.ipd.ipd_agent import IPDAgent +from mllm.markov_games.ipd.ipd_simulation import IPD +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.negotiation.dond_agent import DealNoDealAgent +from mllm.markov_games.negotiation.dond_simulation import DealNoDealSimulation +from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent +from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressSimulation +from mllm.markov_games.negotiation.tas_agent import TrustAndSplitAgent +from mllm.markov_games.negotiation.tas_rps_agent import TrustAndSplitRPSAgent +from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSSimulation +from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitSimulation + +from mllm.markov_games.markov_game import MarkovGame +from mllm.markov_games.rollout_tree import RolloutTreeRootNode, StepLog, RolloutTreeBranchNode +from mllm.markov_games.rollout_tree import AgentActLog +from mllm.markov_games.simulation import SimulationStepLog +from mllm.markov_games.rollout_tree import RolloutTreeNode + +AgentId = str + + + + +@dataclass +class AgentConfig: + agent_id: int + agent_class_name: str + policy_id: str + init_kwargs: dict + + +@dataclass +class MarkovGameConfig: + id: int + seed: int + simulation_class_name: str + simulation_init_args: dict + agent_configs: list[AgentConfig] + + +def init_markov_game_components( + config: MarkovGameConfig, policies: dict[str, Callable[[list[dict]], str]] +): + """ + TOWRITE + """ + simulation = eval(config.simulation_class_name)( + seed=config.seed, + **config.simulation_init_args, + ) + agents = {} + for agent_config in config.agent_configs: + agent_id = agent_config.agent_id + agent_class = eval(agent_config.agent_class_name) + agent = agent_class( + seed=config.seed, + agent_id=agent_id, + policy=policies[agent_config.policy_id], + **agent_config.init_kwargs, + ) + agents[agent_id] = agent + markov_game = MarkovGame( + id=config.id, + crn_id=config.seed, + simulation=simulation, + agents=agents, + ) + return markov_game + + + + diff --git a/src_code_for_reproducibility/markov_games/simulation.py b/src_code_for_reproducibility/markov_games/simulation.py new file mode 100644 index 0000000000000000000000000000000000000000..b0b804e2aa4c288b3d98cc8106cfd727f1cc1e1a --- /dev/null +++ b/src_code_for_reproducibility/markov_games/simulation.py @@ -0,0 +1,87 @@ +""" +A Simulation is the environment of a Markov Game. +The Simulation is not responsible for properly checking / formatting the responses of LLM's. +This is the job of the `Agent` class. +Simulations expect clean actions, and are defined similarly to `gymnasium` environments, except that they are adapted for the Multi-agent setting. +""" + +from abc import ABC, abstractmethod +from typing import Any, Tuple + +from numpy.random import default_rng + +from mllm.markov_games.rollout_tree import SimulationStepLog + + +class Simulation(ABC): + @abstractmethod + def __init__(self, seed: int, *args, **kwargs): + self.seed = seed + self.rng = default_rng(self.seed) + + @abstractmethod + def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]: + """ + Returns terminated, info + """ + raise NotImplementedError + + def get_obs(self): + """Returns all agent observations in dict + + Returns: + observations + """ + raise NotImplementedError + + def get_obs_agent(self, agent_id): + """Returns observation for agent_id""" + raise NotImplementedError + + def get_obs_size(self): + """Returns the shape of the observation""" + raise NotImplementedError + + def get_state(self): + raise NotImplementedError + + def get_state_size(self): + """Returns the shape of the state""" + raise NotImplementedError + + def get_avail_actions(self): + raise NotImplementedError + + def get_avail_agent_actions(self, agent_id): + """Returns the available actions for agent_id""" + raise NotImplementedError + + def get_total_actions(self): + """Returns the total number of actions an agent could ever take""" + # TODO: This is only suitable for a discrete 1 dimensional action space for each agent + raise NotImplementedError + + def get_safe_copy(self): + """ + Return copy of the agent object that is decorrelated from the original object. + """ + raise NotImplementedError + + def reset(self): + """Returns initial observations and states""" + raise NotImplementedError + + def render(self): + raise NotImplementedError + + def close(self): + raise NotImplementedError + + # def seed(self): + # raise NotImplementedError + + def save_replay(self): + raise NotImplementedError + + def get_simulation_info(self): + raise NotImplementedError diff --git a/src_code_for_reproducibility/markov_games/statistics_runner.py b/src_code_for_reproducibility/markov_games/statistics_runner.py new file mode 100644 index 0000000000000000000000000000000000000000..eb30800f4aa4e5f919292023df77fa29e43d9c24 --- /dev/null +++ b/src_code_for_reproducibility/markov_games/statistics_runner.py @@ -0,0 +1,405 @@ +from __future__ import annotations + +import gc +import json +import pickle +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional + +from basic_render import find_iteration_folders + +from mllm.markov_games.rollout_tree import ( + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, + SimulationStepLog, +) + + +def _iterate_main_nodes(root: RolloutTreeRootNode) -> Iterator[RolloutTreeNode]: + """ + Iterate the main path nodes without materializing full path lists. + """ + current = root.child + while current is not None: + if isinstance(current, RolloutTreeNode): + yield current + current = current.child + elif isinstance(current, RolloutTreeBranchNode): + # Follow only the main child on the main trajectory + current = current.main_child + else: + break + + +def iterate_main_simulation_logs( + root: RolloutTreeRootNode, +) -> Iterator[SimulationStepLog]: + for node in _iterate_main_nodes(root): + yield node.step_log.simulation_step_log + + +def stream_rollout_files(iteration_folder: Path) -> Iterator[Path]: + for p in iteration_folder.rglob("*.rt.pkl"): + if p.is_file(): + yield p + + +def load_root(path: Path) -> RolloutTreeRootNode: + with open(path, "rb") as f: + data = pickle.load(f) + return RolloutTreeRootNode.model_validate(data) + + +@dataclass +class StatRecord: + mgid: int + crn_id: Optional[int] + iteration: str + values: Dict[str, Any] + + +class StatComputer: + """ + Stateful stat computer that consumes SimulationStepLog instances + and produces final aggregated values for one rollout (mgid). + """ + + def update(self, sl: SimulationStepLog) -> None: # pragma: no cover - interface + raise NotImplementedError + + def finalize(self) -> Dict[str, Any]: # pragma: no cover - interface + raise NotImplementedError + + +def run_stats( + data_root: Path, + game_name: str, + make_computers: Callable[[], List[StatComputer]], + output_filename: Optional[str] = None, + output_format: str = "json", # "json" (dict of lists) or "jsonl" +) -> Path: + """ + Compute stats across all iteration_* folders under data_root. + Writes JSONL to data_root/statistics/. + """ + data_root = Path(data_root) + outdir = data_root / "statistics" + outdir.mkdir(parents=True, exist_ok=True) + # Choose extension by format + default_name = ( + f"{game_name}.stats.json" + if output_format == "json" + else f"{game_name}.stats.jsonl" + ) + outfile = outdir / ( + output_filename if output_filename is not None else default_name + ) + + # Rewrite file each run to keep it clean and small + if outfile.exists(): + outfile.unlink() + + iteration_folders = find_iteration_folders(str(data_root)) + + # If writing JSONL, stream directly; otherwise accumulate minimal records + if output_format == "jsonl": + with open(outfile, "w", encoding="utf-8") as w: + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + computers = make_computers() + for sl in iterate_main_simulation_logs(root): + for comp in computers: + try: + comp.update(sl) + except Exception: + continue + + values: Dict[str, Any] = {} + for comp in computers: + try: + values.update(comp.finalize()) + except Exception: + continue + + rec = { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + w.write(json.dumps(rec, ensure_ascii=False) + "\n") + + del root + del computers + gc.collect() + else: + # Aggregate to dict-of-lists for easier plotting + records: List[Dict[str, Any]] = [] + # Process in deterministic order + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + computers = make_computers() + for sl in iterate_main_simulation_logs(root): + for comp in computers: + try: + comp.update(sl) + except Exception: + continue + + values: Dict[str, Any] = {} + for comp in computers: + try: + values.update(comp.finalize()) + except Exception: + continue + + records.append( + { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + ) + + del root + del computers + gc.collect() + + # Build dict-of-lists with nested stats preserved + # Collect all stat keys and nested agent keys where needed + mgids: List[Any] = [] + crn_ids: List[Any] = [] + iterations_out: List[str] = [] + # stats_out is a nested structure mirroring keys but with lists + stats_out: Dict[str, Any] = {} + + # First pass to collect union of keys + stat_keys: set[str] = set() + nested_agent_keys: Dict[str, set[str]] = {} + for r in records: + stats = r.get("stats", {}) or {} + for k, v in stats.items(): + stat_keys.add(k) + if isinstance(v, dict): + nested = nested_agent_keys.setdefault(k, set()) + for ak in v.keys(): + nested.add(str(ak)) + + # Initialize structure + for k in stat_keys: + if k in nested_agent_keys: + stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])} + else: + stats_out[k] = [] + + # Fill lists + for r in records: + mgids.append(r.get("mgid")) + crn_ids.append(r.get("crn_id")) + iterations_out.append(r.get("iteration")) + stats = r.get("stats", {}) or {} + for k in stat_keys: + val = stats.get(k) + if isinstance(stats_out[k], dict): + # per-agent dict + agent_dict = val if isinstance(val, dict) else {} + for ak in stats_out[k].keys(): + stats_out[k][ak].append(agent_dict.get(ak)) + else: + stats_out[k].append(val) + + with open(outfile, "w", encoding="utf-8") as w: + json.dump( + { + "mgid": mgids, + "crn_id": crn_ids, + "iteration": iterations_out, + "stats": stats_out, + }, + w, + ensure_ascii=False, + ) + + return outfile + + +def run_stats_functional( + data_root: Path, + game_name: str, + metrics: Dict[str, Callable[[SimulationStepLog], Optional[Dict[str, float]]]], + output_filename: Optional[str] = None, + output_format: str = "json", +) -> Path: + """ + Functional variant where metrics is a dict of name -> f(SimulationStepLog) -> {agent_id: value}. + Aggregates per rollout by averaging over steps where a metric produced a value. + Writes a single consolidated file in data_root/statistics/. + """ + data_root = Path(data_root) + outdir = data_root / "statistics" + outdir.mkdir(parents=True, exist_ok=True) + default_name = ( + f"{game_name}.stats.json" + if output_format == "json" + else f"{game_name}.stats.jsonl" + ) + outfile = outdir / ( + output_filename if output_filename is not None else default_name + ) + + if outfile.exists(): + outfile.unlink() + + iteration_folders = find_iteration_folders(str(data_root)) + + def finalize_rollout( + agg: Dict[str, Dict[str, List[float]]] + ) -> Dict[str, Dict[str, float]]: + # avg per metric per agent + result: Dict[str, Dict[str, float]] = {} + for mname, agent_values in agg.items(): + result[mname] = {} + for aid, vals in agent_values.items(): + if not vals: + result[mname][aid] = None # keep alignment; could be None + else: + result[mname][aid] = sum(vals) / len(vals) + return result + + if output_format == "jsonl": + with open(outfile, "w", encoding="utf-8") as w: + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + # aggregator structure: metric -> agent_id -> list of values + agg: Dict[str, Dict[str, List[float]]] = { + m: {} for m in metrics.keys() + } + + for sl in iterate_main_simulation_logs(root): + for mname, fn in metrics.items(): + try: + vals = fn(sl) + except Exception: + vals = None + if not vals: + continue + for aid, v in vals.items(): + if v is None: + continue + lst = agg[mname].setdefault(str(aid), []) + try: + lst.append(float(v)) + except Exception: + continue + + values = finalize_rollout(agg) + rec = { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + w.write(json.dumps(rec, ensure_ascii=False) + "\n") + + del root + gc.collect() + else: + records: List[Dict[str, Any]] = [] + for iteration_folder in iteration_folders: + iteration_name = Path(iteration_folder).name + for pkl_path in stream_rollout_files(Path(iteration_folder)): + root = load_root(pkl_path) + + agg: Dict[str, Dict[str, List[float]]] = {m: {} for m in metrics.keys()} + for sl in iterate_main_simulation_logs(root): + for mname, fn in metrics.items(): + try: + vals = fn(sl) + except Exception: + vals = None + if not vals: + continue + for aid, v in vals.items(): + if v is None: + continue + lst = agg[mname].setdefault(str(aid), []) + try: + lst.append(float(v)) + except Exception: + continue + + values = finalize_rollout(agg) + records.append( + { + "mgid": getattr(root, "id", None), + "crn_id": getattr(root, "crn_id", None), + "iteration": iteration_name, + "stats": values, + } + ) + + del root + gc.collect() + + # Build dict-of-lists output + mgids: List[Any] = [] + crn_ids: List[Any] = [] + iterations_out: List[str] = [] + stats_out: Dict[str, Any] = {} + + stat_keys: set[str] = set() + nested_agent_keys: Dict[str, set[str]] = {} + for r in records: + stats = r.get("stats", {}) or {} + for k, v in stats.items(): + stat_keys.add(k) + if isinstance(v, dict): + nested = nested_agent_keys.setdefault(k, set()) + for ak in v.keys(): + nested.add(str(ak)) + + for k in stat_keys: + if k in nested_agent_keys: + stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])} + else: + stats_out[k] = [] + + for r in records: + mgids.append(r.get("mgid")) + crn_ids.append(r.get("crn_id")) + iterations_out.append(r.get("iteration")) + stats = r.get("stats", {}) or {} + for k in stat_keys: + val = stats.get(k) + if isinstance(stats_out[k], dict): + agent_dict = val if isinstance(val, dict) else {} + for ak in stats_out[k].keys(): + stats_out[k][ak].append(agent_dict.get(ak)) + else: + stats_out[k].append(val) + + with open(outfile, "w", encoding="utf-8") as w: + json.dump( + { + "mgid": mgids, + "crn_id": crn_ids, + "iteration": iterations_out, + "stats": stats_out, + }, + w, + ensure_ascii=False, + ) + + return outfile diff --git a/src_code_for_reproducibility/models/__init__.py b/src_code_for_reproducibility/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src_code_for_reproducibility/models/__pycache__/__init__.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61b9dbbbf183b5e6b8bee05841751c6038194e85 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/__init__.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c24384e198f6c295b5aa5d68d68d6de0c29b5c3a Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..363f8d1728ed2fa130c28008957765bfe2751e06 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61bd97131802d65764f7f9331ef21f0ef2ea742c Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/inference_backend_sglang.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/inference_backend_sglang.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..587e4c342abb9f345d154fb82529a1b29e289f7d Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/inference_backend_sglang.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..40691a5d9d07ee835df62bb775e2acf28874c73d Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1909ba141042cf9fee7cc2ecf9d26862e60d80ef Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d0b213008dfc53f58c90c6db07082df0c817bd3a Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-311.pyc b/src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..be4986f25beb0d366d665efbca7c2be0e27957a5 Binary files /dev/null and b/src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/models/adapter_training_wrapper.py b/src_code_for_reproducibility/models/adapter_training_wrapper.py new file mode 100644 index 0000000000000000000000000000000000000000..a267876eda4db547357696794845b7934e6fe28e --- /dev/null +++ b/src_code_for_reproducibility/models/adapter_training_wrapper.py @@ -0,0 +1,89 @@ + +import torch.nn as nn +import logging +from typing import Union +from peft import ( + LoraConfig, + get_peft_model, +) + +logger = logging.getLogger(__name__) + + +class AdapterWrapper(nn.Module): + """ + A thin façade that + • keeps a reference to a *shared* PEFT-wrapped model, + • ensures `set_adapter(adapter)` is called on every forward, + • exposes only the parameters that should be trained for that adapter + (plus whatever extra modules you name). + """ + def __init__( + self, + shared_llm: nn.Module, + adapter_id: str, + lora_config: dict, + path: Union[str, None] = None, + ): + super().__init__() + self.shared_llm = shared_llm + self.adapter_id = adapter_id + lora_config = LoraConfig(**lora_config) + # this modifies the shared llm in place, adding a lora adapter inside + self.shared_llm = get_peft_model( + model=shared_llm, + peft_config=lora_config, + adapter_name=adapter_id, + ) + self.shared_llm.train() + # Load external adapter weights if provided + loaded_from: str | None = None + if path: + try: + # Supports both local filesystem paths and HF Hub repo IDs + self.shared_llm.load_adapter( + is_trainable=True, + model_id=path, + adapter_name=adapter_id, + ) + loaded_from = path + except Exception as exc: # noqa: BLE001 - want to log any load failure context + logger.warning( + f"Adapter '{adapter_id}': failed to load from '{path}': {exc}" + ) + + if loaded_from: + logger.info( + f"Adapter '{adapter_id}': loaded initial weights from '{loaded_from}'." + ) + else: + logger.info( + f"Adapter '{adapter_id}': initialized with fresh weights (no initial weights found)." + ) + + def parameters(self, recurse: bool = True): + """ + "recurse" is just for pytorch compatibility + """ + self.shared_llm.set_adapter(self.adapter_id) + params = [p for p in self.shared_llm.parameters() if p.requires_grad] + + return params + + def forward(self, *args, **kwargs): + self.shared_llm.set_adapter(self.adapter_id) + return self.shared_llm(*args, **kwargs) + + def save_pretrained(self, save_path): + self.shared_llm.save_pretrained(save_path) + + def gradient_checkpointing_enable(self, *args, **kwargs): + self.shared_llm.gradient_checkpointing_enable(*args, **kwargs) + + @property + def dtype(self): + return self.shared_llm.dtype + + @property + def device(self): + return self.shared_llm.device diff --git a/src_code_for_reproducibility/models/inference_backend.py b/src_code_for_reproducibility/models/inference_backend.py new file mode 100644 index 0000000000000000000000000000000000000000..1b6195b2c09b30bbf76ce79b14cbc7d611043d0f --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend.py @@ -0,0 +1,35 @@ +from abc import ABC, abstractmethod +from typing import Any, Optional +from dataclasses import dataclass + +@dataclass +class PolicyOutput: + content: str + reasoning_content: str | None = None + +class LLMInferenceBackend(ABC): + @abstractmethod + def __init__(self, **kwargs): + ... + + @abstractmethod + def prepare_adapter( + self, adapter_id: str, weights_got_updated: bool = False + ) -> None: + """Ensure adapter is ready/loaded for next generation call.""" + + @abstractmethod + async def generate(self, prompt: list[dict], regex: Optional[str] = None) -> PolicyOutput: + ... + + @abstractmethod + def toggle_training_mode(self) -> None: + ... + + @abstractmethod + def toggle_eval_mode(self) -> None: + ... + + @abstractmethod + def shutdown(self) -> None: + ... diff --git a/src_code_for_reproducibility/models/inference_backend_dummy.py b/src_code_for_reproducibility/models/inference_backend_dummy.py new file mode 100644 index 0000000000000000000000000000000000000000..3de56557a787a1c9e622d8f69ba4aa29e6da3560 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_dummy.py @@ -0,0 +1,53 @@ +import asyncio +import re +from typing import Optional + +import rstr +from transformers import AutoTokenizer + +from mllm.models.inference_backend import LLMInferenceBackend, PolicyOutput +from mllm.utils.short_id_gen import generate_short_id + + +class DummyInferenceBackend(LLMInferenceBackend): + def __init__( + self, + *args, + **kwargs, + ): + pass + + def prepare_adapter( + self, adapter_id: Optional[str], weights_got_updated: bool + ) -> None: + pass + + async def toggle_training_mode(self) -> None: + await asyncio.sleep(0) + pass + + async def toggle_eval_mode(self) -> None: + await asyncio.sleep(0) + pass + + def shutdown(self) -> None: + pass + + async def generate( + self, prompt_text: str, regex: Optional[str] = None + ) -> PolicyOutput: + content = "I am a dummy backend without a regex." + reasoning_content = None + + if regex: + raw_text = rstr.xeger(regex) + content = raw_text + # Strict split: require \n...\n\n before final content + m = re.match( + r"^\n\n([\s\S]*?)\n\n(.*)$", raw_text, flags=re.DOTALL + ) + if m: + reasoning_content = m.group(1) + content = m.group(2) + + return PolicyOutput(content=content, reasoning_content=reasoning_content) diff --git a/src_code_for_reproducibility/models/inference_backend_sglang.py b/src_code_for_reproducibility/models/inference_backend_sglang.py new file mode 100644 index 0000000000000000000000000000000000000000..ba987988bb44478be77342092bf88eb39f87fb44 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_sglang.py @@ -0,0 +1,86 @@ +# new_backend_sglang_offline.py +from __future__ import annotations + +import asyncio +from typing import Any, Optional + +import sglang as sgl + +from mllm.models.inference_backend import LLMInferenceBackend + + +class SGLangOfflineBackend(LLMInferenceBackend): + def __init__( + self, + model_name: str, + tokenizer, # unused but kept for parity + adapter_paths: dict[str, str], + device: str = "cuda", + max_model_len: Optional[int] = None, + enable_lora: bool = True, + lora_target_modules: Optional[list[str] | str] = None, + max_loras_per_batch: int = 8, + engine_kwargs: dict[str, Any] = None, + ): + self.model_name = model_name + self.adapter_paths = adapter_paths + self.current_adapter: Optional[str] = None + engine_kwargs = dict(engine_kwargs or {}) + # Map server-style LoRA flags to offline engine ctor + if enable_lora and adapter_paths: + engine_kwargs.setdefault("enable_lora", True) + # The offline Engine mirrors server args; pass a mapping name->path + engine_kwargs.setdefault("lora_paths", adapter_paths) + if lora_target_modules is not None: + engine_kwargs.setdefault("lora_target_modules", lora_target_modules) + engine_kwargs.setdefault("max_loras_per_batch", max_loras_per_batch) + + if max_model_len is not None: + engine_kwargs.setdefault("context_length", max_model_len) + + # Launch in-process engine (no HTTP server) + self.llm = sgl.Engine(model_path=model_name, **engine_kwargs) # async-ready + # SGLang supports: generate(), async_generate(), and async streaming helpers. :contentReference[oaicite:2]{index=2} + + def is_ready(self) -> bool: + return True + + def toggle_training_mode(self) -> None: + # No explicit KV release API offline; typically you pause usage here. + pass + + def toggle_eval_mode(self) -> None: + pass + + def shutdown(self) -> None: + # Engine cleans up on GC; explicit close not required. + pass + + def prepare_adapter(self, adapter_id: Optional[str]) -> None: + # With offline Engine, when LoRA is enabled at init, + # you select adapter per request via the input batch mapping. + self.current_adapter = adapter_id + + async def generate( + self, prompt_text: str, sampling_params: dict, adapter_id: Optional[str] + ) -> str: + # Non-streaming async (batch of 1). For batched prompts, pass a list. + params = { + "temperature": sampling_params.get("temperature", 1.0), + "top_p": sampling_params.get("top_p", 1.0), + "max_new_tokens": sampling_params.get("max_new_tokens", 128), + } + if (tk := sampling_params.get("top_k", -1)) and tk > 0: + params["top_k"] = tk + if (mn := sampling_params.get("min_new_tokens")) is not None: + params["min_new_tokens"] = mn + if (fp := sampling_params.get("frequency_penalty")) is not None: + params["frequency_penalty"] = fp + + # If using multi-LoRA, SGLang lets you provide adapter names aligned to each input. + prompts = [prompt_text] + adapters = [adapter_id] if adapter_id else None # or omit for base + outs = await self.llm.async_generate( + prompts, params, adapters + ) # :contentReference[oaicite:3]{index=3} + return outs[0]["text"] diff --git a/src_code_for_reproducibility/models/inference_backend_sglang_local_server.py b/src_code_for_reproducibility/models/inference_backend_sglang_local_server.py new file mode 100644 index 0000000000000000000000000000000000000000..c29f4d01a0bc0a6a0435461a02cd67074771abb0 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_sglang_local_server.py @@ -0,0 +1,127 @@ +import os + +import httpx +import requests +from sglang.utils import launch_server_cmd, wait_for_server + +from mllm.models.inference_backend import LLMInferenceBackend + + +class HttpSGLangBackend(LLMInferenceBackend): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.port = None + self.proc = None + self.urls = {} + # track sglang adapter ids separately from your logical ids + self.sglang_names = {aid: aid for aid in self.adapter_paths.keys()} + self.needs_loading = {aid: True for aid in self.adapter_paths.keys()} + + # defaults you already used: + self.mem_fraction = kwargs.get("mem_fraction_static", 0.6) + self.dtype = kwargs.get("dtype", "bfloat16") + self.extra_cli = kwargs.get("extra_cli", "") + self.disable_radix_cache = kwargs.get("disable_radix_cache", True) + + def launch(self) -> None: + # find local hf cache path for server + from transformers.utils import cached_file + + local_llm_path = os.path.split(cached_file(self.model_name, "config.json"))[0] + + lora_str = "" + if self.adapter_paths: + lora_str = "--lora-paths " + " ".join( + f"{aid}={path}" for aid, path in self.adapter_paths.items() + ) + + cmd = f""" + python3 -m sglang.launch_server --model-path {local_llm_path} \ + --host 0.0.0.0 {lora_str} \ + {'--disable-radix-cache' if self.disable_radix_cache else ''} \ + --mem-fraction-static {self.mem_fraction} --dtype {self.dtype} {self.extra_cli} + """ + self.proc, self.port = launch_server_cmd(cmd) + wait_for_server(f"http://localhost:{self.port}") + base = f"http://localhost:{self.port}" + self.urls = dict( + generate=f"{base}/generate", + release=f"{base}/release_memory_occupation", + resume=f"{base}/resume_memory_occupation", + load_lora=f"{base}/load_lora_adapter", + unload_lora=f"{base}/unload_lora_adapter", + ) + + def is_ready(self) -> bool: + try: + requests.get(self.urls["generate"], timeout=2) + return True + except Exception: + return False + + def prepare_adapter(self, adapter_id: str) -> None: + if adapter_id is None: + return + if self.needs_loading.get(adapter_id, False): + # unload old name if present + try: + requests.post( + self.urls["unload_lora"], + json={"lora_name": self.sglang_names[adapter_id]}, + timeout=10, + ) + except Exception: + pass + new_name = self._short_id() + self.sglang_names[adapter_id] = new_name + requests.post( + self.urls["load_lora"], + json={ + "lora_name": new_name, + "lora_path": self.adapter_paths[adapter_id], + }, + ).raise_for_status() + self.needs_loading[adapter_id] = False + + async def generate( + self, prompt_text: str, sampling_params: dict, adapter_id: str | None + ) -> str: + lora_name = self.sglang_names.get(adapter_id) if adapter_id else None + payload = { + "text": [prompt_text], + "sampling_params": sampling_params, + } + if lora_name: + payload["lora_path"] = [lora_name] + + timeout = httpx.Timeout(3600.0, connect=3600.0) + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(self.urls["generate"], json=payload) + resp.raise_for_status() + return resp.json()[0]["text"] + + def toggle_training_mode(self) -> None: + # free KV space while training adapters + requests.post( + self.urls["release"], json={"tags": ["kv_cache"]} + ).raise_for_status() + + def toggle_eval_mode(self) -> None: + # re-allocate KV space + try: + requests.post( + self.urls["resume"], json={"tags": ["kv_cache"]} + ).raise_for_status() + except Exception: + pass + + def shutdown(self) -> None: + from sglang.utils import terminate_process + + if self.proc: + terminate_process(self.proc) + + def _short_id(self) -> str: + import uuid + + return str(uuid.uuid4().int)[:8] diff --git a/src_code_for_reproducibility/models/inference_backend_vllm.py b/src_code_for_reproducibility/models/inference_backend_vllm.py new file mode 100644 index 0000000000000000000000000000000000000000..f935fe910fbab01d3adbc9bd5b7a525f73ca769d --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_vllm.py @@ -0,0 +1,96 @@ +import asyncio +import re +from typing import Optional + +from transformers import AutoTokenizer +from vllm import AsyncEngineArgs, AsyncLLMEngine, SamplingParams +from vllm.lora.request import LoRARequest +from vllm.sampling_params import GuidedDecodingParams, RequestOutputKind + +from mllm.models.inference_backend import LLMInferenceBackend, PolicyOutput +from mllm.utils.short_id_gen import generate_short_id + + +class VLLMAsyncBackend(LLMInferenceBackend): + def __init__( + self, + model_name: str, + tokenizer: AutoTokenizer, + adapter_paths: dict[str, str], + engine_init_kwargs: dict = {}, + sampling_params: dict = {}, + ): + self.model_name = model_name + self.adapter_paths = adapter_paths or {} + self.current_adapter = None + self.vllm_adapter_ids = { + adapter_id: generate_short_id() for adapter_id in adapter_paths.keys() + } + ea = dict(model=model_name, **engine_init_kwargs) + ea["enable_lora"] = True + ea["max_loras"] = len(self.vllm_adapter_ids) + ea["enable_sleep_mode"] = True + self.engine = AsyncLLMEngine.from_engine_args(AsyncEngineArgs(**ea)) + + self.sampling_params = sampling_params + + def prepare_adapter( + self, adapter_id: Optional[str], weights_got_updated: bool + ) -> None: + self.current_adapter = adapter_id + if weights_got_updated: + self.vllm_adapter_ids[adapter_id] = generate_short_id() + self.current_lora_request = LoRARequest( + adapter_id, + self.vllm_adapter_ids[adapter_id], + self.adapter_paths[adapter_id], + ) + + async def toggle_training_mode(self) -> None: + await self.engine.sleep(level=1) + + async def toggle_eval_mode(self) -> None: + await self.engine.wake_up() + + def shutdown(self) -> None: + # No explicit close call; engine stops when process exits. + pass + + async def generate( + self, prompt_text: str, regex: Optional[str] = None + ) -> PolicyOutput: + # Build SamplingParams correctly + + guided = GuidedDecodingParams(regex=regex) if regex else None + sp = SamplingParams( + **self.sampling_params, + guided_decoding=guided, + output_kind=RequestOutputKind.FINAL_ONLY, + ) + + request_id = f"req-{asyncio.get_running_loop().time()}" + result_generator = self.engine.generate( + prompt_text, + sp, # SamplingParams(...) + request_id, + lora_request=self.current_lora_request, + ) + + async for out in result_generator: # with FINAL_ONLY this runs once + res = out + + raw_text = res.outputs[0].text + + content = raw_text + reasoning_content = None + + if regex: + # Strict split: require \n...\n\n before final content + m = re.match( + r"^\n\n([\s\S]*?)\n\n(.*)$", raw_text, flags=re.DOTALL + ) + if m: + reasoning_content = m.group(1) + content = m.group(2) + + return PolicyOutput(content=content, reasoning_content=reasoning_content) diff --git a/src_code_for_reproducibility/models/inference_backend_vllm_local_server.py b/src_code_for_reproducibility/models/inference_backend_vllm_local_server.py new file mode 100644 index 0000000000000000000000000000000000000000..815ba3eb2cc9bb1d664b0065b240cd0273f66474 --- /dev/null +++ b/src_code_for_reproducibility/models/inference_backend_vllm_local_server.py @@ -0,0 +1,160 @@ +import json +import os +import subprocess +import time + +import httpx +import requests + +from mllm.models.inference_backend import LLMInferenceBackend + + +class HttpVLLMBackend(LLMInferenceBackend): + def __init__(self, **kwargs): + super().__init__(**kwargs) + self.port = kwargs.get("port", 8000) + self.host = kwargs.get("host", "0.0.0.0") + self.proc = None + self.base_url = f"http://{self.host}:{self.port}" + # vLLM memory safety knobs + self.gpu_mem_util = kwargs.get("gpu_memory_utilization", 0.9) + self.max_model_len = kwargs.get("max_model_len", None) + self.max_num_seqs = kwargs.get("max_num_seqs", None) + self.max_batched_tokens = kwargs.get("max_num_batched_tokens", None) + self.dtype = kwargs.get("dtype", "bfloat16") + self.trust_remote_code = kwargs.get("trust_remote_code", False) + # LoRA strategy: "preload" (CLI) or "runtime" (endpoints) depending on your vLLM build + self.lora_mode = kwargs.get( + "lora_mode", "preload" + ) # "runtime" supported in newer builds + self.runtime_lora_enabled = self.lora_mode == "runtime" + + # If preloading: build CLI args (adapter name -> path) + self._preload_lora_args = [] + if self.adapter_paths and self.lora_mode == "preload": + # vLLM supports multiple LoRA modules via CLI in recent versions + # Example flag shapes can vary; adapt as needed for your version: + # --lora-modules adapter_id=path + for aid, pth in self.adapter_paths.items(): + self._preload_lora_args += ["--lora-modules", f"{aid}={pth}"] + + def launch(self): + # Build vLLM serve command + cmd = [ + "python3", + "-m", + "vllm.entrypoints.openai.api_server", + "--model", + self.model_name, + "--host", + self.host, + "--port", + str(self.port), + "--dtype", + self.dtype, + "--gpu-memory-utilization", + str(self.gpu_mem_util), + ] + if self.trust_remote_code: + cmd += ["--trust-remote-code"] + if self.max_model_len: + cmd += ["--max-model-len", str(self.max_model_len)] + if self.max_num_seqs: + cmd += ["--max-num-seqs", str(self.max_num_seqs)] + if self.max_batched_tokens: + cmd += ["--max-num-batched-tokens", str(self.max_batched_tokens)] + cmd += self._preload_lora_args + + self.proc = subprocess.Popen( + cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True + ) + self._wait_ready() + + def _wait_ready(self, timeout=120): + url = f"{self.base_url}/v1/models" + t0 = time.time() + while time.time() - t0 < timeout: + try: + r = requests.get(url, timeout=2) + if r.status_code == 200: + return + except Exception: + pass + time.sleep(1) + raise RuntimeError("vLLM server did not become ready in time") + + def is_ready(self) -> bool: + try: + return ( + requests.get(f"{self.base_url}/v1/models", timeout=2).status_code == 200 + ) + except Exception: + return False + + def prepare_adapter(self, adapter_id: str) -> None: + if not adapter_id or not self.runtime_lora_enabled: + return + # Newer vLLM builds expose runtime LoRA endpoints. If yours differs, + # adjust the path/body here and keep the interface stable. + try: + requests.post( + f"{self.base_url}/v1/load_lora_adapter", + json={ + "adapter_name": adapter_id, + "adapter_path": self.adapter_paths[adapter_id], + }, + timeout=10, + ).raise_for_status() + except Exception as e: + # If already loaded or endpoint not present, swallow or log + pass + + async def generate( + self, prompt_text: str, sampling_params: dict, adapter_id: str | None + ) -> str: + # Map your sampling params to OpenAI schema + body = { + "model": self.model_name, + "messages": [{"role": "user", "content": prompt_text}], + "temperature": sampling_params.get("temperature", 1.0), + "top_p": sampling_params.get("top_p", 1.0), + "max_tokens": sampling_params.get("max_new_tokens", 128), + } + # Optional knobs: + if sampling_params.get("top_k", -1) and sampling_params["top_k"] > 0: + # vLLM accepts top_k via extra params; put under "extra_body" + body.setdefault("extra_body", {})["top_k"] = sampling_params["top_k"] + if sampling_params.get("min_new_tokens", None) is not None: + body.setdefault("extra_body", {})["min_tokens"] = sampling_params[ + "min_new_tokens" + ] + if sampling_params.get("frequency_penalty", None) is not None: + body["frequency_penalty"] = sampling_params["frequency_penalty"] + + # Select LoRA adapter + if adapter_id: + if self.runtime_lora_enabled: + body.setdefault("extra_body", {})["lora_adapter"] = adapter_id + else: + # when preloaded via CLI, most builds select by name via "adapter_name"/"lora_adapter" + body.setdefault("extra_body", {})["lora_adapter"] = adapter_id + + url = f"{self.base_url}/v1/chat/completions" + timeout = httpx.Timeout(3600.0, connect=3600.0) + async with httpx.AsyncClient(timeout=timeout) as client: + resp = await client.post(url, json=body) + resp.raise_for_status() + data = resp.json() + return data["choices"][0]["message"]["content"] + + def toggle_training_mode(self) -> None: + # vLLM doesn’t expose an explicit KV “release” toggle via API. + # Strategy: keep inference server idle during training, or run training in a separate process. + pass + + def toggle_eval_mode(self) -> None: + pass + + def shutdown(self) -> None: + if self.proc: + self.proc.terminate() diff --git a/src_code_for_reproducibility/models/large_language_model_api.py b/src_code_for_reproducibility/models/large_language_model_api.py new file mode 100644 index 0000000000000000000000000000000000000000..45013a3fd78caa37f1f826ea567d594ebb58bbf7 --- /dev/null +++ b/src_code_for_reproducibility/models/large_language_model_api.py @@ -0,0 +1,154 @@ +from __future__ import annotations + +import asyncio +import copy +import os +import re +import random +from typing import Any, Callable, Dict, List, Optional, Sequence + +from openai import AsyncOpenAI, OpenAIError +import backoff +from mllm.models.inference_backend import PolicyOutput + +class LargeLanguageModelOpenAI: + """Tiny async wrapper for OpenAI Chat Completions.""" + + def __init__( + self, + use_reasoning: bool, + llm_id: str, + model: str, + api_key: Optional[str] = None, + base_url: Optional[str] = None, + timeout_s: float = 300.0, + regex_max_attempts: int = 10, + sampling_params: Optional[Dict[str, Any]] = None, + output_directory: Optional[str] = None, + sleep_between_requests: bool = False, + ) -> None: + self.llm_id = llm_id + self.model = model + self.use_reasoning = use_reasoning + self.sleep_between_requests = sleep_between_requests + key = api_key or os.getenv("OPENAI_API_KEY") + if not key: + raise RuntimeError( + "Set OPENAI_API_KEY as global environment variable or pass api_key." + ) + client_kwargs: Dict[str, Any] = {"api_key": key, "timeout": timeout_s} + if base_url: + client_kwargs["base_url"] = base_url + self.client = AsyncOpenAI(**client_kwargs) + + # Sampling/default request params set at init + self.sampling_params = sampling_params + if use_reasoning: + self.sampling_params["reasoning"] = { + "effort": "medium", + "summary": "detailed", + } + self.regex_max_attempts = max(1, int(regex_max_attempts)) + + def get_inference_policies(self) -> Dict[str, Callable]: + return { + self.llm_id: self.generate, + } + + async def prepare_adapter_for_inference(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def toggle_eval_mode(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def toggle_training_mode(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def export_adapters(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def checkpoint_all_adapters(self, *args: Any, **kwargs: Any) -> None: + await asyncio.sleep(0) + pass + + async def wait_random_time(self, min_seconds: float = 0.0, max_seconds: float = 300) -> None: + await asyncio.sleep(random.uniform(min_seconds, max_seconds)) + + def extract_output_from_response(self, resp: Response) -> PolicyOutput: + if self.use_reasoning: + summary = resp.output[0].summary + if summary != []: + reasoning_content = summary[0].text + reasoning_content = f"OpenAI Reasoning Summary: {reasoning_content}" + else: + reasoning_content = None + content = resp.output[1].content[0].text + + else: + reasoning_content = None + content = resp.output[0].content[0].text + + return PolicyOutput( + content=content, + reasoning_content=reasoning_content, + ) + + @backoff.on_exception(backoff.expo, Exception, max_time=10**10, max_tries=10**10) + async def generate( + self, + prompt: list[dict], + regex: Optional[str] = None, + ) -> PolicyOutput: + + # Remove any non-role/content keys from the prompt else openai will error + prompt = [{"role": p["role"], "content": p["content"]} for p in prompt] + + # if self.sleep_between_requests: + # await self.wait_random_time() + + # If regex is required, prime the model and validate client-side + if regex: + constraint_msg = { + "role": "user", + "content": ( + f"Output must match this regex exactly: {regex} \n" + "Return only the matching string, with no quotes or extra text." + ), + } + prompt = [constraint_msg, *prompt] + pattern = re.compile(regex) + for _ in range(self.regex_max_attempts): + resp = await self.client.responses.create( + model=self.model, + input=prompt, + **self.sampling_params, + ) + policy_output = self.extract_output_from_response(resp) + if pattern.fullmatch(policy_output.content): + return policy_output + prompt = [ + *prompt, + { + "role": "user", + "content": ( + f"Invalid response format. Expected format (regex): {regex}\n Please try again and provide ONLY a response that matches this regex." + ), + }, + ] + return policy_output + + # Simple, unconstrained generation + resp = await self.client.responses.create( + model=self.model, + input=prompt, + **self.sampling_params, + ) + policy_output = self.extract_output_from_response(resp) + return policy_output + + def shutdown(self) -> None: + self.client = None diff --git a/src_code_for_reproducibility/models/large_language_model_local.py b/src_code_for_reproducibility/models/large_language_model_local.py new file mode 100644 index 0000000000000000000000000000000000000000..59686d04925e327cfd5ed4f968bb3c5ad900819c --- /dev/null +++ b/src_code_for_reproducibility/models/large_language_model_local.py @@ -0,0 +1,309 @@ +""" +TODO: Figure out how to tweak SGlang not to go OOM when batch size is 32. See https://github.com/sgl-project/sglang/issues/6309. +""" + +import logging +import os +import re +import sys +import uuid +from collections.abc import Callable +from copy import deepcopy +from datetime import datetime +from typing import Literal + +import httpx +import requests +import torch +import torch.nn as nn +from sglang.utils import ( + launch_server_cmd, + print_highlight, + terminate_process, + wait_for_server, +) +from torch.optim import SGD, Adam, AdamW, RMSprop +from transformers import AutoModelForCausalLM, AutoTokenizer +from trl import AutoModelForCausalLMWithValueHead + +from mllm.models.adapter_training_wrapper import AdapterWrapper +from mllm.models.inference_backend import PolicyOutput +from mllm.models.inference_backend_dummy import DummyInferenceBackend +from mllm.models.inference_backend_sglang import SGLangOfflineBackend +from mllm.models.inference_backend_vllm import VLLMAsyncBackend + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + +AdapterID = str +PolicyID = str + + +class LeanLocalLLM: + """ + TOWRITE + """ + + def __init__( + self, + llm_id: str = "base_llm", + model_name: str = "Qwen/Qwen3-4B-Instruct-2507", + device: str = "cuda", + hf_kwargs: dict = {}, + adapter_configs: dict = {}, + output_directory: str = "./models/", + max_thinking_characters: int = 0, + inference_backend: Literal["vllm", "sglang", "dummy"] = "vllm", + inference_backend_sampling_params: dict = {}, + inference_backend_init_kwargs: dict = {}, + initial_adapter_paths: dict[str, str] | None = None, + regex_max_attempts: int = -1, + ): + self.inference_backend_name = inference_backend + self.output_directory = output_directory + self.llm_id = llm_id + self.device = torch.device(device) if device else torch.device("cuda") + self.model_name = model_name + self.adapter_configs = adapter_configs + self.adapter_ids = list(adapter_configs.keys()) + self.enable_thinking = max_thinking_characters > 0 + self.max_thinking_characters = max_thinking_characters + self.regex_max_attempts = regex_max_attempts + + # Optional user-specified initial adapter weight locations (local or HF Hub) + # Format: {adapter_id: path_or_repo_id} + self.initial_adapter_paths: dict[str, str] | None = initial_adapter_paths + + # Path management / imports + self.save_path = str(os.path.join(output_directory, model_name, "adapters")) + self.adapter_paths = { + adapter_id: os.path.join(self.save_path, adapter_id) + for adapter_id in self.adapter_ids + } + # ID management for tracking adapter versions + self.adapter_train_ids = { + adapter_id: self.short_id_generator() for adapter_id in self.adapter_ids + } + # Initialize tokenizer + self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) + # Setup padding token to be same as EOS token + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.tokenizer.pad_token = self.tokenizer.eos_token + + self.weights_got_updated: dict[AdapterID, bool] = { + adapter_id: False for adapter_id in self.adapter_ids + } + self.current_lora_request = None + self.currently_loaded_adapter_id = None + + # --------------------------------------------------------- + # Init HF model, peft adapters + # --------------------------------------------------------- + self.shared_hf_llm = AutoModelForCausalLM.from_pretrained( + pretrained_model_name_or_path=model_name, + **hf_kwargs, + ) + self.hf_adapters = {} + self.optimizers = {} + for adapter_id in self.adapter_ids: + # Prefer output-folder path if it exists; else fall back to user-specified initial path if provided + output_path = os.path.join(self.save_path, adapter_id) + chosen_path: str | None = None + if os.path.isdir(output_path) and os.listdir(output_path): + chosen_path = output_path + logger.info( + f"Initializing adapter '{adapter_id}': using existing weights from output folder '{chosen_path}'." + ) + elif ( + self.initial_adapter_paths and adapter_id in self.initial_adapter_paths + ): + chosen_path = self.initial_adapter_paths[adapter_id] + logger.info( + f"Initializing adapter '{adapter_id}': using provided initial path '{chosen_path}'." + ) + else: + logger.info( + f"Initializing adapter '{adapter_id}': no initial weights provided or found; starting from scratch." + ) + + hf_adapter = AdapterWrapper( + shared_llm=self.shared_hf_llm, + adapter_id=adapter_id, + lora_config=adapter_configs[adapter_id], + path=chosen_path, + ).to(device) + self.hf_adapters[adapter_id] = hf_adapter + # Persist current state of all adapters (ensures remote loads are cached to disk) + self.export_adapters() + + # --------------------------------------------------------- + # Init inference inference_backend + # --------------------------------------------------------- + + if inference_backend == "sglang": + self.inference_backend = SGLangOfflineBackend( + model_name=self.model_name, + save_path=self.save_path, + adapter_paths=self.adapter_paths, + tokenizer=self.tokenizer, + kwargs=inference_backend_init_kwargs, + ) + elif inference_backend == "vllm": + self.inference_backend = VLLMAsyncBackend( + model_name=self.model_name, + adapter_paths=self.adapter_paths, + tokenizer=self.tokenizer, + engine_init_kwargs=inference_backend_init_kwargs, + sampling_params=inference_backend_sampling_params, + ) + elif inference_backend == "dummy": + self.inference_backend = DummyInferenceBackend() + else: + raise ValueError(f"Unknown inference_backend: {inference_backend}") + + def get_inference_policies(self) -> dict[PolicyID, Callable]: + """ + TOWRITE + """ + policies = {} + for adapter_id in self.adapter_ids: + # define policy func + async def policy( + prompt: list[dict], regex: str | None = None, _adapter_id=adapter_id + ): + self.prepare_adapter_for_inference(adapter_id=_adapter_id) + response = await self.generate(prompt, regex) + return response + + policies[self.llm_id + "/" + adapter_id] = policy + return policies + + def get_adapter_modules(self) -> dict[PolicyID, nn.Module]: + """ + Returns wrappers over the adapters which allows them be + interfaced like regular PyTorch models. + # TODO: create the adapter wrappers here + See adapter_wrapper.py + """ + trainable_objects = {an: self.hf_adapters[an] for an in self.adapter_ids} + return trainable_objects + + async def toggle_training_mode(self) -> None: + for adn in self.adapter_ids: + self.adapter_train_ids[adn] = self.short_id_generator() + await self.inference_backend.toggle_training_mode() + + async def toggle_eval_mode(self) -> None: + await self.inference_backend.toggle_eval_mode() + + def prepare_adapter_for_inference(self, adapter_id: AdapterID) -> None: + self.inference_backend.prepare_adapter( + adapter_id, weights_got_updated=self.weights_got_updated[adapter_id] + ) + self.currently_loaded_adapter_id = adapter_id + self.weights_got_updated[adapter_id] = False + + async def generate( + self, prompt: list[dict], regex: str | None = None + ) -> PolicyOutput: + """ + TOWRITE + """ + + if self.enable_thinking: + # REQUIRED ... block with exact line breaks: + # \n...\n\n + # No extra characters allowed between \n\n and the core output. + think_block = ( + rf"\n\n[\s\S]{{1,{self.max_thinking_characters}}}\n\n" + ) + + if regex: + core = regex + if core.startswith("^"): + core = core[1:] + if core.endswith("$"): + core = core[:-1] + # Require think_block, then the core, across the entire string. + regex = rf"^{think_block}(?:{core})$" + else: + # Require think_block, then capture the entire final output. + regex = rf"^{think_block}(.*)$" + + prompt_text = self.tokenizer.apply_chat_template( + prompt, + tokenize=False, + enable_thinking=self.enable_thinking, + add_generation_prompt=True, + ) + + if self.regex_max_attempts != -1 and regex is not None: + pattern = re.compile(regex) + for i in range(self.regex_max_attempts): + policy_output: PolicyOutput = await self.inference_backend.generate( + prompt_text=prompt_text + ) + if pattern.fullmatch(policy_output.content): + return policy_output + logger.warning( + f"Response {policy_output.content} did not match regex: {regex}, retry {i + 1}/{self.regex_max_attempts}" + ) + prompt = [ + *prompt, + {"role": "assistant", "content": policy_output.content}, + { + "role": "user", + "content": ( + f"Invalid response format. Expected format (regex): {regex}\n Please try again and provide ONLY a response that matches this regex." + ), + }, + ] + prompt_text = self.tokenizer.apply_chat_template( + prompt, + tokenize=False, + enable_thinking=self.enable_thinking, + add_generation_prompt=True, + ) + logger.warning(f"Sending policy output that might not match regex.") + return await self.inference_backend.generate( + prompt_text=prompt_text, regex=regex + ) + else: + return await self.inference_backend.generate( + prompt_text=prompt_text, regex=regex + ) + + def export_adapters(self) -> None: + """ + Any peft wrapper, by default, saves all adapters, not just the one currently loaded. + """ + + # New version of the adapters available + for adapter_id in self.adapter_ids: + self.weights_got_updated[adapter_id] = True + + adapter_id = self.adapter_ids[0] + self.hf_adapters[adapter_id].save_pretrained(self.save_path) + + def checkpoint_all_adapters(self, checkpoint_indicator: str) -> None: + """ + Checkpoints all adapters to the configured output directory. + """ + adapter_id = self.adapter_ids[0] + output_dir = os.path.join(self.output_directory, "checkpoints") + os.makedirs(output_dir, exist_ok=True) + date_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + export_path = os.path.join( + output_dir, f"{adapter_id}-{checkpoint_indicator}-{date_str}" + ) + for adapter_id in self.adapter_ids: + self.hf_adapters[adapter_id].save_pretrained(export_path) + + def short_id_generator(self) -> str: + """ + Generates a short unique ID for tracking adapter versions. + + Returns: + int: An 8-digit integer ID. + """ + return str(uuid.uuid4().int)[:8] diff --git a/src_code_for_reproducibility/models/scalar_critic.py b/src_code_for_reproducibility/models/scalar_critic.py new file mode 100644 index 0000000000000000000000000000000000000000..b0cabf6acfb6757db2871778a397bdbe38b813dd --- /dev/null +++ b/src_code_for_reproducibility/models/scalar_critic.py @@ -0,0 +1,54 @@ +import torch, torch.nn as nn, torch.optim as optim +from transformers import AutoModelForCausalLM, AutoTokenizer +from peft import LoraConfig, get_peft_model + +from mllm.models.adapter_training_wrapper import AdapterWrapper + + +class ScalarCritic(nn.Module): + """ + A causal-LM critic_adapter + a scalar value head: + V_φ(s) = wᵀ h_last + b + Only LoRA adapters (inside critic_adapter) and the value head are trainable. + """ + def __init__(self, critic_adapter: AdapterWrapper): + super().__init__() + self.critic_adapter = critic_adapter + hidden_size = self.critic_adapter.shared_llm.config.hidden_size + self.value_head = nn.Linear(hidden_size, 1).to( + dtype=critic_adapter.dtype, + device=critic_adapter.device) + + def forward(self, + input_ids, + attention_mask=None, + **kwargs): + # AdapterWrapper activates its own adapter internally + outputs = self.critic_adapter( + input_ids=input_ids, + attention_mask=attention_mask, + output_hidden_states=True, + **kwargs, + ) + h_last = outputs.hidden_states[-1] # (B, S, H) + values = self.value_head(h_last).squeeze(-1) # (B, S) + return values + + def parameters(self, recurse: bool = True): + """Iterator over *trainable* parameters for this critic.""" + # 1) LoRA params for *this* adapter + for p in self.critic_adapter.parameters(): + yield p + # 2) scalar head + yield from self.value_head.parameters() + + def gradient_checkpointing_enable(self, *args, **kwargs): + self.critic_adapter.gradient_checkpointing_enable(*args, **kwargs) + + @property + def dtype(self): + return self.critic_adapter.dtype + + @property + def device(self): + return self.critic_adapter.device diff --git a/src_code_for_reproducibility/training/README.md b/src_code_for_reproducibility/training/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e1d56e2e3d2fb427f8da2488de4dd9de89650ca2 --- /dev/null +++ b/src_code_for_reproducibility/training/README.md @@ -0,0 +1,20 @@ +Suppose we have a trajectory with 3 timesteps. +token: "0 1 2 3 4 5 6 7 8 9 . . . . ." +string: "A B C a b c A a A a b c A B C" (Capitalized = User, Lowercased = Assistant) +action_mask: "x x x ✓ ✓ ✓ x ✓ x ✓ ✓ ✓ x x x" (F = False, T = True) +rewards: "r r r r r r R R R R R R r r r" +timestep: "0 0 0 0 0 0 1 1 1 1 1 1 2 2 2" +state_ends: "x x ✓ x x x ✓ x x x x x x x ✓" + +There must be one baseline flag per timestep! + +Then, we might have + +A naive way to interpret this is to think of the number of assistant messages as the number of +steps in the environment. However, this is not the case in practice. Indeed, in a +single simulation step, + + + + +A subtlety arises with credit assignment. In the multi-agent case, we might diff --git a/src_code_for_reproducibility/training/__init__.py b/src_code_for_reproducibility/training/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src_code_for_reproducibility/training/__pycache__/__init__.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..81d53ed41fbda4e4665b9040133ec2a85286dcf3 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/__init__.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b149c1507abfaf3f964696e7b42d1ff0bc81a2a6 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/credit_methods.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/produce_training_stats.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/produce_training_stats.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31878bc1116e0dc6297f39be0464a9f44838bc1e Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/produce_training_stats.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/tally_basic.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/tally_basic.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7080b6dc500a674bd2e6cfd2fdf6da8a90a62d6c Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/tally_basic.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4477776ad8d5dea721dadf1f31811a2eaad39455 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/tally_tokenwise.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..df71bcfd88cf7a5bf6fbdbf6c45c2ae5a86e2915 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1d0888f1e007b0117629841ac3e3a1a346129da Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/trainer_ad_align.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3a5311ebe8cfd0f97be8a3781e9d17fc4d8c7793 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/trainer_common.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f8eb311f7b264b1065c70456f41e398ce64e2207 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/trainer_independent.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-311.pyc b/src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..318c64d5719cac74e7a7ae42504f163d09b86365 Binary files /dev/null and b/src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-311.pyc differ diff --git a/src_code_for_reproducibility/training/annealing_methods.py b/src_code_for_reproducibility/training/annealing_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..1d46d6fe04d9482ab4f2c5763f4735390d7ecdf6 --- /dev/null +++ b/src_code_for_reproducibility/training/annealing_methods.py @@ -0,0 +1,6 @@ +import numpy as np + + +def sigmoid_annealing(step: int, temperature: float) -> float: + return 2 / (1 + np.exp(-step / temperature)) - 1 + diff --git a/src_code_for_reproducibility/training/credit_methods.py b/src_code_for_reproducibility/training/credit_methods.py new file mode 100644 index 0000000000000000000000000000000000000000..ebe078763a9a9db9ed075e0e1ef4591bfa1f0192 --- /dev/null +++ b/src_code_for_reproducibility/training/credit_methods.py @@ -0,0 +1,345 @@ +import torch + +from mllm.training.tally_basic import Tally + + +def whiten_advantages(advantages: torch.Tensor, tally: Tally = Tally()) -> torch.Tensor: + """ + Whitens the advantages. + """ + whitened_advantages = advantages - torch.mean(advantages) / ( + torch.std(advantages) + 1e-9 + ) + tally.add_metric(path=["whitened_advantages"], metric=whitened_advantages) + return whitened_advantages + + +def whiten_advantages_time_step_wise( + advantages: torch.Tensor, # (B, T) + tally: Tally = Tally(), +) -> torch.Tensor: + """ + Whitens the advantages. + """ + assert advantages.dim() == 2, "Wrong dimensions." + whitened_advantages_time_step_wise = advantages - advantages.mean( + dim=0, keepdim=True + ) / (advantages.std(dim=0, keepdim=True) + 1e-9) + tally.add_metric( + path=["whitened_advantages_time_step_wise"], + metric=whitened_advantages_time_step_wise, + ) + return whitened_advantages_time_step_wise + + +def get_discounted_state_visitation_credits( + credits: torch.Tensor, discount_factor: float, tally: Tally = Tally() # (B, T) +) -> torch.Tensor: + """ + Computes discounted state visitation credits for a sequence of credits. + """ + return credits * ( + discount_factor ** torch.arange(credits.shape[1], device=credits.device) + ) + tally.add_metric( + path=["discounted_state_visitation_credits"], + metric=discounted_state_visitation_credits, + ) + + +def get_discounted_returns( + rewards: torch.Tensor, # (B, T) + discount_factor: float, + reward_normalizing_constant: float, + tally: Tally = Tally(), +) -> torch.Tensor: + """ + Computes Monte Carlo discounted returns for a sequence of rewards. + + Args: + rewards (torch.Tensor): Array of rewards for each timestep. + + Returns: + torch.Tensor: Array of discounted returns. + """ + assert rewards.dim() == 2, "Wrong dimensions." + rewards = rewards / reward_normalizing_constant + B, T = rewards.shape + discounted_returns = torch.zeros_like(rewards) + accumulator = torch.zeros(B, device=rewards.device, dtype=rewards.dtype) + for t in reversed(range(T)): + accumulator = rewards[:, t] + discount_factor * accumulator + discounted_returns[:, t] = accumulator + tally.add_metric(path=["monte_carlo_discounted_returns"], metric=discounted_returns) + return discounted_returns + + +def get_rloo_credits(credits: torch.Tensor, tally: Tally = Tally()): # (B, S) + assert credits.dim() == 2, "Wrong dimensions." + rloo_baselines = torch.zeros_like(credits) + n = credits.shape[0] + if n == 1: + return credits, rloo_baselines + rloo_baselines = (torch.sum(credits, dim=0, keepdim=True) - credits) / (n - 1) + rloo_credits = credits - rloo_baselines + tally.add_metric(path=["rloo_credits"], metric=rloo_credits) + tally.add_metric(path=["rloo_baselines"], metric=rloo_baselines) + return rloo_credits, rloo_baselines + + +def get_generalized_advantage_estimates( + rewards: torch.Tensor, # (B, T) + value_estimates: torch.Tensor, # (B, T+1) + discount_factor: float, + lambda_coef: float, +) -> torch.Tensor: + """ + Computes Generalized Advantage Estimates (GAE) for a sequence of rewards and value estimates. + See https://arxiv.org/pdf/1506.02438 for details. + + + Returns: + torch.Tensor: Array of GAE values. + """ + assert rewards.dim() == value_estimates.dim() == 2, "Wrong dimensions." + + assert ( + rewards.shape[0] == value_estimates.shape[0] + ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates." + assert ( + rewards.shape[1] == value_estimates.shape[1] - 1 + ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates." + + T = rewards.shape[1] + tds = rewards + discount_factor * value_estimates[:, 1:] - value_estimates[:, :-1] + gaes = torch.zeros_like(tds) + acc = 0.0 + for t in reversed(range(T)): + acc = tds[:, t] + lambda_coef * discount_factor * acc + gaes[:, t] = acc + return gaes + + +def get_advantage_alignment_weights( + advantages: torch.Tensor, # (B, T) + exclude_k_equals_t: bool, + gamma: float, +) -> torch.Tensor: + """ + The advantage alignment credit is calculated as + + \[ + A^*(s_t, a_t, b_t) = A^1(s_t, a_t, b_t) + \beta \cdot + \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \right) + A^2(s_t, a_t, b_t) + \] + + Here, the weights are defined as \( \beta \cdot + \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \) + """ + T = advantages.shape[1] + discounted_advantages = advantages * ( + gamma * torch.ones((1, T), device=advantages.device) + ) ** (-torch.arange(0, T, 1, device=advantages.device)) + if exclude_k_equals_t: + sub = torch.eye(T, device=advantages.device) + else: + sub = torch.zeros((T, T), device=advantages.device) + + # Identity is for \( k < t \), remove for \( k \leq t \) + ad_align_weights = discounted_advantages @ ( + torch.triu(torch.ones((T, T), device=advantages.device)) - sub + ) + t_discounts = (gamma * torch.ones((1, T), device=advantages.device)) ** ( + torch.arange(0, T, 1, device=advantages.device) + ) + ad_align_weights = t_discounts * ad_align_weights + return ad_align_weights + + +def get_advantage_alignment_credits( + a1: torch.Tensor, # (B, S) + a1_alternative: torch.Tensor, # (B, S, A) + a2: torch.Tensor, # (B, S) + exclude_k_equals_t: bool, + beta: float, + gamma: float = 1.0, + use_old_ad_align: bool = False, + use_sign: bool = False, + clipping: float | None = None, + use_time_regularization: bool = False, + force_coop_first_step: bool = False, + use_variance_regularization: bool = False, + rloo_branch: bool = False, + reuse_baseline: bool = False, + mean_normalize_ad_align: bool = False, + whiten_adalign_advantages: bool = False, + whiten_adalign_advantages_time_step_wise: bool = False, + tally: Tally = Tally(), +) -> torch.Tensor: + """ + Calculate the advantage alignment credits with vectorization, as described in https://arxiv.org/abs/2406.14662. + + Recall that the advantage opponent shaping term of the AdAlign policy gradient is: + \[ + \beta \mathbb{E}_{\substack{ + \tau \sim \text{Pr}_{\mu}^{\pi^1, \pi^2} \\ + a_t' \sim \pi^1(\cdot \mid s_t) + }} + \left[\sum_{t=0}^\infty \gamma^{t}\left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t)\nabla_{\theta^1}\text{log } \pi^1(a_t|s_t) \right] + \] + + This method computes the following: + \[ + Credit(s_t, a_t, b_t) = \gamma^t \left[ A^1(s_t, a_t, b_t) + \beta \left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t) \right] + \] + + Args: + a1: Advantages of the main trajectories for the current agent. + a1_alternative: Advantages of the alternative trajectories for the current agent. + a2: Advantages of the main trajectories for the other agent. + discount_factor: Discount factor for the advantage alignment. + beta: Beta parameter for the advantage alignment. + gamma: Gamma parameter for the advantage alignment. + use_sign_in_ad_align: Whether to use sign in the advantage alignment. + + Returns: + torch.Tensor: The advantage alignment credits. + """ + assert a1.dim() == a2.dim() == 2, "Advantages must be of shape (B, S)" + if a1_alternative is not None: + assert ( + a1_alternative.dim() == 3 + ), "Alternative advantages must be of shape (B, S, A)" + B, T, A = a1_alternative.shape + assert a1.shape == a2.shape, "Not the same shape" + + tally.add_metric(path=["regular_advantages"], metric=a1) + tally.add_metric(path=["regular_advantages_other"], metric=a2) + if a1_alternative is not None: + tally.add_metric(path=["alternative_advantages"], metric=a1_alternative) + + if use_old_ad_align: + ad_align_weights = get_advantage_alignment_weights( + advantages=a1, exclude_k_equals_t=exclude_k_equals_t, gamma=gamma + ) + if exclude_k_equals_t: + ad_align_weights = gamma * ad_align_weights + else: + assert a1_alternative is not None, "Alternative advantages must be provided" + if rloo_branch: + a1_alternative = torch.cat([a1.unsqueeze(2), a1_alternative], dim=2) + a1_alternative = a1_alternative.mean(dim=2) + # print(f"a1_alternative: {a1_alternative}, a1: {a1}\n") + a1, baseline = get_rloo_credits(a1) + if reuse_baseline: + a1_alternative = a1_alternative - baseline + else: + a1_alternative, _ = get_rloo_credits(a1_alternative) + assert a1.shape == a1_alternative.shape, "Not the same shape" + ad_align_weights = get_advantage_alignment_weights( + advantages=a1_alternative, + exclude_k_equals_t=exclude_k_equals_t, + gamma=gamma, + ) + + # Log raw weights before further processing + tally.add_metric(path=["raw_advantage_alignment_weights"], metric=ad_align_weights) + + # Use sign + if use_sign: + assert beta == 1.0, "beta should be 1.0 when using sign" + positive_signs = ad_align_weights > 0 + negative_signs = ad_align_weights < 0 + ad_align_weights[positive_signs] = 1 + ad_align_weights[negative_signs] = -1 + tally.add_metric( + path=["ad_align_weights_ratio_positive_signs"], + metric=positive_signs.sum() / ad_align_weights.size, + ) + tally.add_metric( + path=["ad_align_weights_ratio_negative_signs"], + metric=negative_signs.sum() / ad_align_weights.size, + ) + # (rest are 0) + + tally.add_metric( + path=["ad_align_weights_after_using_sign"], metric=ad_align_weights + ) + + ################### + # Process weights + ################### + + # Use clipping + if clipping not in [0.0, None]: + upper_mask = ad_align_weights > 1 + lower_mask = ad_align_weights < -1 + + ad_align_weights = torch.clip( + ad_align_weights, + -clipping, + clipping, + ) + clipping_ratio = ( + torch.sum(upper_mask) + torch.sum(lower_mask) + ) / upper_mask.size + + tally.add_metric(path=["ad_align_clipping_ratio"], metric=clipping_ratio) + + tally.add_metric( + path=["ad_align_weights_after_clipping"], metric=ad_align_weights + ) + + # 1/1+t Regularization + if use_time_regularization: + t_values = torch.arange(1, T + 1) + ad_align_weights = ad_align_weights / t_values + tally.add_metric( + path=["ad_align_weights_after_1_over_t_reg"], metric=ad_align_weights + ) + + # Use coop on t=0 + if force_coop_first_step: + ad_align_weights[:, 0] = 1 + tally.add_metric( + path=["ad_align_weights_after_force_coop_first_step"], + metric=ad_align_weights, + ) + + # # Normalize alignment terms (across same time step) + # if use_variance_regularization_in_ad_align: + # # TODO: verify + # reg_coef = torch.std(a1[:, -1]) / (torch.std(opp_shaping_terms[:, -1]) + 1e-9) + # opp_shaping_terms *= reg_coef + # tally.add_metric( + # path=["opp_shaping_terms_after_var_reg"], metric=opp_shaping_terms + # ) + + #################################### + # Compose elements together + #################################### + + opp_shaping_terms = beta * ad_align_weights * a2 + + tally.add_metric(path=["ad_align_opp_shaping_terms"], metric=opp_shaping_terms) + + credits = a1 + opp_shaping_terms + + if mean_normalize_ad_align: + tally.add_metric(path=["ad_align_credits_before_mean"], metric=credits) + credits = credits - credits.mean(dim=0) + if whiten_adalign_advantages: + tally.add_metric(path=["ad_align_credits_before_whiten"], metric=credits) + credits = (credits - credits.mean()) / (credits.std() + 1e-9) + if whiten_adalign_advantages_time_step_wise: + tally.add_metric( + path=["ad_align_credits_before_whiten_time_step_wise"], metric=credits + ) + credits = (credits - credits.mean(dim=0, keepdim=True)) / ( + credits.std(dim=0, keepdim=True) + 1e-9 + ) + + tally.add_metric(path=["final_advantage_alignment_credits"], metric=credits) + + return credits diff --git a/src_code_for_reproducibility/training/produce_training_stats.py b/src_code_for_reproducibility/training/produce_training_stats.py new file mode 100644 index 0000000000000000000000000000000000000000..9599ccf784f6cb6cbec49babf109dedb3c981861 --- /dev/null +++ b/src_code_for_reproducibility/training/produce_training_stats.py @@ -0,0 +1,277 @@ +import copy +import gc +import json +import logging +import os +import pickle +import random +import re +import subprocess +import sys +import time +from datetime import datetime +from statistics import mean + +import hydra +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +from omegaconf import OmegaConf + + +def get_from_nested_dict(dictio: dict, path: list[str]): + for sp in path[:-1]: + dictio = dictio[sp] + return dictio.get(path[-1]) + + +def set_at_path(dictio: dict, path: list[str], value): + for sp in path[:-1]: + if sp not in dictio: + dictio[sp] = {} + dictio = dictio[sp] + dictio[path[-1]] = value + + +def produce_tabular_render(inpath: str, outpath: str = None): + """ + TODO: docstring + """ + with open(inpath, "r") as f: + data = json.load(f) + rollout_paths = data.keys() + for rollout_path in rollout_paths: + if outpath is None: + m_path = rollout_path.replace("/", "|") + m_path = m_path.replace(".json", "") + m_path = ( + os.path.split(inpath)[0] + + "/contextualized_tabular_renders/" + + m_path + + "_tabular_render.render.csv" + ) + # import pdb; pdb.set_trace() + os.makedirs(os.path.split(m_path)[0], exist_ok=True) + metrics = data[rollout_path] + d = {k: [] for k in metrics[0].keys()} + for m in metrics: + for k, v in m.items(): + d[k].append(v) + d = pd.DataFrame(d) + d.to_csv(m_path) + + +def get_metric_paths(data: list[dict]): + d = data[0] + paths = [] + + def traverse_dict(d, current_path=[]): + for key, value in d.items(): + new_path = current_path + [key] + if isinstance(value, dict): + traverse_dict(value, new_path) + else: + paths.append(new_path) + + traverse_dict(d) + return paths + + +def print_metric_paths(data: list[dict]): + paths = get_metric_paths(data) + for p in paths: + print(p) + + +def get_metric_iteration_list(data: list[dict], metric_path: list[str]): + if isinstance(metric_path, str): + metric_path = [metric_path] + sgl = [] + for d in data: + sgl.append(get_from_nested_dict(d, metric_path)) + return sgl + + +def to_1d_numeric(x): + """Return a 1-D float array (or None if not numeric). Accepts scalars, numpy arrays, or nested list/tuple of them.""" + if x is None: + return None + if isinstance(x, (int, float, np.number)): + return np.array([float(x)], dtype=float) + if isinstance(x, np.ndarray): + try: + return x.astype(float).ravel() + except Exception: + return None + if isinstance(x, (list, tuple)): + parts = [] + for e in x: + arr = to_1d_numeric(e) + if arr is not None and arr.size > 0: + parts.append(arr) + if parts: + return np.concatenate(parts) + return None + return None + + +def get_single_metric_vector(data, metric_path, iterations=None): + if isinstance(metric_path, str): + metric_path = [metric_path] + if iterations == None: + iterations = len(data) + vecs = [] + for d in data: + ar = get_from_nested_dict(d, metric_path) + arr = to_1d_numeric(ar) + if arr is not None: + vecs.append(arr) + + return np.concatenate(vecs) if vecs else np.empty(0, dtype=float) + + +def _load_metrics_file(file_path: str): + if not (file_path.endswith(".tally.pkl") or file_path.endswith(".pkl")): + raise ValueError("Only *.tally.pkl files are supported.") + import pickle + + with open(file_path, "rb") as f: + tree = pickle.load(f) + return tree + + +def get_iterations_data(iterations_path: str): + iterations_data = [] + more_iterations = True + n = 0 + iteration_path = os.path.join(iterations_path, f"iteration_{n:03d}") + while more_iterations: + if os.path.isdir(iteration_path): + for root, dirs, files in os.walk(iteration_path): + for file in sorted([f for f in files if f.endswith(".tally.pkl")]): + file_path = os.path.join(root, file) + iterations_data.append(_load_metrics_file(file_path)) + else: + more_iterations = False + n += 1 + iteration_path = os.path.join(iterations_path, f"iteration_{n:03d}") + return iterations_data + + +def _traverse_array_tally(array_tally: dict, prefix: list[str] = None): + if prefix is None: + prefix = [] + for key, value in array_tally.items(): + next_prefix = prefix + [str(key)] + if isinstance(value, dict): + yield from _traverse_array_tally(value, next_prefix) + else: + yield next_prefix, value + + +def _sanitize_filename_part(part: str) -> str: + s = part.replace("/", "|") + s = s.replace(" ", "_") + return s + + +def render_tally_pkl_to_csvs(pkl_path: str, outdir: str): + with open(pkl_path, "rb") as f: + payload = pickle.load(f) + # Backward compatibility: older tallies stored the dict directly + if isinstance(payload, dict) and "array_tally" in payload: + array_tally = payload.get("array_tally", {}) + rowmeta = payload.get("rowmeta", {}) + row_ids = payload.get("row_ids", []) + else: + array_tally = payload + rowmeta = {} + row_ids = [] + os.makedirs(outdir, exist_ok=True) + trainer_id = os.path.basename(pkl_path).replace(".tally.pkl", "") + for path_list, array_list in _traverse_array_tally(array_tally): + # Build datapoints by expanding element-wise: vectors → 1 row, matrices → per-row + datapoints = [] + for item in array_list: + # Normalize item + if isinstance(item, (int, float)): + arr = np.asarray([item]) + else: + try: + arr = np.asarray(item) + except Exception: + arr = np.array([item], dtype=object) + + # If object array (ragged), iterate elements directly + if isinstance(arr, np.ndarray) and arr.dtype == object: + for sub in arr: + sub_arr = np.asarray(sub) + if sub_arr.ndim <= 1: + datapoints.append(sub_arr.reshape(-1)) + else: + # Use first axis as rows, flatten the rest + for i in range(sub_arr.shape[0]): + datapoints.append(sub_arr[i].reshape(-1)) + continue + + # Numeric arrays + if arr.ndim == 0: + datapoints.append(arr.reshape(1)) + elif arr.ndim == 1: + datapoints.append(arr) + else: + for i in range(arr.shape[0]): + datapoints.append(arr[i].reshape(-1)) + # Build filename + path_part = ".".join(_sanitize_filename_part(p) for p in path_list) + filename = f"{trainer_id}__{path_part}.render.csv" + out_path = os.path.join(outdir, filename) + # Determine alignment with global row_ids + aligned_with_ids = len(row_ids) > 0 and len(datapoints) == len(row_ids) + # Write CSV + with open(out_path, "w", newline="") as f: + import csv + + writer = csv.writer(f) + # Determine max length after expansion for header + max_len = 0 + for r in datapoints: + max_len = max(max_len, int(np.asarray(r).size)) + # Header: include id columns only if aligned (agent_id, crn_id, rollout_id) + header = ( + ["agent_id", "crn_id", "rollout_id"] if aligned_with_ids else [] + ) + [f"c{j}" for j in range(max_len)] + if header: + writer.writerow(header) + for i, r in enumerate(datapoints): + r_arr = np.asarray(r) + if r_arr.size < max_len: + pad = np.empty((max_len - r_arr.size,), dtype=r_arr.dtype) + if pad.dtype == object: + pad[:] = "" + else: + pad[:] = np.nan + r_arr = np.concatenate([r_arr, pad]) + row_vals = [ + x if not isinstance(x, (np.floating, np.integer)) else x.item() + for x in r_arr + ] + if aligned_with_ids: + row_prefix = [ + row_ids[i].get("agent_id", ""), + row_ids[i].get("crn_id", ""), + row_ids[i].get("rollout_id", ""), + ] + writer.writerow(row_prefix + row_vals) + else: + writer.writerow(row_vals) + + +def render_iteration_trainer_stats(iteration_dir: str, outdir: str | None = None): + input_dir = iteration_dir + output_dir = outdir or os.path.join(iteration_dir, "trainer_stats.render.") + os.makedirs(output_dir, exist_ok=True) + for fname in sorted(os.listdir(input_dir)): + if fname.endswith(".tally.pkl"): + pkl_path = os.path.join(input_dir, fname) + render_tally_pkl_to_csvs(pkl_path=pkl_path, outdir=output_dir) diff --git a/src_code_for_reproducibility/training/tally_basic.py b/src_code_for_reproducibility/training/tally_basic.py new file mode 100644 index 0000000000000000000000000000000000000000..087031fb3fbbbf9a1829152e01615d8271f9fa01 --- /dev/null +++ b/src_code_for_reproducibility/training/tally_basic.py @@ -0,0 +1,164 @@ +import json +import os +from copy import deepcopy +from typing import Union + +import numpy as np +import pandas as pd +import torch +from transformers import AutoTokenizer + + +class Tally: + """ + Tally is a utility class for collecting and storing training metrics. + It supports adding metrics at specified paths and saving them to disk. + """ + + def __init__(self): + """ + Initializes the Tally object. + + Args: + tokenizer (AutoTokenizer): Tokenizer for converting token IDs to strings. + max_context_length (int, optional): Maximum context length for contextualized metrics. Defaults to 30. + """ + # Array-preserving structure (leaf lists hold numpy arrays / scalars) + self.array_tally = {} + # Global ordered list of sample identifiers (crn_id, rollout_id) added in the order samples are processed + self.sample_row_ids = [] + + def reset(self): + """ + Resets the base and contextualized tallies to empty dictionaries. + """ + self.array_tally = {} + self.sample_row_ids = [] + + def get_from_nested_dict(self, dictio: dict, path: str): + """ + Retrieves the value at a nested path in a dictionary. + + Args: + dictio (dict): The dictionary to search. + path (list): List of keys representing the path. + + Returns: + Any: The value at the specified path, or None if not found. + """ + assert isinstance(path, list), "Path must be list." + for sp in path[:-1]: + dictio = dictio.setdefault(sp, {}) + return dictio.get(path[-1], None) + + def set_at_path(self, dictio: dict, path: str, value): + """ + Sets a value at a nested path in a dictionary, creating intermediate dictionaries as needed. + + Args: + dictio (dict): The dictionary to modify. + path (list): List of keys representing the path. + value (Any): The value to set at the specified path. + """ + for sp in path[:-1]: + dictio = dictio.setdefault(sp, {}) + dictio[path[-1]] = value + + def add_metric( + self, path: str, metric: Union[float, int, np.ndarray, torch.Tensor, list] + ): + """ + Adds a metric to the base tally at the specified path. + + Args: + path (list): List of keys representing the path in the base tally. + metric (float|int|str|np.ndarray|dict): The metric value to add. + """ + metric = deepcopy(metric) + + # Array-only: accept numbers, tensors, numpy arrays, lists (will convert). No strings. + allowed_types = (float, int, np.ndarray, torch.Tensor, list) + assert isinstance(metric, allowed_types), "Metric of incorrect type" + + # Prepare array-preserving representation only + array_metric = metric + + if isinstance(metric, torch.Tensor): + if metric.dim() == 0: + array_metric = np.asarray(metric.item()) + else: + array_metric = metric.to(torch.float32).detach().cpu().numpy() + + if isinstance(array_metric, (float, int, np.number)): + array_metric = np.asarray(array_metric) + elif isinstance(array_metric, list): + # convert lists to numpy arrays; may be object dtype for ragged + try: + array_metric = np.asarray(array_metric) + except Exception: + array_metric = np.array(array_metric, dtype=object) + + # Update array-preserving tally + array_list = self.get_from_nested_dict(dictio=self.array_tally, path=path) + if array_list is None: + self.set_at_path(dictio=self.array_tally, path=path, value=[array_metric]) + else: + array_list.append(array_metric) + + def add_row_ids(self, crn_ids, rollout_ids, agent_ids=None): + """ + Append an ordered list of (crn_id, rollout_id) pairs to the global sample list. + Accepts tensors, numpy arrays, or lists. Scalars will be broadcast if needed. + """ + + # Normalize to lists + def to_list(x): + if isinstance(x, torch.Tensor): + return x.detach().cpu().tolist() + if isinstance(x, np.ndarray): + return x.tolist() + if isinstance(x, list): + return x + return [x] + + crn_list = to_list(crn_ids) + rid_list = to_list(rollout_ids) + ag_list = to_list(agent_ids) if agent_ids is not None else None + n = max(len(crn_list), len(rid_list)) + if ag_list is not None: + n = max(n, len(ag_list)) + if len(crn_list) != n: + crn_list = crn_list * n + if len(rid_list) != n: + rid_list = rid_list * n + if ag_list is not None and len(ag_list) != n: + ag_list = ag_list * n + for i in range(n): + entry = {"crn_id": crn_list[i], "rollout_id": rid_list[i]} + if ag_list is not None: + entry["agent_id"] = ag_list[i] + self.sample_row_ids.append(entry) + + def save(self, identifier: str, folder: str): + """ + Saves the base and contextualized tallies to disk as JSON files, and also saves contextualized tallies as CSV files for each game/rollout. + + Args: + path (str): Directory path where the metrics will be saved. + """ + os.makedirs(name=folder, exist_ok=True) + + from datetime import datetime + + now = datetime.now() + + # Pickle only (fastest, exact structure with numpy/scalars at leaves) + try: + import pickle + + pkl_path = os.path.join(folder, f"{identifier}.tally.pkl") + payload = {"array_tally": self.array_tally, "row_ids": self.sample_row_ids} + with open(pkl_path, "wb") as f: + pickle.dump(payload, f, protocol=pickle.HIGHEST_PROTOCOL) + except Exception: + pass diff --git a/src_code_for_reproducibility/training/tally_tokenwise.py b/src_code_for_reproducibility/training/tally_tokenwise.py new file mode 100644 index 0000000000000000000000000000000000000000..91ea17d8f3ff459d92573dda1be95d8f8b1e082e --- /dev/null +++ b/src_code_for_reproducibility/training/tally_tokenwise.py @@ -0,0 +1,273 @@ +import json +import os +from typing import Any, Dict, List, Tuple, Union + +import numpy as np +import pandas as pd +import torch +from transformers import AutoTokenizer + + +class ContextualizedTokenwiseTally: + """ + Collect, store, and save token-level metrics per rollout. + + - One DataFrame per rollout_id in `paths` + - Index = timestep (int) + - Columns are added incrementally via `add_contexts()` and `add_data()` + - Cells may contain scalars, strings, or lists (dtype=object) + """ + + def __init__( + self, + tokenizer: AutoTokenizer, + paths: List[str], + max_context_length: int = 30, + ): + """ + Args: + tokenizer: HuggingFace tokenizer used to convert tids -> tokens + paths: rollout identifiers (parallel to batch dimension) + max_context_length: truncate context token lists to this length + """ + self.tokenizer = tokenizer + self.paths = paths + self.max_context_length = max_context_length + self.tally: Dict[str, pd.DataFrame] = {path: pd.DataFrame() for path in paths} + + # set later by setters + self.contexts: torch.Tensor | None = None + self.action_mask: torch.Tensor | None = None + self.range: Tuple[int, int] | None = None + + # --------- Utilities --------- + + def tids_to_str(self, tids: List[int]) -> List[str]: + """Convert a list of token IDs to a list of token strings.""" + return self.tokenizer.convert_ids_to_tokens(tids) + + def _ensure_ready(self): + assert self.action_mask is not None, "call set_action_mask(mask) first" + assert self.range is not None, "call set_range((start, end)) first" + + @staticmethod + def _sanitize_filename(name: Any) -> str: + """Make a safe filename from any rollout_id.""" + s = str(name) + bad = {os.sep, " ", ":", "|", "<", ">", '"', "'"} + if os.altsep is not None: + bad.add(os.altsep) + for ch in bad: + s = s.replace(ch, "_") + return s + + @staticmethod + def _pad_left(seq: List[Any], length: int, pad_val: Any = "") -> List[Any]: + """Left-pad a sequence to `length` with `pad_val`.""" + if len(seq) >= length: + return seq[-length:] + return [pad_val] * (length - len(seq)) + list(seq) + + # --------- Setters --------- + + def set_action_mask(self, action_mask: torch.Tensor): + """ + action_mask: (B, S) bool or 0/1 indicating valid steps + """ + self.action_mask = action_mask + + def set_range(self, range: Tuple[int, int]): + """ + range: slice (start, end) into self.paths for current batch + """ + self.range = range + + # --------- Column builders --------- + + def add_contexts(self, contexts: torch.Tensor): + """ + Add a single 'context' column (list[str]) for valid steps. + + Expects `contexts` with shape (B, S): token id at each timestep. + For each valid timestep t, we use the last N tokens BEFORE t: + window = contexts[i, max(0, t-N) : t] + The list is left-padded with "" to always be length N. + """ + self._ensure_ready() + + current_paths = self.paths[self.range[0] : self.range[1]] + B, S = contexts.shape + N = self.max_context_length + + # to CPU ints once + contexts_cpu = contexts.detach().to("cpu") + + for i in range(B): + rollout_id = current_paths[i] + df = self.tally.get(rollout_id, pd.DataFrame()) + + valid_idx = torch.nonzero( + self.action_mask[i].bool(), as_tuple=False + ).squeeze(-1) + if valid_idx.numel() == 0: + self.tally[rollout_id] = df + continue + + idx_list = valid_idx.tolist() + + # ensure index contains valid steps + if df.empty: + df = pd.DataFrame(index=idx_list) + else: + new_index = sorted(set(df.index.tolist()) | set(idx_list)) + if list(df.index) != new_index: + df = df.reindex(new_index) + + # build context windows + ctx_token_lists = [] + for t in idx_list: + start = max(0, t - N) + window_ids = contexts_cpu[i, start:t].tolist() # tokens BEFORE t + window_toks = self.tids_to_str([int(x) for x in window_ids]) + if len(window_toks) < N: + window_toks = [""] * (N - len(window_toks)) + window_toks + else: + window_toks = window_toks[-N:] + ctx_token_lists.append(window_toks) + + # single 'context' column + if "context" not in df.columns: + df["context"] = pd.Series(index=df.index, dtype=object) + df.loc[idx_list, "context"] = pd.Series( + ctx_token_lists, index=idx_list, dtype=object + ) + + self.tally[rollout_id] = df + + def add_data( + self, + metric_id: str, + metrics: torch.Tensor, + to_tids: bool = False, + ): + """ + Add a metric column for valid steps. + + Args: + metric_id: column name + metrics: shape (B, S) for scalars/ids or (B, S, K) for top-k vectors + to_tids: if True, treat ints/lists of ints as tids and convert to tokens + """ + self._ensure_ready() + current_paths = self.paths[self.range[0] : self.range[1]] + + if metrics.dim() == 2: + B, S = metrics.shape + elif metrics.dim() == 3: + B, S, _ = metrics.shape + else: + raise ValueError("metrics must be (B, S) or (B, S, K)") + + for i in range(B): + rollout_id = current_paths[i] + df = self.tally.get(rollout_id, pd.DataFrame()) + + valid_idx = torch.nonzero( + self.action_mask[i].bool(), as_tuple=False + ).squeeze(-1) + if valid_idx.numel() == 0: + self.tally[rollout_id] = df + continue + + idx_list = valid_idx.detach().cpu().tolist() + + # Ensure index contains valid steps + if df.empty: + df = pd.DataFrame(index=idx_list) + else: + new_index = sorted(set(df.index.tolist()) | set(idx_list)) + if list(df.index) != new_index: + df = df.reindex(new_index) + + # Slice metrics at valid steps + m_valid = metrics[i][valid_idx] + + # -> pure python lists (1D list or list-of-lists) + values = m_valid.detach().cpu().tolist() + + # optional tids -> tokens + if to_tids: + + def _to_tokish(x): + if isinstance(x, list): + return self.tids_to_str([int(v) for v in x]) + else: + return self.tids_to_str([int(x)])[0] + + values = [_to_tokish(v) for v in values] + + # Ensure column exists with object dtype, then assign via aligned Series + if metric_id not in df.columns: + df[metric_id] = pd.Series(index=df.index, dtype=object) + + if isinstance(values, np.ndarray): + values = values.tolist() + + if len(values) != len(idx_list): + raise ValueError( + f"Length mismatch for '{metric_id}': values={len(values)} vs idx_list={len(idx_list)}" + ) + + df.loc[idx_list, metric_id] = pd.Series( + values, index=idx_list, dtype=object + ) + self.tally[rollout_id] = df + + # --------- Saving --------- + + def save(self, path: str): + """ + Write a manifest JSON and one CSV per rollout. + + - Manifest includes metadata only (safe to JSON). + - Each rollout CSV is written with index label 'timestep'. + - Only a single 'context' column (list[str]). + """ + if not self.tally or all(df.empty for df in self.tally.values()): + return + + os.makedirs(path, exist_ok=True) + from datetime import datetime + + now = datetime.now() + + manifest = { + "created_at": f"{now:%Y-%m-%d %H:%M:%S}", + "max_context_length": self.max_context_length, + "num_rollouts": len(self.tally), + "rollouts": [], + } + + for rid, df in self.tally.items(): + rid_str = str(rid) + safe_name = self._sanitize_filename(rid_str) + csv_path = os.path.join(path, f"{safe_name}_tokenwise.csv") + + # Put 'context' first, then the rest + cols = ["context"] + [c for c in df.columns if c != "context"] + df[cols].to_csv(csv_path, index=True, index_label="timestep") + + manifest["rollouts"].append( + { + "rollout_id": rid_str, + "csv": csv_path, + "num_rows": int(df.shape[0]), + "columns": cols, + } + ) + + manifest_path = os.path.join( + path, f"tokenwise_manifest_{now:%Y-%m-%d___%H-%M-%S}.json" + ) + with open(manifest_path, "w") as fp: + json.dump(manifest, fp, indent=2) diff --git a/src_code_for_reproducibility/training/tokenize_chats.py b/src_code_for_reproducibility/training/tokenize_chats.py new file mode 100644 index 0000000000000000000000000000000000000000..31ec1269b1e34f3b7f99875c4c52765e7542e21d --- /dev/null +++ b/src_code_for_reproducibility/training/tokenize_chats.py @@ -0,0 +1,191 @@ +""" +https://github.com/huggingface/transformers/blob/v4.53.3/src/transformers/tokenization_utils_base.py#L1519 +""" + +import torch +from transformers import AutoTokenizer + +from mllm.training.training_data_utils import ( + ReasoningLimits, + TrainingChatTurn, + TrajectoryBatch, +) +from mllm.utils.tiny_utils import find_subsequence + + +def get_chat_dicts(chat: list[TrainingChatTurn]) -> list[dict]: + chat_dicts = [chat_turn.dict() for chat_turn in chat] + return chat_dicts + + +# TODO: expand / test for different model classes +custom_qwen_template = """ +{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %} +{%- for message in messages %} + {%- if message.content is string %} + {%- set content = message.content %} + {%- else %} + {%- set content = '' %} + {%- endif %} + {%- if (message.role == "user") %} + {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" %} + {%- set reasoning_content = '' %} + {%- if message.reasoning_content is string %} + {%- set reasoning_content = message.reasoning_content %} + {%- else %} + {%- if '' in content %} + {%- set reasoning_content = content.split('')[0].rstrip('\n').split('')[-1].lstrip('\n') %} + {%- set content = content.split('')[-1].lstrip('\n') %} + {%- endif %} + {%- endif %} + {%- if reasoning_content %} + {{- '<|im_start|>' + message.role + '\n\n' + reasoning_content.strip('\n') + '\n\n\n' + content.lstrip('\n') }} + {%- else %} + {{- '<|im_start|>' + message.role + '\n' + content }} + {%- endif %} + {{- '<|im_end|>\n' }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} + {%- if enable_thinking is defined and enable_thinking is false %} + {{- '\n\n\n\n' }} + {%- endif %} +{%- endif %} +""" + + +def get_qwen_reasoning_limit_tuple( + tokenizer: AutoTokenizer, chat_turn: TrainingChatTurn +) -> ReasoningLimits: + """ """ + encoded = tokenizer.apply_chat_template( + [chat_turn], return_tensors=None, chat_template=custom_qwen_template + ) + if chat_turn.role != "assistant" or chat_turn.reasoning_content is None: + return None + open_reasoning_ids = tokenizer.encode("\n\n", add_special_tokens=False) + close_reasoning_ids = tokenizer.encode("\n\n", add_special_tokens=False) + reasoning_start = find_subsequence(encoded, open_reasoning_ids) + reasoning_end = find_subsequence(encoded, close_reasoning_ids) + len( + close_reasoning_ids + ) + if reasoning_start == -1 or reasoning_end == -1 or reasoning_end < reasoning_start: + import pdb + + pdb.set_trace() + assert ( + reasoning_start != -1 + and reasoning_end != -1 + and reasoning_end >= reasoning_start + ), f"Expected to find reasoning content in the assistant turn {tokenizer.decode(encoded)}" + content_end = len(encoded) + return ReasoningLimits(reasoning_start, reasoning_end, content_end) + + +def process_training_chat( + tokenizer: AutoTokenizer, + chat_history: list[TrainingChatTurn], + use_qwen_reasoning_mask: bool = False, +) -> tuple[ + torch.IntTensor, + torch.BoolTensor, + torch.IntTensor, + torch.BoolTensor, + list[ReasoningLimits], +]: + """Tokenize a single training chat and build aligned per-token masks. + + Given an ordered list of `TrainingChatTurn`, this function tokenizes each + turn independently using the tokenizer's chat template, then concatenates + all resulting token sequences. It also constructs four parallel 1D masks + that align with the concatenated tokens: + + - input_ids: token ids for the entire chat, turn by turn + - action_mask: True for tokens that belong to assistant turns (i.e., model + actions), False for tokens from other roles + - timesteps: per-token time step copied from the originating turn's + `time_step` + - state_ends_mask: True for the last token of any turn where + `is_state_end` is True, otherwise False + - reasoning_limit_tuples: list of tuples (start, end) of the reasoning blocks + + Important details: + - Each turn is passed as a single-message list to + `tokenizer.apply_chat_template` and flattened; the per-turn outputs are + then concatenated in the original order. + - Turn boundaries are not explicitly encoded beyond what the chat template + inserts; masks provide alignment for learning signals and state endings. + - No truncation or padding is performed here; downstream code should handle + batching/padding as needed. + - Note on dtypes: `input_ids` will be a LongTensor (int64). `action_mask` + and `state_ends_mask` are BoolTensors. `timesteps` is currently created + as a float tensor; adjust the implementation if integer dtype is + required downstream. + + Args: + tokenizer: A Hugging Face tokenizer supporting `apply_chat_template`. + chat_history: Ordered list of `TrainingChatTurn` forming one dialogue. + + Returns: + A tuple of five 1D tensors, all of equal length N (the total number of + tokens across all turns), in the following order: + - input_ids (LongTensor) + - action_mask (BoolTensor) + - timesteps (FloatTensor as implemented; see note above) + - state_ends_mask (BoolTensor) + - reasoning_limit_tuples (list[tuple[int, int]]) + """ + state_ends_mask = [] + input_ids = [] + action_mask = [] + timesteps = [] + reasoning_limit_tuples = [] + token_counter = 0 + chat_template = None + + if use_qwen_reasoning_mask: + assert tokenizer.model_type == "QwenForCausalLM" + chat_template = custom_qwen_template + + for train_chat_turn in chat_history: + is_state_end = train_chat_turn.is_state_end + time_step = train_chat_turn.time_step + is_action = train_chat_turn.role == "assistant" + chat_turn = { + "role": train_chat_turn.role, + "content": train_chat_turn.content, + "reasoning_content": train_chat_turn.reasoning_content, + } + chat_turn_ids = tokenizer.apply_chat_template( + [chat_turn], return_tensors="pt", chat_template=chat_template + ).flatten() + nb_chat_turns_ids = chat_turn_ids.numel() + state_ends_mask.append(torch.zeros(nb_chat_turns_ids, dtype=torch.bool)) + if is_state_end: + state_ends_mask[-1][-1] = True # last token is state end + + if use_qwen_reasoning_mask: + # Handle Qwen-specific reasoning mask + reasoning_limit_tuple = get_qwen_reasoning_limit_tuple( + tokenizer, train_chat_turn + ) + assert reasoning_limit_tuple is not None + reasoning_limit_tuple.reasoning_start += token_counter + reasoning_limit_tuple.reasoning_end += token_counter + reasoning_limit_tuple.content_end += token_counter + reasoning_limit_tuples.append(reasoning_limit_tuple) + + input_ids.append(chat_turn_ids) + action_mask.append(torch.ones(nb_chat_turns_ids, dtype=torch.bool)) + if not is_action: + action_mask[-1] = action_mask[-1] * False + timesteps.append(torch.ones(nb_chat_turns_ids) * time_step) + token_counter += nb_chat_turns_ids + + input_ids = torch.cat(input_ids) + action_mask = torch.cat(action_mask) + timesteps = torch.cat(timesteps) + state_ends_mask = torch.cat(state_ends_mask) + return (input_ids, action_mask, timesteps, state_ends_mask, reasoning_limit_tuples) diff --git a/src_code_for_reproducibility/training/trainer_common.py b/src_code_for_reproducibility/training/trainer_common.py new file mode 100644 index 0000000000000000000000000000000000000000..b48087e52f53d15f956281b236f7954574578634 --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_common.py @@ -0,0 +1,877 @@ +""" +TODO: Add coefficients for losses (depend on total number of tokens or batch) +TODO: adapt reinforce step for torch.compile +TODO: add lr schedulers support +""" +import logging +import os +import pickle +import sys +from abc import ABC, abstractmethod +from typing import Callable, Literal, Union + +import numpy as np +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.annealing_methods import sigmoid_annealing +from mllm.training.credit_methods import ( + get_discounted_returns, + get_generalized_advantage_estimates, + get_rloo_credits, + whiten_advantages, + whiten_advantages_time_step_wise, +) +from mllm.training.tally_basic import Tally +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +@dataclass +class TrainerAnnealingState: + annealing_step_counter: int = 0 + + +class BaseTrainer(ABC): + """ + Trainer + """ + + def __init__( + self, + policy: AutoModelForCausalLM, + policy_optimizer: torch.optim.Optimizer, + critic: Union[AutoModelForCausalLM, None], + critic_optimizer: Union[torch.optim.Optimizer, None], + tokenizer: AutoTokenizer, + lr_scheduler: torch.optim.lr_scheduler.LRScheduler, + critic_lr_scheduler: Union[torch.optim.lr_scheduler.LRScheduler, None], + ###################################################################### + entropy_coeff: float, + kl_coeff: float, + use_qwen_reasoning_mask: bool, + gradient_clipping: Union[float, None], + restrict_tokens: Union[list[str], None], + mini_batch_size: int, + use_gradient_checkpointing: bool, + temperature: float, + device: str, + whiten_advantages: bool, + whiten_advantages_time_step_wise: bool, + use_gae: bool, + use_gae_lambda_annealing: bool, + gae_lambda_annealing_limit: float, + gae_lambda_annealing_method: Literal["sigmoid_annealing"], + gae_lambda_annealing_method_params: dict, + pg_loss_normalization: Literal["batch", "nb_tokens"], + use_rloo: bool, + skip_discounted_state_visitation: bool, + discount_factor: float, + enable_tokenwise_logging: bool, + save_path: str, + reward_normalizing_constant: float = 1.0, + ): + """ + Initialize the REINFORCE trainer with reward shaping for multi-agent or single-agent training. + + Args: + model (AutoModelForCausalLM): The main policy model. + tokenizer (AutoTokenizer): Tokenizer for the model. + optimizer (torch.optim.Optimizer): Optimizer for the policy model. + lr_scheduler (torch.optim.lr_scheduler.LRScheduler): Learning rate scheduler for the policy model. + critic (AutoModelForCausalLM or None): Critic model for value estimation (optional). + critic_optimizer (torch.optim.Optimizer or None): Optimizer for the critic model (optional). + critic_lr_scheduler (torch.optim.lr_scheduler.LRScheduler or None): LR scheduler for the critic (optional). + config (RtConfig): Configuration object for training. + """ + self.tokenizer = tokenizer + # self.tokenizer.padding_side = "left" # needed for flash attention + if self.tokenizer.pad_token_id is None: + self.tokenizer.pad_token_id = self.tokenizer.eos_token_id + self.lr_scheduler = lr_scheduler + self.accelerator = Accelerator() + ( + self.policy, + self.policy_optimizer, + self.critic, + self.critic_optimizer, + ) = self.accelerator.prepare(policy, policy_optimizer, critic, critic_optimizer) + + self.critic_lr_scheduler = critic_lr_scheduler + self.tally = Tally() + + if use_gradient_checkpointing == True: + self.policy.gradient_checkpointing_enable(dict(use_reentrant=False)) + if critic is not None: + self.critic.gradient_checkpointing_enable(dict(use_reentrant=False)) + + self.save_path = save_path + + # Load trainer state if it exists + self.trainer_annealing_state_path = os.path.join( + self.save_path, "trainer_annealing_state.pkl" + ) + if os.path.exists(self.trainer_annealing_state_path): + logger.info( + f"Loading trainer state from {self.trainer_annealing_state_path}" + ) + self.trainer_annealing_state = pickle.load( + open(self.trainer_annealing_state_path, "rb") + ) + else: + self.trainer_annealing_state = TrainerAnnealingState() + + # Load policy optimizer state if it exists + self.policy_optimizer_path = os.path.join( + self.save_path, "policy_optimizer_state.pt" + ) + if os.path.exists(self.policy_optimizer_path): + logger.info( + f"Loading policy optimizer state from {self.policy_optimizer_path}" + ) + self.policy_optimizer.load_state_dict( + torch.load(self.policy_optimizer_path) + ) + self.critic_optimizer_path = os.path.join( + self.save_path, "critic_optimizer_state.pt" + ) + if ( + os.path.exists(self.critic_optimizer_path) + and self.critic_optimizer is not None + ): + logger.info( + f"Loading critic optimizer state from {self.critic_optimizer_path}" + ) + self.critic_optimizer.load_state_dict( + torch.load(self.critic_optimizer_path) + ) + self.device = self.accelerator.device + self.entropy_coeff = entropy_coeff + self.kl_coeff = kl_coeff + self.gradient_clipping = gradient_clipping + self.restrict_tokens = restrict_tokens + self.mini_batch_size = mini_batch_size + self.use_gradient_checkpointing = use_gradient_checkpointing + self.temperature = temperature + self.use_gae = use_gae + self.whiten_advantages = whiten_advantages + self.whiten_advantages_time_step_wise = whiten_advantages_time_step_wise + self.use_rloo = use_rloo + self.use_qwen_reasoning_mask = use_qwen_reasoning_mask + self.skip_discounted_state_visitation = skip_discounted_state_visitation + self.use_gae_lambda_annealing = use_gae_lambda_annealing + self.gae_lambda_annealing_limit = gae_lambda_annealing_limit + if use_gae_lambda_annealing: + self.gae_lambda_annealing_method: Callable[ + [int], float + ] = lambda step: eval(gae_lambda_annealing_method)( + step=step, **gae_lambda_annealing_method_params + ) + self.discount_factor = discount_factor + self.enable_tokenwise_logging = enable_tokenwise_logging + self.reward_normalizing_constant = reward_normalizing_constant + self.pg_loss_normalization = pg_loss_normalization + # Common containers used by all trainers + self.training_data: dict = {} + self.debug_path_list: list[str] = [] + self.policy_gradient_data = None + self.tokenwise_tally = None + + def mask_non_restricted_token_logits(self, logits: torch.Tensor) -> torch.Tensor: + """ + Masks logits so that only allowed tokens (as specified in config.restrict_tokens) + and the EOS token are active. + All other logits are set to -inf, effectively removing them from the softmax. + + Args: + logits (torch.Tensor): The logits tensor of shape (B, S, V). + + Returns: + torch.Tensor: The masked logits tensor. + """ + # TODO: verify. Not sure what we do here is differentiable + # also, we recompute for nothing + + if self.restrict_tokens is not None: + allowed_token_ids = [] + for token in self.restrict_tokens: + token_ids = self.tokenizer(token, add_special_tokens=False)["input_ids"] + allowed_token_ids.append(token_ids[0]) + allowed_token_ids.append( + self.tokenizer.eos_token_id + ) # This token should always be active + allowed_token_ids = torch.tensor(allowed_token_ids, device=logits.device) + # Mask log_probs and probs to only allowed tokens + mask = torch.zeros_like(logits).bool() # (B, S, V) + mask[..., allowed_token_ids] = True + logits = torch.where( + mask, + logits, + torch.tensor(-float("inf"), device=logits.device), + ) + + return logits + + # def get_gradient_magnitude(self, loss_term: torch.Tensor) -> float: + # """ + # Computes the L2 norm of the gradients of the given loss term with respect to the model parameters. + + # Args: + # loss_term (torch.Tensor): The loss tensor to compute gradients for. + + # Returns: + # float: The L2 norm of the gradients, or 0.0 if no gradients are present. + # """ + # with torch.no_grad(): + # grads = torch.autograd.grad( + # loss_term, + # [p for p in self.policy.parameters() if p.requires_grad], + # retain_graph=True, + # allow_unused=True, + # ) + # grads = [g for g in grads if g is not None] + # if not grads: + # return torch.tensor(0.0, device=loss_term.device) + # return torch.norm(torch.stack([g.norm(2) for g in grads])).item() + + def apply_reinforce_step( + self, + training_batch: TrainingBatch, + ) -> None: + """ + Applies a single REINFORCE policy gradient step using the provided batch of rollouts. + Handles batching, loss computation (including entropy and KL regularization), gradient accumulation, and optimizer step. + Optionally logs various metrics and statistics. + + Args: + paths (list[str]): List of game complete file paths for each rollout. + contexts (list[torch.Tensor]): List of context tensors for each rollout. + credits (list[torch.Tensor]): List of credit tensors (rewards/advantages) for each rollout. + action_masks (list[torch.Tensor]): List of action mask tensors for each rollout. + """ + with resource_logger_context(logger, "Apply reinforce step"): + self.policy.train() + mb_size = self.mini_batch_size + nb_rollouts = len(training_batch) + self.tally.add_metric(path=["nb_rollouts"], metric=nb_rollouts) + + # Get total number of tokens generated + total_tokens_generated = 0 + for att_mask in training_batch.batch_action_mask: + total_tokens_generated += att_mask.sum() + + # Obtain loss normalization + if self.pg_loss_normalization == "nb_tokens": + normalization_factor = total_tokens_generated + elif self.pg_loss_normalization == "batch": + normalization_factor = np.ceil(nb_rollouts / mb_size).astype(int) + else: + raise ValueError( + f"Invalid pg_loss_normalization: {self.pg_loss_normalization}" + ) + + # Gradient accumulation for each mini-batch + for mb in range(0, nb_rollouts, mb_size): + loss = 0.0 + training_mb = training_batch[mb : mb + mb_size] + training_mb = training_mb.get_padded_tensors() + training_mb.to(self.device) + tokens_mb, action_mask_mb, credits_mb, causal_reasoning_mask_mb = ( + training_mb.batch_input_ids, + training_mb.batch_action_mask, + training_mb.batch_credits, + training_mb.batch_causal_reasoning_mask, + ) + + # Next token prediction + contexts_mb = tokens_mb[:, :-1] + causal_reasoning_mask_mb = causal_reasoning_mask_mb[ + :, :-1, :-1 + ] # TODO: verify + shifted_contexts_mb = tokens_mb[:, 1:] + action_mask_mb = action_mask_mb[:, 1:] + credits_mb = credits_mb[:, 1:] + + if self.enable_tokenwise_logging: + self.tokenwise_tally.set_action_mask(action_mask=action_mask_mb) + self.tokenwise_tally.set_range(range=(mb, mb + mb_size)) + self.tokenwise_tally.add_contexts(contexts=contexts_mb) + self.tokenwise_tally.add_data( + metric_id="next_token", + metrics=shifted_contexts_mb, + to_tids=True, + ) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_credit", metrics=credits_mb + ) + + # Forward pass + cast to FP-32 for higher prec. + if self.use_qwen_reasoning_mask: + try: + causal_reasoning_mask_mb = causal_reasoning_mask_mb.unsqueeze( + 1 + ) # (B, 1, S, S) -- to broadcast over attention heads + logits = self.policy( + input_ids=contexts_mb, + attention_mask=causal_reasoning_mask_mb, + )[ + 0 + ] # (B, S, V) + except Exception as e: + print( + f"Attn implementation not supported with reasoning mask. Should be 'eager' or 'flex_attention'." + ) + raise e + else: + logits = self.policy(input_ids=contexts_mb)[0] # (B, S, V) + + # Mask non-restricted tokens + if self.restrict_tokens is not None: + logits = self.mask_non_restricted_token_logits(logits) + + logits /= self.temperature # (B, S, V) + + # Compute new log probabilities + log_probs = F.log_softmax(logits, dim=-1) # (B, S, V) + + # Get log probabilities of actions taken during rollouts + action_log_probs = log_probs.gather( + dim=-1, index=shifted_contexts_mb.unsqueeze(-1) + ).squeeze( + -1 + ) # (B, S) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_log_prob", + metrics=action_log_probs, + ) + self.tokenwise_tally.add_data( + metric_id="next_token_prob", + metrics=torch.exp(action_log_probs), + ) + top_k_indices = torch.topk(logits, k=5, dim=-1).indices + self.tokenwise_tally.add_data( + metric_id=f"top_{5}_tids", + metrics=top_k_indices, + to_tids=True, + ) + self.tokenwise_tally.add_data( + metric_id=f"top_{5}_probs", + metrics=torch.exp(log_probs).gather( + dim=-1, index=top_k_indices + ), + ) + + rewarded_action_log_probs = ( + action_mask_mb * credits_mb * action_log_probs + ) + # (B, S) + + if self.enable_tokenwise_logging: + self.tokenwise_tally.add_data( + metric_id="next_token_clogπ", + metrics=rewarded_action_log_probs, + ) + + # Add value term to loss + if self.pg_loss_normalization == "batch": + nb_act_tokens = action_mask_mb.sum() + mb_value = -rewarded_action_log_probs.sum() / nb_act_tokens + else: + mb_value = -rewarded_action_log_probs.sum() + + # if self.enable_tokenwise_logging: + # self.tally.add_metric( + # path=["gradient_term_magnitudes", "value"], + # metric=self.get_gradient_magnitude(loss_term=mb_value), + # ) + loss += mb_value + self.tally.add_metric( + path=["loss_mb_total", "value_mb_total"], metric=mb_value.item() + ) + # ------------------------------------------------- + # Entropy Regularization + # ------------------------------------------------- + if self.entropy_coeff != 0.0: + token_entropy_terms = -F.softmax(logits, dim=-1) * F.log_softmax( + logits, dim=-1 + ) + # (B, S, T) + # We only take the entropy of actions + token_entropy_terms *= action_mask_mb[:, :, None] + mb_entropy = token_entropy_terms.sum(dim=-1) + if self.enable_tokenwise_logging: + self.tally.add_contextualized_token_metrics( + metric_id="entropy", + metrics=mb_entropy, + ) + + if self.pg_loss_normalization == "batch": + nb_act_tokens = action_mask_mb.sum() + mb_entropy = -mb_entropy.sum() / nb_act_tokens + else: + mb_entropy = -mb_entropy.sum() + + mb_entropy *= self.entropy_coeff + self.tally.add_metric( + path=["loss_mb_total", "entropy_mb_total"], + metric=mb_entropy.item(), + ) + + # if self.enable_tokenwise_logging: + # self.tally.add_metric( + # path=["gradient_term_magnitudes", "entropy"], + # metric=self.get_gradient_magnitude(loss_term=mb_entropy), + # ) + loss += mb_entropy + + # ------------------------------------------------- + # KL-DIVERGENCE + # ------------------------------------------------- + if self.kl_coeff != 0.0: + with torch.no_grad(): + with self.policy.disable_adapter(): + ref_model_logits = self.policy( + input_ids=contexts_mb, # attention_mask=attention_mask + )[0] + ref_model_logits = ref_model_logits / self.temperature + # (B, S, V) + ref_model_logits = self.mask_non_restricted_token_logits( + logits=ref_model_logits + ) + # (B, S, V) + ref_model_log_probs = F.log_softmax(ref_model_logits, dim=-1) + # (B, S, V) + ref_model_action_log_probs = ref_model_log_probs.gather( + dim=-1, index=shifted_contexts_mb.unsqueeze(-1) + ).squeeze( + -1 + ) # (B,S) + # Approximating KL Divergence (see refs in docstring) + # Ref 1: http://joschu.net/blog/kl-approx.html + # Ref 2: https://github.dev/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py#L1332 + kl_div = ( + torch.exp(ref_model_action_log_probs - action_log_probs) + - (ref_model_action_log_probs - action_log_probs) + - 1 + ) + + if self.enable_tokenwise_logging: + self.tally.add_contextualized_token_metrics( + metric_id="kl", + metrics=kl_div, + ) + + # We only care about KLD of action tokens + kl_div *= action_mask_mb + mb_kl = kl_div.sum() + + mb_kl *= self.kl_coeff + + self.tally.add_metric( + path=["mb_kl_loss_terms"], metric=mb_kl.item() + ) + + if self.enable_tokenwise_logging: + self.tally.add_metric( + path=["gradient_term_magnitudes", "kl"], + metric=self.get_gradient_magnitude(loss_term=mb_kl), + ) + + loss += mb_kl + + # Accumulate gradient + loss /= normalization_factor + self.accelerator.backward(loss) + + # ensure gpu memory is freed + del training_mb + del log_probs + del logits + del loss + del action_log_probs + del rewarded_action_log_probs + + logger.info( + f"Accumulated the policy gradient loss for {total_tokens_generated} tokens." + ) + + # Clip gradients and take step + if self.gradient_clipping is not None: + grad_norm = self.accelerator.clip_grad_norm_( + self.policy.parameters(), self.gradient_clipping + ) + # TODO: log at right place + self.tally.add_metric(path=["gradient_norm"], metric=grad_norm.item()) + + # TODO: log grad norm even if no grad clip + + # Take step + self.policy_optimizer.step() + self.policy_optimizer.zero_grad() + + # Clear + # TODO: verify + self.accelerator.clear(self.policy, self.policy_optimizer) + import gc + + gc.collect() + torch.cuda.empty_cache() + + def get_advantages_with_critic_gradient_accumulation( + self, trajectories: TrajectoryBatch, critic_loss_scaling_factor: float = 2.0 + ) -> torch.FloatTensor: + """ + TOWRITE + Uses GAE if enabled, otherwise uses Monte Carlo returns. + Optionally trains the critic if GAE is used. + Returns: + advantages: NestedFloatTensors + """ + + mb_size = self.mini_batch_size + batch_size = trajectories.rollout_ids.shape[0] + # self.tally.add_metric(path=["discounted_returns"], metric=rewards) + batch_rewards = trajectories.batch_rewards + self.tally.add_metric(path=["batch_rewards"], metric=batch_rewards) + ###################################### + # use critic for advantage estimation + ###################################### + if self.use_gae: + self.critic.train() + advantages = [] + # critic_loss_scaling_factor comes learning single critic for two agents + normalization_factor = ( + np.ceil(batch_size / mb_size).astype(int) * critic_loss_scaling_factor + ) + # For each minibatch + for mb in range(0, batch_size, mb_size): + trajectory_mb = trajectories[mb : mb + mb_size] + trajectory_mb.to(self.device) + rewards_mb = trajectory_mb.batch_rewards + ( + tokens_mb, + state_ends_mask_mb, + timestep_counts, + causal_reasoning_mask_mb, + ) = trajectory_mb.get_padded_tensors_for_critic() + + # critic causal attention up to end flags + vals_estimate_full = self.critic(tokens_mb) + # if vals_estimate_full.dim() == 3: + # vals_estimate_full = vals_estimate_full.squeeze(-1) + + # Select only positions where states end, per sample → list of (jT,) + B = tokens_mb.shape[0] + vals_list = [ + vals_estimate_full[b][state_ends_mask_mb[b]] for b in range(B) + ] + + # Pad to (B, max_jT) = (B, S) + vals_estimate_mb = pad_sequence( + vals_list, batch_first=True, padding_value=0.0 + ) + dtype = vals_estimate_mb.dtype + rewards_mb = pad_sequence( + rewards_mb, batch_first=True, padding_value=0.0 + ).to( + dtype=dtype + ) # (B, S) + # self.tally.add_metric(path=["mb_rewards"], metric=rewards_mb) + # # Only for tallying + # get_discounted_returns( + # rewards=rewards_mb, + # discount_factor=self.discount_factor, + # reward_normalizing_constant=self.reward_normalizing_constant, + # tally=self.tally, + # ) + + det_vals_estimate_mb = vals_estimate_mb.detach() # (B, max_jT) + self.tally.add_metric( + path=["mb_value_estimates_critic"], metric=det_vals_estimate_mb + ) + + # Append a 0 value to the end of the value estimates + if det_vals_estimate_mb.shape[1] == rewards_mb.shape[1]: + Bsize = det_vals_estimate_mb.shape[0] + device = det_vals_estimate_mb.device + dtype = det_vals_estimate_mb.dtype + det_vals_estimate_mb = torch.cat( + [ + det_vals_estimate_mb, + torch.zeros((Bsize, 1), device=device, dtype=dtype), + ], + dim=1, + ) # (B, max_jT+1) + else: + raise ValueError( + "Incompatible shapes for value estimates and rewards." + ) + + # Get annealed lambda + if self.use_gae_lambda_annealing: + annealing_constant = self.gae_lambda_annealing_method( + step=self.trainer_annealing_state.annealing_step_counter + ) + annealed_lambda = ( + self.gae_lambda_annealing_limit * annealing_constant + ) + self.tally.add_metric( + path=["annealed_lambda"], metric=annealed_lambda + ) + else: + annealed_lambda = self.gae_lambda_annealing_limit + + # Get GAE advantages + gae_advantages = get_generalized_advantage_estimates( + rewards=rewards_mb, + value_estimates=det_vals_estimate_mb, + discount_factor=self.discount_factor, + lambda_coef=annealed_lambda, + ) # (B, max_jT) + self.tally.add_metric(path=["mb_gae_advantages"], metric=gae_advantages) + + targets = ( + gae_advantages.to(dtype=dtype) + det_vals_estimate_mb[:, :-1] + ) # (B, max_jT) # A(s, a, b) + V(s) = Q(s, a, b) + self.tally.add_metric(path=["mb_targets_critic"], metric=targets) + + loss = F.huber_loss( + input=vals_estimate_mb, + target=targets, + ) + self.tally.add_metric(path=["mb_critic_loss"], metric=loss.item()) + # Accumulate gradient + loss /= normalization_factor + self.accelerator.backward(loss) + + # Get jagged back using timestep_counts + advantages.extend( + [gae_advantages[i, : timestep_counts[i]] for i in range(B)] + ) + + ###################################### + # use exclusively Monte Carlo returns & rloo for advantage estimation + ###################################### + else: + lengths = [len(c) for c in batch_rewards] + padded_rewards = pad_sequence( + batch_rewards, batch_first=True, padding_value=0.0 + ) + padded_advantages = get_discounted_returns( + rewards=padded_rewards, + discount_factor=self.discount_factor, + reward_normalizing_constant=self.reward_normalizing_constant, + tally=self.tally, + ) # no baseline for now + if self.use_rloo: + is_grouped_by_rng = ( + trajectories.crn_ids.unique().shape[0] + != trajectories.crn_ids.shape[0] + ) + if is_grouped_by_rng: + for crn_id in trajectories.crn_ids.unique(): + rng_mask = trajectories.crn_ids == crn_id + rng_advantages = padded_advantages[rng_mask] + rng_advantages, _ = get_rloo_credits( + credits=rng_advantages, tally=self.tally + ) + padded_advantages[rng_mask] = rng_advantages + else: + padded_advantages, _ = get_rloo_credits( + credits=padded_advantages, tally=self.tally + ) + advantages = [ + padded_advantages[i, : lengths[i]] + for i in range(padded_advantages.shape[0]) + ] + + if self.whiten_advantages_time_step_wise: + lengths = [len(c) for c in advantages] + padded_advantages = pad_sequence( + advantages, batch_first=True, padding_value=0.0 + ) + whitened_padded_advantages = whiten_advantages_time_step_wise( + padded_advantages, tally=self.tally + ) + advantages = [ + whitened_padded_advantages[i, : lengths[i]].flatten() + for i in range(whitened_padded_advantages.shape[0]) + ] + + if self.whiten_advantages: + lengths = [len(c) for c in advantages] + whitened_advantages = whiten_advantages( + torch.stack(advantages, dim=0).flatten(), tally=self.tally + ) + advantages = torch.split( + tensor=whitened_advantages, split_size_or_sections=lengths + ) + + self.trainer_annealing_state.annealing_step_counter += 1 + + return advantages + + @abstractmethod + def set_agent_trajectory_data( + self, agent_id: str, roots: list[RolloutTreeRootNode] + ) -> None: + """ + TOWRITE + """ + pass + + def set_trajectory_data( + self, rollout_trees: list[RolloutTreeRootNode], agent_ids: list[str] + ) -> None: + """ + TOWRITE + """ + for agent_id in agent_ids: + self.set_agent_trajectory_data(agent_id, rollout_trees) + + @abstractmethod + def share_advantage_data(self) -> list[AdvantagePacket]: + pass + + @abstractmethod + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]) -> None: + pass + + def set_policy_gradient_data(self) -> None: + """ + Already set earlier # TODO: make it separate and clean + """ + self.policy_gradient_data = None + # Track row id order aligned with concatenation + concat_crn_ids = [] + concat_rollout_ids = [] + concat_agent_ids = [] + for agent_id, trajectory_batch in self.training_data.items(): + tokenwise_batch_credits = get_tokenwise_credits( + batch_timesteps=trajectory_batch.batch_timesteps, + batch_credits=trajectory_batch.batch_credits, + ) + policy_gradient_data = TrainingBatch( + rollout_ids=trajectory_batch.rollout_ids, + batch_input_ids=trajectory_batch.batch_input_ids, + batch_action_mask=trajectory_batch.batch_action_mask, + batch_credits=tokenwise_batch_credits, + batch_reasoning_limits=trajectory_batch.batch_reasoning_limits, + ) + if self.policy_gradient_data is None: + self.policy_gradient_data = policy_gradient_data + else: + self.policy_gradient_data.append(policy_gradient_data) + + concat_crn_ids.append(trajectory_batch.crn_ids) + concat_rollout_ids.append(trajectory_batch.rollout_ids) + concat_agent_ids.extend(trajectory_batch.agent_ids) + + self.tokenwise_tally = ContextualizedTokenwiseTally( + tokenizer=self.tokenizer, + paths=self.debug_path_list, + ) + + # Register row ids once in the same order used to build policy_gradient_data + try: + self.tally.add_row_ids( + crn_ids=torch.cat(concat_crn_ids), + rollout_ids=torch.cat(concat_rollout_ids), + agent_ids=concat_agent_ids, + ) + except Exception: + pass + + def train(self) -> None: + """ + TOWRITE + """ + assert self.policy_gradient_data is not None, "Policy gradient data is not set" + if self.critic_optimizer is not None: + if self.gradient_clipping is not None: + grad_norm = self.accelerator.clip_grad_norm_( + self.critic.parameters(), self.gradient_clipping + ) + self.tally.add_metric( + path=["gradient_norm_critic"], metric=grad_norm.item() + ) + # Take step + self.critic_optimizer.step() + self.critic_optimizer.zero_grad() + self.accelerator.clear(self.critic, self.critic_optimizer) + import gc + + gc.collect() + torch.cuda.empty_cache() + self.apply_reinforce_step(training_batch=self.policy_gradient_data) + + def export_training_tally(self, identifier: str, folder: str) -> None: + """ + Saves and resets the collected training metrics using the tally object. + """ + os.makedirs(folder, exist_ok=True) + self.tally.save(identifier=identifier, folder=folder) + self.tokenwise_tally.save( + path=os.path.join(folder, f"{identifier}_tokenwise.csv") + ) + self.tally.reset() + self.tokenwise_tally = None + self.debug_path_list = [] + + def export_optimizer_states(self) -> None: + """ + Saves the optimizer states for both the main model and critic (if it exists). + """ + try: + os.makedirs(self.save_path, exist_ok=True) + + torch.save(self.policy_optimizer.state_dict(), self.policy_optimizer_path) + logger.info(f"Saved main optimizer state to {self.policy_optimizer_path}") + + if self.critic_optimizer is not None: + torch.save( + self.critic_optimizer.state_dict(), self.critic_optimizer_path + ) + logger.info( + f"Saved critic optimizer state to {self.critic_optimizer_path}" + ) + except Exception as e: + logger.error(f"Error saving optimizer states: {str(e)}") + raise + + def export_trainer_annealing_state(self) -> None: + """ + Saves the trainer state. + """ + with open(self.trainer_annealing_state_path, "wb") as f: + pickle.dump(self.trainer_annealing_state, f) + logger.info(f"Saved trainer state to {self.trainer_annealing_state_path}") + + def export_trainer_states(self) -> None: + """ + Saves the trainer states. + """ + self.export_optimizer_states() + self.export_trainer_annealing_state() diff --git a/src_code_for_reproducibility/training/trainer_independent.py b/src_code_for_reproducibility/training/trainer_independent.py new file mode 100644 index 0000000000000000000000000000000000000000..d4fa6adf2251c837c384b786266433c74c3ae652 --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_independent.py @@ -0,0 +1,140 @@ +""" + +""" +import logging +import os +import sys +from typing import Union + +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.credit_methods import ( + get_discounted_returns, + get_discounted_state_visitation_credits, + get_generalized_advantage_estimates, + get_rloo_credits, +) +from mllm.training.tally_basic import Tally +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.trainer_common import BaseTrainer +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +class TrainerNaive(BaseTrainer): + def set_agent_trajectory_data( + self, agent_id: str, roots: list[RolloutTreeRootNode] + ) -> None: + """ + TOWRITE + """ + # TODO: append to current batch data instead, else we will only train for one agent! + self.policy_gradient_data = None + + # Tensorize Chats + rollout_ids = [] + crn_ids = [] # common random number id + batch_input_ids = [] + batch_action_mask = [] + batch_timesteps = [] + batch_state_ends_mask = [] + batch_reasoning_limits = [] + batch_rewards = [] + for root in roots: + rollout_id = root.id + self.debug_path_list.append( + "mgid:" + str(rollout_id) + "_agent_id:" + agent_id + ) + rollout_ids.append(rollout_id) + crn_ids.append(root.crn_id) + chat, rewards = get_main_chat_list_and_rewards(agent_id=agent_id, root=root) + ( + input_ids, + action_mask, + timesteps, + state_ends_mask, + reasoning_limit_tuples, + ) = process_training_chat( + tokenizer=self.tokenizer, + chat_history=chat, + use_qwen_reasoning_mask=self.use_qwen_reasoning_mask, + ) + batch_input_ids.append(input_ids) + batch_action_mask.append(action_mask) + batch_timesteps.append(timesteps) + batch_state_ends_mask.append(state_ends_mask) + batch_rewards.append(rewards) + batch_reasoning_limits.append(reasoning_limit_tuples) + trajectory_batch = TrajectoryBatch( + rollout_ids=torch.tensor(rollout_ids, dtype=torch.int32), + crn_ids=torch.tensor(crn_ids, dtype=torch.int32), + agent_ids=[agent_id] * len(rollout_ids), + batch_input_ids=batch_input_ids, + batch_action_mask=batch_action_mask, + batch_timesteps=batch_timesteps, + batch_state_ends_mask=batch_state_ends_mask, + batch_rewards=batch_rewards, + batch_reasoning_limits=batch_reasoning_limits, + ) + + # Get Advantages + batch_advantages: torch.FloatTensor = ( + self.get_advantages_with_critic_gradient_accumulation(trajectory_batch) + ) + if self.critic_optimizer is not None: + self.critic_optimizer.step() + self.critic_optimizer.zero_grad() + + trajectory_batch.batch_credits = batch_advantages + + # Discount state visitation (the mathematically correct way) + if not self.skip_discounted_state_visitation: + for i in range(len(batch_advantages)): + batch_advantages[i] = get_discounted_state_visitation_credits( + batch_advantages[i].unsqueeze(0), + self.discount_factor, + ).squeeze(0) + + self.training_data[agent_id] = trajectory_batch + + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]): + """ + This trainer ignores the advantages of the other trainers. + """ + pass + + def share_advantage_data(self) -> list[AdvantagePacket]: + """ + Share the advantage data with other agents. + Returns: + AdvantagePacket: The advantage packet containing the agent's advantages. + """ + logger.info(f"Sharing advantage data.") + advantage_packets = [] + for agent_id, agent_data in self.training_data.items(): + advantage_packets.append( + AdvantagePacket( + agent_id=agent_id, + rollout_ids=agent_data.rollout_ids, + main_advantages=agent_data.batch_credits, + ) + ) + return advantage_packets diff --git a/src_code_for_reproducibility/training/trainer_sum_rewards.py b/src_code_for_reproducibility/training/trainer_sum_rewards.py new file mode 100644 index 0000000000000000000000000000000000000000..6a9315415b723cedf8f03524bcd26c6449bc5821 --- /dev/null +++ b/src_code_for_reproducibility/training/trainer_sum_rewards.py @@ -0,0 +1,104 @@ +""" + +""" +import logging +import os +import sys +from typing import Union + +import torch +import torch.nn.functional as F +from accelerate import Accelerator +from pandas._libs.tslibs.offsets import CBMonthBegin +from peft import LoraConfig +from torch.nn.utils.rnn import pad_sequence +from transformers import AutoModelForCausalLM, AutoTokenizer + +from mllm.markov_games.rollout_tree import * +from mllm.markov_games.rollout_tree import RolloutTreeRootNode +from mllm.training.credit_methods import ( + get_discounted_returns, + get_discounted_state_visitation_credits, + get_generalized_advantage_estimates, + get_rloo_credits, +) +from mllm.training.tally_basic import Tally +from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally +from mllm.training.tokenize_chats import * +from mllm.training.tokenize_chats import process_training_chat +from mllm.training.trainer_common import BaseTrainer +from mllm.training.trainer_independent import TrainerNaive +from mllm.training.training_data_utils import * +from mllm.training.training_data_utils import ( + AdvantagePacket, + TrainingBatch, + TrajectoryBatch, + get_tokenwise_credits, +) +from mllm.utils.resource_context import resource_logger_context + +logger = logging.getLogger(__name__) +logger.addHandler(logging.StreamHandler(sys.stdout)) + + +class TrainerSumRewards(TrainerNaive): + def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]): + """ + Sums the advantages of the other trainers + """ + logger.info(f"Receiving advantage packets.") + + assert ( + 2 >= len(advantage_packets) > 0 + ), "At least one advantage packet must be provided." + + for agent_id, agent_data in self.training_data.items(): + for co_agent_packet in advantage_packets: + co_agent_id = co_agent_packet.agent_id + if agent_id == co_agent_id: + continue + agent_rollout_ids = agent_data.rollout_ids + agent_advantages = agent_data.batch_credits + co_agent_advantages = co_agent_packet.main_advantages + co_agent_rollout_ids = co_agent_packet.rollout_ids + B = len(agent_advantages) + # Get co-agent advantages in the right order + permutation = [] + for id in agent_rollout_ids: + permutation.append( + torch.where(id == co_agent_rollout_ids)[0].item() + ) + co_agent_advantages = [co_agent_advantages[i] for i in permutation] + assert all( + a.shape[0] == b.shape[0] + for a, b in zip(co_agent_advantages, agent_advantages) + ), "Number of advantages must match in order to sum them up." + + # Get padded tensors (advantage alignment is invariant to padding) + lengths = torch.tensor( + [len(t) for t in agent_advantages], + device=self.device, + dtype=torch.long, + ) + padded_main_advantages = pad_sequence( + agent_advantages, batch_first=True, padding_value=0.0 + ) + + padded_co_agent_advantages = pad_sequence( + co_agent_advantages, batch_first=True, padding_value=0.0 + ) + + # Create training batch data + sum_of_ad_credits = padded_main_advantages + padded_co_agent_advantages + + if not self.skip_discounted_state_visitation: + sum_of_ad_credits = get_discounted_state_visitation_credits( + sum_of_ad_credits, + self.discount_factor, + ) + + # Slice back to jagged and convert to tokenwise credits + sum_of_ad_credits = [ + sum_of_ad_credits[i, : lengths[i]] for i in range(B) + ] + self.training_data[agent_id].batch_credits = sum_of_ad_credits diff --git a/src_code_for_reproducibility/training/training_data_utils.py b/src_code_for_reproducibility/training/training_data_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..4cfea02b5b975e90d0ef1afe74ee27681c247a6f --- /dev/null +++ b/src_code_for_reproducibility/training/training_data_utils.py @@ -0,0 +1,402 @@ +from dataclasses import dataclass +from typing import Literal, Optional, Tuple + +import torch +from torch.nn.utils.rnn import pad_sequence + +from mllm.markov_games.rollout_tree import ( + ChatTurn, + RolloutTreeBranchNode, + RolloutTreeNode, + RolloutTreeRootNode, +) + + +@dataclass +class AdvantagePacket: + agent_id: str + rollout_ids: torch.IntTensor # (B,) + # list-of-tensors + main_advantages: list[torch.FloatTensor] + + +@dataclass +class ReasoningLimits: + reasoning_start: int + reasoning_end: int + content_end: int + + +def get_causal_reasoning_mask( + shape: Tuple[int, int], reasoning_limits: list[ReasoningLimits] +) -> ReasoningLimits: + """ + TOWRITE + """ + assert len(shape) == 2, "Must have batch and sequence dimensions" + B, S = shape + # Default causal attention + causal_reasoning_mask = torch.tril( + torch.ones((B, S, S), dtype=torch.bool, device="cuda"), diagonal=0 + ) + for b in range(B): + limits = reasoning_limits[b] + for l in limits: + causal_reasoning_mask[ + b, :, l.reasoning_start : l.reasoning_end + ] = False # hide reasoning tokens for every token + causal_reasoning_mask[ + b, + l.reasoning_start : l.content_end, + l.reasoning_start : l.reasoning_end, + ] = True # allow reasoning tokens whithin a block (plus associated content) to attend each other + causal_reasoning_mask = torch.tril( + causal_reasoning_mask, diagonal=0 + ) # add causality + return causal_reasoning_mask + + +class TrainingChatTurn: + # TODO: simplify by making this a child of ChatTurn + """ + This class contains the chat turns for a single agent. + It is like ChatTurn, but with the time step added. + """ + + def __init__( + self, + time_step: int, + role: str, + agent_id: str, + content: str, + reasoning_content: str | None, + is_state_end: bool, + ): + self.time_step = time_step + self.role = role + self.agent_id = agent_id + self.content = content + self.reasoning_content = reasoning_content + self.is_state_end = is_state_end + + def model_dump(self): + return { + "time_step": self.time_step, + "role": self.role, + "agent_id": self.agent_id, + "content": self.content, + "reasoning_content": self.reasoning_content, + "is_state_end": self.is_state_end, + } + + def dict(self): + return { + "time_step": self.time_step, + "role": self.role, + "agent_id": self.agent_id, + "content": self.content, + "reasoning_content": self.reasoning_content, + "is_state_end": self.is_state_end, + } + + +def get_main_chat_list_and_rewards( + agent_id: str, root: RolloutTreeRootNode | RolloutTreeNode +) -> Tuple[list[TrainingChatTurn], torch.FloatTensor]: + """ + This method traverses a rollout tree and returns a the list of ChatTurn + for an agent. If it encounters a branch node, it follows the main path. + """ + # TODO; extend for all trees, not just linear + if isinstance(root, RolloutTreeRootNode): + current_node = root.child + else: + current_node = root + + chat = [] + rewards = [] + while current_node is not None: + if isinstance(current_node, RolloutTreeBranchNode): + current_node = current_node.main_child + reward: float = current_node.step_log.simulation_step_log.rewards[agent_id] + rewards.append(reward) + chat_turns: list[TrainingChatTurn] = current_node.step_log.action_logs[ + agent_id + ].chat_turns + chat_turns = [ + TrainingChatTurn(time_step=current_node.time_step, **turn.model_dump()) + for turn in chat_turns + ] + chat.extend(chat_turns) + current_node = current_node.child + return chat, torch.FloatTensor(rewards) + + +def get_tokenwise_credits( + # B := batch size, S := number of tokens / seq. length, T := number of states. `j` stands for jagged (see pytorch nested tensors.) + batch_timesteps: torch.IntTensor | torch.Tensor, # (B, jS), + batch_credits: torch.FloatTensor | torch.Tensor, # (B, jT) +) -> torch.FloatTensor | torch.Tensor: # (B, jS) + """ + TOWRITE + """ + # TODO vectorize this code + batch_token_credits = [] + for credits, timesteps in zip(batch_credits, batch_timesteps): + token_credits = torch.zeros_like( + timesteps, + dtype=credits.dtype, + device=timesteps.device, + ) + for idx, credit in enumerate(credits): + token_credits[timesteps == idx] = credit + batch_token_credits.append(token_credits) + return batch_token_credits + + +@dataclass +class TrajectoryBatch: + """ + Tensorized batch of trajectories using list-of-tensors for jagged dimensions. + """ + + # B := batch size, S := number of tokens / seq. length, T := number of states. + rollout_ids: torch.IntTensor # (B,) + crn_ids: torch.IntTensor # (B,) + agent_ids: list[str] # (B,) + batch_input_ids: list[torch.LongTensor] # List[(jS,)] + batch_reasoning_limits: ReasoningLimits # List[(jS,)] + batch_action_mask: list[torch.BoolTensor] # List[(jS,)] + batch_timesteps: list[torch.IntTensor] # List[(jS,)] + batch_state_ends_mask: list[torch.BoolTensor] # List[(jS,)] + batch_rewards: list[torch.FloatTensor] # List[(jT,)] + batch_credits: Optional[list[torch.FloatTensor]] = None # List[(jS,)] + + def __post_init__(self): + """ + Validate per-sample consistency. + """ + B = self.rollout_ids.shape[0] + assert ( + self.crn_ids.shape[0] == B + ), "RNG IDs must have length equal to batch size." + assert ( + len(self.agent_ids) == B + ), "agent_ids must have length equal to batch size." + assert ( + len(self.batch_input_ids) + == len(self.batch_reasoning_limits) + == len(self.batch_action_mask) + == len(self.batch_timesteps) + == len(self.batch_state_ends_mask) + == len(self.batch_rewards) + == B + ), "Jagged lists must all have length equal to batch size." + + for b in range(B): + nb_rewards = int(self.batch_rewards[b].shape[0]) + nb_timesteps = int(torch.max(self.batch_timesteps[b]).item()) + 1 + assert ( + nb_rewards == nb_timesteps + ), "Number of rewards and timesteps mismatch." + assert ( + self.batch_input_ids[b].shape[0] + == self.batch_action_mask[b].shape[0] + == self.batch_timesteps[b].shape[0] + ), "Tensors must have the same shape along the jagged dimension." + assert ( + int(self.batch_state_ends_mask[b].sum()) + == self.batch_rewards[b].shape[0] + ), "Number of rewards must match number of state ends." + + """ + Entries: + Here, we ignore the batch dimension. + input_ids: + All of the tokens of both the user and the assistant, flattened. + action_mask: + Set to true on the tokens of the assistant (tokens generated by the model). + timesteps: + Therefore, max(timesteps) = Ns - 1. + state_ends_idx: + Indices of the tokens at which state descriptions end. + rewards: + rewards[t] := R_t(s_t, a_t) + Example: + position: "0 1 2 3 4 5 6 7 8 9 10 11 12 13 14" + input_ids: "U U U a a a U a U a a a U U U" (U := User, a := Assistant) + action_mask: "x x x ✓ ✓ ✓ x ✓ x ✓ ✓ ✓ x x x" + timestep: "0 0 0 0 0 0 1 1 1 1 1 1 2 2 2" + state_ends_dx: [2, 6, 14] + rewards: [r0, r1, r2] + """ + + def __getitem__(self, key) -> "TrajectoryBatch": + if isinstance(key, slice): + return TrajectoryBatch( + rollout_ids=self.rollout_ids.__getitem__(key), + crn_ids=self.crn_ids.__getitem__(key), + agent_ids=self.agent_ids[key], + batch_input_ids=self.batch_input_ids[key], + batch_reasoning_limits=self.batch_reasoning_limits[key], + batch_action_mask=self.batch_action_mask[key], + batch_timesteps=self.batch_timesteps[key], + batch_state_ends_mask=self.batch_state_ends_mask[key], + batch_rewards=self.batch_rewards[key], + ) + + def __len__(self): + return len(self.batch_input_ids) + + def to(self, device): + self.rollout_ids = self.rollout_ids.to(device) + self.crn_ids = self.crn_ids.to(device) + self.batch_input_ids = [t.to(device) for t in self.batch_input_ids] + self.batch_action_mask = [t.to(device) for t in self.batch_action_mask] + self.batch_timesteps = [t.to(device) for t in self.batch_timesteps] + self.batch_state_ends_mask = [t.to(device) for t in self.batch_state_ends_mask] + self.batch_rewards = [t.to(device) for t in self.batch_rewards] + + def get_padded_tensors_for_critic(self): + """ + Returns: + padded_batch_input_ids: (B, P) + padded_batch_state_ends_mask: (B, P) + timestep_counts: (B,) tensor of ints indicating number of states per sample + causal_reasoning_mask: (B, P, P) + """ + padded_batch_input_ids = pad_sequence( + self.batch_input_ids, batch_first=True, padding_value=0 + ) + padded_batch_state_ends_mask = pad_sequence( + self.batch_state_ends_mask, batch_first=True, padding_value=0 + ).bool() + # number of states equals number of True in state_ends_mask + timestep_counts = torch.tensor( + [int(mask.sum().item()) for mask in self.batch_state_ends_mask], + device=padded_batch_input_ids.device, + dtype=torch.long, + ) + causal_reasoning_mask = get_causal_reasoning_mask( + padded_batch_input_ids.shape, self.batch_reasoning_limits + ) + return ( + padded_batch_input_ids, + padded_batch_state_ends_mask, + timestep_counts, + causal_reasoning_mask, + ) + + +timestep = int + + +@dataclass +class PaddedTensorTrainingBatch: + batch_input_ids: torch.LongTensor + batch_action_mask: torch.BoolTensor + batch_credits: torch.FloatTensor + batch_causal_reasoning_mask: torch.BoolTensor + + def __len__(self): + return self.batch_input_ids.shape[0] + + def to(self, device): + self.batch_input_ids = self.batch_input_ids.to(device) + self.batch_action_mask = self.batch_action_mask.to(device) + self.batch_credits = self.batch_credits.to(device) + self.batch_causal_reasoning_mask = self.batch_causal_reasoning_mask.to(device) + + +@dataclass +class TrainingBatch: + rollout_ids: torch.IntTensor # (B,) + batch_input_ids: list[torch.LongTensor] # List[(jS,)] + batch_action_mask: list[torch.BoolTensor] # List[(jS,)] + batch_reasoning_limits: ReasoningLimits # List[(jTb,)] (Tb is number of thinking blocks) + batch_credits: list[torch.FloatTensor] # List[(jS,)] + + def __post_init__(self): + # Put everything in the right device + # self.rollout_ids = self.rollout_ids.to("cuda" if torch.cuda.is_available() else "cpu") + # self.batch_input_ids = self.batch_input_ids.to("cuda" if torch.cuda.is_available() else "cpu") + # self.batch_action_mask = self.batch_action_mask.to("cuda" if torch.cuda.is_available() else "cpu") + # self.batch_credits = self.batch_credits.to("cuda" if torch.cuda.is_available() else "cpu") + # Ensure batch dimension is present + assert ( + len(self.batch_input_ids) + == len(self.batch_action_mask) + == len(self.batch_credits) + == self.rollout_ids.shape[0] + ), "Jagged lists must all have length equal to batch size." + for inp, mask, cred in zip( + self.batch_input_ids, self.batch_action_mask, self.batch_credits + ): + assert ( + inp.shape[0] == mask.shape[0] == cred.shape[0] + ), "Tensors must have the same shapes along the jagged dimension." + + def __getitem__(self, key) -> "TrainingBatch": + if isinstance(key, slice): + return TrainingBatch( + rollout_ids=self.rollout_ids.__getitem__(key), + batch_input_ids=self.batch_input_ids[key], + batch_action_mask=self.batch_action_mask[key], + batch_reasoning_limits=self.batch_reasoning_limits[key], + batch_credits=self.batch_credits[key], + ) + + def __len__(self): + return len(self.batch_input_ids) + + def to(self, device): + self.rollout_ids = self.rollout_ids.to(device) + self.batch_input_ids = [t.to(device) for t in self.batch_input_ids] + self.batch_action_mask = [t.to(device) for t in self.batch_action_mask] + self.batch_credits = [t.to(device) for t in self.batch_credits] + + def get_padded_tensors(self, padding: float = 0.0): + """ + TOWRITE + Always pad to the right. + """ + padded_batch_input_ids = pad_sequence( + self.batch_input_ids, + batch_first=True, + padding_value=int(padding), + padding_side="right", + ) + padded_batch_action_mask = pad_sequence( + [m.to(dtype=torch.bool) for m in self.batch_action_mask], + batch_first=True, + padding_value=False, + padding_side="right", + ) + padded_batch_credits = pad_sequence( + self.batch_credits, + batch_first=True, + padding_value=float(padding), + padding_side="right", + ) + + # indices are unchanged since we pad to the right + batch_causal_reasoning_mask = get_causal_reasoning_mask( + padded_batch_input_ids.shape, self.batch_reasoning_limits + ) + + return PaddedTensorTrainingBatch( + padded_batch_input_ids, + padded_batch_action_mask, + padded_batch_credits, + batch_causal_reasoning_mask, + ) + + def append(self, other: "TrainingBatch"): + self.rollout_ids = torch.cat([self.rollout_ids, other.rollout_ids]) + self.batch_input_ids.extend(other.batch_input_ids) + self.batch_action_mask.extend(other.batch_action_mask) + self.batch_reasoning_limits.extend(other.batch_reasoning_limits) + self.batch_credits.extend(other.batch_credits) + + +timestep = int