Muqeeth commited on Nov 28, 2025

Commit

0caadff

verified ·

1 Parent(s): 1f4f273

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.hydra/config.yaml +178 -0
.hydra/hydra.yaml +154 -0
.hydra/overrides.yaml +1 -0
run.log +0 -0
seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/README.md +207 -0
seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json +42 -0
seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json +42 -0
src_code_for_reproducibility/__init__.py +0 -0
src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc +0 -0
src_code_for_reproducibility/docs/source/contributing.rst +0 -0
src_code_for_reproducibility/docs/source/environments/dond.rst +410 -0
src_code_for_reproducibility/docs/source/environments/ipd.rst +411 -0
src_code_for_reproducibility/docs/source/marl_standard.rst +141 -0
src_code_for_reproducibility/docs/source/src.environments.dond.dond_game.rst +7 -0
src_code_for_reproducibility/docs/source/src.environments.ipd.ipd_statistics_funcs.rst +7 -0
src_code_for_reproducibility/docs/source/src.experiments.generate_and_train.rst +7 -0
src_code_for_reproducibility/docs/source/src.generation.run_games.rst +7 -0
src_code_for_reproducibility/docs/source/src.models.local_llm.rst +7 -0
src_code_for_reproducibility/docs/source/src.models.new_local_llm.rst +7 -0
src_code_for_reproducibility/docs/source/src.models.rst +20 -0
src_code_for_reproducibility/docs/source/src.training.reinforce_training.rst +7 -0
src_code_for_reproducibility/docs/source/src.training.rl_convs_processing.rst +7 -0
src_code_for_reproducibility/docs/source/src.training.train_main.rst +7 -0
src_code_for_reproducibility/docs/source/src.utils.export_ppo_training_set.rst +7 -0
src_code_for_reproducibility/docs/source/src.utils.extra_stats.rst +7 -0
src_code_for_reproducibility/docs/source/src.utils.log_gpu_usage.rst +7 -0
src_code_for_reproducibility/docs/source/src.utils.parallel_shuffle.rst +7 -0
src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/diplomacy/diplomacy_agent.py +259 -0
src_code_for_reproducibility/markov_games/diplomacy/diplomacy_logging_for_training.py +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simple_agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +108 -0
src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/credit_methods.py +295 -0
src_code_for_reproducibility/training/tally_tokenwise.py +276 -0
src_code_for_reproducibility/training/tokenize_chats.py +128 -0
src_code_for_reproducibility/training/trainer_ad_align.py +492 -0
src_code_for_reproducibility/training/trainer_common.py +1054 -0
src_code_for_reproducibility/training/trainer_independent.py +155 -0
src_code_for_reproducibility/training/trainer_sum_rewards.py +127 -0
src_code_for_reproducibility/training/training_data_utils.py +394 -0

.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,178 @@

+experiment:
+  wandb_enabled: true
+  nb_epochs: 3000
+  nb_matches_per_iteration: 64
+  reinit_matches_each_it: true
+  checkpoint_every_n_iterations: 10
+  start_epoch: 0
+  resume_experiment: true
+  base_seed: 123
+  seed_group_size: 8
+  train: true
+  stat_methods_for_live_wandb: mllm.markov_games.negotiation.negotiation_statistics
+  name: no_press_10_1_ties_ad_align_nocurrtimestep_seed123
+  agent_buffer: true
+  keep_agent_buffer_count: ${lora_count}
+  agent_buffer_recent_k: -1
+logging:
+  wandb:
+    enabled: false
+    project: llm-negotiation
+    entity: null
+    mode: online
+    name: null
+    group: null
+    tags: []
+    notes: null
+temperature: 1.0
+markov_games:
+  runner_method_name: LinearRunner
+  runner_kwargs: {}
+  group_by_round: true
+  simulation_class_name: NoPressSimulation
+  simulation_init_args:
+    nb_of_rounds: 10
+    quota_messages_per_agent_per_round: 0
+    game_type: 10-1-ties
+    atleast_one_conflict: true
+    item_types:
+    - hats
+    - books
+    - balls
+  agents:
+    0:
+      agent_id: ${agent_0_id}
+      agent_name: Alice
+      agent_class_name: NoPressAgent
+      policy_id: base_llm/agent_adapter
+      init_kwargs:
+        goal: Maximize your total points over the whole game.
+    1:
+      agent_id: ${agent_1_id}
+      agent_name: Bob
+      agent_class_name: NoPressAgent
+      policy_id: base_llm/agent_adapter
+      init_kwargs:
+        goal: Maximize your total points over the whole game.
+models:
+  base_llm:
+    class: LeanLocalLLM
+    init_args:
+      llm_id: base_llm
+      model_name: Qwen/Qwen2.5-7B-Instruct
+      inference_backend: vllm
+      hf_kwargs:
+        device_map: auto
+        torch_dtype: bfloat16
+        max_memory:
+          0: 20GiB
+        attn_implementation: flash_attention_2
+      inference_backend_init_kwargs:
+        enable_lora: true
+        seed: ${experiment.base_seed}
+        enable_prefix_caching: true
+        max_model_len: 10000.0
+        gpu_memory_utilization: 0.5
+        dtype: bfloat16
+        trust_remote_code: true
+        max_lora_rank: 32
+        enforce_eager: false
+        max_loras: ${lora_count}
+        max_cpu_loras: ${lora_count}
+        enable_sleep_mode: true
+      inference_backend_sampling_params:
+        temperature: ${temperature}
+        top_p: 1.0
+        max_tokens: 400
+        top_k: -1
+        logprobs: 0
+      adapter_configs:
+        agent_adapter:
+          task_type: CAUSAL_LM
+          r: 32
+          lora_alpha: 64
+          lora_dropout: 0.0
+          target_modules: all-linear
+        critic_adapter:
+          task_type: CAUSAL_LM
+          r: 32
+          lora_alpha: 64
+          lora_dropout: 0.0
+          target_modules: all-linear
+      enable_thinking: null
+      regex_max_attempts: 3
+critics:
+  agent_critic:
+    module_pointer:
+    - base_llm
+    - critic_adapter
+optimizers:
+  agent_optimizer:
+    module_pointer:
+    - base_llm
+    - agent_adapter
+    optimizer_class_name: torch.optim.Adam
+    init_args:
+      lr: 3.0e-06
+      weight_decay: 0.0
+  critic_optimizer:
+    module_pointer: agent_critic
+    optimizer_class_name: torch.optim.Adam
+    init_args:
+      lr: 3.0e-06
+      weight_decay: 0.0
+trainers:
+  agent_trainer:
+    class: TrainerAdAlign
+    module_pointers:
+      policy:
+      - base_llm
+      - agent_adapter
+      policy_optimizer: agent_optimizer
+      critic: agent_critic
+      critic_optimizer: critic_optimizer
+    kwargs:
+      entropy_coeff: 0.0
+      entropy_topk: null
+      entropy_mask_regex: null
+      kl_coeff: 0.001
+      gradient_clipping: 1.0
+      restrict_tokens: null
+      mini_batch_size: 1
+      use_gradient_checkpointing: false
+      temperature: ${temperature}
+      device: cuda:0
+      use_gae: false
+      whiten_advantages: false
+      whiten_advantages_time_step_wise: false
+      skip_discounted_state_visitation: true
+      use_gae_lambda_annealing: false
+      gae_lambda_annealing_method: None
+      gae_lambda_annealing_method_params: None
+      gae_lambda_annealing_limit: 0.95
+      discount_factor: 0.9
+      use_rloo: true
+      enable_tokenwise_logging: false
+      pg_loss_normalization: nb_tokens
+      truncated_importance_sampling_ratio_cap: 2.0
+      reward_normalizing_constant: 100.0
+      ad_align_force_coop_first_step: false
+      ad_align_clipping: null
+      ad_align_gamma: 0.9
+      ad_align_exclude_k_equals_t: true
+      ad_align_use_sign: false
+      ad_align_beta: 1.0
+      use_old_ad_align: true
+      use_time_regularization: false
+      rloo_branch: false
+      reuse_baseline: false
+train_on_which_data:
+  agent_trainer: ${agent_ids}
+lora_count: 30
+common_agent_kwargs:
+  goal: Maximize your total points over the whole game.
+agent_0_id: Alice
+agent_1_id: Bob
+agent_ids:
+- Alice
+- Bob

.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+hydra:
+  run:
+    dir: ${oc.env:SCRATCH}/llm_negotiation/${now:%Y_%m}/${experiment.name}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: run
+    chdir: false
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: no_press_10_1_ties_ad_align_nocurrtimestep_seed123.yaml
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /scratch/m/muqeeth/llm_negotiation
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /scratch/m/muqeeth/llm_negotiation/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /scratch/m/muqeeth/llm_negotiation/2025_11/no_press_10_1_ties_ad_align_nocurrtimestep_seed123
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

run.log ADDED Viewed

The diff for this file is too large to render. See raw diff

seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.17.1

seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

seed_123/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,42 @@

+{
+  "alpha_pattern": {},
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "o_proj",
+    "v_proj",
+    "up_proj",
+    "q_proj",
+    "k_proj",
+    "gate_proj",
+    "down_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

src_code_for_reproducibility/__init__.py ADDED Viewed

File without changes

src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (148 Bytes). View file

src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc ADDED Viewed

Binary file (3.64 kB). View file

src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc ADDED Viewed

Binary file (1.32 kB). View file

src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc ADDED Viewed

Binary file (3.61 kB). View file

src_code_for_reproducibility/docs/source/contributing.rst ADDED Viewed

File without changes

src_code_for_reproducibility/docs/source/environments/dond.rst ADDED Viewed

	@@ -0,0 +1,410 @@

+=================
+Deal or No Deal
+=================
+The Deal or No Deal (DoND) environment provides a multi-agent negotiation interface where players trade
+items with different values. This document describes the API for interacting with the DoND environment
+and its associated agent handler.
+Overview
+--------
+Deal or No Deal is a negotiation game where two agents must agree on how to divide a set of items,
+each of which has different values to each agent. The agents engage in a back-and-forth dialogue to
+determine an allocation of the items, with each trying to maximize their own total value.
+Our implementation follows the Multi-Agent Negotiation Environment standard, allowing it to be used
+with LLM agents through a text-based interface.
+Game Rules
+----------
+### Basic Structure
+The core mechanics of Deal or No Deal are:
+1. Two agents negotiate over a set of items (e.g., books, balls, hats)
+2. Each item has:
+   - A specific quantity (how many of each item is available)
+   - A value for each agent (which may differ between agents)
+3. Agents take turns sending messages to negotiate how to split the items
+4. Once an agreement is reached, agents finalize the deal
+5. Points are awarded based on the value of items each agent receives
+### Detailed Gameplay
+#### Setup Phase
+The game begins with:
+- A set of items (e.g., "book", "hat", "ball")
+- Each item has a quantity (e.g., 6 books, 2 hats, 4 balls)
+- Each agent has private values for each item (e.g., books might be worth 5 points to one agent but only 2 points to the other)
+- Agents are assigned roles (starting negotiator and responding negotiator)
+#### Negotiation Phase
+1. Agents take turns sending free-form text messages to each other
+2. Messages can include offers, counter-offers, questions, or strategic communication
+3. There is a maximum number of messages permitted (preventing endless negotiations)
+4. Either agent can propose to finalize an agreement at any time
+For example:
+- Agent 1: "I propose I get all the books and you get all the hats and balls."
+- Agent 2: "That doesn't work for me. How about you get 3 books and I get 3 books, all the hats, and all the balls?"
+- Agent 1: "Let me counter-offer: I get 4 books and 2 balls, you get 2 books, all hats, and 2 balls."
+#### Finalization Phase
+1. When an agent wants to finalize a deal, they must specify the exact allocation:
+   - How many of each item they receive
+   - How many of each item the other agent receives
+2. The other agent must then either agree (by submitting the same allocation) or reject the finalization
+3. If both agents submit matching finalizations, the deal is executed
+4. If finalizations don't match, no agreement is reached, and both agents receive 0 points
+#### Scoring
+1. Each agent's score is calculated based on the value of items they receive
+2. The formula is: Sum(quantity_of_item_i × value_of_item_i_to_agent)
+3. If no agreement is reached, both agents receive 0 points
+### Example Game
+Let's walk through a simple example:
+**Setup:**
+- Items: Books (4), Hats (2), Balls (6)
+- Agent 1 values: Books=5, Hats=1, Balls=2
+- Agent 2 values: Books=3, Hats=6, Balls=1
+**Negotiation (simplified):**
+1. Agent 1: "I would like all the books and balls. You can have the hats."
+2. Agent 2: "That doesn't work for me. Books are valuable. I propose I get all the hats and 2 books, you get 2 books and all the balls."
+3. Agent 1: "How about I get 3 books and all the balls, and you get 1 book and all the hats?"
+4. Agent 2: "I accept your proposal."
+**Finalization:**
+- Agent 1 submits: Agent 1 gets (Books: 3, Hats: 0, Balls: 6), Agent 2 gets (Books: 1, Hats: 2, Balls: 0)
+- Agent 2 submits the same allocation, confirming agreement
+**Scoring:**
+- Agent 1 score: (3 books × 5) + (0 hats × 1) + (6 balls × 2) = 15 + 0 + 12 = 27 points
+- Agent 2 score: (1 book × 3) + (2 hats × 6) + (0 balls × 1) = 3 + 12 + 0 = 15 points
+### Game Variations
+The DoND environment supports several variations through configuration parameters:
+#### Different Value Distributions
+The environment offers multiple ways to assign values to items:
+1. **Standard Random Setup (dond_random_setup)**:
+   - Items have even-numbered quantities
+   - Each agent receives distinct random values for each item
+   - Values are drawn from a uniform distribution
+2. **Independent Random Values (independent_random_vals)**:
+   - Item quantities can be any number in the specified range
+   - Values for each agent are drawn independently
+   - Creates more varied negotiation scenarios
+3. **Bicameral Value Distribution (bicameral_vals_assignator)**:
+   - Creates a "high value" and "low value" distribution for each item
+   - Each agent values approximately half the items highly and half lowly
+   - Values are drawn from normal distributions with different means
+   - Creates scenarios with clear trade opportunities
+#### Visibility Options
+1. **Finalization Visibility**:
+   - When enabled, both agents can see each other's finalization proposals
+   - When disabled, finalization proposals remain private until both are submitted
+2. **Other Values Visibility**:
+   - When enabled, agents can see each other's value functions
+   - When disabled, agents only know their own values
+   - Creates information asymmetry and richer negotiation dynamics
+#### Game Modes
+1. **Cooperative Mode ("coop")**:
+   - Agents are encouraged to find mutually beneficial solutions
+   - Success is measured by the sum of both agents' scores
+2. **Competitive Mode ("comp")**:
+   - Agents aim to maximize their individual scores
+   - Creates more adversarial negotiations
+#### Round Structure
+1. **Single Round**:
+   - One negotiation session between the same agents
+   - Simple evaluation of negotiation skills
+2. **Multiple Rounds**:
+   - Agents negotiate multiple times with different item setups
+   - Allows for learning and adaptation over time
+   - Roles can be swapped between rounds
+DondEnv
+------------
+The ``DondEnv`` class provides an interface to the Deal or No Deal environment that follows the Multi-Agent
+Negotiation Environment standard.
+.. code-block:: python
+    class DondEnv:
+        """
+        Multi-Agent Negotiation Environment for Deal or No Deal.
+        """
+        def __init__(
+            self,
+            agents,
+            mode="coop",
+            max_messages=None,
+            min_messages=None,
+            max_chars_per_message=None,
+            rounds_per_game=1,
+            random_setup_func=None,
+            random_setup_kwargs=None,
+            role_assignator_func=None,
+            role_assignator_func_kwargs=None,
+            finalization_visibility=False,
+            other_values_visibility=False,
+            random_seed=None
+        ):
+            """Initialize the Deal or No Deal environment.
+            Args:
+                agents: List of agent IDs participating in the game
+                mode: Game mode ("coop" or "comp")
+                max_messages: Maximum number of messages per agent per round
+                min_messages: Minimum number of messages per agent per round
+                max_chars_per_message: Maximum characters per message
+                rounds_per_game: Number of negotiation rounds to play
+                random_setup_func: Function to generate item quantities and values
+                random_setup_kwargs: Arguments for the random setup function
+                role_assignator_func: Function to assign roles to agents
+                role_assignator_func_kwargs: Arguments for the role assignator
+                finalization_visibility: Whether agents can see each other's finalizations
+                other_values_visibility: Whether agents can see each other's values
+                random_seed: Seed for reproducibility
+            """
+            # ...
+        def reset(self):
+            """Reset the environment to an initial state and return the initial observation.
+            Returns:
+                observation (dict): A dictionary where keys are agent identifiers and values are observations.
+            """
+            # ...
+        def step(self, actions):
+            """Take a step in the environment using the provided actions.
+            Args:
+                actions (dict): A dictionary where keys are agent identifiers and values are actions.
+                    Actions can be messages or finalization proposals.
+            Returns:
+                observations (dict): A dictionary where keys are agent identifiers and values are observations.
+                done (bool): Whether the episode has ended.
+                info (dict): Additional information about the environment.
+            """
+            # ...
+        def get_state(self):
+            """Retrieve the current state of the game.
+            Returns:
+                state (dict): The current state of the game, including items, quantities, values, etc.
+            """
+            # ...
+Key Implementation Details
+~~~~~~~~~~~~~~~~~~~~~~~~~
+The ``DondEnv`` class implements several key features:
+1. **Multi-Agent Support**: The environment tracks two agents and manages their alternating messages.
+2. **Turn-Based Dialogue**: The environment enforces turn structure and limits on message count.
+3. **Finalization Processing**: The environment validates and processes finalization proposals.
+4. **Random Setup**: The environment supports multiple methods of generating negotiation scenarios.
+5. **Round Management**: The environment can handle multiple rounds with different setups.
+Observation Structure
+~~~~~~~~~~~~~~~~~~~~
+Each agent receives an observation (state) dictionary with rich information about the game:
+.. code-block:: python
+    {
+        "mode": str,                 # Game mode ("coop" or "comp")
+        "role_values": dict,         # Value mappings for each role
+        "role_props": dict,          # Properties for each role
+        "agent_to_role": dict,       # Mapping from agent IDs to roles
+        "is_new_round": bool,        # Whether this is the start of a new round
+        "is_new_game": bool,         # Whether this is the start of a new game
+        "game_over": bool,           # Whether the game is over
+        "items": list,               # List of item names
+        "quantities": dict,          # Quantities of each item
+        "has_finalized": bool,       # Whether finalization has been proposed
+        "last_message": dict,        # The last message sent
+        "messages_remaining": dict,  # Number of messages each agent can still send
+        # And various history tracking fields
+    }
+Action Structure
+~~~~~~~~~~~~~~~
+Actions can be:
+1. **Text Messages**: Free-form text for negotiation.
+2. **Finalization Proposals**: Structured data specifying the exact allocation of items.
+Example finalization format:
+.. code-block:: python
+    {
+        "type": "finalize",
+        "allocation": {
+            "agent1": {"book": 3, "hat": 0, "ball": 6},
+            "agent2": {"book": 1, "hat": 2, "ball": 0}
+        }
+    }
+Value Setup Functions
+--------------------
+The DoND environment provides several functions for setting up item values:
+.. code-block:: python
+    def dond_random_setup(items, min_quant, max_quant, min_val, max_val, random_seed=None):
+        """
+        Generates items, even-numbered quantities and distinct random values for each category for both agents.
+        Args:
+            items (list): List of items.
+            min_quant (int): Minimum quantity per item.
+            max_quant (int): Maximum quantity per item.
+            min_val (int): Minimum value per item.
+            max_val (int): Maximum value per item.
+            random_seed (int, optional): Seed for random generation.
+        Returns:
+            tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator))
+        """
+        # ...
+    def independent_random_vals(items, min_quant, max_quant, min_val, max_val, random_seed=None):
+        """
+        Generates random quantities and independent random values for both agents.
+        Args:
+            Similar to dond_random_setup
+        Returns:
+            tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator))
+        """
+        # ...
+    def bicameral_vals_assignator(items, min_quant, max_quant, low_val_mean, low_val_std, high_val_mean, high_val_std, random_seed=None):
+        """
+        Generates values with a bicameral distribution - each agent values half the items highly.
+        Args:
+            items (list): List of items.
+            min_quant, max_quant: Range for quantities
+            low_val_mean, low_val_std: Mean and standard deviation for the "low value" distribution
+            high_val_mean, high_val_std: Mean and standard deviation for the "high value" distribution
+            random_seed: Seed for reproducibility
+        Returns:
+            tuple: (items, quantities, (val_starting_negotiator, val_responding_negotiator))
+        """
+        # ...
+Running DoND Games
+----------------------
+To run Deal or No Deal games with LLM agents, you can use the following structure:
+.. code-block:: python
+    from mllm.environments.dond.dond_game import DondEnv
+    from mllm.environments.dond.dond_agent import DondAgent
+    from src.run_matches import run_batched_matches
+    # Create environment
+    env = DondEnv(
+        agents=["agent1", "agent2"],
+        mode="coop",
+        max_messages=10,
+        rounds_per_game=1,
+        random_setup_func="dond_random_setup",
+        random_setup_kwargs={
+            "items": ["book", "hat", "ball"],
+            "min_quant": 2,
+            "max_quant": 8,
+            "min_val": 1,
+            "max_val": 10
+        },
+        finalization_visibility=False
+    )
+    # Create agent handlers (implementation details would vary)
+    agent_handlers = {
+        "agent1": DondAgent(agent_id="agent1"),
+        "agent2": DondAgent(agent_id="agent2")
+    }
+    # Define policy mapping
+    policy_mapping = {
+        "llm_policy": my_llm_policy_function
+    }
+    # Run the game
+    game_results = run_batched_matches(
+        envs=[env],
+        agent_handlers_per_env=[agent_handlers],
+        policy_mapping=policy_mapping,
+        max_parallel_matches=1
+    )
+Limitations and Considerations
+-----------------------------
+1. **Negotiation Complexity**: The open-ended nature of negotiations can be challenging for some LLM agents.
+2. **Parsing Challenges**: Extracting structured finalization proposals from free-form text requires robust parsing.
+3. **Optimization Opportunities**: Different agents may employ different negotiation strategies to optimize outcomes.
+4. **Fairness Evaluation**: The environment allows research into questions of fair division and Pareto optimality.
+5. **Strategic Deception**: Agents might strategically misrepresent their true values, adding complexity to negotiations.
+Advanced Usage
+------------
+For advanced usage, you can:
+1. **Custom Value Functions**: Create more complex distributions of item values for specific research questions.
+2. **Novel Negotiation Scenarios**: Design item sets and values to test specific negotiation skills.
+3. **Curriculum Learning**: Create progressively more difficult negotiation scenarios.
+4. **Communication Analysis**: Analyze the language and strategies used in successful negotiations.
+5. **Multi-Round Dynamics**: Study how agents adapt their strategies over multiple rounds.

src_code_for_reproducibility/docs/source/environments/ipd.rst ADDED Viewed

	@@ -0,0 +1,411 @@

+=================
+Iterated Prisoner's Dilemma
+=================
+The Iterated Prisoner's Dilemma environment provides a classic game theory setting for studying cooperation
+and competition between agents. This document describes the API for interacting with the IPD environment
+and its associated agent handler.
+Overview
+--------
+The Prisoner's Dilemma is a fundamental problem in game theory that demonstrates why two rational individuals might not
+cooperate, even when it appears in their best interest to do so. In the iterated version, the same two players
+repeatedly face the same dilemma, allowing for the development of trust or retaliation based on previous interactions.
+Our implementation follows the Multi-Agent Negotiation Environment standard, allowing it to be used with
+LLM agents through a text-based interface.
+Game Rules
+----------
+### Basic Premise
+The scenario behind the Prisoner's Dilemma is as follows:
+Two criminals are arrested and imprisoned. Each prisoner is in solitary confinement with no means of communicating with
+the other. The prosecutors lack sufficient evidence to convict the pair on the principal charge, but they have enough
+to convict both on a lesser charge. Simultaneously, the prosecutors offer each prisoner a bargain:
+- If both prisoners betray each other, each serves 2 years in prison (the "punishment" payoff)
+- If one betrays the other while the other remains silent, the betrayer goes free (the "temptation" payoff) while the
+  silent accomplice serves 3 years (the "sucker" payoff)
+- If both remain silent, each serves only 1 year in prison (the "reward" payoff)
+### Game Mechanics
+In our implementation, the choices are simplified to:
+- **C**: Cooperate (remain silent)
+- **D**: Defect (betray the other prisoner)
+Each round, both players simultaneously choose either C or D, and receive points based on the combination of their choices:
+- Both choose C: Both receive the "reward" payoff (3 points by default)
+- Both choose D: Both receive the "punishment" payoff (1 point by default)
+- One chooses C, one chooses D: The defector receives the "temptation" payoff (5 points by default), while the cooperator
+  receives the "sucker" payoff (0 points by default)
+### Example: Single Round
+Let's see how a single round plays out:
+1. Alice and Bob simultaneously make their choices
+2. If Alice chooses C and Bob chooses C:
+   - Alice receives 3 points
+   - Bob receives 3 points
+3. If Alice chooses C and Bob chooses D:
+   - Alice receives 0 points
+   - Bob receives 5 points
+4. If Alice chooses D and Bob chooses C:
+   - Alice receives 5 points
+   - Bob receives 0 points
+5. If Alice chooses D and Bob chooses D:
+   - Alice receives 1 point
+   - Bob receives 1 point
+### Iterated Game Structure
+The iterated version repeats this basic game for a fixed number of rounds. The key features are:
+1. Players know the total number of rounds in advance
+2. After each round, players learn what choice the other player made
+3. Players maintain a cumulative score across all rounds
+4. Players can adjust their strategy based on the history of previous interactions
+### Game Variations
+The IPD environment supports several variations through configuration parameters:
+#### Different Payoff Matrices
+The standard payoff values can be modified to create different incentive structures:
+- **Traditional PD**: reward=3, punishment=1, temptation=5, sucker=0
+- **Weak Temptation**: reward=3, punishment=1, temptation=4, sucker=0 (reduces the incentive to defect)
+- **Harsh Punishment**: reward=3, punishment=0, temptation=5, sucker=0 (increases the cost of mutual defection)
+- **Generous**: reward=4, punishment=2, temptation=5, sucker=1 (cushions the blow of being betrayed)
+#### Game Length Variations
+The number of rounds can significantly impact strategy:
+- **Short Games** (5-10 rounds): Incentivizes more defection, especially near the end
+- **Medium Games** (20-50 rounds): Allows for the development of tit-for-tat and forgiveness strategies
+- **Long Games** (100+ rounds): Favors steady cooperation with occasional "probing" defections
+### Common Strategies
+While not enforced by the environment, several well-known strategies can emerge:
+- **Always Cooperate**: Always choose C
+- **Always Defect**: Always choose D
+- **Tit for Tat**: Start with C, then copy what the opponent did in the previous round
+- **Forgiving Tit for Tat**: Like Tit for Tat, but occasionally cooperate even after being defected against
+- **Grudger**: Cooperate until the opponent defects once, then always defect
+- **Random**: Choose randomly between C and D
+IPDEnv
+------
+The ``IPDEnv`` class provides an interface to the Iterated Prisoner's Dilemma environment that follows the
+Multi-Agent Negotiation Environment standard.
+.. code-block:: python
+    class IPDEnv:
+        """
+        Iterated Prisoner's Dilemma environment following the MarlEnvironment standard.
+        In each round of the game, two agents simultaneously choose to either cooperate (C) or defect (D).
+        The payoffs are as follows:
+        - If both cooperate: Both receive the "reward" (usually 3 points)
+        - If both defect: Both receive the "punishment" (usually 1 point)
+        - If one cooperates and one defects: The defector receives the "temptation" (usually 5 points)
+          and the cooperator receives the "sucker" payoff (usually 0 points)
+        The game is played for a specified number of rounds.
+        """
+        def __init__(
+            self,
+            rounds_per_game: int = 10,
+            reward: float = 3.0,           # Both cooperate
+            punishment: float = 1.0,       # Both defect
+            temptation: float = 5.0,       # Defector's reward when other cooperates
+            sucker: float = 0.0,           # Cooperator's reward when other defects
+            random_seed: Optional[int] = None,
+        ):
+            """
+            Initialize the Iterated Prisoner's Dilemma environment.
+            Args:
+                rounds_per_game: Number of rounds to play
+                reward: Payoff when both agents cooperate
+                punishment: Payoff when both agents defect
+                temptation: Payoff for defecting when other agent cooperates
+                sucker: Payoff for cooperating when other agent defects
+                seed: Random seed for reproducibility
+            """
+            # ...
+        def reset(self) -> Dict[str, Dict[str, Any]]:
+            """
+            Reset the environment to an initial state and return the initial observation.
+            Returns:
+                observation (dict): A dictionary where keys are agent identifiers and values are observations.
+            """
+            # ...
+        def step(self, actions: Dict[str, str]) -> Tuple[Dict[str, Dict[str, Any]], bool, Dict[str, Any]]:
+            """
+            Take a step in the environment using the provided actions.
+            Args:
+                actions (dict): A dictionary where keys are agent identifiers and values are actions ('C' or 'D').
+            Returns:
+                observations (dict): A dictionary where keys are agent identifiers and values are observations.
+                done (bool): Whether the episode has ended.
+                info (dict): Additional information about the environment.
+            """
+            # ...
+Key Implementation Details
+~~~~~~~~~~~~~~~~~~~~~~~~~
+The ``IPDEnv`` class implements several key features:
+1. **Two-Agent Support**: The environment tracks two agents ("alice" and "bob") and manages their interactions.
+2. **Round-Based Play**: The environment enforces turn structure and tracks game history.
+3. **Payoff Matrix**: The environment calculates rewards based on the standard prisoner's dilemma payoff matrix.
+4. **Observation Generation**: The environment generates detailed observations for each agent, including action history and rewards.
+5. **Game Termination**: The environment tracks game termination after the specified number of rounds.
+Observation Structure
+~~~~~~~~~~~~~~~~~~~~
+Each agent receives an observation dictionary with the following structure:
+.. code-block:: python
+    {
+        "current_round": int,                # Current round number (0-indexed)
+        "rounds_per_game": int,              # Total number of rounds in the game
+        "history": List[Dict],               # Complete game history so far
+        "last_round_actions": Dict[str, str], # Actions from the previous round (if any)
+        "last_round_reward": float,          # Reward received in the previous round (if any)
+        "total_reward": float,               # Cumulative reward so far
+        "payoff_matrix": Dict[str, float],   # The game's payoff matrix values
+    }
+Action Structure
+~~~~~~~~~~~~~~~
+Actions are simple strings:
+1. ``"C"`` for Cooperate
+2. ``"D"`` for Defect
+IPDAgent
+--------------
+The ``IPDAgent`` class implements the agent handler interface for the Iterated Prisoner's Dilemma, processing observations from the environment and generating actions through an LLM.
+.. code-block:: python
+    class IPDAgent:
+        """
+        Agent handler for Iterated Prisoner's Dilemma, implementing the AgentState interface
+        for the multi-agent negotiation standard.
+        """
+        def __init__(
+            self,
+            agent_id: str,
+            policy_id: str = "llm_policy",
+            system_prompt: Optional[str] = None,
+            max_errors: int = 3,
+            opponent_id: Optional[str] = None,
+        ):
+            """
+            Initialize the IPD agent handler.
+            Args:
+                agent_id: Identifier for this agent ("alice" or "bob")
+                policy_id: Identifier for the policy this agent uses
+                system_prompt: Optional custom system prompt for the LLM
+                max_errors: Maximum number of parsing errors before defaulting to cooperate
+                opponent_id: Optional identifier of the opponent (inferred if not provided)
+            """
+            # ...
+        def step(self, observation_from_env: Dict[str, Any], policy_output: str = None) -> Tuple[str, Dict[str, Any], str, bool, Dict[str, Any]]:
+            """
+            Update the agent state based on the observation and process the policy output.
+            Args:
+                observation_from_env: The observation from the environment
+                policy_output: The output from the policy (LLM response)
+            Returns:
+                policy_id: The policy identifier
+                policy_input: The input to the policy
+                action: The action to be sent to the environment
+                done: Whether the action is ready to be sent to the environment
+                info: Additional information about the agent
+            """
+            # ...
+Key Implementation Details
+~~~~~~~~~~~~~~~~~~~~~~~~~
+The ``IPDAgent`` class implements several key features:
+1. **LLM Interaction**: The agent generates prompts for an LLM and processes the LLM's responses.
+2. **Action Extraction**: The agent parses the LLM's output to extract valid actions (C or D).
+3. **Error Handling**: The agent provides helpful error messages when parsing fails and defaults to cooperation after multiple failures.
+4. **History Tracking**: The agent maintains and provides the complete game history in its prompts.
+5. **Strategy Explanation**: The agent can extract and log the reasoning behind an LLM's decisions.
+Prompt Structure
+~~~~~~~~~~~~~~~
+The agent generates prompts that include:
+1. **System Prompt**: Instructions and context for the LLM, explaining its role and the rules of the Prisoner's Dilemma.
+2. **Game State Description**: A text description of the current game state, including:
+   - Current round number
+   - History of previous rounds (if any)
+   - Cumulative score
+3. **Action Request**: Instructions on how to format the response, requiring an explicit action tag.
+Example system prompt:
+.. code-block:: text
+    You are playing as Alice in an Iterated Prisoner's Dilemma game against Bob.
+    In each round, you must choose to either Cooperate (C) or Defect (D).
+    The payoffs are:
+    - If both players Cooperate: You each get 3 points
+    - If both players Defect: You each get 1 point
+    - If you Cooperate and Bob Defects: You get 0 points, Bob gets 5 points
+    - If you Defect and Bob Cooperates: You get 5 points, Bob gets 0 points
+    Your goal is to maximize your total points across all rounds.
+    The game will last for exactly 10 rounds, and both players know this.
+Example game state prompt:
+.. code-block:: text
+    Current round: 3/10
+    History:
+    Round 1: You chose C, Bob chose C. You earned 3 points.
+    Round 2: You chose C, Bob chose D. You earned 0 points.
+    Your total score so far: 3 points
+    What is your choice for round 3?
+    Please respond with <action>C</action> to cooperate or <action>D</action> to defect,
+    and explain your reasoning.
+Running IPD Games
+----------------------
+To run Iterated Prisoner's Dilemma games with LLM agents, you can use the following code structure:
+.. code-block:: python
+    from mllm.environments.ipd.ipd_game import IPDEnv
+    from mllm.environments.ipd.ipd_agent import IPDAgent
+    from mllm.run_matches import run_batched_matches
+    # Create environment
+    env = IPDEnv(
+        rounds_per_game=10,
+        reward=3.0,
+        punishment=1.0,
+        temptation=5.0,
+        sucker=0.0
+    )
+    # Create agent handlers
+    agent_handlers = {
+        "alice": IPDAgent(agent_id="alice"),
+        "bob": IPDAgent(agent_id="bob")
+    }
+    # Define policy mapping
+    policy_mapping = {
+        "llm_policy": my_llm_policy_function
+    }
+    # Run the game
+    game_results = run_batched_matches(
+        envs=[env],
+        agent_handlers_per_env=[agent_handlers],
+        policy_mapping=policy_mapping,
+        max_parallel_matches=1
+    )
+    # Process results
+    for result in game_results:
+        print(f"Game finished. Scores: {result['total_rewards']}")
+Statistics and Analysis
+----------------------
+The IPD environment includes utility functions for analyzing game outcomes:
+1. **Cooperation Rates**: Percentage of rounds where each agent cooperated.
+2. **Mutual Cooperation/Defection**: Percentage of rounds where both agents made the same choice.
+3. **Score Distribution**: Analysis of how points were accumulated over the game.
+These statistics can be calculated using the ``gather_ipd_statistics`` function:
+.. code-block:: python
+    from mllm.environments.ipd.ipd_statistics_funcs import gather_ipd_statistics
+    stats = gather_ipd_statistics(match_info, env_info)
+    print(f"Cooperation rates: {stats['cooperation_rate']}")
+    print(f"Mutual cooperation rate: {stats['mutual_cooperation_rate']}")
+    print(f"Mutual defection rate: {stats['mutual_defection_rate']}")
+Limitations and Considerations
+-----------------------------
+1. **Determinism**: The environment is deterministic, with randomness only in initialization if a seed is provided.
+2. **Limited Player Count**: The IPD environment only supports exactly two players.
+3. **Perfect Information**: Both players have perfect information about the game history.
+4. **Simultaneous Actions**: Both players act simultaneously, which requires adaptations for some LLM interfaces.
+5. **Fixed Game Length**: The total number of rounds is fixed and known to both players from the start.
+Advanced Usage
+------------
+For advanced usage, you can customize:
+1. **Payoff Matrix**: Modify reward values to create different incentive structures.
+2. **System Prompts**: Customize the LLM's understanding of the game and potential strategies.
+3. **Error Handling**: Adjust how the agent responds to invalid LLM outputs.
+4. **Analysis**: Create custom statistics gathering for specific research questions.
+5. **Integration**: Connect the IPD environment to other negotiation frameworks or tournament systems.

src_code_for_reproducibility/docs/source/marl_standard.rst ADDED Viewed

	@@ -0,0 +1,141 @@

+=================
+Abstract Standard for Multi-Agent Negotiation Environments
+=================
+Multi-Agent Negotiation Environments require more features than gymnasium environments in order to be used as interfaces in general game running code.
+The two fundamental differences between gymnasium environments and Multi-Agent Negotiation Environments are:
+1. Response from the LLM is a text action, not a discrete action. Therefore, appropriate parsing of the text is required. The model may need to be run multiple times to get the full action.
+    This is why we introduce the `AgentHandler` class, which is responsible for parsing the LLM's response.
+2. The environment needs to be able to handle multi-agent interactions.
+    This is why we introduce the `NegotiationEnvironment` class, which is responsible for handling the multi-agent interactions.
+3. MARL environments are complex to describe. In different contexts, the same environment may be described differently. Therefore, both the environement and the agent handlers are
+    responsible for describing a particular trajectory. This information is given by the `get_log_info` method.
+4. There might be a lot of overlap between the neural networks used by each agent. For instance, the same model may be used for all agents. This motivates a requirement for a
+    policy identifier for each agent.
+Taking inspiration from the `gymnasium <https://gymnasium.farama.org/>`_ library, we introduce a new standard for Multi-Agent Negotiation Environments.
+Our standard is based on the following features:
+Environments are of the form:
+.. code-block:: python
+    class MarlEnvironment():
+        def __init__(self):
+            """Initialize the environment."""
+            pass
+        def reset(self):
+            """Reset the environment to an initial state and return the initial observation.
+            Returns:
+                observation (dict): A dictionary where keys are agent identifiers and values are observations.
+            """
+            # (...)
+            return observation
+        def step(self, actions):
+            """Take a step in the environment using the provided actions.
+            Args:
+                actions (dict): A dictionary where keys are agent identifiers and values are actions.
+            Returns:
+                observations (dict): A dictionary where keys are agent identifiers and values are observations.
+                reward (dict): A dictionary where keys are agent identifiers and values are rewards.
+                done (bool): Whether the episode has ended.
+                info (dict): Additional information about the environment.
+            """
+            # (...)
+            return observations, done, info
+        def get_log_info(self):
+            """Get additional information about the environment. This information is used to log the game.
+            Returns:
+                log_info (dict): Information about the environment required to log the game.
+            """
+            # (...)
+            return log_info
+        def render(self):
+            """Render the current state of the environment."""
+            pass
+        def close(self):
+            """Perform any necessary cleanup."""
+            pass
+    class AgentState():
+        def __init__(self):
+            """Initialize the agent state."""
+            pass
+        def step(self, observation_from_env, policy_output=None):
+            """Update the agent state based on the observation and action.
+            The action is the output of the LLM.
+            """
+            Args:
+                observation_from_env (dict): The observation of the environment.
+                policy_output : The output of the policy.
+            Returns:
+                policy_id (str): The policy identifier.
+                policy_input (dict): The input to the policy.
+                action : The official action to be sent to the environment.
+                done (bool): Whether the LLM action is ready to be sent to the environment.
+                info (dict): Additional information about the agent.
+            """
+            # (...)
+            return policy_id, policy_input, action, done, info
+        def get_log_info(self):
+            """Get information about the agent required to log a trajectory.
+            Returns:
+                log_info (dict): Information about the agent required to log a trajectory.
+            """
+            # (...)
+            return log_info
+        def render(self):
+            """Render the current state of the environment."""
+            pass
+        def close(self):
+            """Perform any necessary cleanup."""
+            pass
+Implicitely, the keys of the `observations` in the `step` method of the `MarlEnvironment` interface represent the set of agents from which an action is expected at the current step. The next step should only expect actions from the agents in the `observations` dictionary.
+As you can see, both classes have a `get_log_info` method. This method is used to log the game. It returns a dictionary with keys being the agent identifiers and values being the information to log. The reason we need this is because the environment and the agent handler may need to log different information. It makes it easier to log from the perspective of each agent. The core environment class should not need to know about the details of the agent handler.
+Running Environments in Parallel
+--------------------------------
+This standard allows the use of the `run_batched_matches` function (TODO: link) to run environments in an efficient way. The core idea is to batch the policy calls for all agents in the environment.
+.. note::
+   The ``run_batched_matches`` function allows you to run multiple negotiation games, or "matches," in parallel.
+   After each environment is initialized, the function continuously loops over all active matches and checks which agents
+   are still pending actions. Each agent's logic can require multiple calls to the policy (e.g., an LLM) before an action
+   becomes "ready" to be sent to the environment. (For instance, an agent might need multiple policy calls before having a string which can be parsed into a valid action.) While an agent is waiting for a policy output, these calls for all agents across all matches are grouped together by unique policy identifier and processed in batch for efficiency. This is the core functionality of the ``run_batched_matches`` function.
+   Only once all actions from the required agents at a given step for an environment are ready does the function make a single ``env.step(...)`` call; this ensures
+   every match moves forward in lockstep for all its active agents. As soon as an environment signals it is done, the function
+   retrieves logged information from both the environment and the agent states before removing this match from the active set.
+   If there are more matches waiting to be processed, they are then started one by one to maintain the specified degree of parallelism.
+   This batching approach provides an efficient mechanism to handle multi-agent or multi-policy environments, ensuring minimal
+   overhead and a clear, unified flow for stepping through matches.
+Here is a diagram that shows how the `run_batched_matches` function works at a high level:
+.. image:: media/runbatch.png
+   :alt: Alternate text for the image
+   :width: 1000px

src_code_for_reproducibility/docs/source/src.environments.dond.dond_game.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.environments.dond.dond\_game module
+=======================================
+.. automodule:: src.environments.dond.dond_game
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.environments.ipd.ipd_statistics_funcs.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.environments.ipd.ipd\_statistics\_funcs module
+==================================================
+.. automodule:: src.environments.ipd.ipd_statistics_funcs
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.experiments.generate_and_train.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.experiments.generate\_and\_train module
+===========================================
+.. automodule:: src.experiments.generate_and_train
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.generation.run_games.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.generation.run\_games module
+================================
+.. automodule:: src.generation.run_games
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.models.local_llm.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.models.local\_llm module
+============================
+.. automodule:: src.models.local_llm
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.models.new_local_llm.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.models.new\_local\_llm module
+=================================
+.. automodule:: src.models.new_local_llm
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.models.rst ADDED Viewed

	@@ -0,0 +1,20 @@

+src.models package
+==================
+.. automodule:: src.models
+   :members:
+   :undoc-members:
+   :show-inheritance:
+Submodules
+----------
+.. toctree::
+   :maxdepth: 4
+   src.models.dummy_local_llm
+   src.models.local_llm
+   src.models.new_local_llm
+   src.models.server_llm
+   src.models.updatable_worker
+   src.models.vllm_worker_wrap

src_code_for_reproducibility/docs/source/src.training.reinforce_training.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.training.reinforce\_training module
+=======================================
+.. automodule:: src.training.reinforce_training
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.training.rl_convs_processing.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.training.rl\_convs\_processing module
+=========================================
+.. automodule:: src.training.rl_convs_processing
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.training.train_main.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.training.train\_main module
+===============================
+.. automodule:: src.training.train_main
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.utils.export_ppo_training_set.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.utils.export\_ppo\_training\_set module
+===========================================
+.. automodule:: src.utils.export_ppo_training_set
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.utils.extra_stats.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.utils.extra\_stats module
+=============================
+.. automodule:: src.utils.extra_stats
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.utils.log_gpu_usage.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.utils.log\_gpu\_usage module
+================================
+.. automodule:: src.utils.log_gpu_usage
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/docs/source/src.utils.parallel_shuffle.rst ADDED Viewed

	@@ -0,0 +1,7 @@

+src.utils.parallel\_shuffle module
+==================================
+.. automodule:: src.utils.parallel_shuffle
+   :members:
+   :undoc-members:
+   :show-inheritance:

src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (173 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc ADDED Viewed

Binary file (6.17 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-312.pyc ADDED Viewed

Binary file (3.98 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-312.pyc ADDED Viewed

Binary file (3.67 kB). View file

src_code_for_reproducibility/markov_games/diplomacy/diplomacy_agent.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from typing import Dict, List, Tuple, Optional, Any
+import copy
+class DiplomacyAgent:
+    """Agent handler for Diplomacy game that follows the MARL standard.
+    This class is responsible for parsing LLM output into valid Diplomacy orders,
+    managing the agent state, and providing information for logging.
+    """
+    def __init__(self, policy_id: str, power_name: str, random_valid_move=False):
+        """Initialize the agent handler for a power in the Diplomacy game.
+        Args:
+            power_name: The name of the power this agent controls (e.g., 'FRANCE', 'ENGLAND')
+            policy_id: The identifier for the policy this agent uses
+            random_valid_move: If True, will select random valid moves instead of using LLM (default: False)
+        """
+        self.policy_id = policy_id
+        self.power_name = power_name
+        self.orders = []
+        self.wait = True
+        self.processing_state = "WAITING_FOR_ORDERS"
+        self.parsed_orders = []
+        self.order_status = {}
+        self.message_history = []
+        self.random_valid_move = random_valid_move
+    def step(self, observation_from_env, policy_output=None):
+        """Update the agent state based on the observation and LLM output.
+        Args:
+            observation_from_env: The observation from the environment
+            policy_output: The output from the LLM
+        Returns:
+            policy_id: The policy identifier
+            policy_input: The input to the policy
+            action: The official action to be sent to the environment
+            done: Whether the LLM action is ready to be sent to the environment
+            info: Additional information about the agent
+        """
+        info = {}
+        # If random_valid_move is enabled, select random valid moves
+        if self.random_valid_move:
+            valid_orders = self._select_random_valid_moves(observation_from_env)
+            self.orders = valid_orders
+            self.wait = False
+            action = {
+                "orders": valid_orders,
+                "wait": False
+            }
+            return self.policy_id, {}, action, True, info
+        # If no policy output, this is the initial step - prepare prompt
+        if policy_output is None:
+            # Create initial prompt for the LLM
+            phase = observation_from_env.get('phase', '')
+            units = observation_from_env.get('units', {}).get(self.power_name, [])
+            centers = observation_from_env.get('centers', {}).get(self.power_name, [])
+            orderable_locations = observation_from_env.get('orderable_locations', {})
+            prompt = self._create_prompt(phase, units, centers, orderable_locations)
+            return self.policy_id, {"prompt": prompt}, None, False, info
+        # Process the LLM output to extract orders
+        success, parsed_orders = self._parse_llm_output(policy_output)
+        self.parsed_orders = parsed_orders
+        if not success:
+            # Need more information from LLM
+            clarification_prompt = self._create_clarification_prompt(policy_output, parsed_orders)
+            return self.policy_id, {"prompt": clarification_prompt}, None, False, info
+        # Validate if the orders are valid for the current phase
+        valid_orders = self._validate_orders(parsed_orders, observation_from_env)
+        if valid_orders:
+            # Orders are valid, prepare action for environment
+            self.orders = valid_orders
+            self.wait = False
+            action = {
+                "orders": valid_orders,
+                "wait": False
+            }
+            return self.policy_id, {}, action, True, info
+        else:
+            # Orders are invalid, ask for new ones
+            error_prompt = self._create_error_prompt(parsed_orders, observation_from_env)
+            return self.policy_id, {"prompt": error_prompt}, None, False, info
+    def _create_prompt(self, phase, units, centers, orderable_locations):
+        """Create the initial prompt for the LLM.
+        Args:
+            phase: The current game phase
+            units: List of units controlled by this power
+            centers: List of supply centers controlled by this power
+            orderable_locations: List of locations where orders can be issued
+        Returns:
+            A prompt string for the LLM
+        """
+        prompt = f"You are playing as {self.power_name} in Diplomacy. The current phase is {phase}.\n\n"
+        prompt += f"Your units: {', '.join(units)}\n"
+        prompt += f"Your supply centers: {', '.join(centers)}\n"
+        prompt += f"Locations you can order: {', '.join(orderable_locations)}\n\n"
+        if phase.endswith('M'):  # Movement phase
+            prompt += "Please provide orders for your units in the form:\n"
+            prompt += "- A LON H (hold)\n"
+            prompt += "- F NTH - NWY (move)\n"
+            prompt += "- A WAL S F LON (support)\n"
+            prompt += "- F NWG C A NWY - EDI (convoy)\n"
+        elif phase.endswith('R'):  # Retreat phase
+            prompt += "Please provide retreat orders for your dislodged units:\n"
+            prompt += "- A PAR R MAR (retreat to MAR)\n"
+            prompt += "- A PAR D (disband)\n"
+        elif phase.endswith('A'):  # Adjustment phase
+            if len(units) < len(centers):
+                prompt += "You can build units. Please provide build orders:\n"
+                prompt += "- A PAR B (build army in PAR)\n"
+                prompt += "- F BRE B (build fleet in BRE)\n"
+                prompt += "- WAIVE (waive a build)\n"
+            elif len(units) > len(centers):
+                prompt += "You must remove units. Please provide disbandment orders:\n"
+                prompt += "- A PAR D (disband army in PAR)\n"
+                prompt += "- F BRE D (disband fleet in BRE)\n"
+        prompt += "\nProvide your orders as a list, one per line."
+        return prompt
+    def _parse_llm_output(self, llm_output):
+        """Parse the LLM output to extract orders.
+        Args:
+            llm_output: The raw output from the LLM
+        Returns:
+            success: Whether parsing was successful
+            parsed_orders: List of parsed orders
+        """
+        # Simple parsing for now - extract lines that look like orders
+        lines = llm_output.strip().split('\n')
+        orders = []
+        for line in lines:
+            # Remove list markers, hyphens, etc.
+            line = line.strip('- *•').strip()
+            # Skip empty lines and lines that don't look like orders
+            if not line or line.startswith('I ') or line.startswith('Let\'s'):
+                continue
+            # Check if it looks like a Diplomacy order
+            if (' H' in line or ' -' in line or ' S ' in line or ' C ' in line or
+                ' R ' in line or ' D' in line or ' B' in line or line == 'WAIVE'):
+                orders.append(line)
+        return len(orders) > 0, orders
+    def _validate_orders(self, orders, observation):
+        """Validate if the orders are valid for the current phase.
+        Args:
+            orders: List of orders to validate
+            observation: Current observation from the environment
+        Returns:
+            List of valid orders or None if invalid
+        """
+        # For simplicity, we'll assume all parsed orders are valid
+        # In a real implementation, we would use the game's validation logic
+        return orders
+    def _create_clarification_prompt(self, previous_output, parsed_orders):
+        """Create a prompt asking for clarification when orders couldn't be parsed.
+        Args:
+            previous_output: The previous LLM output
+            parsed_orders: Any orders that were successfully parsed
+        Returns:
+            A prompt string for the LLM
+        """
+        prompt = f"I couldn't fully understand your orders for {self.power_name}. "
+        if parsed_orders:
+            prompt += f"I understood these orders:\n"
+            for order in parsed_orders:
+                prompt += f"- {order}\n"
+        prompt += "\nPlease provide clear, valid Diplomacy orders in the format:\n"
+        prompt += "- A LON H\n- F NTH - NWY\n- etc.\n"
+        return prompt
+    def _create_error_prompt(self, invalid_orders, observation):
+        """Create a prompt when orders are invalid.
+        Args:
+            invalid_orders: The invalid orders
+            observation: Current observation from the environment
+        Returns:
+            A prompt string for the LLM
+        """
+        prompt = f"The following orders for {self.power_name} are invalid:\n"
+        for order in invalid_orders:
+            prompt += f"- {order}\n"
+        prompt += "\nPlease provide valid orders for your units."
+        return prompt
+    def get_log_info(self):
+        """Get information about the agent required to log a trajectory.
+        Returns:
+            log_info: Information about the agent required to log a trajectory.
+        """
+        return {
+            "power_name": self.power_name,
+            "orders": self.orders,
+            "wait": self.wait,
+            "parsing_state": self.processing_state,
+            "message_history": self.message_history
+        }
+    def render(self):
+        """Render the current state of the agent."""
+        print(f"Power: {self.power_name}")
+        print(f"Orders: {self.orders}")
+        print(f"Wait: {self.wait}")
+    def close(self):
+        """Perform any necessary cleanup."""
+        pass
+    def _select_random_valid_moves(self, observation):
+        """Select random valid moves for all units.
+        Args:
+            observation: Current observation from the environment
+        Returns:
+            List of valid orders
+        """
+        import random
+        possible_orders = observation.get('possible_orders', {})
+        valid_orders = []
+        # For each location with possible orders, select one randomly
+        for location, orders in possible_orders.items():
+            if orders:  # If there are any possible orders for this location
+                valid_orders.append(random.choice(orders))
+        return valid_orders

src_code_for_reproducibility/markov_games/diplomacy/diplomacy_logging_for_training.py ADDED Viewed

File without changes

src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (310 Bytes). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-312.pyc ADDED Viewed

Binary file (9.06 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simple_agent.cpython-312.pyc ADDED Viewed

Binary file (4.89 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simulation.cpython-312.pyc ADDED Viewed

Binary file (9.03 kB). View file

src_code_for_reproducibility/markov_games/negotiation/tas_agent.py ADDED Viewed

	@@ -0,0 +1,108 @@

+from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
+class TrustAndSplitAgent(NegotiationAgent):
+    def __init__(self, num_message_chars, *args, **kwargs):
+        self.num_message_chars = num_message_chars
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game has multiple independent rounds.\n"
+            "2. In each round, there are multiple items to split between the two agents.\n"
+            "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n"
+            "4. You can only observe your own per-item values.\n"
+            "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
+            "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the item.\n"
+            "   - Use this chat to communicate your private per-item value to make informed proposals.\n"
+            "3. After the chat, both agents simultaneously propose the amount of each item they will keep.\n"
+            "4. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n"
+            "5. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n"
+            "6. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n"
+            "7. Points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = (
+            "A New Round Begins\n"
+            "The items to split are {quantities}.\n"
+            "Your per-item values are {value}."
+        )
+        self.last_round_prompt = (
+            "Last Round Summary:\n"
+            "   - Items to split: {last_quantities}\n"
+            "   - Your per-item values: {last_value_agent}\n"
+            "   - {other_agent}'s per-item values: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent}\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent}\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round Complete.\n"
+        )
+        self.send_split_prompt = (
+            "Message quota is finished for this round.\n"
+            "{other_agent} has finalized their proposal.\n"
+            "Submit your finalization now\n"
+            "Respond with {proposal_style2}"
+        )
+        # self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
+        self.wait_for_message_prompt = ""
+        self.last_message_prompt = "{other_agent} said: {last_message}"
+        # self.send_message_prompt = (
+        #     f"Send your message now (max {self.num_message_chars} chars)."
+        # )
+        self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
+    def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+        return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
+    # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+    #     return rf"(?s).{{0,{self.num_message_chars}}}"
+    def get_split_regex(self, observation: TrustAndSplitObs) -> str:
+        items = list(observation.quantities.keys())
+        # Accept both singular and plural forms
+        item_pattern = "|".join(
+            [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items]
+        )
+        regex = rf"(?i)<items_to_self> ?((?:\s*(?P<num>(10|[0-9]))\s*(?P<item>{item_pattern})\s*,?)+) ?</items_to_self>"
+        return regex
+    def get_split_action(
+        self, policy_output: str, observation: TrustAndSplitObs
+    ) -> Split:
+        items = list(observation.quantities.keys())
+        import re as _re
+        split_regex = self.get_split_regex(observation)
+        items_given_to_self = {item: 0 for item in items}
+        m = _re.match(split_regex, policy_output.strip())
+        if m:
+            # Find all (number, item) pairs
+            item_pattern = "|".join(
+                [
+                    f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?"
+                    for item in items
+                ]
+            )
+            inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
+            def normalize_item_name(item_str):
+                for orig in items:
+                    if item_str.lower() == orig.lower():
+                        return orig
+                    if orig.endswith("s") and item_str.lower() == orig[:-1].lower():
+                        return orig
+                    if (
+                        not orig.endswith("s")
+                        and item_str.lower() == orig.lower() + "s"
+                    ):
+                        return orig
+            for num, item in _re.findall(inner_regex, m.group(1)):
+                items_given_to_self[normalize_item_name(item)] = int(num)
+        return Split(items_given_to_self=items_given_to_self)

src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc ADDED Viewed

Binary file (3.21 kB). View file

src_code_for_reproducibility/training/credit_methods.py ADDED Viewed

	@@ -0,0 +1,295 @@

+import torch
+def whiten_advantages(advantages: torch.Tensor) -> torch.Tensor:
+    """
+    Whitens the advantages.
+    """
+    whitened_advantages = (advantages - torch.mean(advantages)) / (
+        torch.std(advantages) + 1e-9
+    )
+    return whitened_advantages
+def whiten_advantages_time_step_wise(
+    advantages: torch.Tensor,  # (B, T)
+) -> torch.Tensor:
+    """
+    Whitens the advantages.
+    """
+    assert advantages.dim() == 2, "Wrong dimensions."
+    whitened_advantages_time_step_wise = (
+        advantages - advantages.mean(dim=0, keepdim=True)
+    ) / (advantages.std(dim=0, keepdim=True) + 1e-9)
+    return whitened_advantages_time_step_wise
+def get_discounted_state_visitation_credits(
+    credits: torch.Tensor, discount_factor: float  # (B, T)
+) -> torch.Tensor:
+    """
+    Computes discounted state visitation credits for a sequence of credits.
+    """
+    return credits * (
+        discount_factor ** torch.arange(credits.shape[1], device=credits.device)
+    )
+def get_discounted_returns(
+    rewards: torch.Tensor,  # (B, T)
+    discount_factor: float,
+) -> torch.Tensor:
+    """
+    Computes Monte Carlo discounted returns for a sequence of rewards.
+    Args:
+        rewards (torch.Tensor): Array of rewards for each timestep.
+    Returns:
+        torch.Tensor: Array of discounted returns.
+    """
+    assert rewards.dim() == 2, "Wrong dimensions."
+    B, T = rewards.shape
+    discounted_returns = torch.zeros_like(rewards)
+    accumulator = torch.zeros(B, device=rewards.device, dtype=rewards.dtype)
+    for t in reversed(range(T)):
+        accumulator = rewards[:, t] + discount_factor * accumulator
+        discounted_returns[:, t] = accumulator
+    return discounted_returns
+def get_rloo_credits(credits: torch.Tensor):  # (B, S)
+    assert credits.dim() == 2, "Wrong dimensions."
+    rloo_baselines = torch.zeros_like(credits)
+    n = credits.shape[0]
+    if n == 1:
+        return credits, rloo_baselines
+    rloo_baselines = (torch.sum(credits, dim=0, keepdim=True) - credits) / (n - 1)
+    rloo_credits = credits - rloo_baselines
+    return rloo_credits, rloo_baselines
+def get_generalized_advantage_estimates(
+    rewards: torch.Tensor,  # (B, T)
+    value_estimates: torch.Tensor,  # (B, T+1)
+    discount_factor: float,
+    lambda_coef: float,
+) -> torch.Tensor:
+    """
+    Computes Generalized Advantage Estimates (GAE) for a sequence of rewards and value estimates.
+    See https://arxiv.org/pdf/1506.02438 for details.
+    Returns:
+        torch.Tensor: Array of GAE values.
+    """
+    assert rewards.dim() == value_estimates.dim() == 2, "Wrong dimensions."
+    assert (
+        rewards.shape[0] == value_estimates.shape[0]
+    ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates."
+    assert (
+        rewards.shape[1] == value_estimates.shape[1] - 1
+    ), f"Got shapes {rewards.shape} and {value_estimates.shape} of rewards and value estimates."
+    T = rewards.shape[1]
+    tds = rewards + discount_factor * value_estimates[:, 1:] - value_estimates[:, :-1]
+    gaes = torch.zeros_like(tds)
+    acc = 0.0
+    for t in reversed(range(T)):
+        acc = tds[:, t] + lambda_coef * discount_factor * acc
+        gaes[:, t] = acc
+    return gaes
+def get_advantage_alignment_weights(
+    advantages: torch.Tensor,  # (B, T)
+    exclude_k_equals_t: bool,
+    gamma: float,
+) -> torch.Tensor:
+    """
+    The advantage alignment credit is calculated as
+    \[
+        A^*(s_t, a_t, b_t) = A^1(s_t, a_t, b_t) + \beta \cdot
+        \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \right)
+        A^2(s_t, a_t, b_t)
+    \]
+    Here, the weights are defined as \( \beta \cdot
+        \left( \sum_{k < t} \gamma^{t-k} A^1(s_k, a_k, b_k) \)
+    """
+    T = advantages.shape[1]
+    discounted_advantages = advantages * (
+        gamma * torch.ones((1, T), device=advantages.device)
+    ) ** (-torch.arange(0, T, 1, device=advantages.device))
+    if exclude_k_equals_t:
+        sub = torch.eye(T, device=advantages.device)
+    else:
+        sub = torch.zeros((T, T), device=advantages.device)
+    # Identity is for \( k < t \), remove for \( k \leq t \)
+    ad_align_weights = discounted_advantages @ (
+        torch.triu(torch.ones((T, T), device=advantages.device)) - sub
+    )
+    t_discounts = (gamma * torch.ones((1, T), device=advantages.device)) ** (
+        torch.arange(0, T, 1, device=advantages.device)
+    )
+    ad_align_weights = t_discounts * ad_align_weights
+    return ad_align_weights
+def get_advantage_alignment_credits(
+    a1: torch.Tensor,  # (B, S)
+    a1_alternative: torch.Tensor,  # (B, S, A)
+    a2: torch.Tensor,  # (B, S)
+    exclude_k_equals_t: bool,
+    beta: float,
+    gamma: float = 1.0,
+    use_old_ad_align: bool = False,
+    use_sign: bool = False,
+    clipping: float | None = None,
+    use_time_regularization: bool = False,
+    force_coop_first_step: bool = False,
+    use_variance_regularization: bool = False,
+    rloo_branch: bool = False,
+    reuse_baseline: bool = False,
+    mean_normalize_ad_align: bool = False,
+    whiten_adalign_advantages: bool = False,
+    whiten_adalign_advantages_time_step_wise: bool = False,
+) -> torch.Tensor:
+    """
+    Calculate the advantage alignment credits with vectorization, as described in https://arxiv.org/abs/2406.14662.
+    Recall that the advantage opponent shaping term of the AdAlign policy gradient is:
+    \[
+        \beta \mathbb{E}_{\substack{
+        \tau \sim \text{Pr}_{\mu}^{\pi^1, \pi^2} \\
+        a_t' \sim \pi^1(\cdot \mid s_t)
+        }}
+        \left[\sum_{t=0}^\infty  \gamma^{t}\left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t)\nabla_{\theta^1}\text{log } \pi^1(a_t|s_t) \right]
+    \]
+    This method computes the following:
+    \[
+        Credit(s_t, a_t, b_t) = \gamma^t \left[ A^1(s_t, a_t, b_t) + \beta \left( \sum_{k\leq t} A^1(s_k,a^{\prime}_k,b_k) \right) A^{2}(s_t,a_t, b_t) \right]
+    \]
+    Args:
+        a1: Advantages of the main trajectories for the current agent.
+        a1_alternative: Advantages of the alternative trajectories for the current agent.
+        a2: Advantages of the main trajectories for the other agent.
+        discount_factor: Discount factor for the advantage alignment.
+        beta: Beta parameter for the advantage alignment.
+        gamma: Gamma parameter for the advantage alignment.
+        use_sign_in_ad_align: Whether to use sign in the advantage alignment.
+    Returns:
+        torch.Tensor: The advantage alignment credits.
+    """
+    assert a1.dim() == a2.dim() == 2, "Advantages must be of shape (B, S)"
+    if a1_alternative is not None:
+        assert (
+            a1_alternative.dim() == 3
+        ), "Alternative advantages must be of shape (B, S, A)"
+        B, T, A = a1_alternative.shape
+    else:
+        B, T = a1.shape
+    assert a1.shape == a2.shape, "Not the same shape"
+    sub_tensors = {}
+    if use_old_ad_align:
+        ad_align_weights = get_advantage_alignment_weights(
+            advantages=a1, exclude_k_equals_t=exclude_k_equals_t, gamma=gamma
+        )
+        sub_tensors["ad_align_weights_prev"] = ad_align_weights
+        if exclude_k_equals_t:
+            ad_align_weights = gamma * ad_align_weights
+    else:
+        assert a1_alternative is not None, "Alternative advantages must be provided"
+        if rloo_branch:
+            a1_alternative = torch.cat([a1.unsqueeze(2), a1_alternative], dim=2)
+            a1_alternative = a1_alternative.mean(dim=2)
+            # print(f"a1_alternative: {a1_alternative}, a1: {a1}\n")
+            a1, baseline = get_rloo_credits(a1)
+            if reuse_baseline:
+                a1_alternative = a1_alternative - baseline
+            else:
+                a1_alternative, _ = get_rloo_credits(a1_alternative)
+        assert a1.shape == a1_alternative.shape, "Not the same shape"
+        ad_align_weights = get_advantage_alignment_weights(
+            advantages=a1_alternative,
+            exclude_k_equals_t=exclude_k_equals_t,
+            gamma=gamma,
+        )
+        sub_tensors["ad_align_weights"] = ad_align_weights
+    # Use sign
+    if use_sign:
+        assert beta == 1.0, "beta should be 1.0 when using sign"
+        positive_signs = ad_align_weights > 0
+        negative_signs = ad_align_weights < 0
+        ad_align_weights[positive_signs] = 1
+        ad_align_weights[negative_signs] = -1
+        sub_tensors["ad_align_weights_sign"] = ad_align_weights
+        # (rest are 0)
+    ###################
+    # Process weights
+    ###################
+    # Use clipping
+    if clipping not in [0.0, None]:
+        upper_mask = ad_align_weights > 1
+        lower_mask = ad_align_weights < -1
+        ad_align_weights = torch.clip(
+            ad_align_weights,
+            -clipping,
+            clipping,
+        )
+        clipping_ratio = (
+            torch.sum(upper_mask) + torch.sum(lower_mask)
+        ) / upper_mask.size
+        sub_tensors["clipped_ad_align_weights"] = ad_align_weights
+    # 1/1+t Regularization
+    if use_time_regularization:
+        t_values = torch.arange(1, T + 1).to(ad_align_weights.device)
+        ad_align_weights = ad_align_weights / t_values
+        sub_tensors["time_regularized_ad_align_weights"] = ad_align_weights
+    # Use coop on t=0
+    if force_coop_first_step:
+        ad_align_weights[:, 0] = 1
+        sub_tensors["coop_first_step_ad_align_weights"] = ad_align_weights
+    # # Normalize alignment terms (across same time step)
+    # if use_variance_regularization_in_ad_align:
+    #     # TODO: verify
+    #     reg_coef = torch.std(a1[:, -1]) / (torch.std(opp_shaping_terms[:, -1]) + 1e-9)
+    #     opp_shaping_terms *= reg_coef
+    ####################################
+    # Compose elements together
+    ####################################
+    opp_shaping_terms = beta * ad_align_weights * a2
+    sub_tensors["ad_align_opp_shaping_terms"] = opp_shaping_terms
+    credits = a1 + opp_shaping_terms
+    if mean_normalize_ad_align:
+        credits = credits - credits.mean(dim=0)
+        sub_tensors["mean_normalized_ad_align_credits"] = credits
+    if whiten_adalign_advantages:
+        credits = (credits - credits.mean()) / (credits.std() + 1e-9)
+        sub_tensors["whitened_ad_align_credits"] = credits
+    if whiten_adalign_advantages_time_step_wise:
+        credits = (credits - credits.mean(dim=0, keepdim=True)) / (
+            credits.std(dim=0, keepdim=True) + 1e-9
+        )
+        sub_tensors["whitened_ad_align_credits_time_step_wise"] = credits
+    sub_tensors["final_ad_align_credits"] = credits
+    return credits, sub_tensors

src_code_for_reproducibility/training/tally_tokenwise.py ADDED Viewed

	@@ -0,0 +1,276 @@

+import json
+import os
+from typing import Any, Dict, List, Tuple, Union
+import numpy as np
+import pandas as pd
+import torch
+from transformers import AutoTokenizer
+class ContextualizedTokenwiseTally:
+    """
+    Collect, store, and save token-level metrics per rollout.
+    - One DataFrame per rollout_id in `paths`
+    - Index = timestep (int)
+    - Columns are added incrementally via `add_contexts()` and `add_data()`
+    - Cells may contain scalars, strings, or lists (dtype=object)
+    """
+    def __init__(
+        self,
+        tokenizer: AutoTokenizer,
+        paths: List[str],
+        max_context_length: int = 30,
+    ):
+        """
+        Args:
+            tokenizer: HuggingFace tokenizer used to convert tids -> tokens
+            paths: rollout identifiers (parallel to batch dimension)
+            max_context_length: truncate context token lists to this length
+        """
+        self.tokenizer = tokenizer
+        self.paths = paths
+        self.max_context_length = max_context_length
+        self.tally: Dict[str, pd.DataFrame] = {path: pd.DataFrame() for path in paths}
+        # set later by setters
+        self.contexts: torch.Tensor | None = None
+        self.action_mask: torch.Tensor | None = None
+        self.range: Tuple[int, int] | None = None
+    # --------- Utilities ---------
+    def tids_to_str(self, tids: List[int]) -> List[str]:
+        """Convert a list of token IDs to a list of token strings."""
+        return self.tokenizer.convert_ids_to_tokens(tids)
+    def _ensure_ready(self):
+        assert self.action_mask is not None, "call set_action_mask(mask) first"
+        assert self.range is not None, "call set_range((start, end)) first"
+    @staticmethod
+    def _sanitize_filename(name: Any) -> str:
+        """Make a safe filename from any rollout_id."""
+        s = str(name)
+        bad = {os.sep, " ", ":", "|", "<", ">", '"', "'"}
+        if os.altsep is not None:
+            bad.add(os.altsep)
+        for ch in bad:
+            s = s.replace(ch, "_")
+        return s
+    @staticmethod
+    def _pad_left(seq: List[Any], length: int, pad_val: Any = "") -> List[Any]:
+        """Left-pad a sequence to `length` with `pad_val`."""
+        if len(seq) >= length:
+            return seq[-length:]
+        return [pad_val] * (length - len(seq)) + list(seq)
+    # --------- Setters ---------
+    def set_action_mask(self, action_mask: torch.Tensor):
+        """
+        action_mask: (B, S) bool or 0/1 indicating valid steps
+        """
+        self.action_mask = action_mask
+    def set_range(self, range: Tuple[int, int]):
+        """
+        range: slice (start, end) into self.paths for current batch
+        """
+        self.range = range
+    # --------- Column builders ---------
+    def add_contexts(self, contexts: torch.Tensor):
+        """
+        Add a single 'context' column (list[str]) for valid steps.
+        Expects `contexts` with shape (B, S): token id at each timestep.
+        For each valid timestep t, we use the last N tokens up to and including t:
+            window = contexts[i, max(0, t - N + 1) : t + 1]
+        The list is left-padded with "" to always be length N.
+        """
+        self._ensure_ready()
+        current_paths = self.paths[self.range[0] : self.range[1]]
+        B, S = contexts.shape
+        N = self.max_context_length
+        # to CPU ints once
+        contexts_cpu = contexts.detach().to("cpu")
+        for i in range(B):
+            rollout_id = current_paths[i]
+            df = self.tally.get(rollout_id, pd.DataFrame())
+            valid_idx = torch.nonzero(
+                self.action_mask[i].bool(), as_tuple=False
+            ).squeeze(-1)
+            if valid_idx.numel() == 0:
+                self.tally[rollout_id] = df
+                continue
+            idx_list = valid_idx.tolist()
+            # ensure index contains valid steps
+            if df.empty:
+                df = pd.DataFrame(index=idx_list)
+            else:
+                new_index = sorted(set(df.index.tolist()) | set(idx_list))
+                if list(df.index) != new_index:
+                    df = df.reindex(new_index)
+            # build context windows
+            ctx_token_lists = []
+            for t in idx_list:
+                start = max(0, t - N + 1)
+                window_ids = contexts_cpu[i, start : t + 1].tolist()
+                window_toks = self.tids_to_str([int(x) for x in window_ids])
+                if len(window_toks) < N:
+                    window_toks = [""] * (N - len(window_toks)) + window_toks
+                else:
+                    window_toks = window_toks[-N:]
+                ctx_token_lists.append(window_toks)
+            # single 'context' column
+            if "context" not in df.columns:
+                df["context"] = pd.Series(index=df.index, dtype=object)
+            df.loc[idx_list, "context"] = pd.Series(
+                ctx_token_lists, index=idx_list, dtype=object
+            )
+            self.tally[rollout_id] = df
+    def add_data(
+        self,
+        metric_id: str,
+        metrics: torch.Tensor,
+        to_tids: bool = False,
+    ):
+        """
+        Add a metric column for valid steps.
+        Args:
+            metric_id: column name
+            metrics: shape (B, S) for scalars/ids or (B, S, K) for top-k vectors
+            to_tids: if True, treat ints/lists of ints as tids and convert to tokens
+        """
+        self._ensure_ready()
+        current_paths = self.paths[self.range[0] : self.range[1]]
+        if metrics.dim() == 2:
+            B, S = metrics.shape
+        elif metrics.dim() == 3:
+            B, S, _ = metrics.shape
+        else:
+            raise ValueError("metrics must be (B, S) or (B, S, K)")
+        for i in range(B):
+            rollout_id = current_paths[i]
+            df = self.tally.get(rollout_id, pd.DataFrame())
+            valid_idx = torch.nonzero(
+                self.action_mask[i].bool(), as_tuple=False
+            ).squeeze(-1)
+            if valid_idx.numel() == 0:
+                self.tally[rollout_id] = df
+                continue
+            idx_list = valid_idx.detach().cpu().tolist()
+            # Ensure index contains valid steps
+            if df.empty:
+                df = pd.DataFrame(index=idx_list)
+            else:
+                new_index = sorted(set(df.index.tolist()) | set(idx_list))
+                if list(df.index) != new_index:
+                    df = df.reindex(new_index)
+            # Slice metrics at valid steps
+            m_valid = metrics[i][valid_idx]
+            # -> pure python lists (1D list or list-of-lists)
+            values = m_valid.detach().cpu().tolist()
+            # optional tids -> tokens
+            if to_tids:
+                def _to_tokish(x):
+                    if isinstance(x, list):
+                        return self.tids_to_str([int(v) for v in x])
+                    else:
+                        return self.tids_to_str([int(x)])[0]
+                values = [_to_tokish(v) for v in values]
+            # Ensure column exists with object dtype, then assign via aligned Series
+            if metric_id not in df.columns:
+                df[metric_id] = pd.Series(index=df.index, dtype=object)
+            if isinstance(values, np.ndarray):
+                values = values.tolist()
+            if len(values) != len(idx_list):
+                raise ValueError(
+                    f"Length mismatch for '{metric_id}': values={len(values)} vs idx_list={len(idx_list)}"
+                )
+            df.loc[idx_list, metric_id] = pd.Series(
+                values, index=idx_list, dtype=object
+            )
+            self.tally[rollout_id] = df
+    # --------- Saving ---------
+    def save(self, path: str):
+        """
+        Write a manifest JSON and one CSV per rollout.
+        - Manifest includes metadata only (safe to JSON).
+        - Each rollout CSV is written with index label 'timestep'.
+        - Only a single 'context' column (list[str]).
+        """
+        if not self.tally or all(df.empty for df in self.tally.values()):
+            return
+        os.makedirs(path, exist_ok=True)
+        from datetime import datetime
+        now = datetime.now()
+        manifest = {
+            "created_at": f"{now:%Y-%m-%d %H:%M:%S}",
+            "max_context_length": self.max_context_length,
+            "num_rollouts": len(self.tally),
+            "rollouts": [],
+        }
+        for rid, df in self.tally.items():
+            rid_str = str(rid)
+            safe_name = self._sanitize_filename(rid_str)
+            csv_path = os.path.join(path, f"{safe_name}_tokenwise.csv")
+            # Put 'context' first, then the rest
+            cols = ["context"] + [c for c in df.columns if c != "context"]
+            try:
+                df[cols].to_csv(csv_path, index=True, index_label="timestep")
+            except Exception as e:
+                continue
+            manifest["rollouts"].append(
+                {
+                    "rollout_id": rid_str,
+                    "csv": csv_path,
+                    "num_rows": int(df.shape[0]),
+                    "columns": cols,
+                }
+            )
+        manifest_path = os.path.join(
+            path, f"tokenwise_manifest_{now:%Y-%m-%d___%H-%M-%S}.json"
+        )
+        with open(manifest_path, "w") as fp:
+            json.dump(manifest, fp, indent=2)

src_code_for_reproducibility/training/tokenize_chats.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import logging
+import sys
+import regex
+import torch
+from transformers import AutoTokenizer
+from mllm.training.training_data_utils import TrainingChatTurn, TrajectoryBatch
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+# def get_chat_dicts(chat: list[TrainingChatTurn]) -> list[dict]:
+#     chat_dicts = [chat_turn.dict() for chat_turn in chat]
+#     return chat_dicts
+def process_training_chat(
+    tokenizer: AutoTokenizer,
+    chat_history: list[TrainingChatTurn],
+    entropy_mask_regex: str | None = None,
+    exploration_prompts_to_remove: list[str] = [],
+    use_engine_out_token_ids: bool = False,
+) -> tuple[torch.IntTensor, torch.BoolTensor, torch.IntTensor, torch.BoolTensor]:
+    """Tokenize a single training chat and build aligned per-token masks.
+    Given an ordered list of `TrainingChatTurn`, this function tokenizes each
+    turn independently using the tokenizer's chat template, then concatenates
+    all resulting token sequences. It also constructs three parallel 1D masks
+    that align with the concatenated tokens:
+    - input_ids: token ids for the entire chat, turn by turn
+    - action_mask: True for tokens that belong to assistant turns (i.e., model
+      actions), False for tokens from other roles
+    - timesteps: per-token time step copied from the originating turn's
+      `time_step`
+    - state_ends_mask: True for the last token of any turn where
+      `is_state_end` is True, otherwise False
+    Important details:
+    - Each turn is passed as a single-message list to
+      `tokenizer.apply_chat_template` and flattened; the per-turn outputs are
+      then concatenated in the original order.
+    - Turn boundaries are not explicitly encoded beyond what the chat template
+      inserts; masks provide alignment for learning signals and state endings.
+    - No truncation or padding is performed here; downstream code should handle
+      batching/padding as needed.
+    - Note on dtypes: `input_ids` will be a LongTensor (int64). `action_mask`
+      and `state_ends_mask` are BoolTensors. `timesteps` is currently created
+      as a float tensor; adjust the implementation if integer dtype is
+      required downstream.
+    Args:
+        tokenizer: A Hugging Face tokenizer supporting `apply_chat_template`.
+        chat_history: Ordered list of `TrainingChatTurn` forming one dialogue.
+    Returns:
+        A tuple of four 1D tensors, all of equal length N (the total number of
+        tokens across all turns), in the following order:
+        - input_ids (LongTensor)
+        - action_mask (BoolTensor)
+        - timesteps (FloatTensor as implemented; see note above)
+        - state_ends_mask (BoolTensor)
+    """
+    state_ends_mask = []
+    input_ids = []
+    action_mask = []
+    timesteps = []
+    entropy_mask = []
+    engine_log_probs = []
+    for train_chat_turn in chat_history:
+        is_state_end = train_chat_turn.is_state_end
+        time_step = train_chat_turn.time_step
+        is_action = train_chat_turn.role == "assistant"
+        # Remove exploration prompts from training data
+        for exploration_prompt in exploration_prompts_to_remove:
+            if exploration_prompt in train_chat_turn.content:
+                train_chat_turn.content = train_chat_turn.content.replace(
+                    exploration_prompt, ""
+                )
+        chat_turn = {
+            "role": train_chat_turn.role,
+            "content": train_chat_turn.content,
+        }
+        if entropy_mask_regex is not None:
+            is_entropy_mask_true = (
+                regex.search(entropy_mask_regex, train_chat_turn.content) is not None
+            )
+        else:
+            is_entropy_mask_true = True
+        if is_action:
+            chat_turn_ids = train_chat_turn.out_token_ids
+            nb_chat_turns_ids = chat_turn_ids.numel()
+            action_mask.append(torch.ones(nb_chat_turns_ids, dtype=torch.bool))
+            engine_log_probs.append(train_chat_turn.log_probs)
+        else:
+            chat_turn_ids = train_chat_turn.chat_template_token_ids
+            nb_chat_turns_ids = chat_turn_ids.numel()
+            action_mask.append(torch.zeros(nb_chat_turns_ids, dtype=torch.bool))
+            engine_log_probs.append(torch.zeros(nb_chat_turns_ids, dtype=torch.float))
+        nb_chat_turns_ids = chat_turn_ids.numel()
+        state_ends_mask.append(torch.zeros(nb_chat_turns_ids, dtype=torch.bool))
+        if is_state_end:
+            state_ends_mask[-1][-1] = True  # last token is state end
+        input_ids.append(chat_turn_ids)
+        entropy_mask.append(torch.ones(nb_chat_turns_ids, dtype=torch.bool))
+        if not is_entropy_mask_true:
+            entropy_mask[-1] = entropy_mask[-1] * False
+        timesteps.append(torch.ones(nb_chat_turns_ids) * time_step)
+    input_ids = torch.cat(input_ids)
+    action_mask = torch.cat(action_mask)
+    entropy_mask = torch.cat(entropy_mask)
+    timesteps = torch.cat(timesteps)
+    timesteps = timesteps.to(torch.long)
+    state_ends_mask = torch.cat(state_ends_mask)
+    engine_log_probs = torch.cat(engine_log_probs)
+    return (
+        input_ids,
+        action_mask,
+        entropy_mask,
+        timesteps,
+        state_ends_mask,
+        engine_log_probs,
+    )

src_code_for_reproducibility/training/trainer_ad_align.py ADDED Viewed

	@@ -0,0 +1,492 @@

+import copy
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from mllm.markov_games.rollout_tree import (
+    ChatTurn,
+    RolloutTreeBranchNode,
+    RolloutTreeRootNode,
+)
+from mllm.training.credit_methods import (
+    get_advantage_alignment_credits,
+    get_discounted_state_visitation_credits,
+)
+from mllm.training.tally_metrics import Tally
+from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem
+from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally
+from mllm.training.tokenize_chats import process_training_chat
+from mllm.training.trainer_common import BaseTrainer
+from mllm.training.training_data_utils import (
+    AdvantagePacket,
+    TrainingBatch,
+    TrainingChatTurn,
+    TrajectoryBatch,
+    get_main_chat_list_and_rewards,
+    get_tokenwise_credits,
+)
+from mllm.utils.resource_context import resource_logger_context
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+RolloutId = int
+AgentId = str
+@dataclass
+class AdAlignTrainingData:
+    agent_id: str
+    main_data: TrajectoryBatch
+    # list-of-tensors: per rollout advantages with length jT
+    main_advantages: list[torch.FloatTensor] | None = None
+    # list-of-tensors: per rollout matrix (jT, A)
+    alternative_advantages: list[torch.FloatTensor] | None = None
+    advantage_alignment_credits: list[torch.FloatTensor] | None = None
+def get_alternative_chat_histories(
+    agent_id: str, root: RolloutTreeRootNode
+) -> list[list[TrainingChatTurn], list[torch.FloatTensor]]:
+    """
+    args:
+        agent_id: The agent we want to get the chat history for.
+        root: The root of the rollout tree.
+    returns:
+        alternative_chats: list[list[TrainingChatTurn]] (jT*A, jS')
+        alternative_rewards: list[torch.FloatTensor] (jT*A, jT')
+    """
+    current_node = root.child
+    branches = current_node.branches
+    pre_branch_chat = []
+    pre_branch_rewards = []
+    alternative_rewards = []
+    alternative_chats = []
+    while current_node is not None:
+        assert isinstance(
+            current_node, RolloutTreeBranchNode
+        ), "Current node should be a branch node."
+        main_node = current_node.main_child
+        branches = current_node.branches
+        current_node = main_node.child
+        # Get the `A` alternative trajectories
+        alternative_nodes = branches[agent_id]
+        for alt_node in alternative_nodes:
+            post_branch_chat, post_branch_rewards = get_main_chat_list_and_rewards(
+                agent_id=agent_id, root=alt_node
+            )
+            branch_chat = pre_branch_chat + post_branch_chat
+            alternative_chats.append(branch_chat)
+            alternative_rewards.append(
+                torch.cat([torch.tensor(pre_branch_rewards), post_branch_rewards])
+            )
+        chat_turns: list[ChatTurn] = main_node.step_log.action_logs[agent_id].chat_turns
+        chat_turns: list[TrainingChatTurn] = [
+            TrainingChatTurn(time_step=main_node.time_step, **turn.model_dump())
+            for turn in chat_turns
+        ]
+        pre_branch_chat.extend(chat_turns)
+        pre_branch_rewards.append(
+            main_node.step_log.simulation_step_log.rewards[agent_id]
+        )
+    return alternative_chats, alternative_rewards
+class TrainerAdAlign(BaseTrainer):
+    """
+    Extends the reinforce trainer to support Advantage Alignment.
+    """
+    def __init__(
+        self,
+        ad_align_beta: float,
+        ad_align_gamma: float,
+        ad_align_exclude_k_equals_t: bool,
+        ad_align_use_sign: bool,
+        ad_align_clipping: float,
+        ad_align_force_coop_first_step: bool,
+        use_old_ad_align: bool,
+        use_time_regularization: bool,
+        rloo_branch: bool,
+        reuse_baseline: bool,
+        ad_align_beta_anneal_step: int = -1,
+        ad_align_beta_anneal_rate: float = 0.5,
+        min_ad_align_beta: float = 0.1,
+        mean_normalize_ad_align: bool = False,
+        whiten_adalign_advantages: bool = False,
+        whiten_adalign_advantages_time_step_wise: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the advantage alignment trainer.
+        Args:
+            ad_align_beta: Beta parameter for the advantage alignment.
+            ad_align_gamma: Gamma parameter for the advantage alignment.
+            ad_align_exclude_k_equals_t: Whether to include k = t in the advantage alignment.
+            ad_align_use_sign: Whether to use sign in the advantage alignment.
+            ad_align_clipping: Clipping value for the advantage alignment.
+            ad_align_force_coop_first_step: Whether to force coop on the first step of the advantage alignment.
+        """
+        super().__init__(*args, **kwargs)
+        self.ad_align_beta = ad_align_beta
+        self.ad_align_gamma = ad_align_gamma
+        self.ad_align_exclude_k_equals_t = ad_align_exclude_k_equals_t
+        self.ad_align_use_sign = ad_align_use_sign
+        self.ad_align_clipping = ad_align_clipping
+        self.ad_align_force_coop_first_step = ad_align_force_coop_first_step
+        self.use_old_ad_align = use_old_ad_align
+        self.use_time_regularization = use_time_regularization
+        self.rloo_branch = rloo_branch
+        self.reuse_baseline = reuse_baseline
+        self.ad_align_beta_anneal_step = ad_align_beta_anneal_step
+        self.ad_align_beta_anneal_rate = ad_align_beta_anneal_rate
+        self.min_ad_align_beta = min_ad_align_beta
+        self.past_ad_align_step = -1
+        self.mean_normalize_ad_align = mean_normalize_ad_align
+        self.whiten_adalign_advantages = whiten_adalign_advantages
+        self.whiten_adalign_advantages_time_step_wise = (
+            whiten_adalign_advantages_time_step_wise
+        )
+        self.training_data: dict[AgentId, AdAlignTrainingData] = {}
+        self.debug_path_list: list[str] = []
+    def set_agent_trajectory_data(
+        self, agent_id: str, roots: list[RolloutTreeRootNode]
+    ):
+        """
+        TOWRITE
+        Set the advantage alignment data for the trainer.
+        """
+        B = len(roots)  # Number of rollouts
+        # For main rollouts
+        batch_rollout_ids = []
+        batch_crn_ids = []
+        batch_input_ids = []
+        batch_action_mask = []
+        batch_entropy_mask = []
+        batch_timesteps = []
+        batch_state_ends_mask = []
+        batch_engine_log_probs = []
+        batch_rewards = []
+        # For alternative actions rollouts
+        batch_branching_time_steps = []
+        alternative_batch_input_ids = []
+        alternative_batch_action_mask = []
+        alternative_batch_entropy_mask = []
+        alternative_batch_timesteps = []
+        alternative_batch_state_ends_mask = []
+        alternative_batch_engine_log_probs = []
+        alternative_batch_rewards = []
+        jT_list = []
+        try:
+            A = len(roots[0].child.branches[agent_id])  # Number of alternative actions
+        except:
+            A = 0
+        for root in roots:
+            rollout_id = root.id
+            self.debug_path_list.append(
+                "mgid:" + str(rollout_id) + "_agent_id:" + agent_id
+            )
+            # Get main trajectory
+            batch_rollout_ids.append(rollout_id)
+            batch_crn_ids.append(root.crn_id)
+            main_chat, main_rewards = get_main_chat_list_and_rewards(
+                agent_id=agent_id, root=root
+            )
+            (
+                input_ids,
+                action_mask,
+                entropy_mask,
+                timesteps,
+                state_ends_mask,
+                engine_log_probs,
+            ) = process_training_chat(
+                tokenizer=self.tokenizer,
+                chat_history=main_chat,
+                entropy_mask_regex=self.entropy_mask_regex,
+                exploration_prompts_to_remove=self.exploration_prompts_to_remove,
+            )
+            batch_input_ids.append(input_ids)
+            batch_action_mask.append(action_mask)
+            batch_entropy_mask.append(entropy_mask)
+            batch_timesteps.append(timesteps)
+            batch_state_ends_mask.append(state_ends_mask)
+            batch_engine_log_probs.append(engine_log_probs)
+            batch_rewards.append(main_rewards)
+            jT = main_rewards.numel()  # TODO: better than this
+            jT_list.append(jT)
+            if A > 0:
+                # We get the branching time steps for each of the `jT` time steps in the main trajectory.
+                branching_time_steps = [bt for item in range(jT) for bt in A * [item]]
+                batch_branching_time_steps.extend(branching_time_steps)
+                # Get all of the (jT*A) alternative trajectories in the tree
+                # (jT is the number of time steps in the main trajectory, A is the number of alternative actions)
+                alternative_chats, alternative_rewards = get_alternative_chat_histories(
+                    agent_id=agent_id, root=root
+                )
+                assert (
+                    len(alternative_chats) == A * jT
+                ), "Incorrect number of alternative trajectories."
+                for chat, rewards in zip(alternative_chats, alternative_rewards):
+                    (
+                        input_ids,
+                        action_mask,
+                        entropy_mask,
+                        timesteps,
+                        state_ends_mask,
+                        engine_log_probs,
+                    ) = process_training_chat(
+                        tokenizer=self.tokenizer,
+                        chat_history=chat,
+                        entropy_mask_regex=self.entropy_mask_regex,
+                        exploration_prompts_to_remove=self.exploration_prompts_to_remove,
+                    )
+                    alternative_batch_input_ids.append(input_ids)
+                    alternative_batch_action_mask.append(action_mask)
+                    alternative_batch_entropy_mask.append(entropy_mask)
+                    alternative_batch_timesteps.append(timesteps)
+                    alternative_batch_state_ends_mask.append(state_ends_mask)
+                    alternative_batch_engine_log_probs.append(engine_log_probs)
+                    alternative_batch_rewards.append(rewards)
+        jT_list = torch.Tensor(jT_list)
+        # Assert that number of alternative actions is constant
+        # assert len(set(nb_alternative_actions)) == 1, "Number of alternative actions must be constant"
+        # A = nb_alternative_actions[0]
+        trajectory_batch = TrajectoryBatch(
+            rollout_ids=torch.tensor(batch_rollout_ids, dtype=torch.int32),  # (B,)
+            crn_ids=torch.tensor(batch_crn_ids, dtype=torch.int32),
+            agent_ids=[agent_id] * len(batch_rollout_ids),
+            batch_input_ids=batch_input_ids,
+            batch_action_mask=batch_action_mask,
+            batch_entropy_mask=batch_entropy_mask,
+            batch_timesteps=batch_timesteps,
+            batch_state_ends_mask=batch_state_ends_mask,
+            batch_engine_log_probs=batch_engine_log_probs,
+            batch_rewards=batch_rewards,
+        )
+        # Get Advantages & Train Critic
+        with resource_logger_context(
+            logger, "Get advantages with critic gradient accumulation"
+        ):
+            self.batch_advantages: torch.FloatTensor = (
+                self.get_advantages_with_critic_gradient_accumulation(trajectory_batch)
+            )  # (B, jT)
+        if A > 0:
+            # Here, `A` is the number of alternative actions / trajectories taken at each time step.
+            # For each of the `B` rollout perspectives, at each of its jT (`j` is for jagged, since each main rollout may be of a different length) steps, we take A alternate trajectories (from different actions).
+            # Therefore, we have ∑jT * A trajectories to process. If each of the main trajectories have T steps, we will have `B*T*A` to process.
+            with resource_logger_context(logger, "Create alternative trajectory batch"):
+                sum_jT = int(torch.sum(jT_list).item())
+                jT_list = (
+                    jT_list.int().tolist()
+                )  # (jT,) # (we only want the advantages where we branched out)
+                alternative_trajectory_batch = TrajectoryBatch(
+                    rollout_ids=torch.zeros(A * sum_jT, dtype=torch.int32),
+                    crn_ids=torch.zeros(A * sum_jT, dtype=torch.int32),
+                    agent_ids=[agent_id] * (A * sum_jT),
+                    batch_input_ids=alternative_batch_input_ids,
+                    batch_action_mask=alternative_batch_action_mask,
+                    batch_entropy_mask=alternative_batch_entropy_mask,
+                    batch_timesteps=alternative_batch_timesteps,
+                    batch_state_ends_mask=alternative_batch_state_ends_mask,
+                    batch_engine_log_probs=alternative_batch_engine_log_probs,
+                    batch_rewards=alternative_batch_rewards,
+                )
+            # Get alternative advantages
+            # BAAs stands for batch alternative advantages
+            # (torch nested tensors have very little api support, so we have to do some odd manual work here)
+            with resource_logger_context(
+                logger, "Compute alternative advantage estimates"
+            ):
+                BAAs_list = self.get_advantages_with_critic_gradient_accumulation(
+                    alternative_trajectory_batch
+                )  # list length (∑jT * A), each (jT',)
+                # Pad alternative advantages to (∑jT*A, P)
+                BAAs_padded = pad_sequence(
+                    BAAs_list, batch_first=True, padding_value=0.0
+                )
+                branch_idx = torch.tensor(
+                    batch_branching_time_steps,
+                    device=BAAs_padded.device,
+                    dtype=torch.long,
+                )
+                gathered = BAAs_padded.gather(
+                    dim=1, index=branch_idx.unsqueeze(1)
+                ).squeeze(1)
+                # Reshape and split per rollout, then transpose to (jT_i, A)
+                gathered = gathered.view(A, sum_jT)  # (A, ∑jT)
+                blocks = list(
+                    torch.split(gathered, jT_list, dim=1)
+                )  # len B, shapes (A, jT_i)
+                BAAs = [
+                    blk.transpose(0, 1).contiguous() for blk in blocks
+                ]  # list of (jT_i, A)
+        if self.ad_align_beta_anneal_step > 0:
+            max_rollout_id = torch.max(trajectory_batch.rollout_ids) + 1
+            if (
+                max_rollout_id % self.ad_align_beta_anneal_step == 0
+                and self.past_ad_align_step != max_rollout_id
+            ):
+                self.ad_align_beta = max(
+                    self.ad_align_beta * self.ad_align_beta_anneal_rate,
+                    self.min_ad_align_beta,
+                )
+                logger.info(f"Annealing ad_align_beta to {self.ad_align_beta}")
+                self.past_ad_align_step = max_rollout_id
+        self.training_data[agent_id] = AdAlignTrainingData(
+            agent_id=agent_id,
+            main_data=trajectory_batch,
+            main_advantages=self.batch_advantages,
+            alternative_advantages=BAAs if A > 0 else None,
+        )
+    def share_advantage_data(self) -> list[AdvantagePacket]:
+        """
+        Share the advantage alignment data with other agents.
+        Returns:
+            AdvantagePacket: The advantage packet containing the agent's advantages.
+        """
+        logger.info(f"Sharing advantage alignment data.")
+        advantage_packets = []
+        for _, agent_data in self.training_data.items():
+            advantage_packets.append(
+                AdvantagePacket(
+                    agent_id=agent_data.agent_id,
+                    rollout_ids=agent_data.main_data.rollout_ids,
+                    main_advantages=agent_data.main_advantages,
+                )
+            )
+        return advantage_packets
+    def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]):
+        """
+        Receive advantage packets from other players.
+        These contain the advantages of the other players' rollouts estimated by them.
+        """
+        logger.info(f"Receiving advantage packets.")
+        assert (
+            len(advantage_packets) > 0
+        ), "At least one advantage packet must be provided."
+        for agent_id, agent_data in self.training_data.items():
+            coagent_advantage_packets = [
+                packet for packet in advantage_packets if packet.agent_id != agent_id
+            ]
+            agent_rollout_ids = agent_data.main_data.rollout_ids
+            agent_advantages = agent_data.main_advantages
+            co_agent_advantages = []
+            for rollout_id in agent_rollout_ids:
+                for co_agent_packet in coagent_advantage_packets:
+                    if rollout_id in co_agent_packet.rollout_ids:
+                        index = torch.where(rollout_id == co_agent_packet.rollout_ids)[
+                            0
+                        ].item()
+                        co_agent_advantages.append(
+                            co_agent_packet.main_advantages[index]
+                        )
+                        # assumes that its two player game, with one co-agent
+                        break
+            assert len(co_agent_advantages) == len(agent_advantages)
+            B = len(agent_advantages)
+            assert all(
+                a.shape[0] == b.shape[0]
+                for a, b in zip(co_agent_advantages, agent_advantages)
+            ), "Number of advantages must match for advantage alignment."
+            # Get padded tensors (advantage alignment is invariant to padding)
+            lengths = torch.tensor(
+                [len(t) for t in agent_advantages],
+                device=self.device,
+                dtype=torch.long,
+            )
+            padded_main_advantages = pad_sequence(
+                agent_advantages, batch_first=True, padding_value=0.0
+            )
+            if agent_data.alternative_advantages:
+                padded_alternative_advantages = pad_sequence(
+                    agent_data.alternative_advantages,
+                    batch_first=True,
+                    padding_value=0.0,
+                )  # (B, P, A)
+            else:
+                padded_alternative_advantages = None
+            padded_co_agent_advantages = pad_sequence(
+                co_agent_advantages, batch_first=True, padding_value=0.0
+            )
+            # Create training batch data
+            credits, sub_tensors = get_advantage_alignment_credits(
+                a1=padded_main_advantages,
+                a1_alternative=padded_alternative_advantages,
+                a2=padded_co_agent_advantages,
+                beta=self.ad_align_beta,
+                gamma=self.ad_align_gamma,
+                exclude_k_equals_t=self.ad_align_exclude_k_equals_t,
+                use_sign=self.ad_align_use_sign,
+                clipping=self.ad_align_clipping,
+                force_coop_first_step=self.ad_align_force_coop_first_step,
+                use_old_ad_align=self.use_old_ad_align,
+                use_time_regularization=self.use_time_regularization,
+                rloo_branch=self.rloo_branch,
+                reuse_baseline=self.reuse_baseline,
+                mean_normalize_ad_align=self.mean_normalize_ad_align,
+                whiten_adalign_advantages=self.whiten_adalign_advantages,
+                whiten_adalign_advantages_time_step_wise=self.whiten_adalign_advantages_time_step_wise,
+            )
+            for key, value in sub_tensors.items():
+                self.rollout_tally.add_metric(
+                    path=[key],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=agent_data.main_data.crn_ids,
+                        rollout_ids=agent_data.main_data.rollout_ids,
+                        agent_ids=agent_data.main_data.agent_ids,
+                        metric_matrix=value,
+                    ),
+                )
+            if not self.skip_discounted_state_visitation:
+                credits = get_discounted_state_visitation_credits(
+                    credits,
+                    self.discount_factor,
+                )
+                self.rollout_tally.add_metric(
+                    path=["discounted_state_visitation_credits"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=agent_data.main_data.crn_ids,
+                        rollout_ids=agent_data.main_data.rollout_ids,
+                        agent_ids=agent_data.main_data.agent_ids,
+                        metric_matrix=sub_tensors[
+                            "discounted_state_visitation_credits"
+                        ],
+                    ),
+                )
+            # Slice back to jagged
+            advantage_alignment_credits = [credits[i, : lengths[i]] for i in range(B)]
+            # Replace stored training data for this agent by the concrete trajectory batch
+            # and attach the computed credits for policy gradient.
+            self.training_data[agent_id] = agent_data.main_data
+            self.training_data[agent_id].batch_credits = advantage_alignment_credits

src_code_for_reproducibility/training/trainer_common.py ADDED Viewed

	@@ -0,0 +1,1054 @@

+"""
+TODO: Add coefficients for losses (depend on total number of tokens or batch)
+TODO: adapt reinforce step for torch.compile
+TODO: add lr schedulers support
+"""
+import logging
+import os
+import pickle
+import sys
+from abc import ABC, abstractmethod
+from typing import Callable, Literal, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from pandas._libs.tslibs.offsets import CBMonthBegin
+from peft import LoraConfig
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from mllm.markov_games.rollout_tree import *
+from mllm.markov_games.rollout_tree import RolloutTreeRootNode
+from mllm.training.annealing_methods import sigmoid_annealing
+from mllm.training.credit_methods import (
+    get_discounted_returns,
+    get_generalized_advantage_estimates,
+    get_rloo_credits,
+    whiten_advantages,
+    whiten_advantages_time_step_wise,
+)
+from mllm.training.tally_metrics import Tally
+from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem
+from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally
+from mllm.training.tokenize_chats import *
+from mllm.training.tokenize_chats import process_training_chat
+from mllm.training.training_data_utils import *
+from mllm.training.training_data_utils import (
+    TrainingBatch,
+    TrajectoryBatch,
+    get_tokenwise_credits,
+)
+from mllm.utils.resource_context import resource_logger_context
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+@dataclass
+class TrainerAnnealingState:
+    annealing_step_counter: int = 0
+class BaseTrainer(ABC):
+    """
+    Trainer
+    """
+    def __init__(
+        self,
+        policy: AutoModelForCausalLM,
+        policy_optimizer: torch.optim.Optimizer,
+        critic: Union[AutoModelForCausalLM, None],
+        critic_optimizer: Union[torch.optim.Optimizer, None],
+        tokenizer: AutoTokenizer,
+        lr_scheduler: torch.optim.lr_scheduler.LRScheduler,
+        critic_lr_scheduler: Union[torch.optim.lr_scheduler.LRScheduler, None],
+        ######################################################################
+        entropy_coeff: float,
+        entropy_topk: int,
+        entropy_mask_regex: Union[str, None],
+        kl_coeff: float,
+        gradient_clipping: Union[float, None],
+        restrict_tokens: Union[list[str], None],
+        mini_batch_size: int,
+        use_gradient_checkpointing: bool,
+        temperature: float,
+        device: str,
+        whiten_advantages: bool,
+        whiten_advantages_time_step_wise: bool,
+        use_gae: bool,
+        use_gae_lambda_annealing: bool,
+        gae_lambda_annealing_limit: float,
+        gae_lambda_annealing_method: Literal["sigmoid_annealing"],
+        gae_lambda_annealing_method_params: dict,
+        pg_loss_normalization: Literal["batch", "nb_tokens"],
+        use_rloo: bool,
+        skip_discounted_state_visitation: bool,
+        discount_factor: float,
+        enable_tokenwise_logging: bool,
+        save_path: str,
+        reward_normalizing_constant: float = 1.0,
+        critic_loss_type: Literal["mse", "huber"] = "huber",
+        exploration_prompts_to_remove: list[str] = [],
+        filter_higher_refprob_tokens_kl: bool = False,
+        truncated_importance_sampling_ratio_cap: float = 0.0,
+        importance_sampling_strategy: Literal[
+            "per_token", "per_sequence"
+        ] = "per_token",
+    ):
+        """
+        Initialize the REINFORCE trainer with reward shaping for multi-agent or single-agent training.
+        Args:
+            model (AutoModelForCausalLM): The main policy model.
+            tokenizer (AutoTokenizer): Tokenizer for the model.
+            optimizer (torch.optim.Optimizer): Optimizer for the policy model.
+            lr_scheduler (torch.optim.lr_scheduler.LRScheduler): Learning rate scheduler for the policy model.
+            critic (AutoModelForCausalLM or None): Critic model for value estimation (optional).
+            critic_optimizer (torch.optim.Optimizer or None): Optimizer for the critic model (optional).
+            critic_lr_scheduler (torch.optim.lr_scheduler.LRScheduler or None): LR scheduler for the critic (optional).
+            config (RtConfig): Configuration object for training.
+        """
+        self.tokenizer = tokenizer
+        # self.tokenizer.padding_side = "left"  # needed for flash attention
+        if self.tokenizer.pad_token_id is None:
+            self.tokenizer.pad_token_id = self.tokenizer.eos_token_id
+        self.lr_scheduler = lr_scheduler
+        self.accelerator = Accelerator()
+        (
+            self.policy,
+            self.policy_optimizer,
+            self.critic,
+            self.critic_optimizer,
+        ) = self.accelerator.prepare(policy, policy_optimizer, critic, critic_optimizer)
+        self.critic_lr_scheduler = critic_lr_scheduler
+        self.tally = Tally()
+        if use_gradient_checkpointing == True:
+            self.policy.gradient_checkpointing_enable(dict(use_reentrant=False))
+            if critic is not None:
+                self.critic.gradient_checkpointing_enable(dict(use_reentrant=False))
+        self.save_path = save_path
+        # Load trainer state if it exists
+        self.trainer_annealing_state_path = os.path.join(
+            self.save_path, "trainer_annealing_state.pkl"
+        )
+        if os.path.exists(self.trainer_annealing_state_path):
+            logger.info(
+                f"Loading trainer state from {self.trainer_annealing_state_path}"
+            )
+            self.trainer_annealing_state = pickle.load(
+                open(self.trainer_annealing_state_path, "rb")
+            )
+        else:
+            self.trainer_annealing_state = TrainerAnnealingState()
+        # Load policy optimizer state if it exists
+        self.policy_optimizer_path = os.path.join(
+            self.save_path, "policy_optimizer_state.pt"
+        )
+        if os.path.exists(self.policy_optimizer_path):
+            logger.info(
+                f"Loading policy optimizer state from {self.policy_optimizer_path}"
+            )
+            self.policy_optimizer.load_state_dict(
+                torch.load(self.policy_optimizer_path)
+            )
+        # Load critic optimizer state if it exists
+        self.critic_optimizer_path = os.path.join(
+            self.save_path, "critic_optimizer_state.pt"
+        )
+        if (
+            os.path.exists(self.critic_optimizer_path)
+            and self.critic_optimizer is not None
+        ):
+            logger.info(
+                f"Loading critic optimizer state from {self.critic_optimizer_path}"
+            )
+            self.critic_optimizer.load_state_dict(
+                torch.load(self.critic_optimizer_path)
+            )
+        self.device = self.accelerator.device
+        self.entropy_coeff = entropy_coeff
+        self.entropy_topk = entropy_topk
+        self.entropy_mask_regex = entropy_mask_regex
+        self.kl_coeff = kl_coeff
+        self.gradient_clipping = gradient_clipping
+        self.restrict_tokens = restrict_tokens
+        self.mini_batch_size = mini_batch_size
+        self.use_gradient_checkpointing = use_gradient_checkpointing
+        self.temperature = temperature
+        self.use_gae = use_gae
+        self.whiten_advantages = whiten_advantages
+        self.whiten_advantages_time_step_wise = whiten_advantages_time_step_wise
+        self.use_rloo = use_rloo
+        self.skip_discounted_state_visitation = skip_discounted_state_visitation
+        self.use_gae_lambda_annealing = use_gae_lambda_annealing
+        self.gae_lambda_annealing_limit = gae_lambda_annealing_limit
+        if use_gae_lambda_annealing:
+            self.gae_lambda_annealing_method: Callable[
+                [int], float
+            ] = lambda step: eval(gae_lambda_annealing_method)(
+                step=step, **gae_lambda_annealing_method_params
+            )
+        self.discount_factor = discount_factor
+        self.enable_tokenwise_logging = enable_tokenwise_logging
+        self.reward_normalizing_constant = reward_normalizing_constant
+        self.pg_loss_normalization = pg_loss_normalization
+        self.critic_loss_type = critic_loss_type
+        self.exploration_prompts_to_remove = exploration_prompts_to_remove
+        # Common containers used by all trainers
+        self.training_data: dict = {}
+        self.debug_path_list: list[str] = []
+        self.policy_gradient_data = None
+        self.tally = Tally()
+        self.rollout_tally = RolloutTally()
+        self.tokenwise_tally: Union[ContextualizedTokenwiseTally, None] = None
+        self.filter_higher_refprob_tokens_kl = filter_higher_refprob_tokens_kl
+        self.truncated_importance_sampling_ratio_cap = (
+            truncated_importance_sampling_ratio_cap
+        )
+        self.importance_sampling_strategy = importance_sampling_strategy
+    def mask_non_restricted_token_logits(self, logits: torch.Tensor) -> torch.Tensor:
+        """
+        Masks logits so that only allowed tokens (as specified in config.restrict_tokens)
+        and the EOS token are active.
+        All other logits are set to -inf, effectively removing them from the softmax.
+        Args:
+            logits (torch.Tensor): The logits tensor of shape (B, S, V).
+        Returns:
+            torch.Tensor: The masked logits tensor.
+        """
+        # TODO: verify. Not sure what we do here is differentiable
+        # also, we recompute for nothing
+        if self.restrict_tokens is not None:
+            allowed_token_ids = []
+            for token in self.restrict_tokens:
+                token_ids = self.tokenizer(token, add_special_tokens=False)["input_ids"]
+                allowed_token_ids.append(token_ids[0])
+            allowed_token_ids.append(
+                self.tokenizer.eos_token_id
+            )  # This token should always be active
+            allowed_token_ids = torch.tensor(allowed_token_ids, device=logits.device)
+            # Mask log_probs and probs to only allowed tokens
+            mask = torch.zeros_like(logits).bool()  # (B, S, V)
+            mask[..., allowed_token_ids] = True
+            logits = torch.where(
+                mask,
+                logits,
+                torch.tensor(-float("inf"), device=logits.device),
+            )
+        return logits
+    # def get_gradient_magnitude(self, loss_term: torch.Tensor) -> float:
+    #     """
+    #     Computes the L2 norm of the gradients of the given loss term with respect to the model parameters.
+    #     Args:
+    #         loss_term (torch.Tensor): The loss tensor to compute gradients for.
+    #     Returns:
+    #         float: The L2 norm of the gradients, or 0.0 if no gradients are present.
+    #     """
+    #     with torch.no_grad():
+    #         grads = torch.autograd.grad(
+    #             loss_term,
+    #             [p for p in self.policy.parameters() if p.requires_grad],
+    #             retain_graph=True,
+    #             allow_unused=True,
+    #         )
+    #         grads = [g for g in grads if g is not None]
+    #         if not grads:
+    #             return torch.tensor(0.0, device=loss_term.device)
+    #         return torch.norm(torch.stack([g.norm(2) for g in grads])).item()
+    def apply_reinforce_step(
+        self,
+        training_batch: TrainingBatch,
+    ) -> None:
+        """
+        Applies a single REINFORCE policy gradient step using the provided batch of rollouts.
+        Handles batching, loss computation (including entropy and KL regularization), gradient accumulation, and optimizer step.
+        Optionally logs various metrics and statistics.
+        Args:
+            paths (list[str]): List of game complete file paths for each rollout.
+            contexts (list[torch.Tensor]): List of context tensors for each rollout.
+            credits (list[torch.Tensor]): List of credit tensors (rewards/advantages) for each rollout.
+            action_masks (list[torch.Tensor]): List of action mask tensors for each rollout.
+        """
+        with resource_logger_context(logger, "Apply reinforce step"):
+            self.policy.train()
+            mb_size = self.mini_batch_size
+            nb_rollouts = len(training_batch)
+            # Initialize running mean logs
+            running_mean_logs = {
+                "rl_objective": 0.0,
+                "policy_gradient_loss": 0.0,
+                "policy_gradient_norm": 0.0,
+                "log_probs": 0.0,
+                "credits": 0.0,
+                "entropy": 0.0,
+                "engine_log_probs_diff_clampfrac": 0.0,
+                "tis_imp_ratio": 0.0,
+                "ref_log_probs_diff_clampfrac": 0.0,
+                "higher_refprob_frac": 0.0,
+                "tis_imp_ratio_clampfrac": 0.0,
+            }
+            if self.entropy_coeff != 0.0:
+                running_mean_logs["entropy"] = 0.0
+            if self.kl_coeff != 0.0:
+                running_mean_logs["kl_divergence"] = 0.0
+            # Get total number of tokens generated
+            total_tokens_generated = 0
+            for att_mask in training_batch.batch_action_mask:
+                total_tokens_generated += att_mask.sum()
+            # Obtain loss normalization
+            if self.pg_loss_normalization == "nb_tokens":
+                normalization_factor = total_tokens_generated
+            elif self.pg_loss_normalization == "batch":
+                normalization_factor = np.ceil(nb_rollouts / mb_size).astype(int)
+            else:
+                raise ValueError(
+                    f"Invalid pg_loss_normalization: {self.pg_loss_normalization}"
+                )
+            # Gradient accumulation for each mini-batch
+            for mb in range(0, nb_rollouts, mb_size):
+                logger.info(f"Processing mini-batch {mb} of {nb_rollouts}")
+                loss = 0.0
+                training_mb = training_batch[mb : mb + mb_size]
+                training_mb = training_mb.get_padded_tensors()
+                training_mb.to(self.device)
+                (
+                    tokens_mb,
+                    action_mask_mb,
+                    entropy_mask_mb,
+                    credits_mb,
+                    engine_log_probs_mb,
+                    timesteps_mb,
+                ) = (
+                    training_mb.batch_input_ids,
+                    training_mb.batch_action_mask,
+                    training_mb.batch_entropy_mask,
+                    training_mb.batch_credits,
+                    training_mb.batch_engine_log_probs,
+                    training_mb.batch_timesteps,
+                )
+                # Next token prediction
+                contexts_mb = tokens_mb[:, :-1]
+                shifted_contexts_mb = tokens_mb[:, 1:]
+                action_mask_mb = action_mask_mb[:, 1:]
+                entropy_mask_mb = entropy_mask_mb[:, 1:]
+                credits_mb = credits_mb[:, 1:]
+                engine_log_probs_mb = engine_log_probs_mb[:, 1:]
+                timesteps_mb = timesteps_mb[:, 1:]
+                if self.enable_tokenwise_logging:
+                    self.tokenwise_tally.set_action_mask(action_mask=action_mask_mb)
+                    self.tokenwise_tally.set_range(range=(mb, mb + mb_size))
+                    self.tokenwise_tally.add_contexts(contexts=contexts_mb)
+                    self.tokenwise_tally.add_data(
+                        metric_id="next_token",
+                        metrics=shifted_contexts_mb,
+                        to_tids=True,
+                    )
+                    self.tokenwise_tally.add_data(
+                        metric_id="entropy_mask",
+                        metrics=entropy_mask_mb,
+                    )
+                if self.enable_tokenwise_logging:
+                    self.tokenwise_tally.add_data(
+                        metric_id="next_token_credit", metrics=credits_mb
+                    )
+                # Forward pass + cast to FP-32 for higher prec.
+                # TODO: create attention mask if not relying on default (assume causal llm)
+                logits = self.policy(input_ids=contexts_mb)[0]  # (B, S, V)
+                # Mask non-restricted tokens
+                if self.restrict_tokens is not None:
+                    logits = self.mask_non_restricted_token_logits(logits)
+                logits /= self.temperature  # (B, S, V)
+                # Compute new log probabilities
+                log_probs = F.log_softmax(logits, dim=-1)  # (B, S, V)
+                # Get log probabilities of actions taken during rollouts
+                action_log_probs = log_probs.gather(
+                    dim=-1, index=shifted_contexts_mb.unsqueeze(-1)
+                ).squeeze(
+                    -1
+                )  # (B, S)
+                if self.pg_loss_normalization == "batch":
+                    den_running_mean = action_mask_mb.sum() * normalization_factor
+                else:
+                    den_running_mean = normalization_factor
+                running_mean_logs["log_probs"] += (
+                    action_log_probs * action_mask_mb
+                ).sum().item() / den_running_mean
+                running_mean_logs["credits"] += (
+                    credits_mb * action_mask_mb
+                ).sum().item() / den_running_mean
+                if self.enable_tokenwise_logging:
+                    self.tokenwise_tally.add_data(
+                        metric_id="next_token_log_prob",
+                        metrics=action_log_probs,
+                    )
+                    self.tokenwise_tally.add_data(
+                        metric_id="engine_next_token_log_prob",
+                        metrics=engine_log_probs_mb,
+                    )
+                    self.tokenwise_tally.add_data(
+                        metric_id="next_token_prob",
+                        metrics=torch.exp(action_log_probs),
+                    )
+                    top_k_indices = torch.topk(logits, k=5, dim=-1).indices
+                    self.tokenwise_tally.add_data(
+                        metric_id=f"top_{5}_tids",
+                        metrics=top_k_indices,
+                        to_tids=True,
+                    )
+                    self.tokenwise_tally.add_data(
+                        metric_id=f"top_{5}_probs",
+                        metrics=torch.exp(log_probs).gather(
+                            dim=-1, index=top_k_indices
+                        ),
+                    )
+                rewarded_action_log_probs = (
+                    action_mask_mb * credits_mb * action_log_probs
+                )
+                # (B, S)
+                INVALID_LOGPROB = 1.0
+                CLAMP_VALUE = 40.0
+                masked_action_log_probs = torch.masked_fill(
+                    action_log_probs, ~action_mask_mb, INVALID_LOGPROB
+                )
+                masked_engine_log_probs = torch.masked_fill(
+                    engine_log_probs_mb, ~action_mask_mb, INVALID_LOGPROB
+                )
+                with torch.no_grad():
+                    action_engine_log_probs_diff = (
+                        masked_action_log_probs - masked_engine_log_probs
+                    ).clamp(-CLAMP_VALUE, CLAMP_VALUE)
+                running_mean_logs["engine_log_probs_diff_clampfrac"] += (
+                    action_engine_log_probs_diff.abs()
+                    .eq(CLAMP_VALUE)
+                    .float()
+                    .sum()
+                    .item()
+                    / den_running_mean
+                )
+                if self.importance_sampling_strategy == "per_sequence":
+                    tis_imp_ratio = torch.zeros_like(action_engine_log_probs_diff)
+                    for mb_idx in range(action_engine_log_probs_diff.shape[0]):
+                        valid_token_mask = action_mask_mb[mb_idx]
+                        timestep_ids = timesteps_mb[mb_idx][valid_token_mask]
+                        timestep_logprob_diffs = action_engine_log_probs_diff[mb_idx][
+                            valid_token_mask
+                        ]
+                        max_timestep = int(timestep_ids.max().item()) + 1
+                        timestep_sums = torch.zeros(
+                            max_timestep,
+                            device=action_engine_log_probs_diff.device,
+                            dtype=action_engine_log_probs_diff.dtype,
+                        )
+                        timestep_sums.scatter_add_(
+                            0, timestep_ids, timestep_logprob_diffs
+                        )
+                        timestep_ratios = torch.exp(timestep_sums)
+                        tis_imp_ratio[
+                            mb_idx, valid_token_mask
+                        ] = timestep_ratios.gather(0, timestep_ids)
+                else:
+                    tis_imp_ratio = torch.exp(action_engine_log_probs_diff)
+                running_mean_logs["tis_imp_ratio"] += (
+                    tis_imp_ratio * action_mask_mb
+                ).sum().item() / den_running_mean
+                if self.truncated_importance_sampling_ratio_cap > 0.0:
+                    tis_imp_ratio = torch.clamp(
+                        tis_imp_ratio, max=self.truncated_importance_sampling_ratio_cap
+                    )
+                    running_mean_logs["tis_imp_ratio_clampfrac"] += (
+                        tis_imp_ratio.eq(self.truncated_importance_sampling_ratio_cap)
+                        .float()
+                        .sum()
+                        .item()
+                    ) / den_running_mean
+                    rewarded_action_log_probs = (
+                        rewarded_action_log_probs * tis_imp_ratio
+                    )
+                if self.enable_tokenwise_logging:
+                    self.tokenwise_tally.add_data(
+                        metric_id="next_token_clogπ",
+                        metrics=rewarded_action_log_probs,
+                    )
+                # Add value term to loss
+                if self.pg_loss_normalization == "batch":
+                    nb_act_tokens = action_mask_mb.sum()
+                    mb_value = -rewarded_action_log_probs.sum() / nb_act_tokens
+                else:
+                    mb_value = -rewarded_action_log_probs.sum()
+                loss += mb_value
+                running_mean_logs["rl_objective"] += mb_value.item() / den_running_mean
+                # -------------------------------------------------
+                # Entropy Regularization
+                # -------------------------------------------------
+                # Only apply entropy on distribution defined over most probable tokens
+                if self.entropy_topk is not None:
+                    top_k_indices = torch.topk(
+                        logits, k=self.entropy_topk, dim=-1
+                    ).indices
+                    entropy_logits = logits.gather(dim=-1, index=top_k_indices)
+                else:
+                    entropy_logits = logits
+                token_entropy_terms = -F.softmax(
+                    entropy_logits, dim=-1
+                ) * F.log_softmax(
+                    entropy_logits, dim=-1
+                )  # (B, S, T)
+                token_entropy_terms *= (
+                    action_mask_mb[:, :, None] * entropy_mask_mb[:, :, None]
+                )  # only get loss on specific action tokens
+                mb_entropy = token_entropy_terms.sum(dim=-1)
+                if self.enable_tokenwise_logging:
+                    self.tokenwise_tally.add_data(
+                        metric_id="entropy",
+                        metrics=mb_entropy,
+                    )
+                if self.pg_loss_normalization == "batch":
+                    nb_act_tokens = action_mask_mb.sum()
+                    mb_entropy = -mb_entropy.sum() / nb_act_tokens
+                else:
+                    mb_entropy = -mb_entropy.sum()
+                running_mean_logs["entropy"] += -mb_entropy.item() / den_running_mean
+                if self.entropy_coeff != 0.0:
+                    mb_entropy *= self.entropy_coeff
+                    loss += mb_entropy
+                # -------------------------------------------------
+                # KL-DIVERGENCE
+                # -------------------------------------------------
+                if self.kl_coeff != 0.0:
+                    ref_model_logits = self.policy.get_base_model_logits(contexts_mb)
+                    ref_model_logits = ref_model_logits / self.temperature
+                    # (B, S, V)
+                    ref_model_logits = self.mask_non_restricted_token_logits(
+                        logits=ref_model_logits
+                    )
+                    # (B, S, V)
+                    ref_model_log_probs = F.log_softmax(ref_model_logits, dim=-1)
+                    # (B, S, V)
+                    ref_model_action_log_probs = ref_model_log_probs.gather(
+                        dim=-1, index=shifted_contexts_mb.unsqueeze(-1)
+                    ).squeeze(
+                        -1
+                    )  # (B,S)
+                    # Approximating KL Divergence (see refs in docstring)
+                    # Ref 1: http://joschu.net/blog/kl-approx.html
+                    # Ref 2: https://github.dev/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py#L1332
+                    masked_ref_model_action_log_probs = torch.masked_fill(
+                        ref_model_action_log_probs, ~action_mask_mb, INVALID_LOGPROB
+                    )
+                    action_log_probs_diff = (
+                        masked_ref_model_action_log_probs - masked_action_log_probs
+                    ).clamp(-CLAMP_VALUE, CLAMP_VALUE)
+                    running_mean_logs["ref_log_probs_diff_clampfrac"] += (
+                        action_log_probs_diff.abs().eq(CLAMP_VALUE).float().sum().item()
+                        / den_running_mean
+                    )
+                    if self.filter_higher_refprob_tokens_kl:
+                        higher_refprob_tokens_mask = action_log_probs_diff > 0.0
+                        running_mean_logs["higher_refprob_frac"] += (
+                            higher_refprob_tokens_mask.sum().item() / den_running_mean
+                        )
+                        action_log_probs_diff = action_log_probs_diff * (
+                            ~higher_refprob_tokens_mask
+                        )
+                    kl_div = torch.expm1(action_log_probs_diff) - action_log_probs_diff
+                    kl_div *= action_mask_mb  # We only care about KLD of action tokens
+                    if self.truncated_importance_sampling_ratio_cap > 0.0:
+                        kl_div = kl_div * tis_imp_ratio
+                    kl_div *= self.kl_coeff
+                    if self.enable_tokenwise_logging:
+                        self.tokenwise_tally.add_data(
+                            metric_id="ref_model_next_token_log_prob",
+                            metrics=ref_model_action_log_probs,
+                        )
+                        self.tokenwise_tally.add_data(
+                            metric_id="kl_divergence",
+                            metrics=kl_div,
+                        )
+                    if self.pg_loss_normalization == "batch":
+                        nb_act_tokens = action_mask_mb.sum()
+                        mb_kl = kl_div.sum() / nb_act_tokens
+                    else:
+                        mb_kl = kl_div.sum()
+                    running_mean_logs["kl_divergence"] += (
+                        mb_kl.item() / den_running_mean
+                    )
+                    loss += mb_kl
+                # Accumulate gradient
+                running_mean_logs["policy_gradient_loss"] += (
+                    loss.item() / den_running_mean
+                )
+                loss /= normalization_factor
+                self.accelerator.backward(loss)
+                # ensure gpu memory is freed
+                del training_mb
+                del log_probs
+                del logits
+                del loss
+                del action_log_probs
+                del rewarded_action_log_probs
+            logger.info(
+                f"Accumulated the policy gradient loss for {total_tokens_generated} tokens."
+            )
+            # Clip gradients and take step
+            if self.gradient_clipping is not None:
+                grad_norm = self.accelerator.clip_grad_norm_(
+                    self.policy.parameters(), self.gradient_clipping
+                )
+                running_mean_logs["policy_gradient_norm"] += grad_norm.item()
+            # Take step
+            self.policy_optimizer.step()
+            self.policy_optimizer.zero_grad()
+            # Store logs
+            for key, value in running_mean_logs.items():
+                self.tally.add_metric(path=key, metric=value)
+            # Clear
+            # TODO: verify
+            self.accelerator.clear(self.policy, self.policy_optimizer)
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
+            return running_mean_logs
+    def get_advantages_with_critic_gradient_accumulation(
+        self, trajectories: TrajectoryBatch, critic_loss_scaling_factor: float = 2.0
+    ) -> torch.FloatTensor:
+        """
+        TOWRITE
+        Uses GAE if enabled, otherwise uses Monte Carlo returns.
+        Optionally trains the critic if GAE is used.
+        Returns:
+            advantages: NestedFloatTensors
+        """
+        mb_size = self.mini_batch_size
+        batch_size = trajectories.rollout_ids.shape[0]
+        agent_id = trajectories.agent_ids[0]
+        batch_rewards = trajectories.batch_rewards
+        ######################################
+        # use critic for advantage estimation
+        ######################################
+        if self.use_gae:
+            if "buffer" in agent_id:
+                self.critic.eval()
+                training = False
+            else:
+                self.critic.train()
+                training = True
+            advantages = []
+            # critic_loss_scaling_factor comes learning single critic for two agents
+            normalization_factor = (
+                np.ceil(batch_size / mb_size).astype(int) * critic_loss_scaling_factor
+            )
+            # For each minibatch
+            for mb in range(0, batch_size, mb_size):
+                trajectory_mb = trajectories[mb : mb + mb_size]
+                trajectory_mb.to(self.device)
+                rewards_mb = trajectory_mb.batch_rewards
+                (
+                    tokens_mb,
+                    state_ends_mask_mb,
+                    timestep_counts,
+                ) = trajectory_mb.get_padded_tensors_for_critic()
+                # critic causal attention up to end flags
+                if training:
+                    vals_estimate_full = self.critic(tokens_mb)
+                else:
+                    with torch.no_grad():
+                        vals_estimate_full = self.critic(tokens_mb)
+                # if vals_estimate_full.dim() == 3:
+                #     vals_estimate_full = vals_estimate_full.squeeze(-1)
+                # Select only positions where states end, per sample → list of (jT,)
+                B = tokens_mb.shape[0]
+                vals_list = [
+                    vals_estimate_full[b][state_ends_mask_mb[b]] for b in range(B)
+                ]
+                # Pad to (B, max_jT) = (B, S)
+                vals_estimate_mb = pad_sequence(
+                    vals_list, batch_first=True, padding_value=0.0
+                )
+                dtype = vals_estimate_mb.dtype
+                rewards_mb = pad_sequence(
+                    rewards_mb, batch_first=True, padding_value=0.0
+                ).to(
+                    dtype=dtype
+                )  # (B, S)
+                self.rollout_tally.add_metric(
+                    path=["batch_rewards"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=trajectory_mb.crn_ids,
+                        rollout_ids=trajectory_mb.rollout_ids,
+                        agent_ids=trajectory_mb.agent_ids,
+                        metric_matrix=rewards_mb,
+                    ),
+                )
+                if self.reward_normalizing_constant != 1.0:
+                    rewards_mb /= self.reward_normalizing_constant
+                det_vals_estimate_mb = vals_estimate_mb.detach()  # (B, max_jT)
+                self.rollout_tally.add_metric(
+                    path=["mb_value_estimates_critic"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=trajectory_mb.crn_ids,
+                        rollout_ids=trajectory_mb.rollout_ids,
+                        agent_ids=trajectory_mb.agent_ids,
+                        metric_matrix=det_vals_estimate_mb,
+                    ),
+                )
+                # Append a 0 value to the end of the value estimates
+                if det_vals_estimate_mb.shape[1] == rewards_mb.shape[1]:
+                    Bsize = det_vals_estimate_mb.shape[0]
+                    device = det_vals_estimate_mb.device
+                    dtype = det_vals_estimate_mb.dtype
+                    det_vals_estimate_mb = torch.cat(
+                        [
+                            det_vals_estimate_mb,
+                            torch.zeros((Bsize, 1), device=device, dtype=dtype),
+                        ],
+                        dim=1,
+                    )  # (B, max_jT+1)
+                else:
+                    raise ValueError(
+                        "Incompatible shapes for value estimates and rewards."
+                    )
+                # Get annealed lambda
+                if self.use_gae_lambda_annealing:
+                    annealing_constant = self.gae_lambda_annealing_method(
+                        step=self.trainer_annealing_state.annealing_step_counter
+                    )
+                    annealed_lambda = (
+                        self.gae_lambda_annealing_limit * annealing_constant
+                    )
+                    self.tally.add_metric(
+                        path="annealed_lambda", metric=annealed_lambda
+                    )
+                else:
+                    annealed_lambda = self.gae_lambda_annealing_limit
+                # Get GAE advantages
+                gae_advantages = get_generalized_advantage_estimates(
+                    rewards=rewards_mb,
+                    value_estimates=det_vals_estimate_mb,
+                    discount_factor=self.discount_factor,
+                    lambda_coef=annealed_lambda,
+                )  # (B, max_jT)
+                self.rollout_tally.add_metric(
+                    path=["mb_gae_advantages"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=trajectory_mb.crn_ids,
+                        rollout_ids=trajectory_mb.rollout_ids,
+                        agent_ids=trajectory_mb.agent_ids,
+                        metric_matrix=gae_advantages,
+                    ),
+                )
+                if training:
+                    targets = (
+                        gae_advantages.to(dtype=dtype) + det_vals_estimate_mb[:, :-1]
+                    )  # (B, max_jT) # A(s, a, b) + V(s) = Q(s, a, b)
+                    self.rollout_tally.add_metric(
+                        path=["mb_targets_critic"],
+                        rollout_tally_item=RolloutTallyItem(
+                            crn_ids=trajectory_mb.crn_ids,
+                            rollout_ids=trajectory_mb.rollout_ids,
+                            agent_ids=trajectory_mb.agent_ids,
+                            metric_matrix=targets,
+                        ),
+                    )
+                    if self.critic_loss_type == "mse":
+                        loss = F.mse_loss(
+                            input=vals_estimate_mb,
+                            target=targets,
+                        )
+                    elif self.critic_loss_type == "huber":
+                        loss = F.huber_loss(
+                            input=vals_estimate_mb,
+                            target=targets,
+                        )
+                    self.tally.add_metric(path=["mb_critic_loss"], metric=loss.item())
+                    # Accumulate gradient
+                    loss /= normalization_factor
+                    self.accelerator.backward(loss)
+                    del loss
+                    del targets
+                    del vals_estimate_mb
+                del trajectory_mb
+                del vals_estimate_full
+                # Get jagged back using timestep_counts
+                advantages.extend(
+                    [gae_advantages[i, : timestep_counts[i]] for i in range(B)]
+                )
+        ######################################
+        # use exclusively Monte Carlo returns & rloo for advantage estimation
+        ######################################
+        else:
+            lengths = [len(c) for c in batch_rewards]
+            padded_rewards = pad_sequence(
+                batch_rewards, batch_first=True, padding_value=0.0
+            )
+            self.rollout_tally.add_metric(
+                path=["mb_rewards"],
+                rollout_tally_item=RolloutTallyItem(
+                    crn_ids=trajectories.crn_ids,
+                    rollout_ids=trajectories.rollout_ids,
+                    agent_ids=trajectories.agent_ids,
+                    metric_matrix=padded_rewards,
+                ),
+            )
+            if self.reward_normalizing_constant != 1.0:
+                padded_rewards /= self.reward_normalizing_constant
+            padded_advantages = get_discounted_returns(
+                rewards=padded_rewards,
+                discount_factor=self.discount_factor,
+            )  # no baseline for now
+            if self.use_rloo:
+                is_grouped_by_rng = (
+                    trajectories.crn_ids.unique().shape[0]
+                    != trajectories.crn_ids.shape[0]
+                )
+                if is_grouped_by_rng:
+                    for crn_id in trajectories.crn_ids.unique():
+                        rng_mask = trajectories.crn_ids == crn_id
+                        rng_advantages = padded_advantages[rng_mask]
+                        rng_advantages, _ = get_rloo_credits(credits=rng_advantages)
+                        padded_advantages[rng_mask] = rng_advantages
+                else:
+                    padded_advantages, _ = get_rloo_credits(credits=padded_advantages)
+                self.rollout_tally.add_metric(
+                    path=["mb_rloo_advantages"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=trajectories.crn_ids,
+                        rollout_ids=trajectories.rollout_ids,
+                        agent_ids=trajectories.agent_ids,
+                        metric_matrix=padded_advantages,
+                    ),
+                )
+            advantages = [
+                padded_advantages[i, : lengths[i]]
+                for i in range(padded_advantages.shape[0])
+            ]
+        if self.whiten_advantages_time_step_wise or self.whiten_advantages:
+            lengths = [len(c) for c in advantages]
+            padded_advantages = pad_sequence(
+                advantages, batch_first=True, padding_value=0.0
+            )
+            if self.whiten_advantages_time_step_wise:
+                whitened_padded_advantages = whiten_advantages_time_step_wise(
+                    padded_advantages
+                )
+                path = ["mb_whitened_advantages_time_step_wise"]
+            elif self.whiten_advantages:
+                whitened_padded_advantages = whiten_advantages(padded_advantages)
+                path = ["mb_whitened_advantages"]
+            self.rollout_tally.add_metric(
+                path=path,
+                rollout_tally_item=RolloutTallyItem(
+                    crn_ids=trajectories.crn_ids,
+                    rollout_ids=trajectories.rollout_ids,
+                    agent_ids=trajectories.agent_ids,
+                    metric_matrix=whitened_padded_advantages,
+                ),
+            )
+            advantages = [
+                whitened_padded_advantages[i, : lengths[i]]
+                for i in range(whitened_padded_advantages.shape[0])
+            ]
+        self.trainer_annealing_state.annealing_step_counter += 1
+        return advantages
+    @abstractmethod
+    def set_agent_trajectory_data(
+        self, agent_id: str, roots: list[RolloutTreeRootNode]
+    ) -> None:
+        """
+        TOWRITE
+        """
+        pass
+    def set_trajectory_data(
+        self, roots: list[RolloutTreeRootNode], agent_ids: list[str]
+    ) -> None:
+        """
+        TOWRITE
+        """
+        for agent_id in agent_ids:
+            self.set_agent_trajectory_data(agent_id, roots)
+    @abstractmethod
+    def share_advantage_data(self) -> list[AdvantagePacket]:
+        pass
+    @abstractmethod
+    def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]) -> None:
+        pass
+    def set_policy_gradient_data(self, agent_ids: list[str]) -> None:
+        """
+        Already set earlier # TODO: make it separate and clean
+        """
+        self.policy_gradient_data = None
+        # for agent_id, trajectory_batch in self.training_data.items():
+        #     if "buffer" in agent_id:
+        #         continue
+        for agent_id in agent_ids:
+            assert "buffer" not in agent_id, "Buffer agents do not train policy"
+            trajectory_batch = self.training_data[agent_id]
+            tokenwise_batch_credits = get_tokenwise_credits(
+                batch_timesteps=trajectory_batch.batch_timesteps,
+                batch_credits=trajectory_batch.batch_credits,
+            )
+            policy_gradient_data = TrainingBatch(
+                rollout_ids=trajectory_batch.rollout_ids,
+                batch_input_ids=trajectory_batch.batch_input_ids,
+                batch_action_mask=trajectory_batch.batch_action_mask,
+                batch_entropy_mask=trajectory_batch.batch_entropy_mask,
+                batch_credits=tokenwise_batch_credits,
+                batch_engine_log_probs=trajectory_batch.batch_engine_log_probs,
+                batch_timesteps=trajectory_batch.batch_timesteps,
+            )
+            if self.policy_gradient_data is None:
+                self.policy_gradient_data = policy_gradient_data
+            else:
+                self.policy_gradient_data.append(policy_gradient_data)
+        self.training_data = {}
+        self.tokenwise_tally = ContextualizedTokenwiseTally(
+            tokenizer=self.tokenizer,
+            paths=self.debug_path_list,
+        )
+    def train(self) -> None:
+        """
+        TOWRITE
+        """
+        assert self.policy_gradient_data is not None, "Policy gradient data is not set"
+        if self.critic_optimizer is not None:
+            if self.gradient_clipping is not None:
+                grad_norm = self.accelerator.clip_grad_norm_(
+                    self.critic.parameters(), self.gradient_clipping
+                )
+                self.tally.add_metric(
+                    path="gradient_norm_critic", metric=grad_norm.item()
+                )
+            # Take step
+            self.critic_optimizer.step()
+            self.critic_optimizer.zero_grad()
+            self.accelerator.clear(self.critic, self.critic_optimizer)
+            import gc
+            gc.collect()
+            torch.cuda.empty_cache()
+        running_mean_logs = self.apply_reinforce_step(
+            training_batch=self.policy_gradient_data
+        )
+        return running_mean_logs
+    def export_training_tally(self, identifier: str, folder: str) -> None:
+        """
+        Saves and resets the collected training metrics using the tally object.
+        """
+        os.makedirs(folder, exist_ok=True)
+        self.tally.save(identifier=identifier, folder=folder)
+        self.tokenwise_tally.save(
+            path=os.path.join(folder, f"{identifier}_tokenwise.csv")
+        )
+        self.rollout_tally.save(identifier=identifier, folder=folder)
+        self.tally.reset()
+        self.tokenwise_tally = None
+        self.rollout_tally.reset()
+        self.debug_path_list = []
+    def export_optimizer_states(self) -> None:
+        """
+        Saves the optimizer states for both the main model and critic (if it exists).
+        """
+        try:
+            os.makedirs(self.save_path, exist_ok=True)
+            torch.save(self.policy_optimizer.state_dict(), self.policy_optimizer_path)
+            logger.info(f"Saved main optimizer state to {self.policy_optimizer_path}")
+            if self.critic_optimizer is not None:
+                torch.save(
+                    self.critic_optimizer.state_dict(), self.critic_optimizer_path
+                )
+                logger.info(
+                    f"Saved critic optimizer state to {self.critic_optimizer_path}"
+                )
+        except Exception as e:
+            logger.error(f"Error saving optimizer states: {str(e)}")
+            raise
+    def export_trainer_annealing_state(self) -> None:
+        """
+        Saves the trainer state.
+        """
+        with open(self.trainer_annealing_state_path, "wb") as f:
+            pickle.dump(self.trainer_annealing_state, f)
+        logger.info(f"Saved trainer state to {self.trainer_annealing_state_path}")
+    def export_trainer_states(self) -> None:
+        """
+        Saves the trainer states.
+        """
+        self.export_optimizer_states()
+        self.export_trainer_annealing_state()

src_code_for_reproducibility/training/trainer_independent.py ADDED Viewed

	@@ -0,0 +1,155 @@

+"""
+"""
+import logging
+import os
+import sys
+from typing import Union
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from pandas._libs.tslibs.offsets import CBMonthBegin
+from peft import LoraConfig
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from mllm.markov_games.rollout_tree import *
+from mllm.markov_games.rollout_tree import RolloutTreeRootNode
+from mllm.training.credit_methods import (
+    get_discounted_returns,
+    get_discounted_state_visitation_credits,
+    get_generalized_advantage_estimates,
+    get_rloo_credits,
+)
+from mllm.training.tally_metrics import Tally
+from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally
+from mllm.training.tokenize_chats import *
+from mllm.training.tokenize_chats import process_training_chat
+from mllm.training.trainer_common import BaseTrainer
+from mllm.training.training_data_utils import *
+from mllm.training.training_data_utils import (
+    TrainingBatch,
+    TrajectoryBatch,
+    get_tokenwise_credits,
+)
+from mllm.utils.resource_context import resource_logger_context
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+@dataclass
+class TrainingData:
+    agent_id: str
+    main_data: TrajectoryBatch
+    # list-of-tensors: per rollout advantages with length jT
+    main_advantages: list[torch.FloatTensor] | None = None
+class TrainerNaive(BaseTrainer):
+    def set_agent_trajectory_data(
+        self, agent_id: str, roots: list[RolloutTreeRootNode]
+    ) -> None:
+        """
+        TOWRITE
+        """
+        # TODO: append to current batch data instead, else we will only train for one agent!
+        self.policy_gradient_data = None
+        # Tensorize Chats
+        rollout_ids = []
+        crn_ids = []  # common random number id
+        batch_input_ids = []
+        batch_action_mask = []
+        batch_entropy_mask = []
+        batch_timesteps = []
+        batch_state_ends_mask = []
+        batch_engine_log_probs = []
+        batch_rewards = []
+        for root in roots:
+            rollout_id = root.id
+            self.debug_path_list.append(
+                "mgid:" + str(rollout_id) + "_agent_id:" + agent_id
+            )
+            rollout_ids.append(rollout_id)
+            crn_ids.append(root.crn_id)
+            chat, rewards = get_main_chat_list_and_rewards(agent_id=agent_id, root=root)
+            (
+                input_ids,
+                action_mask,
+                entropy_mask,
+                timesteps,
+                state_ends_mask,
+                engine_log_probs,
+            ) = process_training_chat(
+                tokenizer=self.tokenizer,
+                chat_history=chat,
+                entropy_mask_regex=self.entropy_mask_regex,
+                exploration_prompts_to_remove=self.exploration_prompts_to_remove,
+            )
+            batch_input_ids.append(input_ids)
+            batch_action_mask.append(action_mask)
+            batch_entropy_mask.append(entropy_mask)
+            batch_timesteps.append(timesteps)
+            batch_state_ends_mask.append(state_ends_mask)
+            batch_engine_log_probs.append(engine_log_probs)
+            batch_rewards.append(rewards)
+        trajectory_batch = TrajectoryBatch(
+            rollout_ids=torch.tensor(rollout_ids, dtype=torch.int32),
+            crn_ids=torch.tensor(crn_ids, dtype=torch.int32),
+            agent_ids=[agent_id] * len(rollout_ids),
+            batch_input_ids=batch_input_ids,
+            batch_action_mask=batch_action_mask,
+            batch_entropy_mask=batch_entropy_mask,
+            batch_timesteps=batch_timesteps,
+            batch_state_ends_mask=batch_state_ends_mask,
+            batch_rewards=batch_rewards,
+            batch_engine_log_probs=batch_engine_log_probs,
+        )
+        # Get Advantages
+        batch_advantages: torch.FloatTensor = (
+            self.get_advantages_with_critic_gradient_accumulation(trajectory_batch)
+        )
+        # Discount state visitation (the mathematically correct way)
+        if not self.skip_discounted_state_visitation:
+            for i in range(len(batch_advantages)):
+                batch_advantages[i] = get_discounted_state_visitation_credits(
+                    batch_advantages[i].unsqueeze(0),
+                    self.discount_factor,
+                ).squeeze(0)
+        self.training_data[agent_id] = TrainingData(
+            agent_id=agent_id,
+            main_data=trajectory_batch,
+            main_advantages=batch_advantages,
+        )
+    def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]):
+        """
+        This trainer ignores the advantages of the other trainers.
+        """
+        for agent_id, agent_data in self.training_data.items():
+            self.training_data[agent_id] = agent_data.main_data
+            self.training_data[agent_id].batch_credits = agent_data.main_advantages
+    def share_advantage_data(self) -> list[AdvantagePacket]:
+        """
+        Share the advantage data with other agents.
+        Returns:
+            AdvantagePacket: The advantage packet containing the agent's advantages.
+        """
+        logger.info(f"Sharing advantage data.")
+        advantage_packets = []
+        for agent_id, agent_data in self.training_data.items():
+            advantage_packets.append(
+                AdvantagePacket(
+                    agent_id=agent_id,
+                    rollout_ids=agent_data.main_data.rollout_ids,
+                    main_advantages=agent_data.main_advantages,
+                )
+            )
+        return advantage_packets

src_code_for_reproducibility/training/trainer_sum_rewards.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""
+"""
+import logging
+import os
+import sys
+from typing import Union
+import torch
+import torch.nn.functional as F
+from accelerate import Accelerator
+from pandas._libs.tslibs.offsets import CBMonthBegin
+from peft import LoraConfig
+from torch.nn.utils.rnn import pad_sequence
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from mllm.markov_games.rollout_tree import *
+from mllm.markov_games.rollout_tree import RolloutTreeRootNode
+from mllm.training.credit_methods import (
+    get_discounted_returns,
+    get_discounted_state_visitation_credits,
+    get_generalized_advantage_estimates,
+    get_rloo_credits,
+)
+from mllm.training.tally_metrics import Tally
+from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem
+from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally
+from mllm.training.tokenize_chats import *
+from mllm.training.tokenize_chats import process_training_chat
+from mllm.training.trainer_common import BaseTrainer
+from mllm.training.trainer_independent import TrainerNaive, TrainingData
+from mllm.training.training_data_utils import *
+from mllm.training.training_data_utils import (
+    AdvantagePacket,
+    TrainingBatch,
+    TrajectoryBatch,
+    get_tokenwise_credits,
+)
+from mllm.utils.resource_context import resource_logger_context
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+class TrainerSumRewards(TrainerNaive):
+    def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]):
+        """
+        Sums the advantages of the other trainers
+        """
+        logger.info(f"Receiving advantage packets.")
+        assert (
+            len(advantage_packets) > 0
+        ), "At least one advantage packet must be provided."
+        for agent_id, agent_data in self.training_data.items():
+            coagent_advantage_packets = [
+                packet for packet in advantage_packets if packet.agent_id != agent_id
+            ]
+            agent_rollout_ids = agent_data.main_data.rollout_ids
+            agent_advantages = agent_data.main_advantages
+            co_agent_advantages = []
+            for rollout_id in agent_rollout_ids:
+                for co_agent_packet in coagent_advantage_packets:
+                    if rollout_id in co_agent_packet.rollout_ids:
+                        index = torch.where(rollout_id == co_agent_packet.rollout_ids)[
+                            0
+                        ].item()
+                        co_agent_advantages.append(
+                            co_agent_packet.main_advantages[index]
+                        )
+                        # assumes that its two player game, with one co-agent
+                        break
+            assert len(co_agent_advantages) == len(agent_advantages)
+            B = len(agent_advantages)
+            assert all(
+                a.shape[0] == b.shape[0]
+                for a, b in zip(co_agent_advantages, agent_advantages)
+            ), "Number of advantages must match in order to sum them up."
+            # Get padded tensors (advantage alignment is invariant to padding)
+            lengths = torch.tensor(
+                [len(t) for t in agent_advantages],
+                device=self.device,
+                dtype=torch.long,
+            )
+            padded_main_advantages = pad_sequence(
+                agent_advantages, batch_first=True, padding_value=0.0
+            )
+            padded_co_agent_advantages = pad_sequence(
+                co_agent_advantages, batch_first=True, padding_value=0.0
+            )
+            # Create training batch data
+            sum_of_ad_credits = padded_main_advantages + padded_co_agent_advantages
+            self.rollout_tally.add_metric(
+                path=["sum_of_ad_credits"],
+                rollout_tally_item=RolloutTallyItem(
+                    crn_ids=agent_data.main_data.crn_ids,
+                    rollout_ids=agent_data.main_data.rollout_ids,
+                    agent_ids=agent_data.main_data.agent_ids,
+                    metric_matrix=sum_of_ad_credits,
+                ),
+            )
+            if not self.skip_discounted_state_visitation:
+                sum_of_ad_credits = get_discounted_state_visitation_credits(
+                    sum_of_ad_credits,
+                    self.discount_factor,
+                )
+                self.rollout_tally.add_metric(
+                    path=["discounted_state_visitation_credits"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=agent_data.main_data.crn_ids,
+                        rollout_ids=agent_data.main_data.rollout_ids,
+                        agent_ids=agent_data.main_data.agent_ids,
+                        metric_matrix=sub_tensors[
+                            "discounted_state_visitation_credits"
+                        ],
+                    ),
+                )
+            # Slice back to jagged and convert to tokenwise credits
+            sum_of_ad_credits = [sum_of_ad_credits[i, : lengths[i]] for i in range(B)]
+            self.training_data[agent_id] = agent_data.main_data
+            self.training_data[agent_id].batch_credits = sum_of_ad_credits

src_code_for_reproducibility/training/training_data_utils.py ADDED Viewed

	@@ -0,0 +1,394 @@

+from dataclasses import dataclass
+from typing import Literal, Optional, Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from mllm.markov_games.rollout_tree import (
+    ChatTurn,
+    RolloutTreeBranchNode,
+    RolloutTreeNode,
+    RolloutTreeRootNode,
+)
+@dataclass
+class AdvantagePacket:
+    agent_id: str
+    rollout_ids: torch.IntTensor  # (B,)
+    # list-of-tensors
+    main_advantages: list[torch.FloatTensor]
+class TrainingChatTurn:
+    # TODO: simplify by making this a child of ChatTurn
+    """
+    This class contains the chat turns for a single agent.
+    It is like ChatTurn, but with the time step added.
+    """
+    def __init__(
+        self,
+        time_step: int,
+        role: str,
+        agent_id: str,
+        content: str,
+        chat_template_token_ids: list[int],
+        reasoning_content: str,
+        is_state_end: bool,
+        out_token_ids: Optional[list[int]] = None,
+        log_probs: Optional[list[float]] = None,
+    ) -> None:
+        self.time_step = time_step
+        self.role = role
+        self.agent_id = agent_id
+        self.content = content
+        self.chat_template_token_ids = chat_template_token_ids
+        self.reasoning_content = reasoning_content
+        self.is_state_end = is_state_end
+        self.out_token_ids = out_token_ids
+        self.log_probs = log_probs
+    def dict(self):
+        return {
+            "time_step": self.time_step,
+            "role": self.role,
+            "agent_id": self.agent_id,
+            "content": self.content,
+            "chat_template_token_ids": self.chat_template_token_ids,
+            "reasoning_content": self.reasoning_content,
+            "is_state_end": self.is_state_end,
+            "out_token_ids": self.out_token_ids,
+            "log_probs": self.log_probs,
+        }
+def get_main_chat_list_and_rewards(
+    agent_id: str, root: RolloutTreeRootNode | RolloutTreeNode
+) -> Tuple[list[TrainingChatTurn], torch.FloatTensor]:
+    """
+    This method traverses a rollout tree and returns a the list of ChatTurn
+    for an agent. If it encounters a branch node, it follows the main path.
+    """
+    # TODO; extend for all trees, not just linear
+    if isinstance(root, RolloutTreeRootNode):
+        current_node = root.child
+    else:
+        current_node = root
+    chat = []
+    rewards = []
+    while current_node is not None:
+        if isinstance(current_node, RolloutTreeBranchNode):
+            current_node = current_node.main_child
+        reward: float = current_node.step_log.simulation_step_log.rewards[agent_id]
+        rewards.append(reward)
+        chat_turns: list[TrainingChatTurn] = current_node.step_log.action_logs[
+            agent_id
+        ].chat_turns
+        chat_turns = [
+            TrainingChatTurn(time_step=current_node.time_step, **turn.model_dump())
+            for turn in chat_turns
+        ]
+        chat.extend(chat_turns)
+        current_node = current_node.child
+    return chat, torch.FloatTensor(rewards)
+def get_tokenwise_credits(
+    # B := batch size, S := number of tokens / seq. length, T := number of states. `j` stands for jagged (see pytorch nested tensors.)
+    batch_timesteps: torch.IntTensor | torch.Tensor,  # (B, jS),
+    batch_credits: torch.FloatTensor | torch.Tensor,  # (B, jT)
+) -> torch.FloatTensor | torch.Tensor:  # (B, jS)
+    """
+    TOWRITE
+    """
+    # TODO vectorize this code
+    batch_token_credits = []
+    for credits, timesteps in zip(batch_credits, batch_timesteps):
+        token_credits = torch.zeros_like(
+            timesteps,
+            dtype=credits.dtype,
+            device=timesteps.device,
+        )
+        for idx, credit in enumerate(credits):
+            token_credits[timesteps == idx] = credit
+        batch_token_credits.append(token_credits)
+    return batch_token_credits
+@dataclass
+class TrajectoryBatch:
+    """
+    Tensorized batch of trajectories using list-of-tensors for jagged dimensions.
+    """
+    # B := batch size, S := number of tokens / seq. length, T := number of states.
+    rollout_ids: torch.IntTensor  # (B,)
+    crn_ids: torch.IntTensor  # (B,)
+    agent_ids: list[str]  # (B,)
+    batch_input_ids: list[torch.LongTensor]  # List[(jS,)]
+    batch_action_mask: list[torch.BoolTensor]  # List[(jS,)]
+    batch_entropy_mask: list[torch.BoolTensor]  # List[(jS,)]
+    batch_timesteps: list[torch.IntTensor]  # List[(jS,)]
+    batch_state_ends_mask: list[torch.BoolTensor]  # List[(jS,)]
+    batch_engine_log_probs: Optional[list[torch.FloatTensor]]  # List[(jS,)]
+    batch_rewards: list[torch.FloatTensor]  # List[(jT,)]
+    batch_credits: Optional[list[torch.FloatTensor]] = None  # List[(jS,)]
+    def __post_init__(self):
+        """
+        Validate per-sample consistency.
+        """
+        B = self.rollout_ids.shape[0]
+        assert (
+            self.crn_ids.shape[0] == B
+        ), "RNG IDs must have length equal to batch size."
+        assert (
+            len(self.agent_ids) == B
+        ), "agent_ids must have length equal to batch size."
+        assert (
+            len(self.batch_input_ids)
+            == len(self.batch_action_mask)
+            == len(self.batch_entropy_mask)
+            == len(self.batch_timesteps)
+            == len(self.batch_state_ends_mask)
+            == len(self.batch_engine_log_probs)
+            == len(self.batch_rewards)
+            == B
+        ), "Jagged lists must all have length equal to batch size."
+        for b in range(B):
+            nb_rewards = int(self.batch_rewards[b].shape[0])
+            nb_timesteps = int(torch.max(self.batch_timesteps[b]).item()) + 1
+            assert (
+                nb_rewards == nb_timesteps
+            ), "Number of rewards and timesteps mismatch."
+            assert (
+                self.batch_input_ids[b].shape[0]
+                == self.batch_action_mask[b].shape[0]
+                == self.batch_entropy_mask[b].shape[0]
+                == self.batch_engine_log_probs[b].shape[0]
+                == self.batch_timesteps[b].shape[0]
+            ), "Tensors must have the same shape along the jagged dimension."
+            assert (
+                int(self.batch_state_ends_mask[b].sum())
+                == self.batch_rewards[b].shape[0]
+            ), "Number of rewards must match number of state ends."
+    """
+    Entries:
+        Here, we ignore the batch dimension.
+        input_ids:
+            All of the tokens of both the user and the assistant, flattened.
+        action_mask:
+            Set to true on the tokens of the assistant (tokens generated by the model).
+        timesteps:
+            Therefore, max(timesteps) = Ns - 1.
+        state_ends_idx:
+            Indices of the tokens at which state descriptions end.
+        rewards:
+            rewards[t] := R_t(s_t, a_t)
+    Example:
+        position:       "0  1  2  3  4  5  6  7  8  9  10 11 12 13 14"
+        input_ids:      "U  U  U  a  a  a  U  a  U  a  a  a  U  U  U" (U := User, a := Assistant)
+        action_mask:    "x  x  x  ✓  ✓  ✓  x  ✓  x  ✓  ✓  ✓  x  x  x"
+        timestep:       "0  0  0  0  0  0  1  1  1  1  1  1  2  2  2"
+        state_ends_dx:  [2, 6, 14]
+        rewards:        [r0, r1, r2]
+    """
+    def __getitem__(self, key) -> "TrajectoryBatch":
+        if isinstance(key, slice):
+            return TrajectoryBatch(
+                rollout_ids=self.rollout_ids.__getitem__(key),
+                crn_ids=self.crn_ids.__getitem__(key),
+                agent_ids=self.agent_ids[key],
+                batch_input_ids=self.batch_input_ids[key],
+                batch_action_mask=self.batch_action_mask[key],
+                batch_entropy_mask=self.batch_entropy_mask[key],
+                batch_timesteps=self.batch_timesteps[key],
+                batch_state_ends_mask=self.batch_state_ends_mask[key],
+                batch_engine_log_probs=self.batch_engine_log_probs[key],
+                batch_rewards=self.batch_rewards[key],
+                batch_credits=self.batch_credits[key] if self.batch_credits else None,
+            )
+    def __len__(self):
+        return len(self.batch_input_ids)
+    def to(self, device):
+        self.rollout_ids = self.rollout_ids.to(device)
+        self.crn_ids = self.crn_ids.to(device)
+        self.batch_input_ids = [t.to(device) for t in self.batch_input_ids]
+        self.batch_action_mask = [t.to(device) for t in self.batch_action_mask]
+        self.batch_entropy_mask = [t.to(device) for t in self.batch_entropy_mask]
+        self.batch_timesteps = [t.to(device) for t in self.batch_timesteps]
+        self.batch_state_ends_mask = [t.to(device) for t in self.batch_state_ends_mask]
+        self.batch_engine_log_probs = [
+            t.to(device) for t in self.batch_engine_log_probs
+        ]
+        self.batch_rewards = [t.to(device) for t in self.batch_rewards]
+        self.batch_credits = (
+            [t.to(device) for t in self.batch_credits] if self.batch_credits else None
+        )
+    def get_padded_tensors_for_critic(self):
+        """
+        Returns:
+            padded_batch_input_ids: (B, P)
+            padded_batch_state_ends_mask: (B, P)
+            timestep_counts: (B,) tensor of ints indicating number of states per sample
+        """
+        padded_batch_input_ids = pad_sequence(
+            self.batch_input_ids, batch_first=True, padding_value=0
+        )
+        padded_batch_state_ends_mask = pad_sequence(
+            self.batch_state_ends_mask, batch_first=True, padding_value=0
+        ).bool()
+        # number of states equals number of True in state_ends_mask
+        timestep_counts = torch.tensor(
+            [int(mask.sum().item()) for mask in self.batch_state_ends_mask],
+            device=padded_batch_input_ids.device,
+            dtype=torch.long,
+        )
+        return padded_batch_input_ids, padded_batch_state_ends_mask, timestep_counts
+timestep = int
+@dataclass
+class PaddedTensorTrainingBatch:
+    batch_input_ids: torch.LongTensor | torch.Tensor
+    batch_action_mask: torch.BoolTensor | torch.Tensor
+    batch_entropy_mask: Optional[torch.BoolTensor | torch.Tensor]
+    batch_credits: torch.FloatTensor | torch.Tensor
+    batch_engine_log_probs: torch.FloatTensor | torch.Tensor
+    batch_timesteps: torch.IntTensor | torch.Tensor
+    def __len__(self):
+        return self.batch_input_ids.shape[0]
+    def to(self, device):
+        self.batch_input_ids = self.batch_input_ids.to(device)
+        self.batch_action_mask = self.batch_action_mask.to(device)
+        self.batch_entropy_mask = self.batch_entropy_mask.to(device)
+        self.batch_credits = self.batch_credits.to(device)
+        self.batch_engine_log_probs = self.batch_engine_log_probs.to(device)
+        self.batch_timesteps = self.batch_timesteps.to(device)
+@dataclass
+class TrainingBatch:
+    rollout_ids: torch.IntTensor | torch.Tensor  # (B,)
+    batch_input_ids: list[torch.LongTensor]  # List[(jS,)]
+    batch_action_mask: list[torch.BoolTensor]  # List[(jS,)]
+    batch_entropy_mask: Optional[list[torch.BoolTensor]]  # List[(jS,)]
+    batch_credits: list[torch.FloatTensor]  # List[(jS,)]
+    batch_engine_log_probs: list[torch.FloatTensor]  # List[(jS,)]
+    batch_timesteps: list[torch.IntTensor]  # List[(jS,)]
+    def __post_init__(self):
+        # Put everything in the right device
+        # self.rollout_ids = self.rollout_ids.to("cuda" if torch.cuda.is_available() else "cpu")
+        # self.batch_input_ids = self.batch_input_ids.to("cuda" if torch.cuda.is_available() else "cpu")
+        # self.batch_action_mask = self.batch_action_mask.to("cuda" if torch.cuda.is_available() else "cpu")
+        # self.batch_credits = self.batch_credits.to("cuda" if torch.cuda.is_available() else "cpu")
+        # Ensure batch dimension is present
+        assert (
+            len(self.batch_input_ids)
+            == len(self.batch_action_mask)
+            == len(self.batch_entropy_mask)
+            == len(self.batch_credits)
+            == len(self.batch_engine_log_probs)
+            == len(self.batch_timesteps)
+            == self.rollout_ids.shape[0]
+        ), "Jagged lists must all have length equal to batch size."
+        for inp, mask, cred, engine_log_prob, timestep in zip(
+            self.batch_input_ids,
+            self.batch_action_mask,
+            self.batch_credits,
+            self.batch_engine_log_probs,
+            self.batch_timesteps,
+        ):
+            assert (
+                inp.shape[0]
+                == mask.shape[0]
+                == cred.shape[0]
+                == engine_log_prob.shape[0]
+                == timestep.shape[0]
+            ), "Tensors must have the same shapes along the jagged dimension."
+    def __getitem__(self, key) -> "TrainingBatch":
+        if isinstance(key, slice):
+            return TrainingBatch(
+                rollout_ids=self.rollout_ids.__getitem__(key),
+                batch_input_ids=self.batch_input_ids[key],
+                batch_action_mask=self.batch_action_mask[key],
+                batch_entropy_mask=self.batch_entropy_mask[key],
+                batch_credits=self.batch_credits[key],
+                batch_engine_log_probs=self.batch_engine_log_probs[key],
+                batch_timesteps=self.batch_timesteps[key],
+            )
+    def __len__(self):
+        return len(self.batch_input_ids)
+    def to(self, device):
+        self.rollout_ids = self.rollout_ids.to(device)
+        self.batch_input_ids = [t.to(device) for t in self.batch_input_ids]
+        self.batch_action_mask = [t.to(device) for t in self.batch_action_mask]
+        self.batch_entropy_mask = [t.to(device) for t in self.batch_entropy_mask]
+        self.batch_credits = [t.to(device) for t in self.batch_credits]
+        self.batch_engine_log_probs = [
+            t.to(device) for t in self.batch_engine_log_probs
+        ]
+        self.batch_timesteps = [t.to(device) for t in self.batch_timesteps]
+    def get_padded_tensors(self, padding: float = 0.0):
+        """
+        TOWRITE
+        Always pad to the right.
+        """
+        padded_batch_input_ids = pad_sequence(
+            self.batch_input_ids, batch_first=True, padding_value=int(padding)
+        )
+        padded_batch_action_mask = pad_sequence(
+            [m.to(dtype=torch.bool) for m in self.batch_action_mask],
+            batch_first=True,
+            padding_value=False,
+        )
+        padded_batch_entropy_mask = pad_sequence(
+            self.batch_entropy_mask, batch_first=True, padding_value=False
+        )
+        padded_batch_credits = pad_sequence(
+            self.batch_credits, batch_first=True, padding_value=float(padding)
+        )
+        padded_batch_engine_log_probs = pad_sequence(
+            self.batch_engine_log_probs, batch_first=True, padding_value=float(padding)
+        )
+        padded_batch_timesteps = pad_sequence(
+            self.batch_timesteps, batch_first=True, padding_value=0
+        )
+        return PaddedTensorTrainingBatch(
+            padded_batch_input_ids,
+            padded_batch_action_mask,
+            padded_batch_entropy_mask,
+            padded_batch_credits,
+            padded_batch_engine_log_probs,
+            padded_batch_timesteps,
+        )
+    def append(self, other: "TrainingBatch"):
+        self.rollout_ids = torch.cat([self.rollout_ids, other.rollout_ids])
+        self.batch_input_ids.extend(other.batch_input_ids)
+        self.batch_action_mask.extend(other.batch_action_mask)
+        self.batch_entropy_mask.extend(other.batch_entropy_mask)
+        self.batch_credits.extend(other.batch_credits)
+        self.batch_engine_log_probs.extend(other.batch_engine_log_probs)
+        self.batch_timesteps.extend(other.batch_timesteps)
+timestep = int