Muqeeth commited on Mar 31

Commit

08eb8ef

verified ·

1 Parent(s): 687ffba

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.hydra/config.yaml +178 -0
.hydra/hydra.yaml +154 -0
.hydra/overrides.yaml +1 -0
seed_1337/Qwen/Qwen2.5-7B-Instruct/adapters/README.md +207 -0
seed_1337/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json +46 -0
src_code_for_reproducibility/__init__.py +4 -0
src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc +0 -0
src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/Ipd_hard_coded_agents.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_statistics.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/README.md +27 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/nego_hard_coded_policies.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_rps_agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/dond_agent.py +75 -0
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +176 -0
src_code_for_reproducibility/markov_games/negotiation/nego_agent.py +261 -0
src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py +70 -0
src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py +252 -0
src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py +249 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py +108 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +182 -0
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +118 -0
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py +128 -0
src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py +257 -0
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/trainer_ad_align.py +505 -0
src_code_for_reproducibility/utils/dict_get_path.py +17 -0
src_code_for_reproducibility/utils/gather_training_stats.py +262 -0
src_code_for_reproducibility/utils/resource_context.py +83 -0
src_code_for_reproducibility/utils/rollout_tree_chat_htmls.py +1597 -0
src_code_for_reproducibility/utils/rollout_tree_gather_utils.py +314 -0
src_code_for_reproducibility/utils/short_id_gen.py +16 -0
src_code_for_reproducibility/utils/stat_pack.py +117 -0
src_code_for_reproducibility/utils/wandb_utils.py +170 -0

.hydra/config.yaml ADDED Viewed

	@@ -0,0 +1,178 @@

+experiment:
+  wandb_enabled: true
+  nb_epochs: 3000
+  nb_matches_per_iteration: 64
+  reinit_matches_each_it: true
+  checkpoint_every_n_iterations: 50
+  start_epoch: 0
+  resume_experiment: true
+  base_seed: 1337
+  seed_group_size: 8
+  train: true
+  stat_methods_for_live_wandb: mllm.markov_games.negotiation.negotiation_statistics
+  name: split_no_comm_vanilla_ad_align_no_agent_buffer_seed1337
+  agent_buffer: false
+  keep_agent_buffer_count: ${lora_count}
+  agent_buffer_recent_k: -1
+logging:
+  wandb:
+    enabled: false
+    project: llm-negotiation
+    entity: null
+    mode: online
+    name: null
+    group: null
+    tags: []
+    notes: null
+temperature: 1.0
+markov_games:
+  runner_method_name: LinearRunner
+  runner_kwargs: {}
+  group_by_round: true
+  simulation_class_name: NoPressSimulation
+  simulation_init_args:
+    nb_of_rounds: 10
+    quota_messages_per_agent_per_round: 0
+    game_type: 10-1-ties
+    atleast_one_conflict: true
+    item_types:
+    - hats
+    - books
+    - balls
+  agents:
+    0:
+      agent_id: ${agent_0_id}
+      agent_name: Alice
+      agent_class_name: NoPressAgent
+      policy_id: base_llm/agent_adapter
+      init_kwargs:
+        goal: Maximize your total points over the whole game.
+    1:
+      agent_id: ${agent_1_id}
+      agent_name: Bob
+      agent_class_name: NoPressAgent
+      policy_id: base_llm/agent_adapter
+      init_kwargs:
+        goal: Maximize your total points over the whole game.
+models:
+  base_llm:
+    class: LeanLocalLLM
+    init_args:
+      llm_id: base_llm
+      model_name: Qwen/Qwen2.5-7B-Instruct
+      inference_backend: vllm
+      hf_kwargs:
+        device_map: auto
+        torch_dtype: bfloat16
+        max_memory:
+          0: 20GiB
+        attn_implementation: flash_attention_2
+      inference_backend_init_kwargs:
+        enable_lora: true
+        seed: ${experiment.base_seed}
+        enable_prefix_caching: true
+        max_model_len: 10000.0
+        gpu_memory_utilization: 0.5
+        dtype: bfloat16
+        trust_remote_code: true
+        max_lora_rank: 32
+        enforce_eager: false
+        max_loras: ${lora_count}
+        max_cpu_loras: ${lora_count}
+        enable_sleep_mode: true
+      inference_backend_sampling_params:
+        temperature: ${temperature}
+        top_p: 1.0
+        max_tokens: 400
+        top_k: -1
+        logprobs: 0
+      adapter_configs:
+        agent_adapter:
+          task_type: CAUSAL_LM
+          r: 32
+          lora_alpha: 64
+          lora_dropout: 0.0
+          target_modules: all-linear
+        critic_adapter:
+          task_type: CAUSAL_LM
+          r: 32
+          lora_alpha: 64
+          lora_dropout: 0.0
+          target_modules: all-linear
+      enable_thinking: null
+      regex_max_attempts: 3
+critics:
+  agent_critic:
+    module_pointer:
+    - base_llm
+    - critic_adapter
+optimizers:
+  agent_optimizer:
+    module_pointer:
+    - base_llm
+    - agent_adapter
+    optimizer_class_name: torch.optim.Adam
+    init_args:
+      lr: 3.0e-06
+      weight_decay: 0.0
+  critic_optimizer:
+    module_pointer: agent_critic
+    optimizer_class_name: torch.optim.Adam
+    init_args:
+      lr: 3.0e-06
+      weight_decay: 0.0
+trainers:
+  agent_trainer:
+    class: TrainerAdAlign
+    module_pointers:
+      policy:
+      - base_llm
+      - agent_adapter
+      policy_optimizer: agent_optimizer
+      critic: agent_critic
+      critic_optimizer: critic_optimizer
+    kwargs:
+      entropy_coeff: 0.0
+      entropy_topk: null
+      entropy_mask_regex: null
+      kl_coeff: 0.001
+      gradient_clipping: 1.0
+      restrict_tokens: null
+      mini_batch_size: 1
+      use_gradient_checkpointing: false
+      temperature: ${temperature}
+      device: cuda:0
+      use_gae: false
+      whiten_advantages: false
+      whiten_advantages_time_step_wise: false
+      skip_discounted_state_visitation: true
+      use_gae_lambda_annealing: false
+      gae_lambda_annealing_method: None
+      gae_lambda_annealing_method_params: None
+      gae_lambda_annealing_limit: 0.95
+      discount_factor: 0.9
+      use_rloo: true
+      enable_tokenwise_logging: false
+      pg_loss_normalization: nb_tokens
+      truncated_importance_sampling_ratio_cap: 2.0
+      reward_normalizing_constant: 100.0
+      ad_align_force_coop_first_step: false
+      ad_align_clipping: null
+      ad_align_gamma: 0.9
+      ad_align_exclude_k_equals_t: true
+      ad_align_use_sign: false
+      ad_align_beta: 1.0
+      use_old_ad_align: true
+      use_time_regularization: false
+      rloo_branch: false
+      reuse_baseline: false
+train_on_which_data:
+  agent_trainer: ${agent_ids}
+lora_count: 30
+common_agent_kwargs:
+  goal: Maximize your total points over the whole game.
+agent_0_id: Alice
+agent_1_id: Bob
+agent_ids:
+- Alice
+- Bob

.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,154 @@

+hydra:
+  run:
+    dir: ${oc.env:SCRATCH}/llm_negotiation/${now:%Y_%m}/${experiment.name}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task: []
+  job:
+    name: run
+    chdir: false
+    override_dirname: ''
+    id: ???
+    num: ???
+    config_name: split_no_comm_vanilla_ad_align_no_agent_buffer_seed1337.yaml
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /lustre10/scratch/muqeeth/AdAlignLLM
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /lustre10/scratch/muqeeth/AdAlignLLM/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /scratch/muqeeth/llm_negotiation/2026_03/split_no_comm_vanilla_ad_align_no_agent_buffer_seed1337
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

.hydra/overrides.yaml ADDED Viewed

	@@ -0,0 +1 @@


1	+ []

seed_1337/Qwen/Qwen2.5-7B-Instruct/adapters/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-7B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-7B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

seed_1337/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.0,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "up_proj",
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "k_proj",
+    "v_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

src_code_for_reproducibility/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+"""
+File: mllm/__init__.py
+Summary: Initializes the multi-agent large language model package namespace.
+"""

src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc ADDED Viewed

Binary file (4.13 kB). View file

src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc ADDED Viewed

Binary file (1.46 kB). View file

src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc ADDED Viewed

Binary file (4.4 kB). View file

src_code_for_reproducibility/markov_games/ipd/__pycache__/Ipd_hard_coded_agents.cpython-312.pyc ADDED Viewed

Binary file (3.05 kB). View file

src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (435 Bytes). View file

src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_agent.cpython-312.pyc ADDED Viewed

Binary file (4.97 kB). View file

src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_simulation.cpython-312.pyc ADDED Viewed

Binary file (6.87 kB). View file

src_code_for_reproducibility/markov_games/ipd/__pycache__/ipd_statistics.cpython-312.pyc ADDED Viewed

Binary file (1.42 kB). View file

src_code_for_reproducibility/markov_games/negotiation/README.md ADDED Viewed

	@@ -0,0 +1,27 @@

+## Negotiation Games: core mechanics and variants
+This family of games feature two agents who, in each round, may briefly communicate and then simultaneously propose how to split a fixed resource (most commonly 10 coins). Rewards are the amount kept multiplied by an agent’s per-unit value. The starting speaker alternates deterministically across rounds.
+Communication is optional and variant-dependent: some settings encourage rich messaging to share private information, while others remove messaging entirely to focus on allocation behavior.
+Proportional splitting is used when the two proposals exceed the available total: allocations are scaled proportionally rather than discarded. This preserves a useful learning signal even when agents over-claim.
+### Variants (in increasing difficulty)
+- No‑Press Split
+  - Multiple item types (e.g., hats, balls, books)
+  - The item values for each agent are public.
+  - No communication; agents go straight to making split proposals.
+  - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
+- Trust-and-Split RPS (TAS-RPS)
+  - Single item type (coins)
+  - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
+  - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
+  - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.

src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_agent.cpython-312.pyc ADDED Viewed

Binary file (4.66 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/nego_hard_coded_policies.cpython-312.pyc ADDED Viewed

Binary file (3.39 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_agent.cpython-312.pyc ADDED Viewed

Binary file (6.11 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-312.pyc ADDED Viewed

Binary file (9.72 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_rps_agent.cpython-312.pyc ADDED Viewed

Binary file (6.05 kB). View file

src_code_for_reproducibility/markov_games/negotiation/dond_agent.py ADDED Viewed

	@@ -0,0 +1,75 @@

+"""
+File: mllm/markov_games/negotiation/dond_agent.py
+Summary: Agent implementation for Deal-or-No-Deal style negotiations.
+"""
+import copy
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.dond_simulation import DealNoDealObs
+from mllm.markov_games.negotiation.nego_agent import (
+    NegotiationAgent,
+    NegotiationAgentState,
+)
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class DealNoDealAgent(NegotiationAgent):
+    """NegotiationAgent tailored to the Deal-or-No-Deal stock/value revelation rules."""
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "You are {agent_id}. You are playing an iterated game. "
+            "At each round, you and other agent will try to distribute among yourselves items of types {item_types}. "
+            "You only know how much you value each item type, but not the other agent's values. "
+            "You can communicate with the other agent by sending up to {quota_messages_per_agent_per_round} short messages per round. "
+            "Each round, after exchanging messages, you and the other agent will submit a private proposal. "
+            "A deal is accepted only if both proposals match exactly and are within stock; otherwise no deal (0 points for both at that round). "
+            "The values of the items of the other agent at the previous round are revealed to you after each round. "
+            "Your goal is: {goal}."
+        )
+        self.new_round_prompt = (
+            "New round {round_nb}. Items: {stock}. Your values: {values}. "
+        )
+        self.last_round_prompt = (
+            "Last round, other agent's values: {previous_values_coagent}. "
+        )
+        self.send_split_prompt = "Respond with <split>...</split> where you propose how many items of each type you want to keep."
+    def get_message_regex(self, observation: DealNoDealObs) -> str:
+        """Allow short XML messages (<400 chars) between proposal phases."""
+        return r"<message>[\s\S]{0,400}</message>"
+    def get_split_regex(self, observation: DealNoDealObs) -> str:
+        """Constrain split proposals to per-item XML tags bounded by the current stock."""
+        parts = []
+        for t in observation.item_types:
+            s = int(observation.quantities.get(t, 0))
+            allowed = "|".join(str(k) for k in range(0, s + 1))
+            rng = f"({allowed})"
+            parts.append(rf"<{t}>{rng}</{t}>")
+        items_block = "".join(parts)
+        return rf"(<split>{items_block}</split>)"
+    def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
+        """Convert the XML proposal into a Split dataclass understood by the simulator."""
+        import re as _re
+        allocations: Dict[str, int] = {}
+        for t in observation.item_types:
+            m = _re.search(rf"<{t}>([0-9]+)</{t}>", policy_output)
+            if m:
+                allocations[t] = int(m.group(1))
+            else:
+                allocations[t] = 0
+        return Split(items_given_to_self=allocations)

src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+File: mllm/markov_games/negotiation/dond_simulation.py
+Summary: Simulates Deal-or-No-Deal negotiation games and logs rollouts.
+"""
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+)
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.utils.get_coagent_id import get_coagent_id
+AgentId = str
+@dataclass
+class DealNoDealState(NegotiationState):
+    """NegotiationState with per-agent value tables and item taxonomy."""
+    item_types: List[str]
+    values: Dict[AgentId, Dict[str, int]]
+@dataclass
+class DealNoDealObs(NegotiationObs):
+    """Observation that reveals own values and (lagged) opponent values."""
+    my_values: Dict[str, int]
+    item_types: List[str]
+    previous_values_coagent: Dict[str, int] | None
+def random_partition_integer(rng, total: int, parts: int) -> List[int]:
+    """Sample non-negative integers summing to ``total`` across ``parts`` buckets."""
+    if parts <= 0:
+        return []
+    if total <= 0:
+        return [0 for _ in range(parts)]
+    cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
+    vals = []
+    prev = 0
+    for c in cuts + [total]:
+        vals.append(c - prev)
+        prev = c
+    return vals
+class DealNoDealSimulation(NegotiationSimulation):
+    """NegotiationSimulation variant implementing the Rubinstein-style Deal-or-No-Deal."""
+    def __init__(
+        self,
+        item_types: List[str] = ["books", "hats", "balls"],
+        *args,
+        **kwargs,
+    ):
+        super().__init__(item_types=item_types, *args, **kwargs)
+        self.reset()
+    def _other(self, agent_id: AgentId) -> AgentId:
+        return get_coagent_id(self.agent_ids, agent_id)
+    def _sample_stock(self) -> Dict[str, int]:
+        # total items between 5 and 7
+        total_items = int(self.rng.integers(5, 8))
+        # nonnegative per-type counts summing to total_items
+        parts = random_partition_integer(self.rng, total_items, len(self.item_types))
+        # allow zeros per type
+        return {t: int(c) for t, c in zip(self.item_types, parts)}
+    def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
+        # Each agent has integer non-negative values that sum to 10
+        # Each item type valued by at least one agent
+        # Some item type valued by both agents
+        while True:
+            vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
+            vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
+            a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
+            b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
+            # each item valued by at least one
+            ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
+            # some item valued by both
+            ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
+            if ok1 and ok2:
+                return {self.agent_ids[0]: a, self.agent_ids[1]: b}
+    def _is_valid_allocation(
+        self, allocation: Dict[str, int], stock: Dict[str, int]
+    ) -> bool:
+        for t in self.item_types:
+            v = allocation.get(t)
+            if v is None:
+                return False
+            if not isinstance(v, int):
+                return False
+            if v < 0 or v > int(stock.get(t, 0)):
+                return False
+        return True
+    def set_new_round_of_variant(self):
+        # Keep same values, resample stock
+        self.state.quantities = self._sample_stock()
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """
+        Returns the rewards for each agent.
+        """
+        split_a = splits[self.agent_ids[0]].items_given_to_self
+        split_b = splits[self.agent_ids[1]].items_given_to_self
+        rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+        for t in self.item_types:
+            # If not complementary, return 0!
+            if not split_a[t] + split_b[t] == self.state.quantities[t]:
+                return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+            rewards[self.agent_ids[0]] += (
+                split_a[t] * self.state.values[self.agent_ids[0]][t]
+            )
+            rewards[self.agent_ids[1]] += (
+                split_b[t] * self.state.values[self.agent_ids[1]][t]
+            )
+        return rewards
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        obs = DealNoDealObs(
+            round_nb=self.state.round_nb,
+            last_message=self.state.last_message,
+            current_agent=self.state.current_agent,
+            quantities=copy.deepcopy(self.state.quantities),
+            value=0.0,  # unused in DOND
+            other_agent_split=None,  # not meaningful until split
+            split_phase=self.state.split_phase,
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            my_values=copy.deepcopy(self.state.values[agent_id]),
+            item_types=list(self.item_types),
+            previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        stock = self._sample_stock()
+        values = self._sample_values_pair()
+        self.state = DealNoDealState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities=stock,
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=False,
+            item_types=list(self.item_types),
+        )
+        return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/nego_agent.py ADDED Viewed

	@@ -0,0 +1,261 @@

+"""
+File: mllm/markov_games/negotiation/nego_agent.py
+Summary: General-purpose negotiation agent coordinating prompts and actions.
+"""
+import copy
+from abc import abstractmethod
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+import numpy as np
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.nego_simulation import Message, NegotiationObs, Split
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+@dataclass
+class NegotiationAgentState:
+    """Lightweight container tracking round progression and message history."""
+    round_nb: int
+    nb_messages_sent_this_round: int
+    chat_counter: int
+    chat_history: List[ChatTurn]
+class NegotiationAgent(Agent):
+    """Base agent that manages prompt scaffolding and regex validation for variants."""
+    def __init__(
+        self,
+        seed: int,
+        agent_id: str,
+        agent_name: str,
+        policy: Callable[[List[Dict]], str],
+        goal: str,
+        exploration_prompts: List[str] = [],
+        exploration_prompt_probs: List[float] = [],
+    ):
+        self.seed = seed
+        self.agent_id = agent_id
+        self.agent_name = agent_name
+        self.policy = policy
+        self.goal = goal
+        self.exploration_prompts_toggled = len(exploration_prompts) > 0
+        if self.exploration_prompts_toggled:
+            exploration_prompts = copy.deepcopy(exploration_prompts)
+            exploration_prompts.append(None)
+            self.exploration_prompts = exploration_prompts
+            self.exploration_prompt_probs = np.array(exploration_prompt_probs)
+            assert self.exploration_prompt_probs.sum() <= 1
+            assert np.all(self.exploration_prompt_probs >= 0)
+            self.exploration_prompt_probs = np.append(
+                self.exploration_prompt_probs, 1 - self.exploration_prompt_probs.sum()
+            )
+        self.state = NegotiationAgentState(
+            round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
+        )
+        # Implemented in variants
+        self.intro_prompt = ""
+        self.new_round_prompt = ""
+        self.last_round_prompt = ""
+        self.send_split_prompt = ""
+        self.wait_for_message_prompt = ""
+        self.last_message_prompt = ""
+        self.send_message_prompt = ""
+    @abstractmethod
+    def get_message_regex(self, observation: NegotiationObs) -> str:
+        """Return the regex that outgoing chat messages must satisfy."""
+        pass
+    @abstractmethod
+    def get_split_regex(self, observation: NegotiationObs) -> str:
+        """Return the regex that final split proposals must satisfy."""
+        pass
+    @abstractmethod
+    def get_split_action(
+        self, policy_output: str, observation: NegotiationObs
+    ) -> Split:
+        """Convert raw LLM output into the ``Split`` structure required by simulations."""
+        pass
+    async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
+        """
+        Assemble the appropriate prompt, query the policy, and return message or split.
+        This handles intro text, new-round reminders, quota tracking, and post-processing
+        (regex enforcement + ChatTurn logging) so subclasses only customize prompts/regexes.
+        """
+        def dict_to_str(d: dict) -> str:
+            return ", ".join(f"{v} {k}" for k, v in d.items())
+        def dict_to_eq_str(d: dict) -> str:
+            return ", ".join(f"{k}={v}" for k, v in d.items())
+        is_our_turn = observation.current_agent == self.agent_id
+        action: Any = None
+        round_nb = observation.round_nb
+        prompt_parts: List[str] = []
+        obs_ctx = vars(observation)
+        obs_ctx_formmated = obs_ctx.copy()
+        for key in obs_ctx_formmated:
+            if isinstance(obs_ctx_formmated[key], dict) and "value" not in key:
+                obs_ctx_formmated[key] = dict_to_str(obs_ctx_formmated[key])
+            elif isinstance(obs_ctx_formmated[key], dict) and "value" in key:
+                obs_ctx_formmated[key] = dict_to_eq_str(obs_ctx_formmated[key])
+        #######################################
+        # build user prompt
+        #######################################
+        # First-ever call
+        is_intro = round_nb == 0 and self.state.chat_counter == 0
+        if is_intro:
+            prompt_parts.append(
+                self.intro_prompt.format(
+                    goal=self.goal, agent=self.agent_name, **obs_ctx_formmated
+                )
+            )
+        # New round
+        is_new_round = round_nb > self.state.round_nb
+        if is_new_round or is_intro:
+            self.state.nb_messages_sent_this_round = 0
+            if not is_intro:
+                prompt_parts.append(self.last_round_prompt.format(**obs_ctx_formmated))
+            prompt_parts.append(self.new_round_prompt.format(**obs_ctx_formmated))
+            if self.exploration_prompts_toggled:
+                exploration_prompt = self.exploration_prompts[
+                    np.random.choice(
+                        len(self.exploration_prompts), p=self.exploration_prompt_probs
+                    )
+                ]
+                if exploration_prompt is not None:
+                    prompt_parts.append(exploration_prompt)
+            self.state.round_nb = round_nb
+        # Wait for message
+        if not is_our_turn and not observation.split_phase:
+            prompt_parts.append(
+                self.wait_for_message_prompt.format(**obs_ctx_formmated)
+            )
+        # Get last message
+        if is_our_turn and not is_new_round and not is_intro:
+            prompt_parts.append(self.last_message_prompt.format(**obs_ctx_formmated))
+        # Prompt to send message
+        must_send_message = not observation.split_phase and is_our_turn
+        if must_send_message:
+            prompt_parts.append(self.send_message_prompt.format(**obs_ctx_formmated))
+        # Prompt to give split
+        must_send_split = not must_send_message and observation.split_phase
+        if must_send_split:
+            var_names = ["x", "y", "z", "w"]  # Extend as needed
+            items_str = ", ".join(
+                [
+                    f"{var_names[i]} {item}"
+                    for i, item in enumerate(obs_ctx["quantities"].keys())
+                ]
+            )
+            ranges_str = ", ".join(
+                [
+                    f"{var_names[i]}: 0-{obs_ctx['quantities'][item]} (integer)"
+                    for i, item in enumerate(obs_ctx["quantities"].keys())
+                ]
+            )
+            proposal_style = f"Proposal: {items_str} where {ranges_str}."
+            proposal_style2 = (
+                f"<items_to_self> {items_str} </items_to_self> where {ranges_str}."
+            )
+            prompt_parts.append(
+                self.send_split_prompt.format(
+                    proposal_style=proposal_style,
+                    proposal_style2=proposal_style2,
+                    **obs_ctx_formmated,
+                )
+            )
+        # Append one ChatTurn with is_state_end=True
+        user_prompt = "\n".join(prompt_parts)
+        self.state.chat_history.append(
+            ChatTurn(
+                agent_id=self.agent_id,
+                role="user",
+                content=user_prompt,
+                is_state_end=True,
+            )
+        )
+        #######################################
+        # Get policy action
+        #######################################
+        # Query policy for the appropriate format
+        if must_send_message:
+            return_regex = self.get_message_regex(observation)
+            policy_output = await self.policy(
+                state=self.state.chat_history,
+                agent_id=self.agent_id,
+                regex=return_regex,
+            )
+            self.state.chat_history.append(
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content=policy_output.content,
+                    reasoning_content=policy_output.reasoning_content,
+                    log_probs=policy_output.log_probs,
+                    out_token_ids=policy_output.out_token_ids,
+                    is_state_end=False,
+                )
+            )
+            action = Message(message=policy_output.content)
+            self.state.nb_messages_sent_this_round += 1
+        elif must_send_split:
+            return_regex = self.get_split_regex(observation)
+            policy_output = await self.policy(
+                state=self.state.chat_history,
+                agent_id=self.agent_id,
+                regex=return_regex,
+            )
+            self.state.chat_history.append(
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content=policy_output.content,
+                    reasoning_content=policy_output.reasoning_content,
+                    log_probs=policy_output.log_probs,
+                    out_token_ids=policy_output.out_token_ids,
+                    is_state_end=False,
+                )
+            )
+            action = self.get_split_action(policy_output.content, observation)
+        else:
+            action = None
+        agent_step_log = AgentActLog(
+            chat_turns=self.state.chat_history[self.state.chat_counter :], info=None
+        )
+        self.state.chat_counter = len(self.state.chat_history)
+        return action, agent_step_log
+    def get_safe_copy(self):
+        agent_copy = copy.copy(self)
+        agent_copy.state = copy.deepcopy(self.state)
+        return agent_copy
+    def reset(self):
+        self.state = NegotiationAgentState(
+            round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
+        )

src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py ADDED Viewed

	@@ -0,0 +1,70 @@

+"""
+File: mllm/markov_games/negotiation/nego_hard_coded_policies.py
+Summary: Provides deterministic negotiation policies for testing and baselines.
+"""
+import asyncio
+from typing import Any, Optional, Tuple
+from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
+from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
+    async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
+        """
+        Policy that gives all of the items to the agent who values them more.
+        If the items are equally valued, give them to the agent who values them more.
+        """
+        quantities = observation.quantities
+        my_values = observation.value
+        other_values = observation.other_value
+        items_given_to_self = {}
+        for item, qty in quantities.items():
+            my_v = float(my_values.get(item, 0))
+            other_v = float(other_values.get(item, 0))
+            if my_v == other_v:
+                items_given_to_self[item] = int(qty) / 2
+            else:
+                items_given_to_self[item] = int(qty if my_v > other_v else 0)
+        action = Split(items_given_to_self=items_given_to_self)
+        act_log = AgentActLog(
+            chat_turns=[
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content="Using welfare-maximizing split (all to higher-value agent).",
+                    is_state_end=True,
+                )
+            ],
+            info=None,
+        )
+        return action, act_log
+class HardCodedNegoGreedyPolicy(NoPressAgent):
+    async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
+        """
+        Always gives itself all of the items.
+        """
+        quantities = observation.quantities
+        items_given_to_self = {item: int(qty) for item, qty in quantities.items()}
+        action = Split(items_given_to_self=items_given_to_self)
+        act_log = AgentActLog(
+            chat_turns=[
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content="Using greedy split (keep all items).",
+                    is_state_end=True,
+                )
+            ],
+            info=None,
+        )
+        return action, act_log

src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py ADDED Viewed

	@@ -0,0 +1,252 @@

+"""
+File: mllm/markov_games/negotiation/nego_simulation.py
+Summary: Simulation harness for general negotiation environments.
+"""
+import copy
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.markov_games.simulation import Simulation
+from mllm.utils.get_coagent_id import get_coagent_id
+AgentId = str
+@dataclass
+class Split:
+    """Structured proposal describing how many units of each item an agent keeps."""
+    items_given_to_self: Dict[str, int]
+@dataclass
+class Message:
+    """Single chat utterance exchanged during the negotiation phase."""
+    message: str
+@dataclass  # gets extended by variants
+class NegotiationState:
+    """Full simulator state snapshot shared by all negotiation variants."""
+    round_nb: int
+    last_message: str
+    current_agent: AgentId
+    quantities: Dict[str, int]
+    values: Dict[AgentId, Dict[str, float]]
+    splits: Dict[AgentId, Split | None]
+    nb_messages_sent: Dict[AgentId, int]
+    previous_values: Dict[AgentId, Dict[str, float]] | None
+    previous_splits: Dict[AgentId, Dict[str, int] | None] | None
+    previous_points: Dict[AgentId, float] | None
+    previous_quantities: Dict[str, int] | None
+    split_phase: bool
+@dataclass  # gets extended by variants
+class NegotiationObs:
+    """Observation presented to agents each turn (base fields; variants extend)."""
+    round_nb: int
+    last_message: str
+    quota_messages_per_agent_per_round: int
+    current_agent: AgentId
+    other_agent: str
+    quantities: Dict[str, int]
+    item_types: List[str]
+    value: Dict[str, int]
+    split_phase: bool
+    last_split_agent: Dict[str, int] | None
+    last_value_agent: Dict[str, int] | None
+    last_points_agent: float | None
+    last_split_coagent: Dict[str, int] | None
+    last_value_coagent: Dict[str, int] | None
+    last_points_coagent: float | None
+    last_quantities: Dict[str, int] | None
+def compute_tas_style_rewards(
+    agent_ids: List[AgentId],
+    values: Dict[AgentId, float],
+    splits: Dict[AgentId, Split],
+    quantities: Dict[str, int],
+) -> Dict[AgentId, float]:
+    """
+    TAS-like reward computation: if sum of proposed coins exceeds max_coins,
+    allocate proportionally. Otherwise, use proposed amounts directly.
+    Rewards are quantity_kept * per-coin value for each agent.
+    """
+    a0, a1 = agent_ids[0], agent_ids[1]
+    r0, r1 = 0.0, 0.0
+    for item in quantities:
+        max_item = quantities[item]
+        item_to_self_0 = int(
+            (splits[a0].items_given_to_self.get(item, 0))
+            if splits[a0] is not None
+            else 0
+        )
+        item_to_self_1 = int(
+            (splits[a1].items_given_to_self.get(item, 0))
+            if splits[a1] is not None
+            else 0
+        )
+        denom = max(int(max_item), item_to_self_0 + item_to_self_1)
+        q0 = float(max_item) * float(item_to_self_0) / float(denom)
+        q1 = float(max_item) * float(item_to_self_1) / float(denom)
+        if type(values[a0]) is not dict:
+            r0 += q0 * float(values[a0])
+            r1 += q1 * float(values[a1])
+        else:
+            r0 += q0 * float(values[a0][item])
+            r1 += q1 * float(values[a1][item])
+    return {a0: r0, a1: r1}
+class NegotiationSimulation(Simulation):
+    def __init__(
+        self,
+        agent_ids: List[AgentId],
+        agent_names: List[str],
+        seed: int,
+        nb_of_rounds: int,
+        quota_messages_per_agent_per_round: int,
+        item_types: List[str] | None = None,
+    ):
+        self.seed = seed
+        self.rng = default_rng(self.seed)
+        self.agent_ids = list(agent_ids)
+        self.agent_names = agent_names
+        self.agent_id_to_name = {
+            agent_id: agent_name for agent_id, agent_name in zip(agent_ids, agent_names)
+        }
+        self.nb_of_rounds = int(nb_of_rounds)
+        self.quota_messages_per_agent_per_round = int(
+            quota_messages_per_agent_per_round
+        )
+        if item_types is not None:
+            self.item_types = [item.lower() for item in item_types]
+        else:
+            self.item_types = ["coins"]
+        self.state: NegotiationState | None = None
+        self._starting_agent_index = self.rng.choice([0, 1])
+        self.reset()
+    def _other(self, agent_id: AgentId) -> AgentId:
+        return get_coagent_id(self.agent_ids, agent_id)
+    @abstractmethod
+    def set_new_round_of_variant(self):
+        """Variant hook: sample new private values / stock before each round."""
+        pass
+    @abstractmethod
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        """Variant hook: populate SimulationStepLog.info with custom diagnostics."""
+        pass
+    def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
+        """
+        Returns terminated, step_log
+        """
+        assert self.state is not None
+        current_agent = self.state.current_agent
+        a0, a1 = self.agent_ids[0], self.agent_ids[1]
+        action = actions.get(current_agent)
+        # Split phase: require both splits in the same timestep
+        if self.state.split_phase:
+            action_a0 = actions.get(a0)
+            action_a1 = actions.get(a1)
+            have_both_splits = isinstance(action_a0, Split) and isinstance(
+                action_a1, Split
+            )
+            if not have_both_splits:
+                rewards = {agent_id: 0.0 for agent_id in self.agent_ids}
+                return False, SimulationStepLog(
+                    rewards=rewards, info={"type": "waiting_for_splits"}
+                )
+            # Record splits
+            self.state.splits[a0] = action_a0
+            self.state.splits[a1] = action_a1
+            # Compute rewards and end round
+            rewards = self.get_rewards(self.state.splits)
+            # Info
+            info = self.get_info_of_variant(self.state, actions)
+            # Prepare next round
+            # Alternate starting agent
+            self.state.round_nb += 1
+            self._starting_agent_index = 1 - self._starting_agent_index
+            self.state.current_agent = self.agent_ids[self._starting_agent_index]
+            self.state.previous_values = copy.deepcopy(self.state.values)
+            self.state.previous_splits = copy.deepcopy(self.state.splits)
+            self.state.previous_quantities = copy.deepcopy(self.state.quantities)
+            self.state.previous_points = copy.deepcopy(rewards)
+            self.state.last_message = ""
+            self.set_new_round_of_variant()  # variant specific
+            self.state.splits = {agent_id: None for agent_id in self.agent_ids}
+            self.state.nb_messages_sent = {agent_id: 0 for agent_id in self.agent_ids}
+            is_last_timestep_in_round = True
+            done = self.state.round_nb >= self.nb_of_rounds
+        # Message phase: roll the conversation forward a single turn.
+        elif isinstance(action, Message):
+            self.state.last_message = action.message
+            self.state.nb_messages_sent[current_agent] += 1
+            # Move turn to other agent
+            self.state.current_agent = self._other(current_agent)
+            # If both agents have reached their message quota, enter split phase
+            if all(
+                self.state.nb_messages_sent[agent_id]
+                >= self.quota_messages_per_agent_per_round
+                for agent_id in self.agent_ids
+            ):
+                self.state.split_phase = True
+            is_last_timestep_in_round = False
+            done = False
+            rewards = {agent_id: 0.0 for agent_id in self.agent_ids}
+            info = {"type": "message"}
+        info[
+            "is_last_timestep_in_round"
+        ] = is_last_timestep_in_round  # Used later to group round timesteps if needed
+        return done, SimulationStepLog(rewards=rewards, info=info)
+    def get_obs(self):
+        """Returns all agent observations in dict"""
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    @abstractmethod
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        pass
+    @abstractmethod
+    def get_obs_agent(self, agent_id):
+        pass
+    def get_state(self):
+        return self.state
+    def get_safe_copy(self):
+        """Return a safe copy of the simulation."""
+        simulation_copy = copy.copy(self)
+        simulation_copy.state = copy.deepcopy(self.state)
+        return simulation_copy
+    @abstractmethod
+    def reset(self) -> dict[AgentId, NegotiationObs]:
+        pass

src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py ADDED Viewed

	@@ -0,0 +1,249 @@

+"""
+File: mllm/markov_games/negotiation/negotiation_statistics.py
+Summary: Aggregates and reports statistics for negotiation experiments.
+"""
+from __future__ import annotations
+from typing import Callable, Dict, List, Tuple
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.rollout_tree import SimulationStepLog
+def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
+    """Average (per-step) reward for each agent and overall.
+    What it computes:
+            - Returns the raw reward for every (non-buffer) agent at the current
+                simulation step.
+            - Adds an aggregate key ``all_agents`` which is the simple arithmetic
+                mean across the agents present in ``sl.rewards``.
+    Rationale / motivation:
+            Monitoring the reward stream at each step helps:
+                * Diagnose reward shaping issues (e.g., unintended negative drift).
+                * Provide a fairness snapshot (are rewards systematically skewed?).
+                * Supply a ubiquitous baseline metric used by other higher‑level
+                    summaries (efficiency, surplus allocation, etc.).
+    Return shape:
+            { agent_id: float, ..., "all_agents": float }
+            If any agent id contains the substring "buffer" we treat this step as
+            an implementation artifact (e.g., rollout buffer) and return ``None``
+            to avoid polluting aggregates.
+    """
+    for aid in sl.rewards.keys():
+        if "buffer" in str(aid) and "live" not in str(aid):
+            return None
+    # One value per agent at each step
+    rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
+    return [(key, value) for key, value in rewards_dict.items() if value is not None]
+def split_efficiency(sl: SimulationStepLog) -> List[Tuple[str, float]] | None:
+    """Final‑round allocation efficiency relative to an upper bound.
+    What it computes (only on the last timestep of a negotiation round):
+            - Uses ``info['values']`` (per‑agent per‑item valuations) and
+                ``info['quantities']`` (available item counts) to form a greedy
+                *upper bound* on achievable total reward: allocate each unit of an
+                item to the single agent who values that item most.
+            - Compares the actually realized sum of rewards at that final
+                timestep to this constructed maximum.
+            - Emits a single scalar under key ``"all_agents"`` equal to
+                achieved / theoretical_max.
+    Motivation:
+            Efficiency (a core welfare notion) distinguishes between coordination
+            failures (low efficiency) versus strategic distributional disputes
+            (high efficiency but uneven splits). Tracking this per round helps
+            evaluate whether models learn to identify and realize joint surplus.
+    Notes / caveats:
+            - Only defined for 2+ non‑buffer agents; if a buffer agent is present
+                returns ``None`` to exclude spurious steps.
+            - Requires the environment to have populated ``values`` and
+                ``quantities``; otherwise returns ``None``.
+            - This is an optimistic bound (not necessarily reachable under
+                protocol constraints) but is simple, fast, and comparable across
+                runs.
+    """
+    info = sl.info or {}
+    if not info or not info.get("is_last_timestep_in_round"):
+        return None
+    quantities = info.get("quantities") or {}
+    values = info.get("values") or {}
+    if not values or not quantities:
+        return None
+    agent_ids = list(sl.rewards.keys())
+    if type(values[agent_ids[0]]) is dict:
+        item_keys = list(values.values())[0].keys()
+        max_vals, max_quantities = [], []
+        for item in item_keys:
+            max_val = max(float(agent_vals[item]) for agent_vals in values.values())
+            max_vals.append(max_val)
+            max_quantities.append(quantities[item])
+    else:
+        max_vals = [max(float(v) for v in values.values())]
+        max_quantities = [quantities[item] for item in quantities.keys()]
+    for aid in sl.rewards.keys():
+        if "buffer" in str(aid) and "live" not in str(aid):
+            return None
+    achieved = sum(float(v) for v in sl.rewards.values())
+    max_reward = sum(d * v for d, v in zip(max_quantities, max_vals))
+    # Efficiency is a global metric; emit same value for a special key "all"
+    return [("split_efficiency", achieved / max_reward)]
+def _extract_items_from_split(raw_split: Dict) -> Dict[str, float] | None:
+    """Return a mapping item->proposal amount from a split structure.
+    Supports both generic negotiation splits with nested structure
+    { 'items_given_to_self': {item: qty, ...}}
+    and TAS coin-only variants which may already be a flat mapping {'coins': qty}.
+    """
+    if raw_split is None:
+        return {}
+    elif isinstance(raw_split, Split):
+        return {k: float(v) for k, v in raw_split.items_given_to_self.items()}
+    elif isinstance(raw_split, dict):
+        if "items_given_to_self" in raw_split and isinstance(
+            raw_split["items_given_to_self"], dict
+        ):
+            return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
+        # Fallback: assume already flat mapping of items
+        elif hasattr(raw_split, "items_given_to_self"):
+            return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
+        return {
+            k: float(v) for k, v in raw_split.items() if isinstance(v, (int, float))
+        }
+    return {}
+def _average_proposal_relative_value(
+    sl: SimulationStepLog,
+    metric_name: str,
+    comparator: Callable[[float, float], bool],
+    opposite_comparator: Callable[[float, float], bool],
+) -> Dict[str, float | None] | None:
+    """Shared implementation for proposal size conditioned on relative value.
+    Parameters:
+            comparator: returns True when agent_0's value relation (e.g. < or >)
+                                    to agent_1 holds for an item and we should collect agent_0's
+                                    proposed quantity for that item.
+            opposite_comparator: inverse relation used to collect agent_1's items.
+    Behavior:
+            - Executes only on final timestep of a round (where the definitive
+                proposal / allocation is known via ``info['splits']``).
+            - For each item, classifies which agent's value satisfies the chosen
+                relation and records that agent's proposed quantity from the split.
+            - Averages (mean) across all qualifying items per agent; if no items
+                qualify for an agent returns ``None`` for that agent id.
+            - Adds ``all_agents`` mean across the numeric (non-None) agent values.
+    Why this matters:
+            Distinguishing how much an agent *asks for* when it subjectively
+            values items more (or less) than its counterpart reveals patterns of
+            opportunism vs. concession. This is especially useful when raw reward
+            differences are subtle but allocation *intent* differs.
+    """
+    info = sl.info or {}
+    if not info or not info.get("is_last_timestep_in_round"):
+        return None
+    quantities = info.get("quantities") or {}
+    splits = info.get("splits") or {}
+    values = info.get("values") or {}
+    agent_ids: List[str] = list(sl.rewards.keys())
+    if len(agent_ids) != 2:
+        return None  # Only defined for 2-agent case.
+    for aid in agent_ids:
+        if "buffer" in str(aid) and "live" not in str(aid):
+            return None
+    # Extract per-agent item proposals robustly
+    split_items = {aid: _extract_items_from_split(splits.get(aid)) for aid in agent_ids}
+    agent_0_vals: List[float] = []
+    agent_1_vals: List[float] = []
+    for item in quantities.keys():
+        # Values may be either a float (same for all items) or dict per item
+        v0_raw = values[agent_ids[0]]
+        v1_raw = values[agent_ids[1]]
+        v0 = float(v0_raw[item]) if isinstance(v0_raw, dict) else float(v0_raw)
+        v1 = float(v1_raw[item]) if isinstance(v1_raw, dict) else float(v1_raw)
+        if comparator(v0, v1):
+            agent_0_vals.append(split_items[agent_ids[0]].get(item, 0.0))
+        elif opposite_comparator(v0, v1):
+            agent_1_vals.append(split_items[agent_ids[1]].get(item, 0.0))
+    out: Dict[str, float | None] = {}
+    out[f"{metric_name}-{agent_ids[0]}"] = (
+        sum(agent_0_vals) / len(agent_0_vals) if agent_0_vals else None
+    )
+    out[f"{metric_name}-{agent_ids[1]}"] = (
+        sum(agent_1_vals) / len(agent_1_vals) if agent_1_vals else None
+    )
+    return [(key, value) for key, value in out.items() if value is not None]
+def average_proposal_when_agent_values_item_lower(
+    sl: SimulationStepLog,
+) -> List[Tuple[str, float | None]] | None:
+    """Mean quantity an agent proposes for items it values *less* than opponent.
+    Interpretation:
+        A higher value implies the agent still claims (or is allocated) a
+        notable share of items where it has a comparative *disadvantage* in
+        valuation, signaling either strategic over-claiming or protocol-driven
+        egalitarian splits. Conversely, very low numbers can indicate
+        efficient specialization or excessive concession.
+    Returns:
+        Mapping { agent_id: float | None, "all_agents": float | None } where
+        None indicates no qualifying items for that agent in the round.
+    """
+    return _average_proposal_relative_value(
+        sl,
+        "average_proposal_when_agent_values_item_lower",
+        lambda a, b: a < b,
+        lambda a, b: a > b,
+    )
+def average_proposal_when_agent_values_item_higher(
+    sl: SimulationStepLog,
+) -> List[Tuple[str, float | None]] | None:
+    """Mean quantity an agent proposes for items it values *more* than opponent.
+    Interpretation:
+        Captures how aggressively an agent claims items where it holds a
+        comparative *advantage*. Elevated values can reflect rational
+        specialization (efficient exploitation of comparative advantage) or
+        potentially unfair grabs if paired with low concession in the lower
+        valuation metric. Comparing this with the 'lower' counterpart helps
+        profile negotiation style (cooperative vs. exploitative).
+    Returns:
+        Mapping { agent_id: float | None, "all_agents": float | None } where
+        None indicates no qualifying items.
+    """
+    return _average_proposal_relative_value(
+        sl,
+        "average_proposal_when_agent_values_item_higher",
+        lambda a, b: a > b,
+        lambda a, b: a < b,
+    )
+# Explicit list of metric functions exported for rendering. Helper functions
+# starting with '_' are intentionally excluded. Update this list when adding
+# new public statistics so render.py can rely on it instead of introspecting
+# every callable in the module.
+stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
+    avg_reward,
+    average_proposal_when_agent_values_item_lower,
+    average_proposal_when_agent_values_item_higher,
+    split_efficiency,
+]

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+File: mllm/markov_games/negotiation/no_press_nego_agent.py
+Summary: Agent variant for no-press negotiations without explicit messaging.
+"""
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.negotiation.nego_agent import (
+    NegotiationAgent,
+    NegotiationAgentState,
+)
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class NoPressAgent(NegotiationAgent):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # No communication in this variant
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game consists of multiple independent rounds.\n"
+            "2. In each round, there are multiple items to split between the two agents.\n"
+            "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n"
+            "4. You can observe per-item values of both agents.\n"
+            "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. Both agents simultaneously propose the amount of each item they will keep.\n"
+            "2. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n"
+            "3. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n"
+            "4. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n"
+            "5. Points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = (
+            "A New Round Begins\n"
+            "The items to split are {quantities}.\n"
+            "Your per-item values are {value} and {other_agent}'s per-item values are  {other_value}."
+        )
+        self.last_round_prompt = (
+            "Last Round Summary:\n"
+            "   - Items to split: {last_quantities}\n"
+            "   - Your per-item values: {last_value_agent}\n"
+            "   - {other_agent}'s per-item values: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent}\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent}\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round Complete.\n"
+        )
+        self.send_split_prompt = "Submit Your Proposal\n" "Respond as {proposal_style}"
+    def get_message_regex(self, observation: NoPressObs) -> str:
+        """Return an empty pattern because the no-press variant forbids chat."""
+        return r"^$"  # No messages allowed
+    def get_split_regex(self, observation: NoPressObs) -> str:
+        """Match proposals like ``Proposal: 4 coins, 6 apples`` case-insensitively."""
+        items = list(observation.quantities.keys())
+        # Accept both singular and plural forms
+        item_pattern = "|".join(
+            [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items]
+        )
+        regex = rf"(?i)Proposal:\s*((?:\s*(?P<num>(10|[0-9]))\s*(?P<item>{item_pattern})\s*,?)+)"
+        return regex
+    def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split:
+        """
+        Parse the LLM proposal into a normalized ``Split`` structure.
+        The regex-based parser is lenient (accepts pluralization variants) so that
+        prompt tweaks do not require re-training the extraction logic.
+        """
+        items = list(observation.quantities.keys())
+        import re as _re
+        split_regex = self.get_split_regex(observation)
+        items_given_to_self = {item: 0 for item in items}
+        m = _re.match(split_regex, policy_output.strip())
+        if m:
+            # Find all (number, item) pairs
+            item_pattern = "|".join(
+                [
+                    f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?"
+                    for item in items
+                ]
+            )
+            inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
+            def normalize_item_name(item_str):
+                """Canonicalize plural/singular user text back to the config item id."""
+                for orig in items:
+                    if item_str.lower() == orig.lower():
+                        return orig
+                    if orig.endswith("s") and item_str.lower() == orig[:-1].lower():
+                        return orig
+                    if (
+                        not orig.endswith("s")
+                        and item_str.lower() == orig.lower() + "s"
+                    ):
+                        return orig
+            for num, item in _re.findall(inner_regex, m.group(1)):
+                items_given_to_self[normalize_item_name(item)] = int(num)
+        return Split(items_given_to_self=items_given_to_self)

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+File: mllm/markov_games/negotiation/no_press_nego_simulation.py
+Summary: Simulation driver for no-press negotiation scenarios.
+"""
+import copy
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Tuple
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+    compute_tas_style_rewards,
+)
+AgentId = str
+@dataclass
+class NoPressState(NegotiationState):
+    """NegotiationState alias used to clarify we run in always-split phase."""
+    pass
+@dataclass
+class NoPressObs(NegotiationObs):
+    """Observation that includes both agents' values (since there is no messaging)."""
+    other_value: Dict[str, float]
+class NoPressSimulation(NegotiationSimulation):
+    def __init__(
+        self,
+        game_type: Literal["10-1-exclusive", "10-1-ties", "1-to-20"] = "1-to-20",
+        same_round_value: bool = True,
+        atleast_one_conflict: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.game_type = game_type
+        self.same_round_value = same_round_value
+        self.atleast_one_conflict = atleast_one_conflict
+        super().__init__(*args, **kwargs)
+    def _sample_values(self) -> Dict[AgentId, dict]:
+        """Sample per-item valuations according to the configured template."""
+        values = defaultdict(dict)
+        if self.state is None:
+            item_types = self.item_types
+        else:
+            item_types = list(self.state.quantities.keys())
+        while True:
+            for item in item_types:
+                if self.game_type == "10-1-exclusive":
+                    v = int(self.rng.choice([1, 10]))
+                    values[self.agent_ids[0]][item] = v
+                    values[self.agent_ids[1]][item] = 10 if v == 1 else 1
+                elif self.game_type == "10-1-ties":
+                    for aid in self.agent_ids:
+                        values[aid][item] = int(self.rng.choice([1, 10]))
+                elif self.game_type == "1-to-20":
+                    for aid in self.agent_ids:
+                        values[aid][item] = int(self.rng.integers(1, 21))
+            if self.atleast_one_conflict:
+                has_conflict = False
+                for item in item_types:
+                    agent_values_for_item = [
+                        values[aid][item] for aid in self.agent_ids
+                    ]
+                    if len(set(agent_values_for_item)) > 1:
+                        has_conflict = True
+                        break
+                if not has_conflict:
+                    continue
+            agent_values = [sum(v.values()) for v in values.values()]
+            if len(set(agent_values)) == 1 or not self.same_round_value:
+                break
+        return values
+    def _sample_quantities(self) -> Dict[str, int]:
+        """No-press setups use symmetric 10-unit stocks for every item."""
+        return {item.lower(): 10 for item in self.item_types}
+    def set_new_round_of_variant(self):
+        """Refresh quantities/values and jump directly into the simultaneous split."""
+        self.state.quantities = self._sample_quantities()
+        self.state.values = self._sample_values()
+        self.state.split_phase = True
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        """Surface quantities/values/splits so statistics modules can read them."""
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """Reuse TAS reward logic because the split arbitration is identical."""
+        return compute_tas_style_rewards(
+            self.agent_ids, self.state.values, splits, self.state.quantities
+        )
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        last_value_coagent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(other_id)
+        )
+        last_points_coagent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(other_id), 1)
+        )
+        last_value_agent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(agent_id)
+        )
+        last_points_agent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(agent_id), 1)
+        )
+        last_split_coagent = None
+        last_split_agent = None
+        if self.state.previous_splits is not None:
+            last_split_coagent = self.state.previous_splits[
+                other_id
+            ].items_given_to_self
+            last_split_agent = self.state.previous_splits[agent_id].items_given_to_self
+        obs = NoPressObs(
+            round_nb=self.state.round_nb,
+            last_message="",
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            current_agent=self.state.current_agent,
+            other_agent=self.agent_id_to_name[other_id],
+            quantities=self.state.quantities,
+            item_types=self.item_types,
+            value=self.state.values[agent_id],
+            split_phase=self.state.split_phase,
+            last_split_agent=last_split_agent,
+            last_value_agent=last_value_agent,
+            last_points_agent=last_points_agent,
+            last_split_coagent=last_split_coagent,
+            last_value_coagent=last_value_coagent,
+            last_points_coagent=last_points_coagent,
+            other_value=self.state.values[other_id],
+            last_quantities=self.state.previous_quantities,
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        quantities = self._sample_quantities()
+        values = self._sample_values()
+        self.state = NoPressState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities=quantities,
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=True,
+            previous_splits=None,
+            previous_points=None,
+            previous_quantities=None,
+        )
+        return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/tas_agent.py ADDED Viewed

	@@ -0,0 +1,118 @@

+"""
+File: mllm/markov_games/negotiation/tas_agent.py
+Summary: Agent implementation for Take-and-Split negotiations.
+"""
+from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
+class TrustAndSplitAgent(NegotiationAgent):
+    """Prompt/template wrapper for the classic multi-item Take-and-Split benchmark."""
+    def __init__(self, num_message_chars, *args, **kwargs):
+        self.num_message_chars = num_message_chars
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game has multiple independent rounds.\n"
+            "2. In each round, there are multiple items to split between the two agents.\n"
+            "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n"
+            "4. You can only observe your own per-item values.\n"
+            "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
+            "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the item.\n"
+            "   - Use this chat to communicate your private per-item value to make informed proposals.\n"
+            "3. After the chat, both agents simultaneously propose the amount of each item they will keep.\n"
+            "4. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n"
+            "5. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n"
+            "6. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n"
+            "7. Points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = (
+            "A New Round Begins\n"
+            "The items to split are {quantities}.\n"
+            "Your per-item values are {value}."
+        )
+        self.last_round_prompt = (
+            "Last Round Summary:\n"
+            "   - Items to split: {last_quantities}\n"
+            "   - Your per-item values: {last_value_agent}\n"
+            "   - {other_agent}'s per-item values: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent}\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent}\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round Complete.\n"
+        )
+        self.send_split_prompt = (
+            "Message quota is finished for this round.\n"
+            "{other_agent} has finalized their proposal.\n"
+            "Submit your finalization now\n"
+            "Respond with {proposal_style2}"
+        )
+        # self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
+        self.wait_for_message_prompt = ""
+        self.last_message_prompt = "{other_agent} said: {last_message}"
+        # self.send_message_prompt = (
+        #     f"Send your message now (max {self.num_message_chars} chars)."
+        # )
+        self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
+    def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+        """Constrain chat to bounded XML tags for stable parsing."""
+        return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
+    # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+    #     return rf"(?s).{{0,{self.num_message_chars}}}"
+    def get_split_regex(self, observation: TrustAndSplitObs) -> str:
+        """Allow natural-language item names while still returning machine-parsable XML."""
+        items = list(observation.quantities.keys())
+        # Accept both singular and plural forms
+        item_pattern = "|".join(
+            [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items]
+        )
+        regex = rf"(?i)<items_to_self> ?((?:\s*(?P<num>(10|[0-9]))\s*(?P<item>{item_pattern})\s*,?)+) ?</items_to_self>"
+        return regex
+    def get_split_action(
+        self, policy_output: str, observation: TrustAndSplitObs
+    ) -> Split:
+        """Convert human-readable allocation text back into canonical item IDs."""
+        items = list(observation.quantities.keys())
+        import re as _re
+        split_regex = self.get_split_regex(observation)
+        items_given_to_self = {item: 0 for item in items}
+        m = _re.match(split_regex, policy_output.strip())
+        if m:
+            # Find all (number, item) pairs
+            item_pattern = "|".join(
+                [
+                    f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?"
+                    for item in items
+                ]
+            )
+            inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
+            def normalize_item_name(item_str):
+                for orig in items:
+                    if item_str.lower() == orig.lower():
+                        return orig
+                    if orig.endswith("s") and item_str.lower() == orig[:-1].lower():
+                        return orig
+                    if (
+                        not orig.endswith("s")
+                        and item_str.lower() == orig.lower() + "s"
+                    ):
+                        return orig
+            for num, item in _re.findall(inner_regex, m.group(1)):
+                items_given_to_self[normalize_item_name(item)] = int(num)
+        return Split(items_given_to_self=items_given_to_self)

src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+File: mllm/markov_games/negotiation/tas_rps_agent.py
+Summary: Agent logic for TAS Rock-Paper-Scissors blended game.
+"""
+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.nego_agent import (
+    Message,
+    NegotiationAgent,
+    NegotiationAgentState,
+    Split,
+)
+from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class TrustAndSplitRPSAgent(NegotiationAgent):
+    """NegotiationAgent that reasons about hidden hands before submitting TAS splits."""
+    def __init__(
+        self,
+        num_message_chars: int,
+        message_start_end_format: bool = False,
+        proposal_start_end_format: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.num_message_chars = num_message_chars
+        self.message_start_end_format = message_start_end_format
+        self.proposal_start_end_format = proposal_start_end_format
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "\n"
+            "Setup:\n"
+            "1. The game has multiple independent rounds.\n"
+            "2. In each round, there are 10 coins to split between the two agents.\n"
+            "3. Each agent's per-coin value for that round is determined as follows:\n"
+            "   - Both agents are randomly assigned a rock, paper or scissors hands\n"
+            "   - Rock has the upper hand over scissors, scissors has the upper hand over paper and paper has the upper hand over rock.\n"
+            "   - The agent with the upper hand has a per-coin value of 10.\n"
+            "   - The agent with the lower hand has a per-coin value of 1.\n"
+            "4. You only see your own hand, but you may communicate it in messages and infer your value based on the other agent's hand.\n"
+            "5. Over many rounds both agents are equally likely to have the upper and lower hand.\n"
+            "\n"
+            "Protocol:\n"
+            "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
+            "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
+            "   - Use this chat to communicate your hand so that both agents can determine their per-coin values.\n"
+            "3. After the chat, both agents simultaneously propose how many coins they keep.\n"
+            "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
+            "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
+            "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
+            "7. The points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = (
+            "A New Round Begins\n"
+            "Your hand is {hand}. You don't know {other_agent}'s hand yet.\n"
+        )
+        # self.last_round_prompt = (
+        #     "Last Round Summary:\n"
+        #     "   - Your hand: {last_hand_agent}\n"
+        #     "   - {other_agent}'s hand: {last_hand_coagent}\n"
+        #     "   - Your value per coin: {last_value_agent}\n"
+        #     "   - {other_agent}'s value per coin: {last_value_coagent}\n"
+        #     "   - You proposed: {last_split_agent} coins\n"
+        #     "   - You earned: {last_points_agent} points\n"
+        #     "   - {other_agent} proposed: {last_split_coagent} coins\n"
+        #     "   - {other_agent} earned: {last_points_coagent} points\n"
+        #     "   - Round Complete.\n"
+        # )
+        self.last_round_prompt = "In the previous round, {other_agent} had a {last_hand_value_coagent} hand and proposed {last_split_coagent} coins.\n"
+        if self.proposal_start_end_format:
+            self.send_split_prompt = (
+                "Submit your proposal\n"
+                "Respond with <<proposal_start>> x <<proposal_end>> where x is an integer in [0, 10]."
+            )
+        else:
+            self.send_split_prompt = (
+                "Submit your proposal\n"
+                "Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
+            )
+        self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
+        # self.wait_for_message_prompt = ""
+        self.last_message_prompt = "{other_agent} said: {last_message}"
+        if self.message_start_end_format:
+            self.send_message_prompt = f"Send your message now in <<message_start>>...<<message_end>> (<={self.num_message_chars} chars)."
+        else:
+            self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
+    def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        """Switch between <message>...</message> and <<message_start>> formats on demand."""
+        if self.message_start_end_format:
+            return (
+                rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
+            )
+        else:
+            return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
+    def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        """Force single-number proposals inside whichever tag style the config selected."""
+        if self.proposal_start_end_format:
+            return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
+        else:
+            return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
+    def get_split_action(
+        self, policy_output: str, observation: TrustAndSplitRPSObs
+    ) -> Split:
+        """Parse the proposal tag (or raw integer fallback) into a Split."""
+        import re as _re
+        if self.proposal_start_end_format:
+            m = _re.search(
+                r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>", policy_output
+            )
+        else:
+            m = _re.search(
+                r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output
+            )
+        coins_int = int(m.group(1)) if m else int(policy_output)
+        return Split(items_given_to_self={"coins": coins_int})

src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py ADDED Viewed

	@@ -0,0 +1,257 @@

+"""
+File: mllm/markov_games/negotiation/tas_rps_simulation.py
+Summary: Simulation for TAS Rock-Paper-Scissors blended scenarios.
+"""
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Tuple
+from mllm.markov_games.negotiation.nego_simulation import (
+    Message,
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+    compute_tas_style_rewards,
+)
+from mllm.markov_games.rollout_tree import SimulationStepLog
+AgentId = str
+def _get_rps_winner(
+    hand1: Literal["rock", "paper", "scissors"],
+    hand2: Literal["rock", "paper", "scissors"],
+) -> Literal["rock", "paper", "scissors"]:
+    """Determine winner of rock-paper-scissors between two hands."""
+    if hand1 == hand2:
+        raise ValueError("Hands should be different")
+    if (
+        (hand1 == "rock" and hand2 == "scissors")
+        or (hand1 == "paper" and hand2 == "rock")
+        or (hand1 == "scissors" and hand2 == "paper")
+    ):
+        return hand1
+    else:
+        return hand2
+@dataclass
+class TrustAndSplitRPSState(NegotiationState):
+    """Negotiation state augmented with the current and previous RPS hands."""
+    hands: Dict[
+        AgentId, Literal["rock", "paper", "scissors"]
+    ]  # rock, paper, or scissors
+    previous_hands: Dict[AgentId, Literal["rock", "paper", "scissors"]] | None
+@dataclass
+class TrustAndSplitRPSObs(NegotiationObs):
+    """Agent-facing observation enriched with last-hand metadata."""
+    hand: Literal["rock", "paper", "scissors"]
+    last_hand_agent: Literal["rock", "paper", "scissors"] | None
+    last_hand_coagent: Literal["rock", "paper", "scissors"] | None
+    last_hand_value_coagent: Literal["upper", "lower"] | None
+class TrustAndSplitRPSSimulation(NegotiationSimulation):
+    """Negotiation variant that splices TAS splitting with RPS-determined stakes."""
+    def __init__(
+        self,
+        alternating_hands: bool = False,
+        alternating_mix_ratio: float = None,
+        *args,
+        **kwargs,
+    ):
+        self.alternating_hands = alternating_hands
+        self.alternating_mix_ratio = alternating_mix_ratio
+        super().__init__(*args, **kwargs)
+        if self.alternating_mix_ratio is not None:
+            if self.rng.random() < self.alternating_mix_ratio:
+                self.alternating_hands = True
+            else:
+                self.alternating_hands = False
+    def _sample_hands_and_values(
+        self,
+        alternate_hands: bool = False,
+    ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]:
+        """
+        Sample a rock-paper-scissors hand for each agent plus the per-hand value.
+        When ``alternate_hands`` is True we deliberately flip the previous round's
+        winner/loser roles to create nonstationary payoffs; otherwise we draw
+        uniformly without replacement.
+        """
+        hands = ["rock", "paper", "scissors"]
+        if alternate_hands:
+            previous_hands = list(self.state.previous_hands.values())
+            hand1, hand2 = self.rng.choice(hands, size=2, replace=False)
+            winner = _get_rps_winner(hand1, hand2)
+            loser = hand1 if winner == hand2 else hand2
+            previous_winner = _get_rps_winner(previous_hands[0], previous_hands[1])
+            agent_hands, values = {}, {}
+            for agent_id in self.agent_ids:
+                if self.state.previous_hands[agent_id] == previous_winner:
+                    agent_hands[agent_id] = loser
+                    values[agent_id] = 1.0
+                else:
+                    agent_hands[agent_id] = winner
+                    values[agent_id] = 10.0
+            return agent_hands, values
+        else:
+            # Assign different hands to each agent
+            hand1, hand2 = self.rng.choice(hands, size=2, replace=False)
+            agent_hands = {self.agent_ids[0]: hand1, self.agent_ids[1]: hand2}
+            # Determine winner and assign values
+            winner = _get_rps_winner(hand1, hand2)
+            values = {}
+            for agent_id in self.agent_ids:
+                if agent_hands[agent_id] == winner:
+                    values[agent_id] = 10.0  # Winner gets value 10
+                else:
+                    values[agent_id] = 1.0  # Loser gets value 1
+            return agent_hands, values
+    def set_new_round_of_variant(self):
+        """Refresh hands/values and reset round-specific state."""
+        self.state.previous_hands = copy.deepcopy(self.state.hands)
+        new_hands, new_values = self._sample_hands_and_values(
+            alternate_hands=self.alternating_hands
+        )
+        self.state.hands = new_hands
+        self.state.values = new_values
+        # Quantities are constant in TAS
+        self.state.quantities = {"coins": 10}
+        self.state.split_phase = False
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        """Expose variant-specific tensors for downstream logging/analysis."""
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "hands": copy.deepcopy(state.hands),
+            "values": copy.deepcopy(state.values),
+            "previous_hands": copy.deepcopy(state.previous_hands),
+            "previous_values": copy.deepcopy(state.previous_values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """Delegates to TAS reward helper because the payout rule is identical."""
+        return compute_tas_style_rewards(
+            self.agent_ids, self.state.values, splits, self.state.quantities
+        )
+    def get_obs_agent(self, agent_id):
+        """Return a full Trust-and-Split observation for ``agent_id``."""
+        other_id = self._other(agent_id)
+        last_value_coagent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(other_id)
+        )
+        last_hand_coagent = (
+            None
+            if self.state.previous_hands is None
+            else self.state.previous_hands.get(other_id)
+        )
+        last_points_coagent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(other_id), 1)
+        )
+        last_value_agent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(agent_id)
+        )
+        last_hand_agent = (
+            None
+            if self.state.previous_hands is None
+            else self.state.previous_hands.get(agent_id)
+        )
+        last_points_agent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(agent_id), 1)
+        )
+        last_split_coagent = None
+        last_split_agent = None
+        if self.state.previous_splits is not None:
+            last_split_coagent = self.state.previous_splits[
+                other_id
+            ].items_given_to_self["coins"]
+            last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[
+                "coins"
+            ]
+        if last_hand_agent is None or last_hand_coagent is None:
+            last_hand_value_coagent = None
+        else:
+            winner = _get_rps_winner(last_hand_agent, last_hand_coagent)
+            last_hand_value_coagent = (
+                "upper" if winner == last_hand_coagent else "lower"
+            )
+        obs = TrustAndSplitRPSObs(
+            round_nb=self.state.round_nb,
+            last_message=self.state.last_message,
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            current_agent=self.state.current_agent,
+            other_agent=self.agent_id_to_name[other_id],
+            quantities={"coins": 10},
+            item_types=self.item_types,
+            value=self.state.values[agent_id],
+            split_phase=self.state.split_phase,
+            last_split_agent=last_split_agent,
+            last_value_agent=last_value_agent,
+            last_points_agent=last_points_agent,
+            last_split_coagent=last_split_coagent,
+            last_value_coagent=last_value_coagent,
+            last_points_coagent=last_points_coagent,
+            hand=self.state.hands[agent_id],
+            last_hand_coagent=last_hand_coagent,
+            last_hand_agent=last_hand_agent,
+            last_quantities=self.state.previous_quantities,
+            last_hand_value_coagent=last_hand_value_coagent,
+        )
+        return obs
+    def get_state(self):
+        return self.state
+    def get_safe_copy(self):
+        """Return a safe copy of the simulation."""
+        simulation_copy = copy.copy(self)
+        simulation_copy.state = copy.deepcopy(self.state)
+        return simulation_copy
+    def reset(self):
+        """Initialize and return initial observations"""
+        # Decide starting agent alternating across resets for determinism
+        start_agent = self.agent_ids[self._starting_agent_index]
+        hands, values = self._sample_hands_and_values()
+        self.state = TrustAndSplitRPSState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities={"coins": 10},
+            values=values,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            previous_values=None,
+            previous_splits=None,
+            previous_points=None,
+            split_phase=False,
+            hands=hands,
+            previous_hands=None,
+            previous_quantities=None,
+        )
+        return self.get_obs()

src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (269 Bytes). View file

src_code_for_reproducibility/models/__pycache__/adapter_training_wrapper.cpython-312.pyc ADDED Viewed

Binary file (5.06 kB). View file

src_code_for_reproducibility/models/__pycache__/human_policy.cpython-312.pyc ADDED Viewed

Binary file (12.1 kB). View file

src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc ADDED Viewed

Binary file (2.38 kB). View file

src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc ADDED Viewed

Binary file (2.49 kB). View file

src_code_for_reproducibility/models/__pycache__/inference_backend_vllm.cpython-312.pyc ADDED Viewed

Binary file (5.12 kB). View file

src_code_for_reproducibility/models/__pycache__/large_language_model_api.cpython-312.pyc ADDED Viewed

Binary file (7.08 kB). View file

src_code_for_reproducibility/models/__pycache__/large_language_model_local.cpython-312.pyc ADDED Viewed

Binary file (16.5 kB). View file

src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc ADDED Viewed

Binary file (3.32 kB). View file

src_code_for_reproducibility/training/trainer_ad_align.py ADDED Viewed

	@@ -0,0 +1,505 @@

+"""
+File: mllm/training/trainer_ad_align.py
+Summary: Trainer specialized for the advantage-alignment objective.
+"""
+import copy
+import logging
+import sys
+from dataclasses import dataclass
+from typing import Tuple
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from mllm.markov_games.rollout_tree import (
+    ChatTurn,
+    RolloutTreeBranchNode,
+    RolloutTreeRootNode,
+)
+from mllm.training.credit_methods import (
+    get_advantage_alignment_credits,
+    get_discounted_state_visitation_credits,
+)
+from mllm.training.tally_metrics import Tally
+from mllm.training.tally_rollout import RolloutTally, RolloutTallyItem
+from mllm.training.tally_tokenwise import ContextualizedTokenwiseTally
+from mllm.training.tokenize_chats import process_training_chat
+from mllm.training.trainer_common import BaseTrainer
+from mllm.training.training_data_utils import (
+    AdvantagePacket,
+    TrainingBatch,
+    TrainingChatTurn,
+    TrajectoryBatch,
+    get_main_chat_list_and_rewards,
+    get_tokenwise_credits,
+)
+from mllm.utils.resource_context import resource_logger_context
+logger = logging.getLogger(__name__)
+logger.addHandler(logging.StreamHandler(sys.stdout))
+RolloutId = int
+AgentId = str
+@dataclass
+class AdAlignTrainingData:
+    """Holds tensorized rollouts plus precomputed advantages for one agent."""
+    agent_id: str
+    main_data: TrajectoryBatch
+    # list-of-tensors: per rollout advantages with length jT
+    main_advantages: list[torch.FloatTensor] | None = None
+    # list-of-tensors: per rollout matrix (jT, A)
+    alternative_advantages: list[torch.FloatTensor] | None = None
+    advantage_alignment_credits: list[torch.FloatTensor] | None = None
+def get_alternative_chat_histories(
+    agent_id: str, root: RolloutTreeRootNode
+) -> list[list[TrainingChatTurn], list[torch.FloatTensor]]:
+    """
+    Traverse every unilateral branch under ``root`` and collect chat/reward histories.
+    Returns
+    -------
+    alternative_chats:
+        Flattened list of chat turns for each branch (ordered by branch depth).
+    alternative_rewards:
+        Matching list of reward tensors aligned with the chat history.
+    """
+    current_node = root.child
+    branches = current_node.branches
+    pre_branch_chat = []
+    pre_branch_rewards = []
+    alternative_rewards = []
+    alternative_chats = []
+    while current_node is not None:
+        assert isinstance(
+            current_node, RolloutTreeBranchNode
+        ), "Current node should be a branch node."
+        main_node = current_node.main_child
+        branches = current_node.branches
+        current_node = main_node.child
+        # Get the `A` alternative trajectories
+        alternative_nodes = branches[agent_id]
+        for alt_node in alternative_nodes:
+            post_branch_chat, post_branch_rewards = get_main_chat_list_and_rewards(
+                agent_id=agent_id, root=alt_node
+            )
+            branch_chat = pre_branch_chat + post_branch_chat
+            alternative_chats.append(branch_chat)
+            alternative_rewards.append(
+                torch.cat([torch.tensor(pre_branch_rewards), post_branch_rewards])
+            )
+        chat_turns: list[ChatTurn] = main_node.step_log.action_logs[agent_id].chat_turns
+        chat_turns: list[TrainingChatTurn] = [
+            TrainingChatTurn(time_step=main_node.time_step, **turn.model_dump())
+            for turn in chat_turns
+        ]
+        pre_branch_chat.extend(chat_turns)
+        pre_branch_rewards.append(
+            main_node.step_log.simulation_step_log.rewards[agent_id]
+        )
+    return alternative_chats, alternative_rewards
+class TrainerAdAlign(BaseTrainer):
+    """
+    Extends the reinforce trainer to support Advantage Alignment.
+    """
+    def __init__(
+        self,
+        ad_align_beta: float,
+        ad_align_gamma: float,
+        ad_align_exclude_k_equals_t: bool,
+        ad_align_use_sign: bool,
+        ad_align_clipping: float,
+        ad_align_force_coop_first_step: bool,
+        use_old_ad_align: bool,
+        use_time_regularization: bool,
+        rloo_branch: bool,
+        reuse_baseline: bool,
+        ad_align_beta_anneal_step: int = -1,
+        ad_align_beta_anneal_rate: float = 0.5,
+        min_ad_align_beta: float = 0.1,
+        mean_normalize_ad_align: bool = False,
+        whiten_adalign_advantages: bool = False,
+        whiten_adalign_advantages_time_step_wise: bool = False,
+        ad_align_discount_t: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialize the advantage alignment trainer.
+        Args:
+            ad_align_beta: Beta parameter for the advantage alignment.
+            ad_align_gamma: Gamma parameter for the advantage alignment.
+            ad_align_exclude_k_equals_t: Whether to include k = t in the advantage alignment.
+            ad_align_use_sign: Whether to use sign in the advantage alignment.
+            ad_align_clipping: Clipping value for the advantage alignment.
+            ad_align_force_coop_first_step: Whether to force coop on the first step of the advantage alignment.
+        """
+        super().__init__(*args, **kwargs)
+        self.ad_align_beta = ad_align_beta
+        self.ad_align_gamma = ad_align_gamma
+        self.ad_align_exclude_k_equals_t = ad_align_exclude_k_equals_t
+        self.ad_align_use_sign = ad_align_use_sign
+        self.ad_align_clipping = ad_align_clipping
+        self.ad_align_force_coop_first_step = ad_align_force_coop_first_step
+        self.use_old_ad_align = use_old_ad_align
+        self.use_time_regularization = use_time_regularization
+        self.rloo_branch = rloo_branch
+        self.reuse_baseline = reuse_baseline
+        self.ad_align_beta_anneal_step = ad_align_beta_anneal_step
+        self.ad_align_beta_anneal_rate = ad_align_beta_anneal_rate
+        self.min_ad_align_beta = min_ad_align_beta
+        self.past_ad_align_step = -1
+        self.mean_normalize_ad_align = mean_normalize_ad_align
+        self.whiten_adalign_advantages = whiten_adalign_advantages
+        self.whiten_adalign_advantages_time_step_wise = (
+            whiten_adalign_advantages_time_step_wise
+        )
+        self.ad_align_discount_t = ad_align_discount_t
+        self.training_data: dict[AgentId, AdAlignTrainingData] = {}
+        self.debug_path_list: list[str] = []
+    def set_agent_trajectory_data(
+        self, agent_id: str, roots: list[RolloutTreeRootNode]
+    ):
+        """
+        Materialize main and alternative trajectory tensors used by the advantage-alignment trainer.
+        """
+        B = len(roots)  # Number of rollouts
+        # For main rollouts
+        batch_rollout_ids = []
+        batch_crn_ids = []
+        batch_input_ids = []
+        batch_action_mask = []
+        batch_entropy_mask = []
+        batch_timesteps = []
+        batch_state_ends_mask = []
+        batch_engine_log_probs = []
+        batch_rewards = []
+        # For alternative actions rollouts
+        batch_branching_time_steps = []
+        alternative_batch_input_ids = []
+        alternative_batch_action_mask = []
+        alternative_batch_entropy_mask = []
+        alternative_batch_timesteps = []
+        alternative_batch_state_ends_mask = []
+        alternative_batch_engine_log_probs = []
+        alternative_batch_rewards = []
+        jT_list = []
+        try:
+            A = len(roots[0].child.branches[agent_id])  # Number of alternative actions
+        except:
+            A = 0
+        for root in roots:
+            rollout_id = root.id
+            self.debug_path_list.append(
+                "mgid:" + str(rollout_id) + "_agent_id:" + agent_id
+            )
+            # Get main trajectory
+            batch_rollout_ids.append(rollout_id)
+            batch_crn_ids.append(root.crn_id)
+            main_chat, main_rewards = get_main_chat_list_and_rewards(
+                agent_id=agent_id, root=root
+            )
+            (
+                input_ids,
+                action_mask,
+                entropy_mask,
+                timesteps,
+                state_ends_mask,
+                engine_log_probs,
+            ) = process_training_chat(
+                tokenizer=self.tokenizer,
+                chat_history=main_chat,
+                entropy_mask_regex=self.entropy_mask_regex,
+                exploration_prompts_to_remove=self.exploration_prompts_to_remove,
+            )
+            batch_input_ids.append(input_ids)
+            batch_action_mask.append(action_mask)
+            batch_entropy_mask.append(entropy_mask)
+            batch_timesteps.append(timesteps)
+            batch_state_ends_mask.append(state_ends_mask)
+            batch_engine_log_probs.append(engine_log_probs)
+            batch_rewards.append(main_rewards)
+            jT = (
+                main_rewards.numel()
+            )  # Number of timesteps inferred from reward tensor length.
+            jT_list.append(jT)
+            if A > 0:
+                # We get the branching time steps for each of the `jT` time steps in the main trajectory.
+                branching_time_steps = [bt for item in range(jT) for bt in A * [item]]
+                batch_branching_time_steps.extend(branching_time_steps)
+                # Get all of the (jT*A) alternative trajectories in the tree
+                # (jT is the number of time steps in the main trajectory, A is the number of alternative actions)
+                alternative_chats, alternative_rewards = get_alternative_chat_histories(
+                    agent_id=agent_id, root=root
+                )
+                assert (
+                    len(alternative_chats) == A * jT
+                ), "Incorrect number of alternative trajectories."
+                for chat, rewards in zip(alternative_chats, alternative_rewards):
+                    (
+                        input_ids,
+                        action_mask,
+                        entropy_mask,
+                        timesteps,
+                        state_ends_mask,
+                        engine_log_probs,
+                    ) = process_training_chat(
+                        tokenizer=self.tokenizer,
+                        chat_history=chat,
+                        entropy_mask_regex=self.entropy_mask_regex,
+                        exploration_prompts_to_remove=self.exploration_prompts_to_remove,
+                    )
+                    alternative_batch_input_ids.append(input_ids)
+                    alternative_batch_action_mask.append(action_mask)
+                    alternative_batch_entropy_mask.append(entropy_mask)
+                    alternative_batch_timesteps.append(timesteps)
+                    alternative_batch_state_ends_mask.append(state_ends_mask)
+                    alternative_batch_engine_log_probs.append(engine_log_probs)
+                    alternative_batch_rewards.append(rewards)
+        jT_list = torch.Tensor(jT_list)
+        # Assert that number of alternative actions is constant
+        # assert len(set(nb_alternative_actions)) == 1, "Number of alternative actions must be constant"
+        # A = nb_alternative_actions[0]
+        trajectory_batch = TrajectoryBatch(
+            rollout_ids=torch.tensor(batch_rollout_ids, dtype=torch.int32),  # (B,)
+            crn_ids=torch.tensor(batch_crn_ids, dtype=torch.int32),
+            agent_ids=[agent_id] * len(batch_rollout_ids),
+            batch_input_ids=batch_input_ids,
+            batch_action_mask=batch_action_mask,
+            batch_entropy_mask=batch_entropy_mask,
+            batch_timesteps=batch_timesteps,
+            batch_state_ends_mask=batch_state_ends_mask,
+            batch_engine_log_probs=batch_engine_log_probs,
+            batch_rewards=batch_rewards,
+        )
+        # Get Advantages & Train Critic
+        with resource_logger_context(
+            logger, "Get advantages with critic gradient accumulation"
+        ):
+            self.batch_advantages: torch.FloatTensor = (
+                self.get_advantages_with_critic_gradient_accumulation(trajectory_batch)
+            )  # (B, jT)
+        if A > 0:
+            # Here, `A` is the number of alternative actions / trajectories taken at each time step.
+            # For each of the `B` rollout perspectives, at each of its jT (`j` is for jagged, since each main rollout may be of a different length) steps, we take A alternate trajectories (from different actions).
+            # Therefore, we have ∑jT * A trajectories to process. If each of the main trajectories have T steps, we will have `B*T*A` to process.
+            with resource_logger_context(logger, "Create alternative trajectory batch"):
+                sum_jT = int(torch.sum(jT_list).item())
+                jT_list = (
+                    jT_list.int().tolist()
+                )  # (jT,) # (we only want the advantages where we branched out)
+                alternative_trajectory_batch = TrajectoryBatch(
+                    rollout_ids=torch.zeros(A * sum_jT, dtype=torch.int32),
+                    crn_ids=torch.zeros(A * sum_jT, dtype=torch.int32),
+                    agent_ids=[agent_id] * (A * sum_jT),
+                    batch_input_ids=alternative_batch_input_ids,
+                    batch_action_mask=alternative_batch_action_mask,
+                    batch_entropy_mask=alternative_batch_entropy_mask,
+                    batch_timesteps=alternative_batch_timesteps,
+                    batch_state_ends_mask=alternative_batch_state_ends_mask,
+                    batch_engine_log_probs=alternative_batch_engine_log_probs,
+                    batch_rewards=alternative_batch_rewards,
+                )
+            # Get alternative advantages
+            # BAAs stands for batch alternative advantages
+            # (torch nested tensors have very little api support, so we have to do some odd manual work here)
+            with resource_logger_context(
+                logger, "Compute alternative advantage estimates"
+            ):
+                BAAs_list = self.get_advantages_with_critic_gradient_accumulation(
+                    alternative_trajectory_batch
+                )  # list length (∑jT * A), each (jT',)
+                # Pad alternative advantages to (∑jT*A, P)
+                BAAs_padded = pad_sequence(
+                    BAAs_list, batch_first=True, padding_value=0.0
+                )
+                branch_idx = torch.tensor(
+                    batch_branching_time_steps,
+                    device=BAAs_padded.device,
+                    dtype=torch.long,
+                )
+                gathered = BAAs_padded.gather(
+                    dim=1, index=branch_idx.unsqueeze(1)
+                ).squeeze(1)
+                # Reshape and split per rollout, then transpose to (jT_i, A)
+                gathered = gathered.view(A, sum_jT)  # (A, ∑jT)
+                blocks = list(
+                    torch.split(gathered, jT_list, dim=1)
+                )  # len B, shapes (A, jT_i)
+                BAAs = [
+                    blk.transpose(0, 1).contiguous() for blk in blocks
+                ]  # list of (jT_i, A)
+        if self.ad_align_beta_anneal_step > 0:
+            max_rollout_id = torch.max(trajectory_batch.rollout_ids) + 1
+            if (
+                max_rollout_id % self.ad_align_beta_anneal_step == 0
+                and self.past_ad_align_step != max_rollout_id
+            ):
+                self.ad_align_beta = max(
+                    self.ad_align_beta * self.ad_align_beta_anneal_rate,
+                    self.min_ad_align_beta,
+                )
+                logger.info(f"Annealing ad_align_beta to {self.ad_align_beta}")
+                self.past_ad_align_step = max_rollout_id
+        self.training_data[agent_id] = AdAlignTrainingData(
+            agent_id=agent_id,
+            main_data=trajectory_batch,
+            main_advantages=self.batch_advantages,
+            alternative_advantages=BAAs if A > 0 else None,
+        )
+    def share_advantage_data(self) -> list[AdvantagePacket]:
+        """
+        Share the advantage alignment data with other agents.
+        Returns:
+            AdvantagePacket: The advantage packet containing the agent's advantages.
+        """
+        logger.info(f"Sharing advantage alignment data.")
+        advantage_packets = []
+        for _, agent_data in self.training_data.items():
+            advantage_packets.append(
+                AdvantagePacket(
+                    agent_id=agent_data.agent_id,
+                    rollout_ids=agent_data.main_data.rollout_ids,
+                    main_advantages=agent_data.main_advantages,
+                )
+            )
+        return advantage_packets
+    def receive_advantage_data(self, advantage_packets: list[AdvantagePacket]):
+        """
+        Receive advantage packets from other players.
+        These contain the advantages of the other players' rollouts estimated by them.
+        """
+        logger.info(f"Receiving advantage packets.")
+        assert (
+            len(advantage_packets) > 0
+        ), "At least one advantage packet must be provided."
+        for agent_id, agent_data in self.training_data.items():
+            coagent_advantage_packets = [
+                packet for packet in advantage_packets if packet.agent_id != agent_id
+            ]
+            agent_rollout_ids = agent_data.main_data.rollout_ids
+            agent_advantages = agent_data.main_advantages
+            co_agent_advantages = []
+            for rollout_id in agent_rollout_ids:
+                for co_agent_packet in coagent_advantage_packets:
+                    if rollout_id in co_agent_packet.rollout_ids:
+                        index = torch.where(rollout_id == co_agent_packet.rollout_ids)[
+                            0
+                        ].item()
+                        co_agent_advantages.append(
+                            co_agent_packet.main_advantages[index]
+                        )
+                        # assumes that its two player game, with one co-agent
+                        break
+            assert len(co_agent_advantages) == len(agent_advantages)
+            B = len(agent_advantages)
+            assert all(
+                a.shape[0] == b.shape[0]
+                for a, b in zip(co_agent_advantages, agent_advantages)
+            ), "Number of advantages must match for advantage alignment."
+            # Get padded tensors (advantage alignment is invariant to padding)
+            lengths = torch.tensor(
+                [len(t) for t in agent_advantages],
+                device=self.device,
+                dtype=torch.long,
+            )
+            padded_main_advantages = pad_sequence(
+                agent_advantages, batch_first=True, padding_value=0.0
+            )
+            if agent_data.alternative_advantages:
+                padded_alternative_advantages = pad_sequence(
+                    agent_data.alternative_advantages,
+                    batch_first=True,
+                    padding_value=0.0,
+                )  # (B, P, A)
+            else:
+                padded_alternative_advantages = None
+            padded_co_agent_advantages = pad_sequence(
+                co_agent_advantages, batch_first=True, padding_value=0.0
+            )
+            # Create training batch data
+            credits, sub_tensors = get_advantage_alignment_credits(
+                a1=padded_main_advantages,
+                a1_alternative=padded_alternative_advantages,
+                a2=padded_co_agent_advantages,
+                beta=self.ad_align_beta,
+                gamma=self.ad_align_gamma,
+                exclude_k_equals_t=self.ad_align_exclude_k_equals_t,
+                use_sign=self.ad_align_use_sign,
+                clipping=self.ad_align_clipping,
+                force_coop_first_step=self.ad_align_force_coop_first_step,
+                use_old_ad_align=self.use_old_ad_align,
+                use_time_regularization=self.use_time_regularization,
+                rloo_branch=self.rloo_branch,
+                reuse_baseline=self.reuse_baseline,
+                mean_normalize_ad_align=self.mean_normalize_ad_align,
+                whiten_adalign_advantages=self.whiten_adalign_advantages,
+                whiten_adalign_advantages_time_step_wise=self.whiten_adalign_advantages_time_step_wise,
+                discount_t=self.ad_align_discount_t,
+            )
+            for key, value in sub_tensors.items():
+                self.rollout_tally.add_metric(
+                    path=[key],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=agent_data.main_data.crn_ids,
+                        rollout_ids=agent_data.main_data.rollout_ids,
+                        agent_ids=agent_data.main_data.agent_ids,
+                        metric_matrix=value,
+                    ),
+                )
+            if not self.skip_discounted_state_visitation:
+                credits = get_discounted_state_visitation_credits(
+                    credits,
+                    self.discount_factor,
+                )
+                self.rollout_tally.add_metric(
+                    path=["discounted_state_visitation_credits"],
+                    rollout_tally_item=RolloutTallyItem(
+                        crn_ids=agent_data.main_data.crn_ids,
+                        rollout_ids=agent_data.main_data.rollout_ids,
+                        agent_ids=agent_data.main_data.agent_ids,
+                        metric_matrix=sub_tensors[
+                            "discounted_state_visitation_credits"
+                        ],
+                    ),
+                )
+            # Slice back to jagged
+            advantage_alignment_credits = [credits[i, : lengths[i]] for i in range(B)]
+            # Replace stored training data for this agent by the concrete trajectory batch
+            # and attach the computed credits for policy gradient.
+            self.training_data[agent_id] = agent_data.main_data
+            self.training_data[agent_id].batch_credits = advantage_alignment_credits

src_code_for_reproducibility/utils/dict_get_path.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+File: mllm/utils/dict_get_path.py
+Summary: Retrieves nested dictionary values using dotted key paths.
+"""
+def get_from_nested_dict(a: dict, path) -> any:
+    # path is string or list of string
+    try:
+        if isinstance(path, str):
+            return a[path]
+        else:
+            for p in path:
+                a = a[p]
+            return a
+    except Exception:
+        return None

src_code_for_reproducibility/utils/gather_training_stats.py ADDED Viewed

	@@ -0,0 +1,262 @@

+"""
+File: mllm/utils/gather_training_stats.py
+Summary: Aggregates training statistics from rollouts and exports artifacts.
+"""
+import copy
+import csv
+import gc
+import json
+import logging
+import os
+import pickle
+import random
+import re
+import subprocess
+import sys
+import time
+from datetime import datetime
+from statistics import mean
+from typing import Any, Dict
+import hydra
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import torch
+from omegaconf import OmegaConf
+from mllm.training.tally_metrics import Tally
+from mllm.utils.stat_pack import StatPack
+def get_from_nested_dict(dictio: dict, path: list[str]):
+    for sp in path[:-1]:
+        dictio = dictio[sp]
+    return dictio.get(path[-1])
+def set_at_path(dictio: dict, path: list[str], value):
+    for sp in path[:-1]:
+        if sp not in dictio:
+            dictio[sp] = {}
+        dictio = dictio[sp]
+    dictio[path[-1]] = value
+def produce_tabular_render(inpath: str, outpath: str = None):
+    """
+    Convert a JSON metrics dump into per-rollout CSV tables for easier inspection.
+    """
+    with open(inpath, "r") as f:
+        data = json.load(f)
+    rollout_paths = data.keys()
+    for rollout_path in rollout_paths:
+        if outpath is None:
+            m_path = rollout_path.replace("/", "|")
+            m_path = m_path.replace(".json", "")
+            m_path = (
+                os.path.split(inpath)[0]
+                + "/contextualized_tabular_renders/"
+                + m_path
+                + "_tabular_render.render.csv"
+            )
+        # import pdb; pdb.set_trace()
+        os.makedirs(os.path.split(m_path)[0], exist_ok=True)
+        metrics = data[rollout_path]
+        d = {k: [] for k in metrics[0].keys()}
+        for m in metrics:
+            for k, v in m.items():
+                d[k].append(v)
+        d = pd.DataFrame(d)
+        d.to_csv(m_path)
+def get_metric_paths(data: list[dict]):
+    d = data[0]
+    paths = []
+    def traverse_dict(d, current_path=[]):
+        for key, value in d.items():
+            new_path = current_path + [key]
+            if isinstance(value, dict):
+                traverse_dict(value, new_path)
+            else:
+                paths.append(new_path)
+    traverse_dict(d)
+    return paths
+def print_metric_paths(data: list[dict]):
+    paths = get_metric_paths(data)
+    for p in paths:
+        print(p)
+def get_metric_iteration_list(data: list[dict], metric_path: list[str]):
+    if isinstance(metric_path, str):
+        metric_path = [metric_path]
+    sgl = []
+    for d in data:
+        sgl.append(get_from_nested_dict(d, metric_path))
+    return sgl
+def to_1d_numeric(x):
+    """Return a 1-D float array (or None if not numeric). Accepts scalars, numpy arrays, or nested list/tuple of them."""
+    if x is None:
+        return None
+    if isinstance(x, (int, float, np.number)):
+        return np.array([float(x)], dtype=float)
+    if isinstance(x, np.ndarray):
+        try:
+            return x.astype(float).ravel()
+        except Exception:
+            return None
+    if isinstance(x, (list, tuple)):
+        parts = []
+        for e in x:
+            arr = to_1d_numeric(e)
+            if arr is not None and arr.size > 0:
+                parts.append(arr)
+        if parts:
+            return np.concatenate(parts)
+        return None
+    return None
+def get_single_metric_vector(data, metric_path, iterations=None):
+    if isinstance(metric_path, str):
+        metric_path = [metric_path]
+    if iterations == None:
+        iterations = len(data)
+    vecs = []
+    for d in data:
+        ar = get_from_nested_dict(d, metric_path)
+        arr = to_1d_numeric(ar)
+        if arr is not None:
+            vecs.append(arr)
+    return np.concatenate(vecs) if vecs else np.empty(0, dtype=float)
+def _load_metrics_file(file_path: str):
+    if not (file_path.endswith(".tally.pkl") or file_path.endswith(".pkl")):
+        raise ValueError("Only *.tally.pkl files are supported.")
+    import pickle
+    with open(file_path, "rb") as f:
+        tree = pickle.load(f)
+    return tree
+def get_leaf_items(array_tally: dict, prefix: list[str] = None):
+    if prefix is None:
+        prefix = []
+    for key, value in array_tally.items():
+        next_prefix = prefix + [str(key)]
+        if isinstance(value, dict):
+            yield from get_leaf_items(value, next_prefix)
+        else:
+            yield next_prefix, value
+def _sanitize_filename_part(part: str) -> str:
+    s = part.replace("/", "|")
+    s = s.replace(" ", "_")
+    return s
+def render_rt_tally_pkl_to_csvs(pkl_path: str, outdir: str):
+    """
+    This method takes care of tokenwise logging.
+    """
+    with open(pkl_path, "rb") as f:
+        payload = pickle.load(f)
+    # Backward compatibility: older tallies stored the dict directly
+    if isinstance(payload, dict) and "array_tally" in payload:
+        array_tally = payload.get("array_tally", {})
+    else:
+        array_tally = payload
+    os.makedirs(outdir, exist_ok=True)
+    trainer_id = os.path.basename(pkl_path).replace(".rt_tally.pkl", "")
+    for path_list, rollout_tally_items in get_leaf_items(array_tally):
+        # Create file and initiate writer
+        path_part = ".".join(_sanitize_filename_part(p) for p in path_list)
+        filename = f"{trainer_id}__{path_part}.render.csv"
+        out_path = os.path.join(outdir, filename)
+        # Write metric rows to CSV
+        with open(out_path, "w", newline="") as f:
+            writer = csv.writer(f)
+            # Write header row - need to determine metric column count from first rollout_tally_item
+            first_item = rollout_tally_items[0]
+            metric_cols = (
+                first_item.metric_matrix.shape[1]
+                if first_item.metric_matrix.ndim > 1
+                else 1
+            )
+            header = ["agent_id", "crn_id", "rollout_id"] + [
+                f"t_{i}" for i in range(metric_cols)
+            ]
+            writer.writerow(header)
+            for rollout_tally_item in rollout_tally_items:
+                crn_ids = rollout_tally_item.crn_ids
+                rollout_ids = rollout_tally_item.rollout_ids
+                agent_ids = rollout_tally_item.agent_ids
+                metric_matrix = rollout_tally_item.metric_matrix
+                for i in range(metric_matrix.shape[0]):
+                    row_vals = metric_matrix[i].reshape(-1)
+                    # Convert row_vals to a list to avoid numpy concatenation issues
+                    row_vals = (
+                        row_vals.tolist()
+                        if hasattr(row_vals, "tolist")
+                        else list(row_vals)
+                    )
+                    row_prefix = [
+                        agent_ids[i],
+                        crn_ids[i],
+                        rollout_ids[i],
+                    ]
+                    writer.writerow(row_prefix + row_vals)
+def tally_to_stat_pack(tally: Dict[str, Any]):
+    stat_pack = StatPack()
+    if "array_tally" in tally:
+        tally = tally["array_tally"]
+        # backward compatibility: will remove later, flatten keys in tally
+        def get_from_nested_dict(dictio: dict, path: list[str]):
+            for sp in path[:-1]:
+                dictio = dictio[sp]
+            return dictio.get(path[-1])
+        def get_metric_paths(tally: dict):
+            paths = []
+            def traverse_dict(tally, current_path=[]):
+                for key, value in tally.items():
+                    new_path = current_path + [key]
+                    if isinstance(value, dict):
+                        traverse_dict(value, new_path)
+                    else:
+                        paths.append(new_path)
+            traverse_dict(tally)
+            return paths
+        paths = get_metric_paths(tally)
+        modified_tally = {}
+        for p in paths:
+            val = get_from_nested_dict(tally, p)
+            modified_tally["_".join(p)] = np.mean(val)
+        del tally
+        tally = modified_tally
+    for key, value in tally.items():
+        stat_pack.add_stat(key, value)
+    return stat_pack

src_code_for_reproducibility/utils/resource_context.py ADDED Viewed

	@@ -0,0 +1,83 @@

+"""
+File: mllm/utils/resource_context.py
+Summary: Tracks system resource usage via a context manager.
+"""
+import logging
+import time
+from contextlib import contextmanager
+import torch
+def vram_usage():
+    output = ""
+    for i in range(torch.cuda.device_count()):
+        gpu_memory_allocated = torch.cuda.memory_allocated(i) / (
+            1024**3
+        )  # Convert bytes to GB
+        gpu_memory_reserved = torch.cuda.memory_reserved(i) / (
+            1024**3
+        )  # Convert bytes to GB
+        output += f"GPU {i}: Memory Allocated: {gpu_memory_allocated:.2f} GB, Memory Reserved: {gpu_memory_reserved:.2f} GB"
+    return output
+def ram_usage():
+    import psutil
+    process = psutil.Process()
+    memory_info = process.memory_info()
+    ram_used = memory_info.rss / (1024**3)  # Convert bytes to GB
+    return f"RAM Usage: {ram_used:.2f} GB"
+@contextmanager
+def resource_logger_context(logger: logging.Logger, task_description: str):
+    """
+    Context manager to log the resource usage of the current task.
+    Args:
+        logger: The logger to use to log the resource usage.
+        task_description: The description of the task to log.
+    Returns:
+        None
+    """
+    try:
+        initial_time = time.time()
+        # Assume CUDA is available and use device 0 only
+        total_mem_bytes = torch.cuda.get_device_properties(0).total_memory
+        initial_total_bytes = torch.cuda.memory_allocated(
+            0
+        ) + torch.cuda.memory_reserved(0)
+        torch.cuda.reset_peak_memory_stats(0)
+        yield None
+    finally:
+        final_time = time.time()
+        # Ensure kernels within the block are accounted for
+        torch.cuda.synchronize()
+        # Compute metrics
+        final_allocated_bytes = torch.cuda.memory_allocated(0)
+        final_reserved_bytes = torch.cuda.memory_reserved(0)
+        final_total_bytes = final_allocated_bytes + final_reserved_bytes
+        delta_vram_percent_total = (
+            100 * (final_total_bytes - initial_total_bytes) / total_mem_bytes
+            if total_mem_bytes
+            else 0.0
+        )
+        current_percent_vram_taken = (
+            100 * final_total_bytes / total_mem_bytes if total_mem_bytes else 0.0
+        )
+        block_peak_percent = (
+            100 * torch.cuda.max_memory_allocated(0) / total_mem_bytes
+            if total_mem_bytes
+            else 0.0
+        )
+        delta_time_str = time.strftime(
+            "%H:%M:%S", time.gmtime(final_time - initial_time)
+        )
+        logger.info(
+            f"For task: {task_description}, ΔVRAM % (total): {delta_vram_percent_total:.2f}%, Current % of VRAM taken: {current_percent_vram_taken:.2f}%, Block Peak % of device VRAM: {block_peak_percent:.2f}%, ΔTime: {delta_time_str}"
+        )

src_code_for_reproducibility/utils/rollout_tree_chat_htmls.py ADDED Viewed

	@@ -0,0 +1,1597 @@

+"""
+File: mllm/utils/rollout_tree_chat_htmls.py
+Summary: Renders rollout tree chat transcripts into HTML artifacts.
+"""
+from pathlib import Path
+from typing import List
+from mllm.utils.rollout_tree_gather_utils import *
+def html_from_chat_turns(chat_turns: List[ChatTurnLog]) -> str:
+    """
+    Render chat turns as a single, wrapping sequence of messages in time order.
+    Keep badge and message bubble styles, include time on every badge and
+    include rewards on assistant badges. Each message is individually
+    hide/show by click; when hidden, only the badge remains and "(...)" is
+    shown inline (not inside a bubble).
+    """
+    import html
+    import re as _re
+    # Prepare ordering: sort by (time_step, original_index) to keep stable order within same step
+    indexed_turns = list(enumerate(chat_turns))
+    indexed_turns.sort(key=lambda t: (t[1].time_step, t[0]))
+    # Get unique agent IDs and sort alphabetically for consistent assignment
+    # Agent with alphabetically lower name gets agent-0 (left, green)
+    # Agent with alphabetically higher name gets agent-1 (right, orange)
+    unique_agent_ids = sorted(
+        set(turn.agent_id for turn in chat_turns if turn.role == "assistant")
+    )
+    agent_id_to_index = {aid: idx for idx, aid in enumerate(unique_agent_ids)}
+    # CSS styles (simplified layout; no time-step or agent-column backgrounds)
+    css = """
+    <style>
+        :root {
+            --font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            --bg: #ffffff;
+            --text: #1c0b00;
+            --muted-text: #2C3E50;
+            --accent-muted: #BDC3C7;
+            --accent-muted-2: #D0D7DE;
+            --panel-bg: #F8FAFC;
+            --reward-color: #3a2e00; /* dark text for reward pill */
+            --font-size: 14px;
+            --border-width: 2px;
+            --corner-radius: 6px;
+            --pill-radius-left: 999px 0 0 999px;
+            --pill-radius-right: 0 999px 999px 0;
+            --inset-shadow: 0 1px 0 rgba(0,0,0,0.03) inset;
+            /* Chat View Colors */
+            --agent-0-bg: #dcf8c6;
+            --agent-0-border: #0eb224;
+            --agent-1-bg: #ffe4cc;
+            --agent-1-border: #ef8323;
+            --user-bg: #f5f5f5;
+            --chat-bg: #ffffff;
+        }
+        body {
+            font-family: var(--font-family);
+            margin: 12px;
+            background-color: var(--bg);
+            color: var(--text);
+            font-size: var(--font-size);
+            line-height: 1.5;
+        }
+        /* Chat View Styles */
+        #flow-chat {
+            max-width: 900px;
+            margin: 0 auto;
+            background: var(--chat-bg);
+            padding: 12px 16px 12px 8px;
+            border-radius: 8px;
+        }
+        .simultaneous-messages {
+            display: flex !important;
+            flex-direction: row !important;
+            flex-wrap: nowrap !important;
+            gap: 8px;
+            margin-bottom: 4px;
+            align-items: flex-start;
+            width: 100%;
+            overflow: hidden;
+            box-sizing: border-box;
+        }
+        .simultaneous-messages .chat-message {
+            flex: 1 1 0 !important;
+            margin-bottom: 0 !important;
+            display: flex !important;
+            flex-direction: row !important;
+            align-items: flex-start !important;
+            margin-left: 0 !important;
+            min-width: 0 !important;
+            max-width: 50% !important;
+            gap: 0 !important;
+            overflow: hidden !important;
+        }
+        .simultaneous-messages .chat-message-content {
+            max-width: 100% !important;
+            width: 100%;
+            align-items: flex-start !important;
+            margin-left: 0 !important;
+            overflow: hidden !important;
+        }
+        .simultaneous-messages .chat-message.agent-0 {
+            justify-content: flex-start !important;
+        }
+        .simultaneous-messages .chat-message.agent-1 {
+            justify-content: flex-end !important;
+        }
+        .simultaneous-messages .chat-message.agent-0 .chat-message-content {
+            margin-left: 0 !important;
+            align-items: flex-start !important;
+        }
+        .simultaneous-messages .chat-message.agent-1 .chat-message-content {
+            margin-left: auto !important;
+            margin-right: 0 !important;
+            align-items: flex-end !important;
+        }
+        .simultaneous-messages .chat-bubble {
+            max-width: 100%;
+            word-break: break-word;
+            overflow-wrap: break-word;
+            box-sizing: border-box;
+        }
+        .simultaneous-messages .chat-message.agent-0 .chat-bubble {
+            border-radius: 10px;
+        }
+        .simultaneous-messages .chat-message.agent-1 .chat-bubble {
+            border-radius: 10px;
+        }
+        .simultaneous-messages .chat-message.agent-0 .chat-header {
+            justify-content: flex-start;
+            flex-shrink: 0;
+        }
+        .simultaneous-messages .chat-message.agent-1 .chat-header {
+            justify-content: flex-end;
+            flex-shrink: 0;
+        }
+        .simultaneous-messages .chat-reasoning {
+            max-width: 100%;
+            overflow-wrap: break-word;
+        }
+        /* Styling for user prompts in simultaneous-messages */
+        .simultaneous-messages .chat-message.role-user {
+            flex: 1 1 0 !important;
+            margin-bottom: 0 !important;
+            display: flex !important;
+            opacity: 0.7;
+            cursor: pointer;
+        }
+        .simultaneous-messages .chat-message.role-user:hover {
+            opacity: 1;
+        }
+        .simultaneous-messages .chat-message.role-user.collapsed .chat-bubble {
+            display: none;
+        }
+        .simultaneous-messages .chat-message.role-user.collapsed .chat-header::after {
+            content: ' (collapsed)';
+            font-weight: normal;
+            font-style: italic;
+            color: #999;
+            font-size: 0.9em;
+        }
+        .simultaneous-messages .chat-message.role-user.agent-0 {
+            justify-content: flex-start !important;
+        }
+        .simultaneous-messages .chat-message.role-user.agent-1 {
+            justify-content: flex-end !important;
+        }
+        .simultaneous-messages .chat-message.role-user.agent-0 .chat-message-content {
+            margin-left: 0 !important;
+            align-items: flex-start !important;
+        }
+        .simultaneous-messages .chat-message.role-user.agent-1 .chat-message-content {
+            margin-left: auto !important;
+            margin-right: 0 !important;
+            align-items: flex-end !important;
+        }
+        /* Styling for split-agent-context when wrapped */
+        .simultaneous-messages .split-agent-context {
+            width: 100%;
+            display: flex !important;
+        }
+        .chat-message {
+            display: flex;
+            margin-bottom: 2px;
+            align-items: flex-end;
+            gap: 6px;
+            position: relative;
+            margin-left: 36px;
+        }
+        .chat-message.agent-0 {
+            margin-left: 0;
+        }
+        .chat-message.agent-1 {
+            margin-left: 0;
+        }
+        .chat-message.agent-0::before {
+            left: 0;
+        }
+        .chat-message.agent-1::before {
+            left: 0;
+        }
+        .chat-message.role-user {
+            opacity: 0.7;
+            cursor: pointer;
+        }
+        .chat-message.role-user.collapsed .chat-bubble {
+            display: none;
+        }
+        .chat-message.role-user.collapsed .chat-header::after {
+            content: ' (collapsed)';
+            font-weight: normal;
+            font-style: italic;
+            color: #999;
+            font-size: 0.9em;
+        }
+        .chat-message.role-user:hover {
+            opacity: 1;
+        }
+        .chat-message::before {
+            content: '';
+            position: absolute;
+            left: -36px;
+            top: 0;
+            bottom: 0;
+            width: 36px;
+            pointer-events: auto;
+        }
+        .merge-btn {
+            position: absolute;
+            left: -30px;
+            top: 50%;
+            transform: translateY(-50%);
+            width: 26px;
+            height: 26px;
+            border-radius: 4px;
+            border: 1.5px solid var(--accent-muted);
+            background: white;
+            cursor: pointer;
+            font-size: var(--font-size);
+            opacity: 0;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            transition: opacity 0.2s ease, transform 0.1s ease;
+            padding: 0;
+            line-height: 1;
+            z-index: 10;
+        }
+        .chat-message:hover .merge-btn,
+        .merge-btn:hover {
+            opacity: 1;
+        }
+        .merge-btn:hover {
+            background: var(--panel-bg);
+            border-color: var(--accent-muted-2);
+            transform: translateY(-50%) scale(1.15);
+            box-shadow: 0 2px 4px rgba(0, 0, 0, 0.15);
+        }
+        .merge-btn:active {
+            transform: translateY(-50%) scale(0.95);
+        }
+        .chat-message.agent-0 .merge-btn {
+            left: -30px;
+        }
+        .chat-message.agent-1 .merge-btn {
+            left: -30px;
+        }
+        .chat-message.role-user .merge-btn {
+            display: none !important;
+        }
+        .simultaneous-messages .merge-btn {
+            opacity: 0 !important;
+            pointer-events: none;
+        }
+        .simultaneous-messages {
+            padding: 6px 0 6px 0 !important;
+            margin-left: 0 !important;
+            margin-right: 0 !important;
+            position: relative !important;
+            background: transparent !important;
+            border-radius: 0 !important;
+            box-sizing: border-box !important;
+            overflow: visible !important;
+            max-width: 100% !important;
+            border: none !important;
+            transition: padding 0.2s ease !important;
+        }
+        .simultaneous-messages:hover {
+            padding-top: 40px !important;
+        }
+        .simultaneous-messages::before {
+            content: '⇅ Merged';
+            position: absolute;
+            left: 0 !important;
+            top: 8px !important;
+            font-size: var(--font-size);
+            font-weight: 500;
+            color: #888;
+            pointer-events: none;
+            opacity: 0;
+            transition: opacity 0.2s ease;
+        }
+        .simultaneous-messages:hover::before {
+            opacity: 1;
+        }
+        .unmerge-btn {
+            position: absolute !important;
+            right: 0 !important;
+            top: 6px !important;
+            width: 36px !important;
+            height: 28px !important;
+            border-radius: 5px !important;
+            border: 2px solid #d63031 !important;
+            background: white !important;
+            cursor: pointer !important;
+            font-size: var(--font-size) !important;
+            font-weight: bold !important;
+            color: #d63031 !important;
+            display: flex !important;
+            align-items: center !important;
+            justify-content: center !important;
+            transition: all 0.2s ease !important;
+            padding: 0 !important;
+            line-height: 1 !important;
+            z-index: 1000 !important;
+            flex: none !important;
+            pointer-events: auto !important;
+            box-shadow: 0 2px 6px rgba(214, 48, 49, 0.3) !important;
+            opacity: 0 !important;
+        }
+        .simultaneous-messages:hover .unmerge-btn {
+            opacity: 1 !important;
+        }
+        .unmerge-btn:hover {
+            background: #ffe5e5 !important;
+            border-color: #b71c1c !important;
+            transform: scale(1.1) !important;
+            box-shadow: 0 3px 8px rgba(214, 48, 49, 0.4) !important;
+        }
+        .unmerge-btn:active {
+            transform: scale(0.95) !important;
+            background: #ffcccc !important;
+        }
+        .chat-message-content {
+            max-width: 72%;
+            display: flex;
+            flex-direction: column;
+            gap: 2px;
+        }
+        .chat-message.agent-0 .chat-message-content {
+            align-items: flex-start;
+        }
+        .chat-message.agent-1 .chat-message-content {
+            align-items: flex-end;
+            margin-left: auto;
+        }
+        .chat-bubble {
+            padding: 6px 10px;
+            border-radius: 10px;
+            word-wrap: break-word;
+            position: relative;
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.1);
+            line-height: 1.4;
+        }
+        .chat-message.agent-0 .chat-bubble {
+            background: var(--agent-0-bg);
+            border: 2px solid var(--agent-0-border);
+            border-radius: 10px 10px 10px 2px;
+        }
+        .chat-message.agent-1 .chat-bubble {
+            background: var(--agent-1-bg);
+            border: 2px solid var(--agent-1-border);
+            border-radius: 10px 10px 2px 10px;
+        }
+        .chat-message.role-user .chat-bubble {
+            background: var(--user-bg);
+            border: 2px solid #d0d0d0;
+        }
+        .chat-header {
+            display: flex;
+            align-items: center;
+            gap: 4px;
+            margin-bottom: 2px;
+            font-size: var(--font-size);
+            font-weight: 600;
+            line-height: 1.2;
+        }
+        .chat-message.agent-0 .chat-header {
+            color: var(--agent-0-border);
+        }
+        .chat-message.agent-1 .chat-header {
+            color: var(--agent-1-border);
+        }
+        .chat-timestamp {
+            font-size: var(--font-size);
+            color: var(--muted-text);
+            margin-top: 1px;
+            opacity: 0.75;
+        }
+        .chat-reward {
+            display: inline-flex;
+            align-items: center;
+            background: linear-gradient(90deg, #fffdf2 0%, #ffffff 75%);
+            color: #000000;
+            font-weight: 600;
+            font-size: var(--font-size);
+            padding: 1px 5px;
+            border-radius: 3px;
+            border: 1px solid #f4e6a8;
+            margin-left: 4px;
+            line-height: 1.3;
+        }
+        .chat-reasoning {
+            font-size: var(--font-size);
+            font-style: italic;
+            color: #555;
+            margin-bottom: 2px;
+            padding: 4px 8px;
+            background: rgba(0, 0, 0, 0.03);
+            border-radius: 5px;
+            cursor: pointer;
+            line-height: 1.3;
+        }
+        .chat-reasoning.collapsed .reasoning-text {
+            display: none;
+        }
+        .chat-reasoning.collapsed::after {
+            content: ' (click to expand)';
+            color: #777;
+        }
+        .chat-group-divider {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            width: 100%;
+            margin: 8px 0 4px 0;
+            position: relative;
+            cursor: pointer;
+            user-select: none;
+        }
+        .chat-group-divider::before,
+        .chat-group-divider::after {
+            content: "";
+            flex: 1 1 auto;
+            height: 2px;
+            background: linear-gradient(90deg, rgba(224,230,235,0), var(--accent-muted-2) 30%, var(--accent-muted-2) 70%, rgba(224,230,235,0));
+        }
+        .chat-group-label {
+            display: inline-block;
+            background: white;
+            padding: 2px 12px;
+            border-radius: 999px;
+            font-size: var(--font-size);
+            font-weight: 700;
+            color: var(--muted-text);
+            border: 1.5px solid var(--accent-muted);
+            box-shadow: 0 1px 3px rgba(0, 0, 0, 0.08);
+            line-height: 1.4;
+            position: relative;
+            transition: background 0.2s ease;
+        }
+        .chat-group-divider:hover .chat-group-label {
+            background: var(--panel-bg);
+        }
+        .chat-group-label::before {
+            content: '▼ ';
+            font-size: 0.8em;
+            display: inline-block;
+            transition: transform 0.2s ease;
+            opacity: 0;
+        }
+        .chat-group-divider:hover .chat-group-label::before {
+            opacity: 1;
+        }
+        .chat-group-divider.collapsed .chat-group-label::before {
+            content: '▶ ';
+            opacity: 1;
+        }
+        .chat-group-divider.collapsed + * {
+            display: none !important;
+        }
+        /* Hide collapsed rounds in strong hide mode */
+        .strong-hide .chat-group-divider.collapsed {
+            display: none !important;
+        }
+        /* Chat view width control */
+        #flow-chat {
+            --chat-width: 900px;
+            max-width: var(--chat-width);
+            margin: 0 auto;
+        }
+        /* Hide user messages when toggle is on */
+        #flow-chat.hide-user-messages .chat-message.role-user {
+            display: none;
+        }
+        /* Hide rewards when hiding user messages */
+        #flow-chat.hide-user-messages .chat-reward {
+            display: none;
+        }
+        /* Round context annotations */
+        .round-context {
+            text-align: center;
+            margin: 4px auto;
+            max-width: 100%;
+        }
+        .round-context-edit {
+            min-height: 20px;
+            padding: 5px 10px;
+            border: 1.5px dashed var(--accent-muted);
+            border-radius: 6px;
+            background: #fafafa;
+            cursor: text;
+            transition: all 0.2s ease;
+            outline: none;
+            font-size: var(--font-size);
+            line-height: 1.3;
+            user-select: text;
+            -webkit-user-select: text;
+            -moz-user-select: text;
+            -ms-user-select: text;
+        }
+        .round-context-edit:focus {
+            border-style: solid;
+            border-color: var(--accent-muted-2);
+            background: #ffffff;
+            box-shadow: 0 2px 8px rgba(0, 0, 0, 0.1);
+        }
+        .round-context-edit:empty:before {
+            content: attr(data-placeholder);
+            color: #999;
+            font-style: italic;
+        }
+        .round-context-controls {
+            display: none;
+            justify-content: center;
+            gap: 4px;
+            margin-top: 4px;
+            flex-wrap: wrap;
+        }
+        .round-context-edit:focus + .round-context-controls,
+        .round-context-controls:hover,
+        .round-context:focus-within .round-context-controls {
+            display: flex;
+        }
+        .context-color-btn {
+            width: 22px;
+            height: 22px;
+            border-radius: 50%;
+            border: 1.5px solid #fff;
+            box-shadow: 0 1px 2px rgba(0, 0, 0, 0.15);
+            cursor: pointer;
+            transition: transform 0.1s ease;
+        }
+        .context-color-btn:hover {
+            transform: scale(1.15);
+        }
+        .context-color-btn:active {
+            transform: scale(0.95);
+        }
+        /* Split agent context boxes */
+        .split-agent-context {
+            display: flex;
+            gap: 6px;
+            margin: 4px auto;
+            max-width: 100%;
+            align-items: flex-start;
+        }
+        .agent-context-box {
+            flex: 1;
+            min-width: 0;
+            position: relative;
+        }
+        .agent-context-box .round-context-edit {
+            margin: 0;
+            border-radius: 6px;
+            padding: 4px 8px;
+            min-height: 18px;
+        }
+        .agent-context-box.agent-0 .round-context-edit {
+            border-color: var(--agent-0-border);
+            background: rgba(14, 178, 36, 0.03);
+        }
+        .agent-context-box.agent-1 .round-context-edit {
+            border-color: var(--agent-1-border);
+            background: rgba(239, 131, 35, 0.03);
+        }
+        .agent-context-box.agent-0 .round-context-edit:focus {
+            border-color: var(--agent-0-border);
+            box-shadow: 0 2px 8px rgba(14, 178, 36, 0.2);
+            background: rgba(14, 178, 36, 0.05);
+        }
+        .agent-context-box.agent-1 .round-context-edit:focus {
+            border-color: var(--agent-1-border);
+            box-shadow: 0 2px 8px rgba(239, 131, 35, 0.2);
+            background: rgba(239, 131, 35, 0.05);
+        }
+        .agent-context-box .round-context-edit::before {
+            font-weight: 700;
+            font-size: var(--font-size);
+            margin-right: 5px;
+            letter-spacing: 0.2px;
+        }
+        .agent-context-box.agent-0 .round-context-edit::before {
+            content: 'Agent 0 Prompt Summary:';
+            color: var(--agent-0-border);
+        }
+        .agent-context-box.agent-1 .round-context-edit::before {
+            content: 'Agent 1 Prompt Summary:';
+            color: var(--agent-1-border);
+        }
+        /* Empty context boxes will be hidden by JavaScript when strong hide is enabled */
+        .toolbar {
+            display: flex;
+            align-items: center;
+            gap: 8px;
+            margin-bottom: 0;
+            font-size: var(--font-size);
+            max-height: 0;
+            overflow: hidden;
+            opacity: 0;
+            pointer-events: none;
+            transition: max-height 0.2s ease, opacity 0.2s ease;
+            flex-wrap: wrap;
+        }
+        .toolbar-wrap { position: sticky; top: 0; z-index: 10; background: var(--bg); }
+        .toolbar-hotzone { height: 6px; }
+        .toolbar-wrap:hover .toolbar { max-height: 500px; opacity: 1; pointer-events: auto; margin-bottom: 12px; }
+        .toolbar * { pointer-events: auto !important; }
+        .toolbar input,
+        .toolbar select { z-index: 100 !important; position: relative; }
+        .toolbar input[type="number"],
+        .toolbar input[type="text"],
+        .toolbar select {
+            width: 72px;
+            padding: 2px 6px;
+            border: 1px solid var(--accent-muted);
+            border-radius: var(--corner-radius);
+            background: var(--bg);
+            user-select: text !important;
+            -webkit-user-select: text !important;
+            -moz-user-select: text !important;
+            -ms-user-select: text !important;
+            pointer-events: auto !important;
+            cursor: pointer !important;
+        }
+        .toolbar input[type="text"] {
+            cursor: text !important;
+        }
+        .toolbar input[type="text"]:focus,
+        .toolbar input[type="number"]:focus,
+        .toolbar select:focus {
+            outline: 2px solid #0066cc;
+            outline-offset: 1px;
+        }
+        .toolbar button {
+            padding: 4px 8px;
+            border: 1px solid var(--accent-muted);
+            background: var(--panel-bg);
+            border-radius: var(--corner-radius);
+            cursor: pointer;
+        }
+        .emoji-bw { filter: grayscale(100%); opacity: 0.95; font-size: var(--font-size); vertical-align: baseline; margin: 0; position: relative; top: -1px; line-height: 1; display: inline-block; }
+    </style>
+    """
+    # HTML structure
+    html_parts = [
+        "<!DOCTYPE html>",
+        "<html>",
+        "<head>",
+        "<meta charset='UTF-8'>",
+        "<title>Chat Turns</title>",
+        css,
+        "<script>\n"
+        "document.addEventListener('DOMContentLoaded', function() {\n"
+        "  const chatFlow = document.getElementById('flow-chat');\n"
+        "  let strongHideOn = false;\n"
+        "  let hideUserMessages = false;\n"
+        "  const hideUserBtn = document.getElementById('toggle-hide-user-messages');\n"
+        "  const hideUserStateEl = document.getElementById('hide-user-state');\n"
+        "  const widthControl = document.getElementById('chat-width-control');\n"
+        "  const widthSlider = document.getElementById('chat-width-slider');\n"
+        "  const widthValue = document.getElementById('chat-width-value');\n"
+        "  const strongHideBtn = document.getElementById('toggle-strong-hide');\n"
+        "  const strongHideStateEl = document.getElementById('strong-hide-state');\n"
+        "  if (strongHideBtn) {\n"
+        "    const setLabel = () => { if (strongHideStateEl) { strongHideStateEl.textContent = strongHideOn ? 'On' : 'Off'; } };\n"
+        "    strongHideBtn.addEventListener('click', () => { strongHideOn = !strongHideOn; chatFlow.classList.toggle('strong-hide', strongHideOn); setLabel(); applyStrongHideToChat(); });\n"
+        "    setLabel();\n"
+        "  }\n"
+        "  if (hideUserBtn && hideUserStateEl && chatFlow) {\n"
+        "    const updateHideUser = () => { hideUserStateEl.textContent = hideUserMessages ? 'On' : 'Off'; };\n"
+        "    hideUserBtn.addEventListener('click', () => {\n"
+        "      hideUserMessages = !hideUserMessages;\n"
+        "      chatFlow.classList.toggle('hide-user-messages', hideUserMessages);\n"
+        "      updateHideUser();\n"
+        "    });\n"
+        "    updateHideUser();\n"
+        "  }\n"
+        "  if (widthSlider && widthValue && chatFlow) {\n"
+        "    const savedWidth = localStorage.getItem('chat-view-width');\n"
+        "    if (savedWidth) {\n"
+        "      widthSlider.value = savedWidth;\n"
+        "      chatFlow.style.setProperty('--chat-width', savedWidth + 'px');\n"
+        "      widthValue.textContent = savedWidth + 'px';\n"
+        "    }\n"
+        "    widthSlider.addEventListener('input', (e) => {\n"
+        "      const width = e.target.value;\n"
+        "      chatFlow.style.setProperty('--chat-width', width + 'px');\n"
+        "      widthValue.textContent = width + 'px';\n"
+        "      localStorage.setItem('chat-view-width', width);\n"
+        "    });\n"
+        "  }\n"
+        "  const fontFamilySelect = document.getElementById('font-family-select');\n"
+        "  const fontSizeInput = document.getElementById('font-size-input');\n"
+        "  if (fontFamilySelect) {\n"
+        "    const savedFont = localStorage.getItem('render-font-family');\n"
+        "    if (savedFont) {\n"
+        "      fontFamilySelect.value = savedFont;\n"
+        "      document.body.style.setProperty('--font-family', savedFont);\n"
+        "    }\n"
+        "    fontFamilySelect.addEventListener('change', (e) => {\n"
+        "      const font = e.target.value;\n"
+        "      document.body.style.setProperty('--font-family', font);\n"
+        "      localStorage.setItem('render-font-family', font);\n"
+        "    });\n"
+        "  }\n"
+        "  if (fontSizeInput) {\n"
+        "    const savedSize = localStorage.getItem('render-font-size');\n"
+        "    if (savedSize) {\n"
+        "      fontSizeInput.value = savedSize;\n"
+        "      document.body.style.setProperty('--font-size', savedSize + 'px');\n"
+        "    }\n"
+        "    fontSizeInput.addEventListener('input', (e) => {\n"
+        "      const size = e.target.value;\n"
+        "      document.body.style.setProperty('--font-size', size + 'px');\n"
+        "      localStorage.setItem('render-font-size', size);\n"
+        "    });\n"
+        "  }\n"
+        "  const agent0EmojiInput = document.getElementById('agent0-emoji-input');\n"
+        "  const agent0NameInput = document.getElementById('agent0-name-input');\n"
+        "  const agent1EmojiInput = document.getElementById('agent1-emoji-input');\n"
+        "  const agent1NameInput = document.getElementById('agent1-name-input');\n"
+        "  const applyAgentNamesBtn = document.getElementById('apply-agent-names');\n"
+        "  function loadAgentNames() {\n"
+        "    if (agent0EmojiInput && agent0NameInput && agent1EmojiInput && agent1NameInput) {\n"
+        "      const savedAgent0Emoji = localStorage.getItem('agent0-emoji') || '🤖';\n"
+        "      const savedAgent0Name = localStorage.getItem('agent0-name') || document.getElementById('agent0-name-input').placeholder;\n"
+        "      const savedAgent1Emoji = localStorage.getItem('agent1-emoji') || '🤖';\n"
+        "      const savedAgent1Name = localStorage.getItem('agent1-name') || document.getElementById('agent1-name-input').placeholder;\n"
+        "      agent0EmojiInput.value = savedAgent0Emoji;\n"
+        "      agent0NameInput.value = savedAgent0Name;\n"
+        "      agent1EmojiInput.value = savedAgent1Emoji;\n"
+        "      agent1NameInput.value = savedAgent1Name;\n"
+        "      applyAgentNamesToDOM(savedAgent0Emoji, savedAgent0Name, savedAgent1Emoji, savedAgent1Name);\n"
+        "    }\n"
+        "  }\n"
+        "  function applyAgentNamesToDOM(agent0Emoji, agent0Name, agent1Emoji, agent1Name) {\n"
+        "    const agentMap = { '0': { name: agent0Name, emoji: agent0Emoji }, '1': { name: agent1Name, emoji: agent1Emoji } };\n"
+        "    document.querySelectorAll('[data-agent-index]').forEach(el => {\n"
+        "      const agentIndex = el.getAttribute('data-agent-index');\n"
+        "      if (!agentMap[agentIndex]) return;\n"
+        "      if (el.classList.contains('agent-name')) {\n"
+        "        el.textContent = agentMap[agentIndex].name;\n"
+        "      } else if (el.classList.contains('emoji-bw')) {\n"
+        "        const currentEmoji = el.textContent.trim();\n"
+        "        if (currentEmoji === '🤖' || currentEmoji === '👤') {\n"
+        "          el.textContent = agentMap[agentIndex].emoji;\n"
+        "        }\n"
+        "      }\n"
+        "    });\n"
+        "    const style = document.createElement('style');\n"
+        "    style.id = 'dynamic-agent-names-style';\n"
+        "    const existingStyle = document.getElementById('dynamic-agent-names-style');\n"
+        "    if (existingStyle) existingStyle.remove();\n"
+        "    style.textContent = `\n"
+        "      .agent-context-box.agent-0 .round-context-edit::before {\n"
+        "        content: '${agent0Name} Prompt Summary:';\n"
+        "      }\n"
+        "      .agent-context-box.agent-1 .round-context-edit::before {\n"
+        "        content: '${agent1Name} Prompt Summary:';\n"
+        "      }\n"
+        "    `;\n"
+        "    document.head.appendChild(style);\n"
+        "  }\n"
+        "  if (applyAgentNamesBtn && agent0EmojiInput && agent0NameInput && agent1EmojiInput && agent1NameInput) {\n"
+        "    [agent0EmojiInput, agent0NameInput, agent1EmojiInput, agent1NameInput].forEach(input => {\n"
+        "      input.style.pointerEvents = 'auto';\n"
+        "      if (input.tagName === 'INPUT') {\n"
+        "        input.style.userSelect = 'text';\n"
+        "        input.style.webkitUserSelect = 'text';\n"
+        "        input.readOnly = false;\n"
+        "      }\n"
+        "      input.disabled = false;\n"
+        "      const stopAll = (e) => { e.stopPropagation(); e.stopImmediatePropagation(); };\n"
+        "      input.addEventListener('mousedown', stopAll, true);\n"
+        "      input.addEventListener('mouseup', stopAll, true);\n"
+        "      input.addEventListener('click', stopAll, true);\n"
+        "      input.addEventListener('dblclick', stopAll, true);\n"
+        "      input.addEventListener('focus', stopAll, true);\n"
+        "      input.addEventListener('blur', stopAll, true);\n"
+        "      input.addEventListener('paste', stopAll, true);\n"
+        "      input.addEventListener('cut', stopAll, true);\n"
+        "      input.addEventListener('copy', stopAll, true);\n"
+        "      input.addEventListener('select', stopAll, true);\n"
+        "      input.addEventListener('selectstart', stopAll, true);\n"
+        "      input.addEventListener('keydown', stopAll, true);\n"
+        "      input.addEventListener('keyup', stopAll, true);\n"
+        "      input.addEventListener('keypress', stopAll, true);\n"
+        "      input.addEventListener('input', stopAll, true);\n"
+        "      input.addEventListener('change', stopAll, true);\n"
+        "      input.addEventListener('contextmenu', stopAll, true);\n"
+        "    });\n"
+        "    const applyNames = () => {\n"
+        "      const agent0Emoji = agent0EmojiInput.value || '🤖';\n"
+        "      const agent0Name = agent0NameInput.value.trim() || agent0NameInput.placeholder;\n"
+        "      const agent1Emoji = agent1EmojiInput.value || '🤖';\n"
+        "      const agent1Name = agent1NameInput.value.trim() || agent1NameInput.placeholder;\n"
+        "      localStorage.setItem('agent0-emoji', agent0Emoji);\n"
+        "      localStorage.setItem('agent0-name', agent0Name);\n"
+        "      localStorage.setItem('agent1-emoji', agent1Emoji);\n"
+        "      localStorage.setItem('agent1-name', agent1Name);\n"
+        "      applyAgentNamesToDOM(agent0Emoji, agent0Name, agent1Emoji, agent1Name);\n"
+        "    };\n"
+        "    applyAgentNamesBtn.addEventListener('click', applyNames);\n"
+        "    [agent0NameInput, agent1NameInput].forEach(input => {\n"
+        "      input.addEventListener('keydown', (e) => {\n"
+        "        if (e.key === 'Enter') {\n"
+        "          e.preventDefault();\n"
+        "          e.stopPropagation();\n"
+        "          e.stopImmediatePropagation();\n"
+        "          applyNames();\n"
+        "        }\n"
+        "      }, true);\n"
+        "    });\n"
+        "    [agent0EmojiInput, agent1EmojiInput].forEach(select => {\n"
+        "      select.addEventListener('change', applyNames);\n"
+        "    });\n"
+        "  }\n"
+        "  loadAgentNames();\n"
+        "  function setupRoundCollapse() {\n"
+        "    document.addEventListener('click', function(e) {\n"
+        "      if (e.target.closest('input, textarea, select, button, .round-context-edit, .toolbar')) { return; }\n"
+        "      const divider = e.target.closest('.chat-group-divider, .group-divider');\n"
+        "      if (!divider) return;\n"
+        "      divider.classList.toggle('collapsed');\n"
+        "      const isCollapsed = divider.classList.contains('collapsed');\n"
+        "      let nextElement = divider.nextElementSibling;\n"
+        "      while (nextElement) {\n"
+        "        if (nextElement.classList.contains('chat-group-divider') || nextElement.classList.contains('group-divider')) {\n"
+        "          break;\n"
+        "        }\n"
+        "        if (isCollapsed) {\n"
+        "          if (!nextElement.dataset.originalDisplay) {\n"
+        "            nextElement.dataset.originalDisplay = nextElement.style.display || getComputedStyle(nextElement).display;\n"
+        "          }\n"
+        "          nextElement.style.display = 'none';\n"
+        "        } else {\n"
+        "          if (nextElement.dataset.originalDisplay) {\n"
+        "            const originalDisplay = nextElement.dataset.originalDisplay;\n"
+        "            nextElement.style.display = originalDisplay === 'none' ? '' : originalDisplay;\n"
+        "            if (nextElement.style.display === originalDisplay && originalDisplay !== 'none') {\n"
+        "              nextElement.style.display = '';\n"
+        "            }\n"
+        "            delete nextElement.dataset.originalDisplay;\n"
+        "          } else {\n"
+        "            nextElement.style.display = '';\n"
+        "          }\n"
+        "        }\n"
+        "        nextElement = nextElement.nextElementSibling;\n"
+        "      }\n"
+        "      e.stopPropagation();\n"
+        "    });\n"
+        "  }\n"
+        "  setupRoundCollapse();\n"
+        "  const strongHideBtnChat = document.getElementById('toggle-strong-hide');\n"
+        "  function applyStrongHideToChat() {\n"
+        "    if (!chatFlow) return;\n"
+        "    chatFlow.classList.toggle('strong-hide', strongHideOn);\n"
+        "    const contextEdits = chatFlow.querySelectorAll('.round-context-edit');\n"
+        "    contextEdits.forEach(edit => {\n"
+        "      const parent = edit.closest('.round-context, .agent-context-box, .split-agent-context');\n"
+        "      if (parent) {\n"
+        "        if (strongHideOn && edit.textContent.trim() === '') {\n"
+        "          parent.style.display = 'none';\n"
+        "        } else {\n"
+        "          parent.style.display = '';\n"
+        "        }\n"
+        "      }\n"
+        "    });\n"
+        "    const splitContexts = chatFlow.querySelectorAll('.split-agent-context');\n"
+        "    splitContexts.forEach(split => {\n"
+        "      if (strongHideOn) {\n"
+        "        const boxes = split.querySelectorAll('.agent-context-box');\n"
+        "        const allEmpty = Array.from(boxes).every(box => {\n"
+        "          const edit = box.querySelector('.round-context-edit');\n"
+        "          return edit && edit.textContent.trim() === '';\n"
+        "        });\n"
+        "        if (allEmpty) split.style.display = 'none';\n"
+        "      }\n"
+        "    });\n"
+        "  }\n"
+        "  if (strongHideBtnChat && chatFlow) {\n"
+        "    strongHideBtnChat.addEventListener('click', () => {\n"
+        "      setTimeout(() => applyStrongHideToChat(), 0);\n"
+        "    });\n"
+        "  }\n"
+        "  document.addEventListener('click', function(e) {\n"
+        "    if (e.target.closest('input, textarea, select, .round-context-edit, .toolbar')) { return; }\n"
+        "    const chatReasoning = e.target.closest('.chat-reasoning');\n"
+        "    if (chatReasoning) {\n"
+        "      chatReasoning.classList.toggle('collapsed');\n"
+        "      return;\n"
+        "    }\n"
+        "    const userMessage = e.target.closest('.chat-message.role-user');\n"
+        "    if (userMessage && !e.target.closest('.merge-btn, .unmerge-btn')) {\n"
+        "      userMessage.classList.toggle('collapsed');\n"
+        "    }\n"
+        "  });\n"
+        "  function applyColorToSelection(color, element) {\n"
+        "    const selection = window.getSelection();\n"
+        "    if (!selection.rangeCount) return false;\n"
+        "    const range = selection.getRangeAt(0);\n"
+        "    if (!element.contains(range.commonAncestorContainer)) return false;\n"
+        "    const selectedText = range.toString();\n"
+        "    if (!selectedText) return false;\n"
+        "    if (color === 'default') {\n"
+        "      // Remove styling - just extract the text content\n"
+        "      const textNode = document.createTextNode(selectedText);\n"
+        "      range.deleteContents();\n"
+        "      range.insertNode(textNode);\n"
+        "    } else {\n"
+        "      const span = document.createElement('span');\n"
+        "      span.style.color = color;\n"
+        "      span.style.fontWeight = '600';\n"
+        "      try {\n"
+        "        range.surroundContents(span);\n"
+        "      } catch (e) {\n"
+        "        const contents = range.extractContents();\n"
+        "        span.appendChild(contents);\n"
+        "        range.insertNode(span);\n"
+        "      }\n"
+        "    }\n"
+        "    return true;\n"
+        "  }\n"
+        "  let lastFocusedContextEdit = null;\n"
+        "  document.addEventListener('focusin', function(e) {\n"
+        "    if (e.target.classList.contains('round-context-edit')) {\n"
+        "      lastFocusedContextEdit = e.target;\n"
+        "    }\n"
+        "  });\n"
+        "  document.addEventListener('mousedown', function(e) {\n"
+        "    if (e.target.classList.contains('context-color-btn')) {\n"
+        "      e.preventDefault();\n"
+        "    }\n"
+        "  });\n"
+        "  document.addEventListener('click', function(e) {\n"
+        "    if (e.target.closest('input:not(.round-context-edit), textarea, select') && !e.target.classList.contains('context-color-btn')) { return; }\n"
+        "    if (e.target.classList.contains('context-color-btn')) {\n"
+        "      e.preventDefault();\n"
+        "      const color = e.target.dataset.color;\n"
+        "      const controls = e.target.closest('.round-context-controls');\n"
+        "      const contextEdit = controls ? controls.previousElementSibling : null;\n"
+        "      if (contextEdit && contextEdit.classList.contains('round-context-edit')) {\n"
+        "        contextEdit.focus();\n"
+        "        const selection = window.getSelection();\n"
+        "        if (selection.rangeCount > 0 && selection.toString().length > 0 && contextEdit.contains(selection.anchorNode)) {\n"
+        "          if (applyColorToSelection(color, contextEdit)) {\n"
+        "            const key = contextEdit.dataset.contextKey;\n"
+        "            localStorage.setItem(key, contextEdit.innerHTML);\n"
+        "          }\n"
+        "        } else {\n"
+        "          try {\n"
+        "            if (color !== 'default') {\n"
+        "              document.execCommand('styleWithCSS', false, true);\n"
+        "              document.execCommand('foreColor', false, color);\n"
+        "            }\n"
+        "            const key = contextEdit.dataset.contextKey;\n"
+        "            setTimeout(() => localStorage.setItem(key, contextEdit.innerHTML), 10);\n"
+        "          } catch (e) {\n"
+        "            console.log('Color command failed:', e);\n"
+        "          }\n"
+        "        }\n"
+        "      }\n"
+        "    }\n"
+        "  });\n"
+        "  const contextEdits = document.querySelectorAll('.round-context-edit');\n"
+        "  contextEdits.forEach(edit => {\n"
+        "    edit.addEventListener('input', function() {\n"
+        "      const key = this.dataset.contextKey;\n"
+        "      localStorage.setItem(key, this.innerHTML);\n"
+        "    });\n"
+        "    const key = edit.dataset.contextKey;\n"
+        "    const saved = localStorage.getItem(key);\n"
+        "    if (saved) {\n"
+        "      edit.innerHTML = saved;\n"
+        "    }\n"
+        "  });\n"
+        "  document.addEventListener('click', function(e) {\n"
+        "    if (e.target.closest('input, textarea, select, .round-context-edit') && !e.target.classList.contains('merge-btn') && !e.target.classList.contains('unmerge-btn')) { return; }\n"
+        "    if (e.target.classList.contains('merge-btn')) {\n"
+        "      e.preventDefault();\n"
+        "      e.stopPropagation();\n"
+        "      const msgId = e.target.dataset.msgId;\n"
+        "      const currentMsg = e.target.closest('.chat-message');\n"
+        "      if (!currentMsg) return;\n"
+        "      if (currentMsg.classList.contains('role-user')) {\n"
+        "        alert('Cannot merge user messages');\n"
+        "        return;\n"
+        "      }\n"
+        "      let nextMsg = currentMsg.nextElementSibling;\n"
+        "      while (nextMsg && !nextMsg.classList.contains('chat-message')) {\n"
+        "        nextMsg = nextMsg.nextElementSibling;\n"
+        "      }\n"
+        "      while (nextMsg && nextMsg.classList.contains('role-user')) {\n"
+        "        nextMsg = nextMsg.nextElementSibling;\n"
+        "        while (nextMsg && !nextMsg.classList.contains('chat-message')) {\n"
+        "          nextMsg = nextMsg.nextElementSibling;\n"
+        "        }\n"
+        "      }\n"
+        "      if (!nextMsg || nextMsg.classList.contains('chat-message') === false) {\n"
+        "        alert('No next assistant message to merge with');\n"
+        "        return;\n"
+        "      }\n"
+        "      if (nextMsg.classList.contains('role-user')) {\n"
+        "        alert('Cannot merge with user messages');\n"
+        "        return;\n"
+        "      }\n"
+        "      \n"
+        "      // Find the user prompts that precede each assistant message\n"
+        "      let currentPrompt = currentMsg.previousElementSibling;\n"
+        "      while (currentPrompt && !currentPrompt.classList.contains('chat-message')) {\n"
+        "        currentPrompt = currentPrompt.previousElementSibling;\n"
+        "      }\n"
+        "      if (currentPrompt && !currentPrompt.classList.contains('role-user')) {\n"
+        "        currentPrompt = null;\n"
+        "      }\n"
+        "      \n"
+        "      let nextPrompt = nextMsg.previousElementSibling;\n"
+        "      while (nextPrompt && !nextPrompt.classList.contains('chat-message')) {\n"
+        "        nextPrompt = nextPrompt.previousElementSibling;\n"
+        "      }\n"
+        "      if (nextPrompt && !nextPrompt.classList.contains('role-user')) {\n"
+        "        nextPrompt = null;\n"
+        "      }\n"
+        "      \n"
+        "      // Find the split-agent-context that precedes the first prompt or assistant message\n"
+        "      let splitContext = null;\n"
+        "      let searchStart = currentPrompt || currentMsg;\n"
+        "      let elem = searchStart.previousElementSibling;\n"
+        "      while (elem) {\n"
+        "        if (elem.classList.contains('split-agent-context')) {\n"
+        "          splitContext = elem;\n"
+        "          break;\n"
+        "        }\n"
+        "        if (elem.classList.contains('chat-message') || elem.classList.contains('chat-group-divider')) {\n"
+        "          break;\n"
+        "        }\n"
+        "        elem = elem.previousElementSibling;\n"
+        "      }\n"
+        "      \n"
+        "      const parent = currentMsg.parentElement;\n"
+        "      if (parent.classList.contains('simultaneous-messages')) {\n"
+        "        const wrapper = parent;\n"
+        "        currentMsg.style.display = '';\n"
+        "        currentMsg.classList.remove('merged');\n"
+        "        const refNode = wrapper.nextElementSibling;\n"
+        "        parent.parentElement.insertBefore(currentMsg, refNode);\n"
+        "        if (nextMsg.parentElement === wrapper) {\n"
+        "          parent.parentElement.insertBefore(nextMsg, refNode);\n"
+        "        }\n"
+        "        if (wrapper.children.length === 0) {\n"
+        "          wrapper.remove();\n"
+        "        }\n"
+        "      } else {\n"
+        "        // If split-agent-context exists, wrap it\n"
+        "        if (splitContext && !splitContext.classList.contains('merged')) {\n"
+        "          const splitWrapper = document.createElement('div');\n"
+        "          splitWrapper.className = 'simultaneous-messages';\n"
+        "          const splitUnmergeBtn = document.createElement('button');\n"
+        "          splitUnmergeBtn.className = 'unmerge-btn';\n"
+        "          splitUnmergeBtn.innerHTML = '✕';\n"
+        "          splitUnmergeBtn.title = 'Click to unmerge messages';\n"
+        "          splitWrapper.appendChild(splitUnmergeBtn);\n"
+        "          splitWrapper.dataset.isSplitContext = 'true';\n"
+        "          parent.insertBefore(splitWrapper, splitContext);\n"
+        "          splitWrapper.appendChild(splitContext);\n"
+        "          splitContext.classList.add('merged');\n"
+        "        }\n"
+        "        \n"
+        "        // Create wrapper for prompts if both exist\n"
+        "        if (currentPrompt && nextPrompt) {\n"
+        "          const promptWrapper = document.createElement('div');\n"
+        "          promptWrapper.className = 'simultaneous-messages';\n"
+        "          const promptUnmergeBtn = document.createElement('button');\n"
+        "          promptUnmergeBtn.className = 'unmerge-btn';\n"
+        "          promptUnmergeBtn.innerHTML = '✕';\n"
+        "          promptUnmergeBtn.title = 'Click to unmerge messages';\n"
+        "          promptWrapper.appendChild(promptUnmergeBtn);\n"
+        "          promptWrapper.dataset.firstMsgId = currentPrompt.dataset.msgId;\n"
+        "          promptWrapper.dataset.secondMsgId = nextPrompt.dataset.msgId;\n"
+        "          \n"
+        "          // Determine order: agent-0 first, agent-1 second\n"
+        "          const firstPrompt = currentPrompt.classList.contains('agent-0') ? currentPrompt : nextPrompt;\n"
+        "          const secondPrompt = currentPrompt.classList.contains('agent-0') ? nextPrompt : currentPrompt;\n"
+        "          \n"
+        "          parent.insertBefore(promptWrapper, currentPrompt);\n"
+        "          promptWrapper.appendChild(firstPrompt);\n"
+        "          promptWrapper.appendChild(secondPrompt);\n"
+        "          currentPrompt.classList.add('merged');\n"
+        "          nextPrompt.classList.add('merged');\n"
+        "        }\n"
+        "        \n"
+        "        // Create wrapper for assistant messages\n"
+        "        const wrapper = document.createElement('div');\n"
+        "        wrapper.className = 'simultaneous-messages';\n"
+        "        const unmergeBtn = document.createElement('button');\n"
+        "        unmergeBtn.className = 'unmerge-btn';\n"
+        "        unmergeBtn.innerHTML = '✕';\n"
+        "        unmergeBtn.title = 'Click to unmerge messages';\n"
+        "        wrapper.appendChild(unmergeBtn);\n"
+        "        wrapper.dataset.firstMsgId = currentMsg.dataset.msgId;\n"
+        "        wrapper.dataset.secondMsgId = nextMsg.dataset.msgId;\n"
+        "        \n"
+        "        // Determine order: agent-0 first, agent-1 second\n"
+        "        const firstAssistant = currentMsg.classList.contains('agent-0') ? currentMsg : nextMsg;\n"
+        "        const secondAssistant = currentMsg.classList.contains('agent-0') ? nextMsg : currentMsg;\n"
+        "        \n"
+        "        parent.insertBefore(wrapper, currentMsg);\n"
+        "        wrapper.appendChild(firstAssistant);\n"
+        "        wrapper.appendChild(secondAssistant);\n"
+        "        currentMsg.classList.add('merged');\n"
+        "        nextMsg.classList.add('merged');\n"
+        "      }\n"
+        "    }\n"
+        "    if (e.target.classList.contains('unmerge-btn')) {\n"
+        "      const wrapper = e.target.closest('.simultaneous-messages');\n"
+        "      if (!wrapper) return;\n"
+        "      const parent = wrapper.parentElement;\n"
+        "      \n"
+        "      // Check if this is a split-context wrapper\n"
+        "      if (wrapper.dataset.isSplitContext === 'true') {\n"
+        "        const splitContext = wrapper.querySelector('.split-agent-context');\n"
+        "        if (splitContext) {\n"
+        "          splitContext.classList.remove('merged');\n"
+        "          parent.insertBefore(splitContext, wrapper.nextElementSibling);\n"
+        "        }\n"
+        "        wrapper.remove();\n"
+        "        return;\n"
+        "      }\n"
+        "      \n"
+        "      const firstMsgId = wrapper.dataset.firstMsgId;\n"
+        "      const secondMsgId = wrapper.dataset.secondMsgId;\n"
+        "      const messages = Array.from(wrapper.querySelectorAll('.chat-message'));\n"
+        "      const refNode = wrapper.nextElementSibling;\n"
+        "      const firstMsg = messages.find(m => m.dataset.msgId === firstMsgId);\n"
+        "      const secondMsg = messages.find(m => m.dataset.msgId === secondMsgId);\n"
+        "      \n"
+        "      // Check for preceding wrappers to also unmerge (prompts and split-context)\n"
+        "      let currentElem = wrapper.previousElementSibling;\n"
+        "      const wrappersToUnmerge = [];\n"
+        "      \n"
+        "      while (currentElem) {\n"
+        "        if (currentElem.classList.contains('simultaneous-messages')) {\n"
+        "          wrappersToUnmerge.push(currentElem);\n"
+        "        } else if (currentElem.classList.contains('chat-message') || currentElem.classList.contains('chat-group-divider')) {\n"
+        "          break;\n"
+        "        }\n"
+        "        currentElem = currentElem.previousElementSibling;\n"
+        "      }\n"
+        "      \n"
+        "      // Unmerge preceding wrappers\n"
+        "      for (const prevWrapper of wrappersToUnmerge) {\n"
+        "        if (prevWrapper.dataset.isSplitContext === 'true') {\n"
+        "          const splitContext = prevWrapper.querySelector('.split-agent-context');\n"
+        "          if (splitContext) {\n"
+        "            splitContext.classList.remove('merged');\n"
+        "            parent.insertBefore(splitContext, prevWrapper.nextElementSibling);\n"
+        "          }\n"
+        "          prevWrapper.remove();\n"
+        "        } else {\n"
+        "          const prevMessages = Array.from(prevWrapper.querySelectorAll('.chat-message'));\n"
+        "          const prevFirstMsgId = prevWrapper.dataset.firstMsgId;\n"
+        "          const prevSecondMsgId = prevWrapper.dataset.secondMsgId;\n"
+        "          const prevFirstMsg = prevMessages.find(m => m.dataset.msgId === prevFirstMsgId);\n"
+        "          const prevSecondMsg = prevMessages.find(m => m.dataset.msgId === prevSecondMsgId);\n"
+        "          const prevRefNode = prevWrapper.nextElementSibling;\n"
+        "          \n"
+        "          if (prevFirstMsg) {\n"
+        "            prevFirstMsg.classList.remove('merged');\n"
+        "            prevFirstMsg.style.display = '';\n"
+        "            parent.insertBefore(prevFirstMsg, prevRefNode);\n"
+        "          }\n"
+        "          if (prevSecondMsg) {\n"
+        "            prevSecondMsg.classList.remove('merged');\n"
+        "            prevSecondMsg.style.display = '';\n"
+        "            parent.insertBefore(prevSecondMsg, prevRefNode);\n"
+        "          }\n"
+        "          prevWrapper.remove();\n"
+        "        }\n"
+        "      }\n"
+        "      \n"
+        "      // Unmerge the main assistant messages\n"
+        "      if (firstMsg) {\n"
+        "        firstMsg.classList.remove('merged');\n"
+        "        firstMsg.style.display = '';\n"
+        "        parent.insertBefore(firstMsg, refNode);\n"
+        "      }\n"
+        "      if (secondMsg) {\n"
+        "        secondMsg.classList.remove('merged');\n"
+        "        secondMsg.style.display = '';\n"
+        "        parent.insertBefore(secondMsg, refNode);\n"
+        "      }\n"
+        "      wrapper.remove();\n"
+        "    }\n"
+        "  });\n"
+        "});\n"
+        "</script>",
+        "</head>",
+        "<body>",
+        '<div class="toolbar-wrap">',
+        '<div class="toolbar-hotzone"></div>',
+        '<div class="toolbar">',
+        '<button id="toggle-strong-hide"><span class="emoji-bw">🗜️</span> Strong Hide: <span id="strong-hide-state">Off</span></button>',
+        '<button id="toggle-hide-user-messages"><span class="emoji-bw">👁️</span> Hide Prompts: <span id="hide-user-state">Off</span></button>',
+        '<span id="chat-width-control" style="margin-left:8px;">',
+        '<label for="chat-width-slider"><span class="emoji-bw">↔️</span> Width:</label>',
+        '<input id="chat-width-slider" type="range" min="600" max="1600" step="50" value="900" style="width:120px; vertical-align:middle;" />',
+        '<span id="chat-width-value" style="margin-left:4px;">900px</span>',
+        "</span>",
+        '<span style="margin-left:12px;">',
+        '<label for="font-family-select"><span class="emoji-bw">🔤</span> Font:</label>',
+        '<select id="font-family-select" style="padding:2px 6px; border:1px solid var(--accent-muted); border-radius:var(--corner-radius); background:var(--bg);">',
+        "<option value=\"'Segoe UI', Tahoma, Geneva, Verdana, sans-serif\">Segoe UI</option>",
+        '<option value="Arial, sans-serif">Arial</option>',
+        "<option value=\"'Helvetica Neue', Helvetica, sans-serif\">Helvetica</option>",
+        "<option value=\"'Times New Roman', Times, serif\">Times New Roman</option>",
+        '<option value="Georgia, serif">Georgia</option>',
+        "<option value=\"'Courier New', Courier, monospace\">Courier New</option>",
+        "<option value=\"'Comic Sans MS', cursive\">Comic Sans</option>",
+        "<option value=\"'Trebuchet MS', sans-serif\">Trebuchet MS</option>",
+        '<option value="Verdana, sans-serif">Verdana</option>',
+        "<option value=\"'Palatino Linotype', 'Book Antiqua', Palatino, serif\">Palatino</option>",
+        "<option value=\"'Lucida Console', Monaco, monospace\">Lucida Console</option>",
+        "</select>",
+        "</span>",
+        '<span style="margin-left:8px;">',
+        '<label for="font-size-input"><span class="emoji-bw">📏</span> Size:</label>',
+        '<input id="font-size-input" type="number" min="8" max="24" step="1" value="14" style="width:50px;" />',
+        "<span>px</span>",
+        "</span>",
+        '<span style="margin-left:12px; display:flex; align-items:center; gap:8px;">',
+        '<label style="font-weight:600;">Agent Names:</label>',
+        f'<select id="agent0-emoji-input" style="width:65px; padding:2px 6px; border:1px solid var(--accent-muted); border-radius:var(--corner-radius); background:var(--bg);">',
+        '<option value="🤖">🤖 Robot</option>',
+        '<option value="👤">👤 Human</option>',
+        "</select>",
+        f'<input id="agent0-name-input" type="text" placeholder="{html.escape(unique_agent_ids[0]) if len(unique_agent_ids) > 0 else "Agent 0"}" style="width:80px; padding:2px 6px; border:1px solid var(--accent-muted); border-radius:var(--corner-radius); background:var(--bg);" />',
+        '<span style="margin:0 4px;">|</span>',
+        f'<select id="agent1-emoji-input" style="width:65px; padding:2px 6px; border:1px solid var(--accent-muted); border-radius:var(--corner-radius); background:var(--bg);">',
+        '<option value="🤖">🤖 Robot</option>',
+        '<option value="👤">👤 Human</option>',
+        "</select>",
+        f'<input id="agent1-name-input" type="text" placeholder="{html.escape(unique_agent_ids[1]) if len(unique_agent_ids) > 1 else "Agent 1"}" style="width:80px; padding:2px 6px; border:1px solid var(--accent-muted); border-radius:var(--corner-radius); background:var(--bg);" />',
+        '<button id="apply-agent-names" style="padding:4px 8px; border:1px solid var(--accent-muted); background:var(--panel-bg); border-radius:var(--corner-radius); cursor:pointer;">Apply</button>',
+        "</span>",
+        "</div>",
+        "</div>",
+    ]
+    # Add Chat View
+    import html as _html_mod
+    html_parts.append('<div id="flow-chat" class="messages-flow">')
+    # Helper function to add context annotation areas
+    def add_context_area(position: str, time_step: int):
+        context_key = f"round-context-{position}-{time_step}"
+        placeholder = f"Add context {position} round {time_step}..."
+        color_buttons = ""
+        # Add default/reset color button first
+        color_buttons += (
+            f'<div class="context-color-btn" data-color="default" '
+            f'style="background: linear-gradient(135deg, #000 25%, transparent 25%, transparent 75%, #000 75%), '
+            f"linear-gradient(135deg, #000 25%, transparent 25%, transparent 75%, #000 75%); "
+            f"background-size: 4px 4px; background-position: 0 0, 2px 2px; "
+            f'background-color: #fff;" title="Default color"></div>'
+        )
+        for color_name, color_value in [
+            ("red", "#d32f2f"),
+            ("orange", "#f57c00"),
+            ("yellow", "#f9a825"),
+            ("green", "#388e3c"),
+            ("blue", "#1976d2"),
+            ("purple", "#7b1fa2"),
+            ("gray", "#666666"),
+        ]:
+            color_buttons += (
+                f'<div class="context-color-btn" data-color="{color_value}" '
+                f'style="background-color: {color_value};" title="{color_name}"></div>'
+            )
+        html_parts.append(
+            f'<div class="round-context">'
+            f'<div class="round-context-edit" contenteditable="true" spellcheck="true" '
+            f'data-context-key="{context_key}" '
+            f'data-placeholder="{placeholder}"></div>'
+            f'<div class="round-context-controls">{color_buttons}</div>'
+            f"</div>"
+        )
+    # Helper function to add split agent context boxes
+    def add_split_agent_contexts(position: str, time_step: int):
+        color_buttons = ""
+        # Add default/reset color button first
+        color_buttons += (
+            f'<div class="context-color-btn" data-color="default" '
+            f'style="background: linear-gradient(135deg, #000 25%, transparent 25%, transparent 75%, #000 75%), '
+            f"linear-gradient(135deg, #000 25%, transparent 25%, transparent 75%, #000 75%); "
+            f"background-size: 4px 4px; background-position: 0 0, 2px 2px; "
+            f'background-color: #fff;" title="Default color"></div>'
+        )
+        for color_name, color_value in [
+            ("red", "#d32f2f"),
+            ("orange", "#f57c00"),
+            ("yellow", "#f9a825"),
+            ("green", "#388e3c"),
+            ("blue", "#1976d2"),
+            ("purple", "#7b1fa2"),
+            ("gray", "#666666"),
+        ]:
+            color_buttons += (
+                f'<div class="context-color-btn" data-color="{color_value}" '
+                f'style="background-color: {color_value};" title="{color_name}"></div>'
+            )
+        html_parts.append('<div class="split-agent-context">')
+        # Agent 0 box
+        agent0_key = f"agent-context-0-{position}-{time_step}"
+        agent0_placeholder = f"..."
+        html_parts.append(
+            f'<div class="agent-context-box agent-0">'
+            f'<div class="round-context-edit" contenteditable="true" spellcheck="true" '
+            f'data-context-key="{agent0_key}" '
+            f'data-placeholder="{agent0_placeholder}"></div>'
+            f'<div class="round-context-controls">{color_buttons}</div>'
+            f"</div>"
+        )
+        # Agent 1 box
+        agent1_key = f"agent-context-1-{position}-{time_step}"
+        agent1_placeholder = f"..."
+        html_parts.append(
+            f'<div class="agent-context-box agent-1">'
+            f'<div class="round-context-edit" contenteditable="true" spellcheck="true" '
+            f'data-context-key="{agent1_key}" '
+            f'data-placeholder="{agent1_placeholder}"></div>'
+            f'<div class="round-context-controls">{color_buttons}</div>'
+            f"</div>"
+        )
+        html_parts.append("</div>")  # split-agent-context
+    last_time_step_chat = None
+    for original_index, turn in indexed_turns:
+        # Use agent index for CSS class (agent-0 or agent-1) instead of agent ID
+        agent_index = agent_id_to_index.get(turn.agent_id, 0)
+        agent_class = f"agent-{agent_index}"
+        role_class = f"role-{turn.role}"
+        # Add time step divider and beginning context
+        if last_time_step_chat is None or turn.time_step != last_time_step_chat:
+            # Add end contexts for previous round (only regular context, not prompt summary)
+            if last_time_step_chat is not None:
+                add_context_area("end", last_time_step_chat)
+            html_parts.append(
+                f'<div class="chat-group-divider">'
+                f'<span class="chat-group-label">⏱ Round {turn.time_step + 1}</span>'
+                f"</div>"
+            )
+            # Add beginning contexts for new round (both context and prompt summary)
+            add_context_area("beginning", turn.time_step)
+            add_split_agent_contexts("beginning", turn.time_step)
+            last_time_step_chat = turn.time_step
+        # Build chat message with merge controls
+        html_parts.append(
+            f'<div class="chat-message {agent_class} {role_class}" data-msg-id="{original_index}">'
+        )
+        # Add merge control button
+        html_parts.append(
+            f'<button class="merge-btn" title="Merge with next message" data-msg-id="{original_index}">⇄</button>'
+        )
+        html_parts.append('<div class="chat-message-content">')
+        # Header with agent name and reward (always show reward)
+        if turn.role == "assistant":
+            name = _html_mod.escape(turn.agent_id)
+            raw_val = turn.reward
+            if isinstance(raw_val, (int, float)):
+                reward_val = f"{raw_val:.4f}".rstrip("0").rstrip(".")
+                if len(reward_val) > 8:
+                    reward_val = reward_val[:8] + "…"
+            else:
+                reward_val = str(raw_val)
+            header_html = (
+                f'<div class="chat-header">'
+                f'<span class="emoji-bw" data-agent-index="{agent_index}">🤖</span> <span class="agent-name" data-agent-index="{agent_index}">{name}</span>'
+                f'<span class="chat-reward">⚑ {reward_val}</span>'
+                f"</div>"
+            )
+        else:
+            name = _html_mod.escape(turn.agent_id)
+            header_html = f'<div class="chat-header">Prompt of <span class="agent-name" data-agent-index="{agent_index}">{name}</span></div>'
+        html_parts.append(header_html)
+        # Reasoning content if present
+        if turn.reasoning_content:
+            _raw_reasoning = turn.reasoning_content.replace("\r\n", "\n")
+            _raw_reasoning = _re.sub(r"^\s*\n+", "", _raw_reasoning)
+            esc_reasoning = _html_mod.escape(_raw_reasoning)
+            html_parts.append(
+                f'<div class="chat-reasoning collapsed">'
+                f'<span class="reasoning-icon">💭</span> '
+                f'<span class="reasoning-text">{esc_reasoning}</span>'
+                f"</div>"
+            )
+        # Message bubble
+        esc_content = _html_mod.escape(turn.content)
+        html_parts.append(f'<div class="chat-bubble">{esc_content}</div>')
+        html_parts.append("</div>")  # chat-message-content
+        html_parts.append("</div>")  # chat-message
+    # Add end contexts for the last round (only regular context, not prompt summary)
+    if last_time_step_chat is not None:
+        add_context_area("end", last_time_step_chat)
+    html_parts.append("</div>")  # flow-chat
+    html_parts.extend(["</body>", "</html>"])
+    return "\n".join(html_parts)
+def export_html_from_rollout_tree(path: Path, outdir: Path, main_only: bool = False):
+    """Process a rollout tree file and generate HTML files for each path.
+    Creates separate HTML files for the main path and each branch path.
+    The main path is saved in the root output directory, while branch paths
+    are saved in a 'branches' subdirectory.
+    Args:
+        path: Path to the rollout tree JSON file
+        outdir: Output directory for HTML files
+        main_only: If True, only export the main trajectory (default: False)
+    """
+    root = load_rollout_tree(path)
+    mgid = root.id
+    main_path, branch_paths = get_rollout_tree_paths(root)
+    outdir.mkdir(parents=True, exist_ok=True)
+    # Create branches subdirectory if we have branch paths
+    if not main_only and branch_paths:
+        branches_dir = outdir / f"mgid:{mgid}_branches_html_renders"
+        branches_dir.mkdir(parents=True, exist_ok=True)
+    # Generate HTML for the main path
+    chat_turns = gather_all_chat_turns_for_path(main_path)
+    html_content = html_from_chat_turns(chat_turns)
+    output_file = outdir / f"mgid:{mgid}_main_html_render.render.html"
+    with open(output_file, "w", encoding="utf-8") as f:
+        f.write(html_content)
+    # Generate HTML for each branch path
+    for path_obj in branch_paths:
+        chat_turns = gather_all_chat_turns_for_path(path_obj)
+        html_content = html_from_chat_turns(chat_turns)
+        path_id: str = path_obj.id
+        output_filename = f"{path_id}_html_render.render.html"
+        output_file = branches_dir / output_filename
+        with open(output_file, "w", encoding="utf-8") as f:
+            f.write(html_content)

src_code_for_reproducibility/utils/rollout_tree_gather_utils.py ADDED Viewed

	@@ -0,0 +1,314 @@

+"""
+File: mllm/utils/rollout_tree_gather_utils.py
+Summary: Utilities for gathering rollout tree files and metadata.
+"""
+from __future__ import annotations
+import csv
+import os
+import pickle
+import re
+from collections import defaultdict
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Any, Callable, Dict, Iterator, List, Optional, Tuple
+from mllm.markov_games.rollout_tree import *
+def load_rollout_tree(path: Path) -> RolloutTreeRootNode:
+    """Load a rollout tree from a PKL file containing a dict."""
+    with open(path, "rb") as f:
+        data = pickle.load(f)
+    return RolloutTreeRootNode.model_validate(data)
+@dataclass
+class RolloutNodeList:
+    id: str
+    nodes: List[RolloutTreeNode]
+def get_rollout_tree_paths(
+    root: RolloutTreeRootNode, mgid: Optional[str] = None
+) -> Tuple[RolloutNodeList, List[RolloutNodeList]]:
+    """
+    Returns:
+        main_path: The main path from the root to the end of the tree.
+        branch_paths: A list of all branch paths from the root to the end of the tree.
+        Each branch path contains a list of nodes that are part of the branch, including the nodes from the main path before the branch was taken.
+    """
+    branch_paths = []
+    def collect_path_nodes(current) -> List[RolloutTreeNode]:
+        """Recursively collect all nodes in a path starting from current node."""
+        if current is None:
+            return []
+        if isinstance(current, RolloutTreeNode):
+            return [current] + collect_path_nodes(current.child)
+        elif isinstance(current, RolloutTreeBranchNode):
+            # For branch nodes, we only follow the main_child for path collection
+            if current.main_child:
+                return [current.main_child] + collect_path_nodes(
+                    current.main_child.child
+                )
+            else:
+                return []
+    def traverse_for_branches(
+        current,
+        main_path_prefix: List[RolloutTreeNode],
+        path_id: str,
+        current_time_step: Optional[int] = 0,
+    ):
+        """Traverse tree to collect all branch paths."""
+        if current is None:
+            return
+        if isinstance(current, RolloutTreeNode):
+            # Continue traversing with this node added to the main path prefix
+            new_prefix = main_path_prefix + [current]
+            traverse_for_branches(current.child, new_prefix, path_id, current.time_step)
+        elif isinstance(current, RolloutTreeBranchNode):
+            # Collect all branch paths
+            if current.branches:
+                for agent_id, branch_node_list in current.branches.items():
+                    if branch_node_list:
+                        # Start with the main path prefix, then recursively collect all nodes in this branch
+                        branch_path_nodes = main_path_prefix.copy()
+                        for branch_node in branch_node_list:
+                            branch_path_nodes.extend(collect_path_nodes(branch_node))
+                        # Create proper branch path ID with mgid, agent_id, and time_step
+                        mgid_str = mgid or str(root.id)
+                        branch_path_id = f"mgid:{mgid_str}_type:branch_agent:{agent_id}_time_step:{current_time_step}"
+                        branch_paths.append(
+                            RolloutNodeList(id=branch_path_id, nodes=branch_path_nodes)
+                        )
+            # Process the main child and add to prefix
+            new_prefix = main_path_prefix
+            if current.main_child:
+                new_prefix = main_path_prefix + [current.main_child]
+            # Continue traversing the main path
+            if current.main_child:
+                traverse_for_branches(
+                    current.main_child.child,
+                    new_prefix,
+                    path_id,
+                    current.main_child.time_step,
+                )
+    # Collect the main path nodes
+    main_path_nodes = collect_path_nodes(root.child)
+    # Traverse to collect all branch paths
+    traverse_for_branches(root.child, [], "")
+    # Create the main path with proper mgid format
+    mgid_str = mgid or str(root.id)
+    main_path = RolloutNodeList(id=f"mgid:{mgid_str}_type:main", nodes=main_path_nodes)
+    return main_path, branch_paths
+class ChatTurnLog(BaseModel):
+    time_step: int
+    agent_id: str
+    role: str
+    content: str
+    reasoning_content: Optional[str] = None
+    is_state_end: bool
+    reward: float
+def gather_agent_chat_turns_for_path(
+    agent_id: str, path: RolloutNodeList
+) -> List[ChatTurnLog]:
+    """Iterate through all chat turns for a specific agent in a path sorted by time step."""
+    turns = []
+    for node in path.nodes:
+        action_log = node.step_log.action_logs.get(agent_id, [])
+        if action_log:
+            for chat_turn in action_log.chat_turns or []:
+                turns.append(
+                    ChatTurnLog(
+                        time_step=node.time_step,
+                        agent_id=agent_id,
+                        role=chat_turn.role,
+                        content=chat_turn.content,
+                        reasoning_content=getattr(chat_turn, "reasoning_content", None),
+                        is_state_end=chat_turn.is_state_end,
+                        reward=node.step_log.simulation_step_log.rewards.get(
+                            agent_id, 0
+                        ),
+                    )
+                )
+    return turns
+def gather_all_chat_turns_for_path(path: RolloutNodeList) -> List[ChatTurnLog]:
+    """Iterate through all chat turns for all agents in a path sorted by time step."""
+    turns = []
+    # Collect turns from all agents, but interleave them per timestep by (user, assistant) pairs
+    for node in path.nodes:
+        # Build (user[, assistant]) pairs for each agent at this timestep
+        agent_ids = sorted(list(node.step_log.action_logs.keys()))
+        per_agent_pairs: Dict[str, List[List[ChatTurnLog]]] = {}
+        for agent_id in agent_ids:
+            action_log = node.step_log.action_logs.get(agent_id)
+            pairs: List[List[ChatTurnLog]] = []
+            current_pair: List[ChatTurnLog] = []
+            if action_log and action_log.chat_turns:
+                for chat_turn in action_log.chat_turns:
+                    turn_log = ChatTurnLog(
+                        time_step=node.time_step,
+                        agent_id=agent_id,
+                        role=chat_turn.role,
+                        content=chat_turn.content,
+                        reasoning_content=getattr(chat_turn, "reasoning_content", None),
+                        is_state_end=chat_turn.is_state_end,
+                        reward=node.step_log.simulation_step_log.rewards.get(
+                            agent_id, 0
+                        ),
+                    )
+                    if chat_turn.role == "user":
+                        # If a previous pair is open, close it and start a new one
+                        if current_pair:
+                            pairs.append(current_pair)
+                            current_pair = []
+                        current_pair = [turn_log]
+                    else:
+                        # assistant: attach to an open user message if present; otherwise stand alone
+                        if (
+                            current_pair
+                            and len(current_pair) == 1
+                            and current_pair[0].role == "user"
+                        ):
+                            current_pair.append(turn_log)
+                            pairs.append(current_pair)
+                            current_pair = []
+                        else:
+                            # No preceding user or already paired; treat as its own unit
+                            pairs.append([turn_log])
+                if current_pair:
+                    # Unpaired trailing user message
+                    pairs.append(current_pair)
+            per_agent_pairs[agent_id] = pairs
+        # Interleave pairs across agents: A1, B1, A2, B2, ...
+        index = 0
+        while True:
+            added_any = False
+            for agent_id in agent_ids:
+                agent_pairs = per_agent_pairs.get(agent_id, [])
+                if index < len(agent_pairs):
+                    for tl in agent_pairs[index]:
+                        turns.append(tl)
+                    added_any = True
+            if not added_any:
+                break
+            index += 1
+    return turns
+def chat_turns_to_dict(chat_turns: Iterator[ChatTurnLog]) -> Iterator[Dict[str, Any]]:
+    """Render all chat turns for a path as structured data for JSON."""
+    for chat_turn in chat_turns:
+        yield chat_turn.model_dump()
+def get_all_agents(root: RolloutTreeRootNode) -> List[str]:
+    """list of all agent IDs that appear in the tree."""
+    if root.child is None:
+        return []
+    # Get the first node to extract all agent IDs
+    first_node = root.child
+    if isinstance(first_node, RolloutTreeBranchNode):
+        first_node = first_node.main_child
+    if first_node is None:
+        return []
+    # All agents should be present in the first node
+    agents = set(first_node.step_log.action_logs.keys())
+    agents.update(first_node.step_log.simulation_step_log.rewards.keys())
+    return sorted(list(agents))
+def gather_agent_main_rewards(agent_id: str, path: RolloutNodeList) -> List[float]:
+    """Gather main rewards for a specific agent in a path."""
+    rewards = []
+    for node in path.nodes:
+        reward = node.step_log.simulation_step_log.rewards[agent_id]
+        rewards.append(reward)
+    return rewards
+def gather_all_rewards(path: RolloutNodeList) -> List[Dict[AgentId, float]]:
+    """Gather main rewards from main trajectory in a path."""
+    rewards = []
+    for node in path.nodes:
+        rewards.append(node.step_log.simulation_step_log.rewards.copy())
+    return rewards
+def gather_simulation_stats(
+    path: RolloutNodeList,
+    filter: Callable[[SimulationStepLog], bool],
+    stat_func: Callable[[SimulationStepLog], Any],
+) -> List[Any]:
+    """Gather stats from main trajectory in a path."""
+    stats = []
+    for node in path.nodes:
+        sl = node.step_log.simulation_step_log
+        if filter(sl):
+            stats.append(stat_func(sl))
+    return stats
+def gather_simulation_step_logs(path: RolloutNodeList) -> List[SimulationStepLog]:
+    """Gather simulation information from main trajectory in a path."""
+    infos = []
+    for node in path.nodes:
+        infos.append(node.step_log.simulation_step_log)
+    return infos
+def export_chat_logs(path: Path, outdir: Path):
+    """Process a rollout tree PKL file and generate a JSONL of chat turns as dicts.
+    Each line contains an object with path_id and chat_turns for a single path.
+    """
+    import json
+    root = load_rollout_tree(path)
+    mgid = root.id
+    main_path, branch_paths = get_rollout_tree_paths(root)
+    all_paths = [main_path] + branch_paths
+    outdir.mkdir(parents=True, exist_ok=True)
+    output_file = outdir / f"mgid:{mgid}_plucked_chats.render.jsonl"
+    with open(output_file, "w", encoding="utf-8") as f:
+        for path_obj in all_paths:
+            chat_turns = gather_all_chat_turns_for_path(path_obj)
+            output_obj = {
+                "path_id": str(path_obj.id),
+                "chat_turns": list(chat_turns_to_dict(iter(chat_turns))),
+            }
+            f.write(json.dumps(output_obj, ensure_ascii=False) + "\n")

src_code_for_reproducibility/utils/short_id_gen.py ADDED Viewed

	@@ -0,0 +1,16 @@

+"""
+File: mllm/utils/short_id_gen.py
+Summary: Generates short unique identifiers for experiment assets.
+"""
+import uuid
+def generate_short_id() -> int:
+    """
+    Generates a short unique ID for tracking adapter versions.
+    Returns:
+        int: An 8-digit integer ID.
+    """
+    return int(str(uuid.uuid4().int)[:8])

src_code_for_reproducibility/utils/stat_pack.py ADDED Viewed

	@@ -0,0 +1,117 @@

+"""
+File: mllm/utils/stat_pack.py
+Summary: Implements the StatPack container for incremental statistics.
+"""
+import csv
+import json
+import os
+import pickle
+from collections import Counter
+from copy import deepcopy
+from locale import strcoll
+from statistics import mean
+from typing import Any, Dict, Iterator, List, Optional, Tuple, TypedDict
+import matplotlib.pyplot as plt
+import numpy as np
+style_path = os.environ.get("ADALIGN_MPLSTYLE")
+if style_path:
+    plt.style.use(style_path)
+import wandb
+from . import wandb_utils
+class StatPack:
+    def __init__(self):
+        self.data = {}
+    def add_stat(self, key: str, value: float | int | None):
+        assert (
+            isinstance(value, float) or isinstance(value, int) or value is None
+        ), f"Value {value} is not a valid type"
+        if key not in self.data:
+            self.data[key] = []
+        self.data[key].append(value)
+    def add_stats(self, other: "StatPack"):
+        for key in other.keys():
+            self.add_stat(key, other[key])
+    def __getitem__(self, key: str):
+        return self.data[key]
+    def __setitem__(self, key: str, value: Any):
+        self.data[key] = value
+    def __contains__(self, key: str):
+        return key in self.data
+    def __len__(self):
+        return len(self.data)
+    def __iter__(self):
+        return iter(self.data)
+    def keys(self):
+        return self.data.keys()
+    def values(self):
+        return self.data.values()
+    def items(self):
+        return self.data.items()
+    def mean(self):
+        mean_st = StatPack()
+        for key in self.keys():
+            if isinstance(self[key], list):
+                # Ignore None entries so missing measurements do not bias the mean.
+                non_none_values = [v for v in self[key] if v is not None]
+                if non_none_values:
+                    mean_st[key] = np.mean(np.array(non_none_values))
+                else:
+                    mean_st[key] = None
+        return mean_st
+    def store_plots(self, folder: str):
+        os.makedirs(folder, exist_ok=True)
+        for key in self.keys():
+            plt.figure(figsize=(10, 5))
+            plt.plot(self[key])
+            plt.title(key)
+            plt.savefig(os.path.join(folder, f"{key}.pdf"))
+            plt.close()
+    def store_numpy(self, folder: str):
+        os.makedirs(folder, exist_ok=True)
+        for key in self.keys():
+            # Sanitize filename components (avoid slashes, spaces, etc.)
+            safe_key = str(key).replace(os.sep, "_").replace("/", "_").replace(" ", "_")
+            values = self[key]
+            # Convert None to NaN for numpy compatibility
+            arr = np.array(
+                [(np.nan if (v is None) else v) for v in values], dtype=float
+            )
+            np.save(os.path.join(folder, f"{safe_key}.npy"), arr)
+    def store_json(self, folder: str, filename: str = "stats.json"):
+        os.makedirs(folder, exist_ok=True)
+        with open(os.path.join(folder, filename), "w") as f:
+            json.dump(self.data, f, indent=4)
+    def store_csv(self, folder: str):
+        os.makedirs(folder, exist_ok=True)
+        for key in self.keys():
+            with open(os.path.join(folder, f"stats.csv"), "w") as f:
+                writer = csv.writer(f)
+                writer.writerow([key] + self[key])
+    def store_pickle(self, folder: str):
+        os.makedirs(folder, exist_ok=True)
+        for key in self.keys():
+            with open(os.path.join(folder, f"stats.pkl"), "wb") as f:
+                pickle.dump(self[key], f)

src_code_for_reproducibility/utils/wandb_utils.py ADDED Viewed

	@@ -0,0 +1,170 @@

+"""
+File: mllm/utils/wandb_utils.py
+Summary: Shared Weights & Biases helper functions.
+"""
+import os
+from typing import Any, Dict, Optional
+_WANDB_AVAILABLE = False
+_WANDB_RUN = None
+def _try_import_wandb():
+    global _WANDB_AVAILABLE
+    if _WANDB_AVAILABLE:
+        return True
+    try:
+        import wandb  # type: ignore
+        _WANDB_AVAILABLE = True
+        return True
+    except Exception:
+        _WANDB_AVAILABLE = False
+        return False
+def _safe_get(cfg: Dict[str, Any], path: list[str], default: Any = None) -> Any:
+    cur: Any = cfg
+    for key in path:
+        if not isinstance(cur, dict) or key not in cur:
+            return default
+        cur = cur[key]
+    return cur
+def is_enabled(cfg: Dict[str, Any]) -> bool:
+    return bool(_safe_get(cfg, ["logging", "wandb", "enabled"], False))
+def init(cfg: Dict[str, Any], run_dir: str, run_name: Optional[str] = None) -> None:
+    """
+    Initialize Weights & Biases if enabled in config. No-op if disabled or wandb not installed.
+    """
+    global _WANDB_RUN
+    if not is_enabled(cfg):
+        return
+    if not _try_import_wandb():
+        return
+    import wandb  # type: ignore
+    project = _safe_get(cfg, ["logging", "wandb", "project"], "llm-negotiation")
+    entity = _safe_get(cfg, ["logging", "wandb", "entity"], None)
+    mode = _safe_get(cfg, ["logging", "wandb", "mode"], "online")
+    tags = _safe_get(cfg, ["logging", "wandb", "tags"], []) or []
+    notes = _safe_get(cfg, ["logging", "wandb", "notes"], None)
+    group = _safe_get(cfg, ["logging", "wandb", "group"], None)
+    name = _safe_get(cfg, ["logging", "wandb", "name"], run_name)
+    # Ensure files are written into the hydra run directory
+    os.makedirs(run_dir, exist_ok=True)
+    os.environ.setdefault("WANDB_DIR", run_dir)
+    # Convert cfg to plain types for W&B config; fallback to minimal dictionary
+    try:
+        from omegaconf import OmegaConf  # type: ignore
+        cfg_container = OmegaConf.to_container(cfg, resolve=True)  # type: ignore
+    except Exception:
+        cfg_container = cfg
+    _WANDB_RUN = wandb.init(
+        project=project,
+        entity=entity,
+        mode=mode,
+        name=name,
+        group=group,
+        tags=tags,
+        notes=notes,
+        config=cfg_container,
+        dir=run_dir,
+        reinit=True,
+    )
+def log(metrics: Dict[str, Any], step: Optional[int] = None) -> None:
+    """Log a flat dictionary of metrics to W&B if active."""
+    if not _WANDB_AVAILABLE or _WANDB_RUN is None:
+        return
+    try:
+        import wandb  # type: ignore
+        wandb.log(metrics if step is None else dict(metrics, step=step))
+    except Exception:
+        pass
+def _flatten(prefix: str, data: Dict[str, Any], out: Dict[str, Any]) -> None:
+    for k, v in data.items():
+        key = f"{prefix}.{k}" if prefix else k
+        if isinstance(v, dict):
+            _flatten(key, v, out)
+        else:
+            out[key] = v
+def _summarize_value(value: Any) -> Dict[str, Any]:
+    import numpy as np  # local import to avoid hard dependency during disabled mode
+    if value is None:
+        return {"none": 1}
+    # Scalars
+    if isinstance(value, (int, float)):
+        return {"value": float(value)}
+    # Lists or arrays
+    try:
+        arr = np.asarray(value)
+        if arr.size == 0:
+            return {"size": 0}
+        return {
+            "mean": float(np.nanmean(arr)),
+            "min": float(np.nanmin(arr)),
+            "max": float(np.nanmax(arr)),
+            "last": float(arr.reshape(-1)[-1]),
+            "size": int(arr.size),
+        }
+    except Exception:
+        # Fallback: string repr
+        return {"text": str(value)}
+def log_tally(
+    array_tally: Dict[str, Any], prefix: str = "", step: Optional[int] = None
+) -> None:
+    """
+    Flatten and summarize Tally.array_tally and log to WandB.
+    Each leaf list/array is summarized with mean/min/max/last/size.
+    """
+    if not _WANDB_AVAILABLE or _WANDB_RUN is None:
+        return
+    summarized: Dict[str, Any] = {}
+    def walk(node: Any, path: list[str]):
+        if isinstance(node, dict):
+            for k, v in node.items():
+                walk(v, path + [k])
+            return
+        # node is a list of values accumulated over time
+        key = ".".join([p for p in ([prefix] if prefix else []) + path])
+        try:
+            summary = _summarize_value(node)
+            for sk, sv in summary.items():
+                summarized[f"{key}.{sk}"] = sv
+        except Exception:
+            summarized[f"{key}.error"] = 1
+    walk(array_tally, [])
+    if summarized:
+        log(summarized, step=step)
+def log_flat_stats(
+    stats: Dict[str, Any], prefix: str = "", step: Optional[int] = None
+) -> None:
+    if not _WANDB_AVAILABLE or _WANDB_RUN is None:
+        return
+    flat: Dict[str, Any] = {}
+    _flatten(prefix, stats, flat)
+    if flat:
+        log(flat, step=step)