Muqeeth commited on
Commit
9280c85
·
verified ·
1 Parent(s): d538038

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .hydra/config.yaml +178 -0
  2. .hydra/hydra.yaml +154 -0
  3. .hydra/overrides.yaml +1 -0
  4. seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/README.md +207 -0
  5. seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json +42 -0
  6. seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json +42 -0
  7. src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc +0 -0
  8. src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc +0 -0
  9. src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc +0 -0
  10. src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc +0 -0
  11. src_code_for_reproducibility/chat_utils/apply_template.py +84 -0
  12. src_code_for_reproducibility/chat_utils/chat_turn.py +27 -0
  13. src_code_for_reproducibility/chat_utils/template_specific.py +109 -0
  14. src_code_for_reproducibility/docs/Makefile +19 -0
  15. src_code_for_reproducibility/docs/generate_docs.py +249 -0
  16. src_code_for_reproducibility/docs/make.bat +35 -0
  17. src_code_for_reproducibility/markov_games/__init__.py +0 -0
  18. src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-312.pyc +0 -0
  19. src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc +0 -0
  20. src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
  21. src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-312.pyc +0 -0
  22. src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc +0 -0
  23. src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-312.pyc +0 -0
  24. src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc +0 -0
  25. src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-312.pyc +0 -0
  26. src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-312.pyc +0 -0
  27. src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc +0 -0
  28. src_code_for_reproducibility/markov_games/agent.py +76 -0
  29. src_code_for_reproducibility/markov_games/alternative_actions_runner.py +138 -0
  30. src_code_for_reproducibility/markov_games/group_timesteps.py +150 -0
  31. src_code_for_reproducibility/markov_games/linear_runner.py +30 -0
  32. src_code_for_reproducibility/markov_games/markov_game.py +208 -0
  33. src_code_for_reproducibility/markov_games/mg_utils.py +89 -0
  34. src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +153 -0
  35. src_code_for_reproducibility/markov_games/negotiation/nego_agent.py +242 -0
  36. src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +168 -0
  37. src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +108 -0
  38. src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py +118 -0
  39. src_code_for_reproducibility/markov_games/negotiation/tas_simple_simulation.py +169 -0
  40. src_code_for_reproducibility/markov_games/negotiation/tas_simulation.py +172 -0
  41. src_code_for_reproducibility/markov_games/rollout_tree.py +86 -0
  42. src_code_for_reproducibility/markov_games/run_markov_games.py +24 -0
  43. src_code_for_reproducibility/markov_games/simulation.py +87 -0
  44. src_code_for_reproducibility/markov_games/statistics_runner.py +405 -0
  45. src_code_for_reproducibility/markov_games/vine_ppo.py +10 -0
  46. src_code_for_reproducibility/models/__init__.py +0 -0
  47. src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc +0 -0
  48. src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc +0 -0
  49. src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc +0 -0
  50. src_code_for_reproducibility/models/adapter_training_wrapper.py +98 -0
.hydra/config.yaml ADDED
@@ -0,0 +1,178 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ experiment:
2
+ wandb_enabled: true
3
+ nb_epochs: 3000
4
+ nb_matches_per_iteration: 64
5
+ reinit_matches_each_it: true
6
+ checkpoint_every_n_iterations: 10
7
+ start_epoch: 0
8
+ resume_experiment: true
9
+ base_seed: 0
10
+ seed_group_size: 8
11
+ train: true
12
+ stat_methods_for_live_wandb: mllm.markov_games.negotiation.negotiation_statistics
13
+ name: no_press_10_1_ties_ad_align_nocurrtimestep
14
+ agent_buffer: true
15
+ keep_agent_buffer_count: ${lora_count}
16
+ agent_buffer_recent_k: -1
17
+ logging:
18
+ wandb:
19
+ enabled: false
20
+ project: llm-negotiation
21
+ entity: null
22
+ mode: online
23
+ name: null
24
+ group: null
25
+ tags: []
26
+ notes: null
27
+ temperature: 1.0
28
+ markov_games:
29
+ runner_method_name: LinearRunner
30
+ runner_kwargs: {}
31
+ group_by_round: true
32
+ simulation_class_name: NoPressSimulation
33
+ simulation_init_args:
34
+ nb_of_rounds: 10
35
+ quota_messages_per_agent_per_round: 0
36
+ game_type: 10-1-ties
37
+ atleast_one_conflict: true
38
+ item_types:
39
+ - hats
40
+ - books
41
+ - balls
42
+ agents:
43
+ 0:
44
+ agent_id: ${agent_0_id}
45
+ agent_name: Alice
46
+ agent_class_name: NoPressAgent
47
+ policy_id: base_llm/agent_adapter
48
+ init_kwargs:
49
+ goal: Maximize your total points over the whole game.
50
+ 1:
51
+ agent_id: ${agent_1_id}
52
+ agent_name: Bob
53
+ agent_class_name: NoPressAgent
54
+ policy_id: base_llm/agent_adapter
55
+ init_kwargs:
56
+ goal: Maximize your total points over the whole game.
57
+ models:
58
+ base_llm:
59
+ class: LeanLocalLLM
60
+ init_args:
61
+ llm_id: base_llm
62
+ model_name: Qwen/Qwen2.5-7B-Instruct
63
+ inference_backend: vllm
64
+ hf_kwargs:
65
+ device_map: auto
66
+ torch_dtype: bfloat16
67
+ max_memory:
68
+ 0: 20GiB
69
+ attn_implementation: flash_attention_2
70
+ inference_backend_init_kwargs:
71
+ enable_lora: true
72
+ seed: ${experiment.base_seed}
73
+ enable_prefix_caching: true
74
+ max_model_len: 10000.0
75
+ gpu_memory_utilization: 0.5
76
+ dtype: bfloat16
77
+ trust_remote_code: true
78
+ max_lora_rank: 32
79
+ enforce_eager: false
80
+ max_loras: ${lora_count}
81
+ max_cpu_loras: ${lora_count}
82
+ enable_sleep_mode: true
83
+ inference_backend_sampling_params:
84
+ temperature: ${temperature}
85
+ top_p: 1.0
86
+ max_tokens: 400
87
+ top_k: -1
88
+ logprobs: 0
89
+ adapter_configs:
90
+ agent_adapter:
91
+ task_type: CAUSAL_LM
92
+ r: 32
93
+ lora_alpha: 64
94
+ lora_dropout: 0.0
95
+ target_modules: all-linear
96
+ critic_adapter:
97
+ task_type: CAUSAL_LM
98
+ r: 32
99
+ lora_alpha: 64
100
+ lora_dropout: 0.0
101
+ target_modules: all-linear
102
+ enable_thinking: null
103
+ regex_max_attempts: 3
104
+ critics:
105
+ agent_critic:
106
+ module_pointer:
107
+ - base_llm
108
+ - critic_adapter
109
+ optimizers:
110
+ agent_optimizer:
111
+ module_pointer:
112
+ - base_llm
113
+ - agent_adapter
114
+ optimizer_class_name: torch.optim.Adam
115
+ init_args:
116
+ lr: 3.0e-06
117
+ weight_decay: 0.0
118
+ critic_optimizer:
119
+ module_pointer: agent_critic
120
+ optimizer_class_name: torch.optim.Adam
121
+ init_args:
122
+ lr: 3.0e-06
123
+ weight_decay: 0.0
124
+ trainers:
125
+ agent_trainer:
126
+ class: TrainerAdAlign
127
+ module_pointers:
128
+ policy:
129
+ - base_llm
130
+ - agent_adapter
131
+ policy_optimizer: agent_optimizer
132
+ critic: agent_critic
133
+ critic_optimizer: critic_optimizer
134
+ kwargs:
135
+ entropy_coeff: 0.0
136
+ entropy_topk: null
137
+ entropy_mask_regex: null
138
+ kl_coeff: 0.001
139
+ gradient_clipping: 1.0
140
+ restrict_tokens: null
141
+ mini_batch_size: 1
142
+ use_gradient_checkpointing: false
143
+ temperature: ${temperature}
144
+ device: cuda:0
145
+ use_gae: false
146
+ whiten_advantages: false
147
+ whiten_advantages_time_step_wise: false
148
+ skip_discounted_state_visitation: true
149
+ use_gae_lambda_annealing: false
150
+ gae_lambda_annealing_method: None
151
+ gae_lambda_annealing_method_params: None
152
+ gae_lambda_annealing_limit: 0.95
153
+ discount_factor: 0.9
154
+ use_rloo: true
155
+ enable_tokenwise_logging: false
156
+ pg_loss_normalization: nb_tokens
157
+ truncated_importance_sampling_ratio_cap: 2.0
158
+ reward_normalizing_constant: 100.0
159
+ ad_align_force_coop_first_step: false
160
+ ad_align_clipping: null
161
+ ad_align_gamma: 0.9
162
+ ad_align_exclude_k_equals_t: true
163
+ ad_align_use_sign: false
164
+ ad_align_beta: 1.0
165
+ use_old_ad_align: true
166
+ use_time_regularization: false
167
+ rloo_branch: false
168
+ reuse_baseline: false
169
+ train_on_which_data:
170
+ agent_trainer: ${agent_ids}
171
+ lora_count: 30
172
+ common_agent_kwargs:
173
+ goal: Maximize your total points over the whole game.
174
+ agent_0_id: Alice
175
+ agent_1_id: Bob
176
+ agent_ids:
177
+ - Alice
178
+ - Bob
.hydra/hydra.yaml ADDED
@@ -0,0 +1,154 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ${oc.env:SCRATCH}/llm_negotiation/${now:%Y_%m}/${experiment.name}
4
+ sweep:
5
+ dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
6
+ subdir: ${hydra.job.num}
7
+ launcher:
8
+ _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
9
+ sweeper:
10
+ _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
11
+ max_batch_size: null
12
+ params: null
13
+ help:
14
+ app_name: ${hydra.job.name}
15
+ header: '${hydra.help.app_name} is powered by Hydra.
16
+
17
+ '
18
+ footer: 'Powered by Hydra (https://hydra.cc)
19
+
20
+ Use --hydra-help to view Hydra specific help
21
+
22
+ '
23
+ template: '${hydra.help.header}
24
+
25
+ == Configuration groups ==
26
+
27
+ Compose your configuration from those groups (group=option)
28
+
29
+
30
+ $APP_CONFIG_GROUPS
31
+
32
+
33
+ == Config ==
34
+
35
+ Override anything in the config (foo.bar=value)
36
+
37
+
38
+ $CONFIG
39
+
40
+
41
+ ${hydra.help.footer}
42
+
43
+ '
44
+ hydra_help:
45
+ template: 'Hydra (${hydra.runtime.version})
46
+
47
+ See https://hydra.cc for more info.
48
+
49
+
50
+ == Flags ==
51
+
52
+ $FLAGS_HELP
53
+
54
+
55
+ == Configuration groups ==
56
+
57
+ Compose your configuration from those groups (For example, append hydra/job_logging=disabled
58
+ to command line)
59
+
60
+
61
+ $HYDRA_CONFIG_GROUPS
62
+
63
+
64
+ Use ''--cfg hydra'' to Show the Hydra config.
65
+
66
+ '
67
+ hydra_help: ???
68
+ hydra_logging:
69
+ version: 1
70
+ formatters:
71
+ simple:
72
+ format: '[%(asctime)s][HYDRA] %(message)s'
73
+ handlers:
74
+ console:
75
+ class: logging.StreamHandler
76
+ formatter: simple
77
+ stream: ext://sys.stdout
78
+ root:
79
+ level: INFO
80
+ handlers:
81
+ - console
82
+ loggers:
83
+ logging_example:
84
+ level: DEBUG
85
+ disable_existing_loggers: false
86
+ job_logging:
87
+ version: 1
88
+ formatters:
89
+ simple:
90
+ format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
91
+ handlers:
92
+ console:
93
+ class: logging.StreamHandler
94
+ formatter: simple
95
+ stream: ext://sys.stdout
96
+ file:
97
+ class: logging.FileHandler
98
+ formatter: simple
99
+ filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
100
+ root:
101
+ level: INFO
102
+ handlers:
103
+ - console
104
+ - file
105
+ disable_existing_loggers: false
106
+ env: {}
107
+ mode: RUN
108
+ searchpath: []
109
+ callbacks: {}
110
+ output_subdir: .hydra
111
+ overrides:
112
+ hydra:
113
+ - hydra.mode=RUN
114
+ task: []
115
+ job:
116
+ name: run
117
+ chdir: false
118
+ override_dirname: ''
119
+ id: ???
120
+ num: ???
121
+ config_name: no_press_10_1_ties_ad_align_nocurrtimestep.yaml
122
+ env_set: {}
123
+ env_copy: []
124
+ config:
125
+ override_dirname:
126
+ kv_sep: '='
127
+ item_sep: ','
128
+ exclude_keys: []
129
+ runtime:
130
+ version: 1.3.2
131
+ version_base: '1.1'
132
+ cwd: /scratch/muqeeth/llm_negotiation
133
+ config_sources:
134
+ - path: hydra.conf
135
+ schema: pkg
136
+ provider: hydra
137
+ - path: /scratch/muqeeth/llm_negotiation/configs
138
+ schema: file
139
+ provider: main
140
+ - path: ''
141
+ schema: structured
142
+ provider: schema
143
+ output_dir: /scratch/muqeeth/llm_negotiation/2025_11/no_press_10_1_ties_ad_align_nocurrtimestep
144
+ choices:
145
+ hydra/env: default
146
+ hydra/callbacks: null
147
+ hydra/job_logging: default
148
+ hydra/hydra_logging: default
149
+ hydra/hydra_help: default
150
+ hydra/help: default
151
+ hydra/sweeper: basic
152
+ hydra/launcher: basic
153
+ hydra/output: default
154
+ verbose: false
.hydra/overrides.yaml ADDED
@@ -0,0 +1 @@
 
 
1
+ []
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-7B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-7B-Instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.17.1
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 32,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "gate_proj",
29
+ "v_proj",
30
+ "k_proj",
31
+ "down_proj",
32
+ "up_proj",
33
+ "o_proj",
34
+ "q_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_config.json ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "Qwen/Qwen2.5-7B-Instruct",
5
+ "bias": "none",
6
+ "corda_config": null,
7
+ "eva_config": null,
8
+ "exclude_modules": null,
9
+ "fan_in_fan_out": false,
10
+ "inference_mode": true,
11
+ "init_lora_weights": true,
12
+ "layer_replication": null,
13
+ "layers_pattern": null,
14
+ "layers_to_transform": null,
15
+ "loftq_config": {},
16
+ "lora_alpha": 64,
17
+ "lora_bias": false,
18
+ "lora_dropout": 0.0,
19
+ "megatron_config": null,
20
+ "megatron_core": "megatron.core",
21
+ "modules_to_save": null,
22
+ "peft_type": "LORA",
23
+ "qalora_group_size": 16,
24
+ "r": 32,
25
+ "rank_pattern": {},
26
+ "revision": null,
27
+ "target_modules": [
28
+ "gate_proj",
29
+ "v_proj",
30
+ "k_proj",
31
+ "down_proj",
32
+ "up_proj",
33
+ "o_proj",
34
+ "q_proj"
35
+ ],
36
+ "target_parameters": null,
37
+ "task_type": "CAUSAL_LM",
38
+ "trainable_token_indices": null,
39
+ "use_dora": false,
40
+ "use_qalora": false,
41
+ "use_rslora": false
42
+ }
src_code_for_reproducibility/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (146 Bytes). View file
 
src_code_for_reproducibility/chat_utils/__pycache__/apply_template.cpython-312.pyc ADDED
Binary file (3.92 kB). View file
 
src_code_for_reproducibility/chat_utils/__pycache__/chat_turn.cpython-312.pyc ADDED
Binary file (1.32 kB). View file
 
src_code_for_reproducibility/chat_utils/__pycache__/template_specific.cpython-312.pyc ADDED
Binary file (4.24 kB). View file
 
src_code_for_reproducibility/chat_utils/apply_template.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+
3
+ from mllm.chat_utils.chat_turn import ChatTurn
4
+ from mllm.chat_utils.template_specific import (
5
+ custom_gemma3_template,
6
+ custom_llama3_template,
7
+ custom_qwen2_template,
8
+ custom_qwen3_template,
9
+ gemma3_assistant_postfix,
10
+ qwen2_assistant_postfix,
11
+ qwen3_assistant_postfix,
12
+ )
13
+
14
+
15
+ def get_custom_chat_template(tokenizer) -> str:
16
+ """
17
+ Get the chat template for the tokenizer.
18
+ """
19
+ if "qwen2" in tokenizer.name_or_path.lower():
20
+ return custom_qwen2_template
21
+ elif "llama" in tokenizer.name_or_path.lower():
22
+ return custom_llama3_template
23
+ elif "qwen3" in tokenizer.name_or_path.lower():
24
+ return custom_qwen3_template
25
+ elif "gemma" in tokenizer.name_or_path.lower():
26
+ return custom_gemma3_template
27
+ else:
28
+ raise ValueError(f"Tokenizer {tokenizer.name_or_path} not supported")
29
+
30
+
31
+ def get_custom_assistant_postfix(tokenizer) -> torch.Tensor:
32
+ """
33
+ Get the custom assistant postfix for the tokenizer.
34
+ """
35
+ if "qwen2" in tokenizer.name_or_path.lower():
36
+ return qwen2_assistant_postfix
37
+ elif "qwen3" in tokenizer.name_or_path.lower():
38
+ return qwen3_assistant_postfix
39
+ elif "gemma" in tokenizer.name_or_path.lower():
40
+ return gemma3_assistant_postfix
41
+ return torch.tensor([], dtype=torch.long)
42
+
43
+
44
+ def tokenize_chats(chats: list[ChatTurn], tokenizer, enable_thinking) -> None:
45
+ """
46
+ Set the chat_template_token_ids for each chat turn.
47
+ # TODO: use engine tokens if available
48
+ """
49
+ custom_template = get_custom_chat_template(tokenizer)
50
+ custom_assistant_postfix: torch.Tensor = get_custom_assistant_postfix(tokenizer)
51
+ for i, chat in enumerate(chats):
52
+ if chat.chat_template_token_ids is None:
53
+ if chat.role == "user":
54
+ next_chat = chats[i + 1] if i + 1 < len(chats) else None
55
+ add_generation_prompt = True
56
+ if next_chat and next_chat.role == "user":
57
+ add_generation_prompt = False
58
+ encoded_chat = tokenizer.apply_chat_template(
59
+ [chat],
60
+ return_tensors="pt",
61
+ chat_template=custom_template,
62
+ add_generation_prompt=add_generation_prompt,
63
+ add_system_prompt=True if i == 0 else False,
64
+ enable_thinking=enable_thinking,
65
+ ).flatten()
66
+ previous_chat = chats[i - 1] if i > 0 else None
67
+ if previous_chat and previous_chat.role == "assistant":
68
+ encoded_chat = torch.cat([custom_assistant_postfix, encoded_chat])
69
+ elif chat.role == "assistant":
70
+ encoded_chat = chat.out_token_ids
71
+ chat.chat_template_token_ids = encoded_chat
72
+
73
+
74
+ def chat_turns_to_token_ids(
75
+ chats: list[ChatTurn], tokenizer, enable_thinking
76
+ ) -> list[int]:
77
+ """
78
+ Tokenize the chat turns and set the chat_template_token_ids for each chat turn.
79
+ """
80
+ tokenize_chats(chats=chats, tokenizer=tokenizer, enable_thinking=enable_thinking)
81
+ token_ids = []
82
+ for chat in chats:
83
+ token_ids.append(chat.chat_template_token_ids)
84
+ return torch.cat(token_ids)
src_code_for_reproducibility/chat_utils/chat_turn.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import json
4
+ from dataclasses import dataclass
5
+ from pathlib import Path
6
+ from typing import Any, List, Literal, Optional, Tuple
7
+
8
+ import jsonschema
9
+ import torch
10
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
11
+
12
+ AgentId = str
13
+
14
+
15
+ class ChatTurn(BaseModel):
16
+ model_config = ConfigDict(arbitrary_types_allowed=True) # needed for torch tensors
17
+
18
+ role: str = Field(pattern="^(user|assistant)$")
19
+ agent_id: AgentId # ID of the agent with which the chat occured
20
+ content: str
21
+ reasoning_content: str | None = None
22
+ chat_template_token_ids: torch.LongTensor | None = None # Token ids of chat template format. For example, token ids of "<assistant>{content}</assistant>""
23
+ out_token_ids: torch.LongTensor | None = (
24
+ None # tokens generated from inference engine
25
+ )
26
+ log_probs: torch.FloatTensor | None = None
27
+ is_state_end: bool = False # indicates whether this chat turn marks the end of a state in the trajectory
src_code_for_reproducibility/chat_utils/template_specific.py ADDED
@@ -0,0 +1,109 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ import torch
3
+ from transformers import AutoTokenizer
4
+
5
+ custom_llama3_template = """
6
+ {%- if add_system_prompt %}
7
+ {{- '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|>' }}
8
+ {%- endif %}
9
+ {%- for message in messages %}
10
+ {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' }}
11
+ {%- endfor %}
12
+
13
+ {%- if add_generation_prompt %}
14
+ {{- '<|start_header_id|>' + 'assistant' + '<|end_header_id|>\n\n' }}
15
+ {%- endif %}
16
+ """
17
+
18
+ qwen2_assistant_postfix = (
19
+ AutoTokenizer.from_pretrained("Qwen/Qwen2.5-7B-Instruct")
20
+ .encode("\n", return_tensors="pt")
21
+ .flatten()
22
+ )
23
+ qwen3_assistant_postfix = (
24
+ AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")
25
+ .encode("\n", return_tensors="pt")
26
+ .flatten()
27
+ )
28
+ gemma3_assistant_postfix = (
29
+ AutoTokenizer.from_pretrained("google/gemma-3-4b-it")
30
+ .encode("\n", return_tensors="pt")
31
+ .flatten()
32
+ )
33
+ custom_qwen2_template = """
34
+ {%- if add_system_prompt %}
35
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
36
+ {%- endif %}
37
+ {%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
38
+ {%- for message in messages %}
39
+ {%- if message.content is string %}
40
+ {%- set content = message.content %}
41
+ {%- else %}
42
+ {%- set content = '' %}
43
+ {%- endif %}
44
+ {%- if (message.role == "user") %}
45
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
46
+ {%- elif message.role == "assistant" %}
47
+ {%- set reasoning_content = '' %}
48
+ {%- if message.reasoning_content is string %}
49
+ {%- set reasoning_content = message.reasoning_content %}
50
+ {%- else %}
51
+ {%- if '</think>' in content %}
52
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
53
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
54
+ {%- endif %}
55
+ {%- endif %}
56
+ {%- if loop.index0 > ns.last_query_index %}
57
+ {%- if reasoning_content %}
58
+ {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
59
+ {%- else %}
60
+ {{- '<|im_start|>' + message.role + '\n' + content }}
61
+ {%- endif %}
62
+ {%- else %}
63
+ {{- '<|im_start|>' + message.role + '\n' + content }}
64
+ {%- endif %}
65
+ {{- '<|im_end|>\n' }}
66
+ {%- endif %}
67
+ {%- endfor %}
68
+ {%- if add_generation_prompt %}
69
+ {{- '<|im_start|>assistant\n' }}
70
+ {%- endif %}
71
+ """
72
+
73
+ custom_qwen3_template = """
74
+ {%- for message in messages %}
75
+ {%- if message.content is string %}
76
+ {%- set content = message.content %}
77
+ {%- else %}
78
+ {%- set content = '' %}
79
+ {%- endif %}
80
+ {%- if (message.role == "user") %}
81
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
82
+ {%- elif message.role == "assistant" %}
83
+ {{- '<|im_start|>' + message.role + '\n' + content + '<|im_end|>' + '\n' }}
84
+ {%- endif %}
85
+ {%- endfor %}
86
+ {%- if add_generation_prompt %}
87
+ {{- '<|im_start|>assistant\n' }}
88
+ {%- if enable_thinking is defined and enable_thinking is false %}
89
+ {{- '<think>\n\n</think>\n\n' }}
90
+ {%- endif %}
91
+ {%- endif %}
92
+ """
93
+
94
+ custom_gemma3_template = """
95
+ {%- if add_system_prompt %}
96
+ {{- bos_token -}}
97
+ {%- endif %}
98
+ {%- for message in messages -%}
99
+ {%- if message['role'] == 'assistant' -%}
100
+ {%- set role = 'model' -%}
101
+ {%- else -%}
102
+ {%- set role = message['role'] -%}
103
+ {%- endif -%}
104
+ {{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}
105
+ {%- endfor -%}
106
+ {%- if add_generation_prompt -%}
107
+ {{ '<start_of_turn>model\n' }}
108
+ {%- endif -%}
109
+ """
src_code_for_reproducibility/docs/Makefile ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal makefile for Sphinx documentation
2
+
3
+ # You can set these variables from the command line, and also
4
+ # from the environment for the first two.
5
+ SPHINXOPTS ?=
6
+ SPHINXBUILD ?= sphinx-build
7
+ SOURCEDIR = source
8
+ BUILDDIR = build
9
+
10
+ # Put it first so that "make" without argument is like "make help".
11
+ help:
12
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(SPHINXFLAGS)
13
+
14
+ .PHONY: help Makefile
15
+
16
+ # Catch-all target: route all unknown targets to Sphinx using the new
17
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
18
+ %: Makefile
19
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(SPHINXFLAGS)
src_code_for_reproducibility/docs/generate_docs.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Script to automatically generate Sphinx documentation for all modules and build the HTML website.
4
+ """
5
+ import importlib.util
6
+ import os
7
+ import subprocess
8
+ import sys
9
+
10
+
11
+ def check_and_install_dependencies():
12
+ """Check for required dependencies and install them if missing."""
13
+ required_packages = [
14
+ "sphinx",
15
+ "sphinx-rtd-theme",
16
+ "sphinxcontrib-napoleon",
17
+ "sphinxcontrib-mermaid",
18
+ "sphinx-autodoc-typehints",
19
+ ]
20
+
21
+ missing_packages = []
22
+
23
+ for package in required_packages:
24
+ # Convert package name to module name (replace - with _)
25
+ module_name = package.replace("-", "_")
26
+
27
+ # Check if the package is installed
28
+ if importlib.util.find_spec(module_name) is None:
29
+ missing_packages.append(package)
30
+
31
+ # Install missing packages
32
+ if missing_packages:
33
+ print(f"Installing missing dependencies: {', '.join(missing_packages)}")
34
+ subprocess.check_call(
35
+ [sys.executable, "-m", "pip", "install"] + missing_packages
36
+ )
37
+ print("Dependencies installed successfully")
38
+ else:
39
+ print("All required dependencies are already installed")
40
+
41
+
42
+ def create_makefile(docs_dir):
43
+ """Create a Makefile for Sphinx documentation if it doesn't exist."""
44
+ makefile_path = os.path.join(docs_dir, "Makefile")
45
+
46
+ if os.path.exists(makefile_path):
47
+ print(f"Makefile already exists at {makefile_path}")
48
+ return
49
+
50
+ print(f"Creating Makefile at {makefile_path}")
51
+
52
+ makefile_content = """# Minimal makefile for Sphinx documentation
53
+
54
+ # You can set these variables from the command line, and also
55
+ # from the environment for the first two.
56
+ SPHINXOPTS ?=
57
+ SPHINXBUILD ?= sphinx-build
58
+ SOURCEDIR = source
59
+ BUILDDIR = build
60
+
61
+ # Put it first so that "make" without argument is like "make help".
62
+ help:
63
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(SPHINXFLAGS)
64
+
65
+ .PHONY: help Makefile
66
+
67
+ # Catch-all target: route all unknown targets to Sphinx using the new
68
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
69
+ %: Makefile
70
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(SPHINXFLAGS)
71
+ """
72
+
73
+ with open(makefile_path, "w") as f:
74
+ f.write(makefile_content)
75
+
76
+ print("Makefile created successfully")
77
+
78
+
79
+ def create_make_bat(docs_dir):
80
+ """Create a make.bat file for Windows if it doesn't exist."""
81
+ make_bat_path = os.path.join(docs_dir, "make.bat")
82
+
83
+ if os.path.exists(make_bat_path):
84
+ print(f"make.bat already exists at {make_bat_path}")
85
+ return
86
+
87
+ print(f"Creating make.bat at {make_bat_path}")
88
+
89
+ make_bat_content = """@ECHO OFF
90
+
91
+ pushd %~dp0
92
+
93
+ REM Command file for Sphinx documentation
94
+
95
+ if "%SPHINXBUILD%" == "" (
96
+ set SPHINXBUILD=sphinx-build
97
+ )
98
+ set SOURCEDIR=source
99
+ set BUILDDIR=build
100
+
101
+ %SPHINXBUILD% >NUL 2>NUL
102
+ if errorlevel 9009 (
103
+ echo.
104
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
105
+ echo.installed, then set the SPHINXBUILD environment variable to point
106
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
107
+ echo.may add the Sphinx directory to PATH.
108
+ echo.
109
+ echo.If you don't have Sphinx installed, grab it from
110
+ echo.https://www.sphinx-doc.org/
111
+ exit /b 1
112
+ )
113
+
114
+ if "%1" == "" goto help
115
+
116
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
117
+ goto end
118
+
119
+ :help
120
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
121
+
122
+ :end
123
+ popd
124
+ """
125
+
126
+ with open(make_bat_path, "w") as f:
127
+ f.write(make_bat_content)
128
+
129
+ print("make.bat created successfully")
130
+
131
+
132
+ def main():
133
+ # Check and install required dependencies
134
+ print("=== Checking dependencies ===")
135
+ check_and_install_dependencies()
136
+
137
+ # Get the directory of this script
138
+ script_dir = os.path.dirname(os.path.abspath(__file__))
139
+
140
+ # Path to the project root
141
+ project_root = os.path.dirname(script_dir)
142
+
143
+ # Path to the source directory
144
+ source_dir = os.path.join(project_root, "src")
145
+
146
+ # Path to the docs source directory
147
+ docs_source_dir = os.path.join(script_dir, "source")
148
+
149
+ # Print paths for debugging
150
+ print(f"Script directory: {script_dir}")
151
+ print(f"Project root: {project_root}")
152
+ print(f"Source directory: {source_dir}")
153
+ print(f"Docs source directory: {docs_source_dir}")
154
+
155
+ # Make sure the source directory exists
156
+ if not os.path.exists(source_dir):
157
+ print(f"Error: Source directory {source_dir} does not exist!")
158
+ sys.exit(1)
159
+
160
+ # Make sure the docs source directory exists
161
+ if not os.path.exists(docs_source_dir):
162
+ print(f"Creating docs source directory: {docs_source_dir}")
163
+ os.makedirs(docs_source_dir)
164
+
165
+ # Step 1: Run sphinx-apidoc to generate .rst files for all modules
166
+ print("\n=== Generating API documentation ===")
167
+ cmd = [
168
+ "sphinx-apidoc",
169
+ "-f", # Force overwriting of existing files
170
+ "-e", # Put module documentation before submodule documentation
171
+ "-M", # Put module documentation before subpackage documentation
172
+ "-o",
173
+ docs_source_dir, # Output directory
174
+ source_dir, # Source code directory
175
+ ]
176
+
177
+ print(f"Running command: {' '.join(cmd)}")
178
+ result = subprocess.run(cmd, capture_output=True, text=True)
179
+
180
+ # Print the output of the command
181
+ print("STDOUT:")
182
+ print(result.stdout)
183
+
184
+ print("STDERR:")
185
+ print(result.stderr)
186
+
187
+ if result.returncode != 0:
188
+ print(f"Error: sphinx-apidoc failed with return code {result.returncode}")
189
+ sys.exit(1)
190
+
191
+ # List the files in the docs source directory
192
+ print("\nFiles in docs/source directory:")
193
+ for file in sorted(os.listdir(docs_source_dir)):
194
+ print(f" {file}")
195
+
196
+ print("\nDocumentation source files generated successfully!")
197
+
198
+ # Step 2: Create Makefile and make.bat if they don't exist
199
+ create_makefile(script_dir)
200
+ create_make_bat(script_dir)
201
+
202
+ # Step 3: Build the HTML documentation
203
+ print("\n=== Building HTML documentation ===")
204
+
205
+ # Determine the build command based on the platform
206
+ if os.name == "nt": # Windows
207
+ build_cmd = ["make.bat", "html"]
208
+ else: # Unix/Linux/Mac
209
+ build_cmd = ["make", "html"]
210
+
211
+ # Change to the docs directory to run the build command
212
+ os.chdir(script_dir)
213
+
214
+ print(f"Running command: {' '.join(build_cmd)}")
215
+ build_result = subprocess.run(build_cmd, capture_output=True, text=True)
216
+
217
+ # Print the output of the build command
218
+ print("STDOUT:")
219
+ print(build_result.stdout)
220
+
221
+ print("STDERR:")
222
+ print(build_result.stderr)
223
+
224
+ if build_result.returncode != 0:
225
+ print(f"Error: HTML build failed with return code {build_result.returncode}")
226
+ sys.exit(1)
227
+
228
+ # Get the path to the built HTML documentation
229
+ html_dir = os.path.join(script_dir, "build", "html")
230
+ index_path = os.path.join(html_dir, "index.html")
231
+
232
+ if os.path.exists(index_path):
233
+ print(f"\nHTML documentation built successfully!")
234
+ print(f"You can view it by opening: {index_path}")
235
+
236
+ # Try to open the documentation in a browser
237
+ try:
238
+ import webbrowser
239
+
240
+ print("\nAttempting to open documentation in your default browser...")
241
+ webbrowser.open(f"file://{index_path}")
242
+ except Exception as e:
243
+ print(f"Could not open browser automatically: {e}")
244
+ else:
245
+ print(f"\nWarning: HTML index file not found at {index_path}")
246
+
247
+
248
+ if __name__ == "__main__":
249
+ main()
src_code_for_reproducibility/docs/make.bat ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @ECHO OFF
2
+
3
+ pushd %~dp0
4
+
5
+ REM Command file for Sphinx documentation
6
+
7
+ if "%SPHINXBUILD%" == "" (
8
+ set SPHINXBUILD=sphinx-build
9
+ )
10
+ set SOURCEDIR=source
11
+ set BUILDDIR=build
12
+
13
+ %SPHINXBUILD% >NUL 2>NUL
14
+ if errorlevel 9009 (
15
+ echo.
16
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
17
+ echo.installed, then set the SPHINXBUILD environment variable to point
18
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
19
+ echo.may add the Sphinx directory to PATH.
20
+ echo.
21
+ echo.If you don't have Sphinx installed, grab it from
22
+ echo.https://www.sphinx-doc.org/
23
+ exit /b 1
24
+ )
25
+
26
+ if "%1" == "" goto help
27
+
28
+ %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
29
+ goto end
30
+
31
+ :help
32
+ %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
33
+
34
+ :end
35
+ popd
src_code_for_reproducibility/markov_games/__init__.py ADDED
File without changes
src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (159 Bytes). View file
 
src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc ADDED
Binary file (3.2 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc ADDED
Binary file (4.95 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-312.pyc ADDED
Binary file (46.5 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc ADDED
Binary file (6.17 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-312.pyc ADDED
Binary file (1.25 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc ADDED
Binary file (9.72 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-312.pyc ADDED
Binary file (3.98 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-312.pyc ADDED
Binary file (3.67 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc ADDED
Binary file (3.9 kB). View file
 
src_code_for_reproducibility/markov_games/agent.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ In simple RL paradise, where the action dimensions are constant and well defined,
3
+ Agent classes are not necessary. But in MARL, with LLM's, there isn't always
4
+ a direct path from policy to action. For instance, from the observation of the environment,
5
+ a prompt must be created. Then, the outputs of the policy might be incorrect, so a second
6
+ request to the LLM must be sent before the action is well defined. This is why this Agent class exists.
7
+ It acts as a mini environment, bridging the gap between the core simulation and
8
+ the LLM policies.
9
+ """
10
+
11
+ from abc import ABC, abstractmethod
12
+ from collections.abc import Callable
13
+ from typing import Any, Tuple
14
+
15
+ from numpy.random import default_rng
16
+
17
+ from mllm.markov_games.rollout_tree import AgentActLog
18
+
19
+
20
+ class Agent(ABC):
21
+ @abstractmethod
22
+ def __init__(
23
+ self,
24
+ seed: int,
25
+ agent_id: str,
26
+ agent_name: str,
27
+ agent_policy: Callable[[list[dict]], str],
28
+ *args,
29
+ **kwargs,
30
+ ):
31
+ """
32
+ Initialize the agent state.
33
+ """
34
+ self.seed = seed
35
+ self.agent_id = agent_id
36
+ self.agent_name = agent_name
37
+ self.policy = policy
38
+ self.rng = default_rng(self.seed)
39
+ raise NotImplementedError
40
+
41
+ async def act(self, observation) -> Tuple[Any, AgentActLog]:
42
+ """
43
+ Query (possibly multiple times) a policy (or possibly a pool of policies) to
44
+ obtain the action of the agent.
45
+
46
+ Example:
47
+ action = None
48
+ prompt = self.observation_to_prompt(observation)
49
+ while not self.valid(action):
50
+ output = await self.policy.generate(prompt)
51
+ action = self.policy_output_to_action(output)
52
+ return action
53
+
54
+ Returns:
55
+ action
56
+ step_info
57
+ """
58
+ raise NotImplementedError
59
+
60
+ def get_safe_copy(self):
61
+ """
62
+ Return copy of the agent object that is decorrelated from the original object.
63
+ """
64
+ raise NotImplementedError
65
+
66
+ def reset(self):
67
+ raise NotImplementedError
68
+
69
+ def render(self):
70
+ raise NotImplementedError
71
+
72
+ def close(self):
73
+ raise NotImplementedError
74
+
75
+ def get_agent_info(self):
76
+ raise NotImplementedError
src_code_for_reproducibility/markov_games/alternative_actions_runner.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import copy
3
+ import json
4
+ import os.path
5
+ from typing import Any, Tuple
6
+
7
+ from mllm.markov_games.markov_game import AgentAndActionSafeCopy, MarkovGame
8
+ from mllm.markov_games.rollout_tree import (
9
+ AgentActLog,
10
+ RolloutTreeBranchNode,
11
+ RolloutTreeNode,
12
+ RolloutTreeRootNode,
13
+ StepLog,
14
+ )
15
+
16
+ AgentId = str
17
+
18
+
19
+
20
+ async def run_with_unilateral_alt_action(
21
+ markov_game: MarkovGame,
22
+ agent_id: AgentId,
23
+ time_step: int,
24
+ branch_node: RolloutTreeBranchNode,
25
+ max_depth: int,
26
+ ):
27
+ """
28
+ This function is used to generate a new branch for a given agent.
29
+ """
30
+
31
+ # Generate alternative action and take a step
32
+ await markov_game.set_action_of_agent(agent_id)
33
+ terminated: bool = markov_game.take_simulation_step()
34
+ step_log = markov_game.get_step_log()
35
+ first_alternative_node = RolloutTreeNode(
36
+ step_log=step_log,
37
+ time_step=time_step,
38
+ )
39
+
40
+ # Generate rest of trajectory up to max depth
41
+ time_step += 1
42
+ counter = 1
43
+ previous_node = first_alternative_node
44
+ while not terminated and counter <= max_depth:
45
+ terminated, step_log = await markov_game.step()
46
+ current_node = RolloutTreeNode(step_log=step_log, time_step=time_step)
47
+ previous_node.child = current_node
48
+ previous_node = current_node
49
+ counter += 1
50
+ time_step += 1
51
+
52
+ if branch_node.branches == None:
53
+ branch_node.branches = {agent_id: [first_alternative_node]}
54
+ else:
55
+ agent_branches = branch_node.branches.get(agent_id, [])
56
+ agent_branches.append(first_alternative_node)
57
+ branch_node.branches[agent_id] = agent_branches
58
+
59
+
60
+ async def AlternativeActionsRunner(
61
+ markov_game: MarkovGame,
62
+ output_folder: str,
63
+ nb_alternative_actions: int,
64
+ max_depth: int,
65
+ branch_only_on_new_round: bool = False,
66
+ ):
67
+ """
68
+ This method generates a trajectory with partially completed branches,
69
+ where the branching comes from taking unilateraly different actions.
70
+ The resulting data is used to estimate the updated advantage alignment policy gradient terms.
71
+ Let k := nb_sub_steps. Then the number of steps generated is O(Tk), where T is
72
+ the maximum trajectory length.
73
+ """
74
+
75
+ tasks = []
76
+ time_step = 0
77
+ terminated = False
78
+ root = RolloutTreeRootNode(
79
+ id=markov_game.get_id(),
80
+ crn_id=markov_game.get_crn_id()
81
+ )
82
+ previous_node = root
83
+
84
+ while not terminated:
85
+ mg_before_action = markov_game.get_safe_copy()
86
+
87
+ # Get safe copies for main branch
88
+ agent_action_safe_copies: dict[
89
+ AgentId, AgentAndActionSafeCopy
90
+ ] = await markov_game.get_actions_of_agents_without_side_effects()
91
+
92
+ markov_game.set_actions_of_agents_manually(agent_action_safe_copies)
93
+ terminated = markov_game.take_simulation_step()
94
+ main_node = RolloutTreeNode(
95
+ step_log=markov_game.get_step_log(), time_step=time_step
96
+ )
97
+ branch_node = RolloutTreeBranchNode(main_child=main_node)
98
+ previous_node.child = branch_node
99
+ previous_node = main_node
100
+
101
+ # Get alternative branches by generating new unilateral actions
102
+ for agent_id in markov_game.agent_ids:
103
+ for _ in range(nb_alternative_actions):
104
+ # Get safe copies for branches
105
+ branch_agent_action_safe_copies: dict[
106
+ AgentId, AgentAndActionSafeCopy
107
+ ] = {
108
+ agent_id: AgentAndActionSafeCopy(
109
+ action=copy.deepcopy(agent_action_safe_copy.action),
110
+ action_info=copy.deepcopy(agent_action_safe_copy.action_info),
111
+ agent_after_action=agent_action_safe_copy.agent_after_action.get_safe_copy(),
112
+ )
113
+ for agent_id, agent_action_safe_copy in agent_action_safe_copies.items()
114
+ }
115
+ mg_branch: MarkovGame = mg_before_action.get_safe_copy()
116
+ other_agent_id = [id for id in mg_branch.agent_ids if id != agent_id][0]
117
+ mg_branch.set_action_and_agent_after_action_manually(
118
+ agent_id=other_agent_id,
119
+ agent_action_safe_copy=branch_agent_action_safe_copies[
120
+ other_agent_id
121
+ ],
122
+ )
123
+ task = asyncio.create_task(
124
+ run_with_unilateral_alt_action(
125
+ markov_game=mg_branch,
126
+ time_step=time_step,
127
+ agent_id=agent_id,
128
+ branch_node=branch_node,
129
+ max_depth=max_depth,
130
+ )
131
+ )
132
+ tasks.append(task)
133
+ time_step += 1
134
+
135
+ # wait for all branches to complete
136
+ await asyncio.gather(*tasks)
137
+
138
+ return root
src_code_for_reproducibility/markov_games/group_timesteps.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This module contains the logic for grouping time steps.
3
+ """
4
+ import copy
5
+ from typing import Callable
6
+
7
+ from mllm.markov_games.markov_game import MarkovGame
8
+ from mllm.markov_games.rollout_tree import (
9
+ AgentActLog,
10
+ RolloutTreeBranchNode,
11
+ RolloutTreeNode,
12
+ RolloutTreeRootNode,
13
+ StepLog,
14
+ )
15
+ from mllm.markov_games.simulation import SimulationStepLog
16
+
17
+ AgentId = str
18
+
19
+
20
+ def group_time_steps(
21
+ rollout_tree: RolloutTreeRootNode,
22
+ accumulation_stop_condition: Callable[[StepLog], bool],
23
+ ) -> RolloutTreeRootNode:
24
+ """
25
+ During generation, we create rollout trees according to the real time steps.
26
+ However, during training, we might want to treat groups of time steps as a single time step.
27
+ As a concrete example, take Trust-and-Split. At each round, say we have X time steps of communication and then one time step for the split.
28
+ Then the communication actions will not get any reward, and the split action will get the reward. During REINFORCE training, with discounting, this
29
+ can cause training instability. We could instead treat every action in the round as being part of a single action, and give it the reward of the split action.
30
+ This method helps to do this sort of grouping.
31
+ It accumulates actions until the accumulation_stop_condition is met, and then creates a new node with the accumulated actions.
32
+ It then recursively calls itself on the child node.
33
+ Details:
34
+ - The reward for the group is the reward of the last time step in the group.
35
+ - The simulation log for the group is the simulation log of the last time step in the group.
36
+ - The state end for the group becomes the first state end in the group.
37
+ - The agent info for the group is the agent info of the last time step in the group.
38
+ """
39
+
40
+ def group_step_logs(step_logs: list[StepLog]) -> StepLog:
41
+ """
42
+ Concatenate per-agent chat turns across steps; keep only the first is_state_end.
43
+ """
44
+ last_sim_log = step_logs[-1].simulation_step_log
45
+ agent_ids = {aid for s in step_logs for aid in s.action_logs.keys()}
46
+ grouped_logs: dict[AgentId, AgentActLog] = {}
47
+ for aid in agent_ids:
48
+ turns = []
49
+ for s in step_logs:
50
+ act = s.action_logs.get(aid)
51
+ if act and act.chat_turns:
52
+ turns.extend(copy.deepcopy(act.chat_turns))
53
+ disable_is_state_end = False
54
+ # Only the first state_end should be True, the rest should be False
55
+ for t in turns:
56
+ if t.is_state_end:
57
+ if disable_is_state_end:
58
+ t.is_state_end = False
59
+ else:
60
+ disable_is_state_end = True
61
+ continue
62
+ grouped_logs[aid] = AgentActLog(
63
+ chat_turns=turns, info=step_logs[-1].action_logs[aid].info
64
+ )
65
+ return StepLog(action_logs=grouped_logs, simulation_step_log=last_sim_log)
66
+
67
+ def group_time_steps_rec(
68
+ current_node: RolloutTreeNode | RolloutTreeBranchNode,
69
+ group_time_step: int,
70
+ accumulation_step_logs: list[StepLog],
71
+ ) -> RolloutTreeNode | RolloutTreeBranchNode:
72
+ """
73
+ Groups time steps. Recursion is used to handle branches.
74
+ """
75
+ assert isinstance(current_node, RolloutTreeNode) or isinstance(
76
+ current_node, RolloutTreeBranchNode
77
+ ), "Current node must be a tree node or a branch node. Is of type: " + str(
78
+ type(current_node)
79
+ )
80
+ first_group_node = None
81
+ current_group_node = None
82
+ while current_node is not None:
83
+ if isinstance(current_node, RolloutTreeBranchNode):
84
+ raise Exception(
85
+ "Grouping timesteps by round is not supported for branching trajectories yet."
86
+ )
87
+ # Special recursive case for branches
88
+ # if isinstance(current_node, RolloutTreeBranchNode):
89
+ # branches = {}
90
+ # for agent_id, branch_nodes in current_node.branches.items():
91
+ # branch_group_nodes = []
92
+ # for branch_node in branch_nodes:
93
+ # branch_group_node = group_time_steps_rec(
94
+ # current_node=branch_node,
95
+ # group_time_step=group_time_step,
96
+ # accumulation_step_logs=copy.deepcopy(accumulation_step_logs))
97
+ # branch_group_nodes.append(branch_group_node)
98
+ # branches[agent_id] = branch_group_nodes
99
+
100
+ # main_child_group_node = group_time_steps_rec(
101
+ # current_node=current_node.main_child,
102
+ # group_time_step=group_time_step,
103
+ # accumulation_step_logs=copy.deepcopy(accumulation_step_logs))
104
+
105
+ # return RolloutTreeBranchNode(main_child=main_child_group_node, branches=branches)
106
+
107
+ # Accumulate
108
+ accumulation_step_logs.append(current_node.step_log)
109
+ if accumulation_stop_condition(current_node.step_log):
110
+ grouped_step_logs = group_step_logs(accumulation_step_logs)
111
+ accumulation_step_logs = []
112
+ new_group_node = RolloutTreeNode(
113
+ step_log=grouped_step_logs, time_step=group_time_step, child=None
114
+ )
115
+ if first_group_node == None:
116
+ first_group_node = new_group_node
117
+ group_time_step += 1
118
+ if current_group_node is not None:
119
+ current_group_node.child = new_group_node
120
+ current_group_node = new_group_node
121
+ current_node = current_node.child
122
+ return first_group_node
123
+
124
+ node = group_time_steps_rec(
125
+ current_node=rollout_tree.child, group_time_step=0, accumulation_step_logs=[]
126
+ )
127
+ return RolloutTreeRootNode(
128
+ id=rollout_tree.id,
129
+ crn_id=rollout_tree.crn_id,
130
+ child=node,
131
+ agent_ids=rollout_tree.agent_ids,
132
+ )
133
+
134
+
135
+ def stop_when_round_ends(step_log: StepLog) -> bool:
136
+ """
137
+ Simplest stop condition. Will return True if step log is the last time step of a round.
138
+ This will throw an error if this information is not available in the simulation info.
139
+ """
140
+ assert (
141
+ "is_last_timestep_in_round" in step_log.simulation_step_log.info.keys()
142
+ ), "To group by round, is_last_timestep_in_round must be set in the info of your simulation step log at each time step."
143
+ return step_log.simulation_step_log.info["is_last_timestep_in_round"]
144
+
145
+
146
+ def group_by_round(rollout_tree: RolloutTreeRootNode) -> RolloutTreeRootNode:
147
+ """
148
+ Groups time steps by round.
149
+ """
150
+ return group_time_steps(rollout_tree, stop_when_round_ends)
src_code_for_reproducibility/markov_games/linear_runner.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import os.path
4
+
5
+ from mllm.markov_games.markov_game import MarkovGame
6
+ from mllm.markov_games.rollout_tree import RolloutTreeNode, RolloutTreeRootNode
7
+
8
+
9
+ async def LinearRunner(
10
+ markov_game: MarkovGame, output_folder: str
11
+ ) -> RolloutTreeRootNode:
12
+ """
13
+ This method generates a trajectory without branching.
14
+ """
15
+ time_step = 0
16
+ terminated = False
17
+ root = RolloutTreeRootNode(
18
+ id=markov_game.get_id(),
19
+ crn_id=markov_game.get_crn_id(),
20
+ agent_ids=markov_game.get_agent_ids(),
21
+ )
22
+ previous_node = root
23
+ while not terminated:
24
+ terminated, step_log = await markov_game.step()
25
+ current_node = RolloutTreeNode(step_log=step_log, time_step=time_step)
26
+ previous_node.child = current_node
27
+ previous_node = current_node
28
+ time_step += 1
29
+
30
+ return root
src_code_for_reproducibility/markov_games/markov_game.py ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This class unifies a simulation, and the agents acting in it (see `simulation.py` & `agent.py`).
3
+ In a MarkovGame step,
4
+ 1) each agent takes an action,
5
+ 2) the state transitions with respect to these actions,
6
+ 3) all relevant data of the step is appended to the historical data list
7
+
8
+ In order to perform 3), the agents and the simulation are expected, at each time step,
9
+ to return a log of the state transition (from their perspective).
10
+ For instance, the Simulation might send rewards and the agents might send prompting contexts to be used later to generate the training data.
11
+ A different approach would be to simply have the agents keep their data private and log it upon completion of a trajectory.
12
+ The approach we use here centralizes the data gathering aspect,
13
+ making it easy to create sub-trajectories (in the `runners` defined in `runners.py`) descriptions that
14
+ only log information for step transitions occuring after the branching out.
15
+ """
16
+ import asyncio
17
+ import copy
18
+ import json
19
+ import os
20
+ from dataclasses import dataclass
21
+ from typing import Any, List, Literal, Optional, Tuple
22
+
23
+ from transformers.models.idefics2 import Idefics2Config
24
+
25
+ from mllm.markov_games.agent import Agent
26
+ from mllm.markov_games.rollout_tree import AgentActLog, StepLog
27
+ from mllm.markov_games.simulation import Simulation
28
+
29
+ AgentId = str
30
+
31
+
32
+ @dataclass
33
+ class AgentAndActionSafeCopy:
34
+ action: Any
35
+ action_info: AgentActLog
36
+ agent_after_action: type[Agent]
37
+
38
+
39
+ class MarkovGame(object):
40
+ def __init__(
41
+ self,
42
+ id: int,
43
+ agents: dict[AgentId, type[Agent]],
44
+ simulation: type[Simulation],
45
+ crn_id: int,
46
+ ):
47
+ """
48
+ Args:
49
+ agents:
50
+ output_path:
51
+ Path where the step infos are saved.
52
+ simulation:
53
+ Simulation object. Example: IPDSimulation
54
+ """
55
+ self.agents = agents
56
+ self.agent_ids = self.agents.keys()
57
+ self.simulation = simulation
58
+ self.simulation_step_log = None
59
+ self.agent_step_logs = {agent_id: None for agent_id in self.agent_ids}
60
+ self.actions = {}
61
+ self.id = id
62
+ self.crn_id = crn_id
63
+
64
+ def get_id(self) -> str:
65
+ return self.id
66
+
67
+ def get_crn_id(self) -> int:
68
+ return self.crn_id
69
+
70
+ def get_agent_ids(self) -> List[AgentId]:
71
+ return list(self.agent_ids)
72
+
73
+ async def get_action_of_agent_without_side_effects(
74
+ self, agent_id: AgentId
75
+ ) -> Tuple[Any, AgentActLog]:
76
+ """
77
+ Safe function to get an action of an agent without modifying the agent or the simulation.
78
+ """
79
+ agent = self.agents[agent_id]
80
+ agent_before_action = agent.get_safe_copy()
81
+ obs = self.simulation.get_obs_agent(agent_id)
82
+ action, action_info = await agent.act(observation=obs)
83
+ self.agents[agent_id] = agent_before_action
84
+ agent_after_action = agent.get_safe_copy()
85
+ return AgentAndActionSafeCopy(action, action_info, agent_after_action)
86
+
87
+ async def get_actions_of_agents_without_side_effects(
88
+ self,
89
+ ) -> dict[AgentId, AgentAndActionSafeCopy]:
90
+ """
91
+ Safe function to get an action of an agent without modifying the agent or the simulation.
92
+ """
93
+ tasks = []
94
+ for agent_id in self.agent_ids:
95
+ task = asyncio.create_task(
96
+ self.get_action_of_agent_without_side_effects(agent_id)
97
+ )
98
+ tasks.append(task)
99
+ agent_and_action_safe_copies: list[
100
+ AgentAndActionSafeCopy
101
+ ] = await asyncio.gather(*tasks)
102
+ return {
103
+ agent_id: agent_and_action_safe_copy
104
+ for agent_id, agent_and_action_safe_copy in zip(
105
+ self.agent_ids, agent_and_action_safe_copies
106
+ )
107
+ }
108
+
109
+ def set_action_and_agent_after_action_manually(
110
+ self,
111
+ agent_id: AgentId,
112
+ agent_action_safe_copy: AgentAndActionSafeCopy,
113
+ ):
114
+ """
115
+ Set the action and the agent after action manually.
116
+ """
117
+ self.actions[agent_id] = agent_action_safe_copy.action
118
+ self.agent_step_logs[agent_id] = agent_action_safe_copy.action_info
119
+ self.agents[agent_id] = agent_action_safe_copy.agent_after_action
120
+
121
+ def set_actions_of_agents_manually(
122
+ self, actions: dict[AgentId, AgentAndActionSafeCopy]
123
+ ):
124
+ """
125
+ Set the actions of agents manually.
126
+ """
127
+ for agent_id, agent_action_safe_copy in actions.items():
128
+ self.set_action_and_agent_after_action_manually(
129
+ agent_id, agent_action_safe_copy
130
+ )
131
+
132
+ async def set_action_of_agent(self, agent_id: AgentId):
133
+ """
134
+ TOWRITE
135
+ """
136
+ agent = self.agents[agent_id]
137
+ obs = self.simulation.get_obs_agent(agent_id)
138
+ action, action_info = await agent.act(observation=obs)
139
+ self.actions[agent_id] = action
140
+ self.agent_step_logs[agent_id] = action_info
141
+
142
+ async def set_actions(self):
143
+ """
144
+ TOWRITE
145
+ """
146
+ # background_tasks = set()
147
+ tasks = []
148
+ for agent_id in self.agent_ids:
149
+ task = asyncio.create_task(self.set_action_of_agent(agent_id))
150
+ tasks.append(task)
151
+ await asyncio.gather(*tasks)
152
+
153
+ def take_simulation_step(self):
154
+ """
155
+ TOWRITE
156
+ """
157
+ terminated, self.simulation_step_log = self.simulation.step(self.actions)
158
+ return terminated
159
+
160
+ def get_step_log(self) -> StepLog:
161
+ """
162
+ TOWRITE
163
+ TODO: assert actions and simulation have taken step
164
+ """
165
+ step_log = StepLog(
166
+ simulation_step_log=self.simulation_step_log,
167
+ action_logs=self.agent_step_logs,
168
+ )
169
+ return step_log
170
+
171
+ async def step(self) -> Tuple[bool, StepLog]:
172
+ """
173
+ TOWRITE
174
+ """
175
+ await self.set_actions()
176
+ terminated = self.take_simulation_step()
177
+ step_log = self.get_step_log()
178
+ return terminated, step_log
179
+
180
+ def get_safe_copy(self):
181
+ """
182
+ TOWRITE
183
+ """
184
+
185
+ new_markov_game = copy.copy(self)
186
+ new_simulation = self.simulation.get_safe_copy()
187
+ new_agents = {
188
+ agent_id: agent.get_safe_copy() for agent_id, agent in self.agents.items()
189
+ }
190
+
191
+ # Reassign copied components
192
+ new_markov_game.simulation = new_simulation
193
+ new_markov_game.agents = new_agents
194
+
195
+ # IMPORTANT: ensure agent_ids references the new agents dict, not the original
196
+ new_markov_game.agent_ids = new_markov_game.agents.keys()
197
+
198
+ # Deep-copy step data to avoid correlation
199
+ new_markov_game.simulation_step_log = copy.deepcopy(self.simulation_step_log)
200
+ new_markov_game.actions = copy.deepcopy(self.actions)
201
+ # Rebuild logs to align exactly with new agent ids
202
+ old_agent_step_logs = copy.deepcopy(self.agent_step_logs)
203
+ new_markov_game.agent_step_logs = {
204
+ agent_id: old_agent_step_logs.get(agent_id)
205
+ for agent_id in new_markov_game.agent_ids
206
+ }
207
+
208
+ return new_markov_game
src_code_for_reproducibility/markov_games/mg_utils.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import copy
3
+ from collections.abc import Callable
4
+ from dataclasses import dataclass
5
+
6
+ from mllm.markov_games.ipd.ipd_agent import IPDAgent
7
+ from mllm.markov_games.ipd.ipd_simulation import IPD
8
+ from mllm.markov_games.markov_game import MarkovGame
9
+ from mllm.markov_games.negotiation.dond_agent import DealNoDealAgent
10
+ from mllm.markov_games.negotiation.dond_simulation import DealNoDealSimulation
11
+ from mllm.markov_games.negotiation.nego_hard_coded_policies import (
12
+ HardCodedNegoGreedyPolicy,
13
+ HardCodedNegoWelfareMaximizingPolicy,
14
+ )
15
+ from mllm.markov_games.ipd.Ipd_hard_coded_agents import AlwaysCooperateIPDAgent, AlwaysDefectIPDAgent
16
+ from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
17
+ from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressSimulation
18
+ from mllm.markov_games.negotiation.tas_agent import TrustAndSplitAgent
19
+ from mllm.markov_games.negotiation.tas_rps_agent import TrustAndSplitRPSAgent
20
+ from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSSimulation
21
+ from mllm.markov_games.negotiation.tas_simple_agent import TrustAndSplitSimpleAgent
22
+ from mllm.markov_games.negotiation.tas_simple_simulation import (
23
+ TrustAndSplitSimpleSimulation,
24
+ )
25
+ from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitSimulation
26
+ from mllm.markov_games.rollout_tree import (
27
+ AgentActLog,
28
+ RolloutTreeBranchNode,
29
+ RolloutTreeNode,
30
+ RolloutTreeRootNode,
31
+ StepLog,
32
+ )
33
+ from mllm.markov_games.simulation import SimulationStepLog
34
+
35
+ AgentId = str
36
+
37
+
38
+ @dataclass
39
+ class AgentConfig:
40
+ agent_id: str
41
+ agent_name: str
42
+ agent_class_name: str
43
+ policy_id: str
44
+ init_kwargs: dict
45
+
46
+
47
+ @dataclass
48
+ class MarkovGameConfig:
49
+ id: int
50
+ seed: int
51
+ simulation_class_name: str
52
+ simulation_init_args: dict
53
+ agent_configs: list[AgentConfig]
54
+
55
+
56
+ def init_markov_game_components(
57
+ config: MarkovGameConfig, policies: dict[str, Callable[[list[dict]], str]]
58
+ ):
59
+ """
60
+ TOWRITE
61
+ """
62
+ agents = {}
63
+ agent_names = []
64
+ for agent_config in config.agent_configs:
65
+ agent_id = agent_config.agent_id
66
+ agent_name = agent_config.agent_name
67
+ agent_class = eval(agent_config.agent_class_name)
68
+ agent = agent_class(
69
+ seed=config.seed,
70
+ agent_id=agent_id,
71
+ agent_name=agent_name,
72
+ policy=policies[agent_config.policy_id],
73
+ **agent_config.init_kwargs,
74
+ )
75
+ agents[agent_id] = agent
76
+ agent_names.append(agent_name)
77
+ simulation = eval(config.simulation_class_name)(
78
+ seed=config.seed,
79
+ agent_ids=list(agents.keys()),
80
+ agent_names=agent_names,
81
+ **config.simulation_init_args,
82
+ )
83
+ markov_game = MarkovGame(
84
+ id=config.id,
85
+ crn_id=config.seed,
86
+ agents=agents,
87
+ simulation=simulation,
88
+ )
89
+ return markov_game
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from dataclasses import dataclass
3
+ from typing import Any, Dict, List, Tuple
4
+
5
+ from numpy.random import default_rng
6
+
7
+ from mllm.markov_games.rollout_tree import SimulationStepLog
8
+ from mllm.markov_games.negotiation.nego_simulation import Split, NegotiationState, NegotiationObs, NegotiationSimulation
9
+ from mllm.utils.get_coagent_id import get_coagent_id
10
+
11
+
12
+ AgentId = str
13
+
14
+
15
+ @dataclass
16
+ class DealNoDealState(NegotiationState):
17
+ item_types: List[str]
18
+ values: Dict[AgentId, Dict[str, int]]
19
+
20
+ @dataclass
21
+ class DealNoDealObs(NegotiationObs):
22
+ my_values: Dict[str, int]
23
+ item_types: List[str]
24
+ previous_values_coagent: Dict[str, int] | None
25
+
26
+
27
+ def random_partition_integer(rng, total: int, parts: int) -> List[int]:
28
+ if parts <= 0:
29
+ return []
30
+ if total <= 0:
31
+ return [0 for _ in range(parts)]
32
+ cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
33
+ vals = []
34
+ prev = 0
35
+ for c in cuts + [total]:
36
+ vals.append(c - prev)
37
+ prev = c
38
+ return vals
39
+
40
+ class DealNoDealSimulation(NegotiationSimulation):
41
+
42
+ def __init__(
43
+ self,
44
+ item_types: List[str] = ["books", "hats", "balls"],
45
+ *args,
46
+ **kwargs,
47
+ ):
48
+ super().__init__(item_types=item_types, *args, **kwargs)
49
+ self.reset()
50
+
51
+ def _other(self, agent_id: AgentId) -> AgentId:
52
+ return get_coagent_id(self.agent_ids, agent_id)
53
+
54
+ def _sample_stock(self) -> Dict[str, int]:
55
+ # total items between 5 and 7
56
+ total_items = int(self.rng.integers(5, 8))
57
+ # nonnegative per-type counts summing to total_items
58
+ parts = random_partition_integer(self.rng, total_items, len(self.item_types))
59
+ # allow zeros per type
60
+ return {t: int(c) for t, c in zip(self.item_types, parts)}
61
+
62
+ def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
63
+ # Each agent has integer non-negative values that sum to 10
64
+ # Each item type valued by at least one agent
65
+ # Some item type valued by both agents
66
+ while True:
67
+ vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
68
+ vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
69
+ a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
70
+ b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
71
+ # each item valued by at least one
72
+ ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
73
+ # some item valued by both
74
+ ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
75
+ if ok1 and ok2:
76
+ return {self.agent_ids[0]: a, self.agent_ids[1]: b}
77
+
78
+ def _is_valid_allocation(self, allocation: Dict[str, int], stock: Dict[str, int]) -> bool:
79
+ for t in self.item_types:
80
+ v = allocation.get(t)
81
+ if v is None:
82
+ return False
83
+ if not isinstance(v, int):
84
+ return False
85
+ if v < 0 or v > int(stock.get(t, 0)):
86
+ return False
87
+ return True
88
+
89
+ def set_new_round_of_variant(self):
90
+ # Keep same values, resample stock
91
+ self.state.quantities = self._sample_stock()
92
+
93
+ def get_info_of_variant(self, state: NegotiationState, actions: Dict[AgentId, Any]) -> Dict[str, Any]:
94
+ return {
95
+ "quantities": copy.deepcopy(state.quantities),
96
+ "values": copy.deepcopy(state.values),
97
+ 'splits': copy.deepcopy(state.splits),
98
+ }
99
+
100
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
101
+ """
102
+ Returns the rewards for each agent.
103
+ """
104
+ split_a = splits[self.agent_ids[0]].items_given_to_self
105
+ split_b = splits[self.agent_ids[1]].items_given_to_self
106
+ rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
107
+ for t in self.item_types:
108
+ # If not complementary, return 0!
109
+ if not split_a[t] + split_b[t] == self.state.quantities[t]:
110
+ return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
111
+ rewards[self.agent_ids[0]] += split_a[t] * self.state.values[self.agent_ids[0]][t]
112
+ rewards[self.agent_ids[1]] += split_b[t] * self.state.values[self.agent_ids[1]][t]
113
+ return rewards
114
+
115
+ def get_obs(self):
116
+ return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
117
+
118
+ def get_obs_agent(self, agent_id):
119
+ other_id = self._other(agent_id)
120
+ obs = DealNoDealObs(
121
+ round_nb=self.state.round_nb,
122
+ last_message=self.state.last_message,
123
+ current_agent=self.state.current_agent,
124
+ quantities=copy.deepcopy(self.state.quantities),
125
+ value=0.0, # unused in DOND
126
+ other_agent_split=None, # not meaningful until split
127
+ split_phase=self.state.split_phase,
128
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
129
+ my_values=copy.deepcopy(self.state.values[agent_id]),
130
+ item_types=list(self.item_types),
131
+ previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
132
+ )
133
+ return obs
134
+
135
+ def reset(self):
136
+ start_agent = self.agent_ids[self._starting_agent_index]
137
+ stock = self._sample_stock()
138
+ values = self._sample_values_pair()
139
+ self.state = DealNoDealState(
140
+ round_nb=0,
141
+ last_message="",
142
+ current_agent=start_agent,
143
+ quantities=stock,
144
+ values=values,
145
+ previous_values=None,
146
+ splits={aid: None for aid in self.agent_ids},
147
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
148
+ split_phase=False,
149
+ item_types=list(self.item_types),
150
+ )
151
+ return self.get_obs()
152
+
153
+
src_code_for_reproducibility/markov_games/negotiation/nego_agent.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from abc import abstractmethod
3
+ from collections.abc import Callable
4
+ from dataclasses import dataclass
5
+ from typing import Any, Dict, List, Tuple
6
+
7
+ import numpy as np
8
+
9
+ from mllm.markov_games.agent import Agent
10
+ from mllm.markov_games.negotiation.nego_simulation import Message, NegotiationObs, Split
11
+ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
12
+
13
+
14
+ @dataclass
15
+ class NegotiationAgentState:
16
+ round_nb: int
17
+ nb_messages_sent_this_round: int
18
+ chat_counter: int
19
+ chat_history: List[ChatTurn]
20
+
21
+
22
+ class NegotiationAgent(Agent):
23
+ def __init__(
24
+ self,
25
+ seed: int,
26
+ agent_id: str,
27
+ agent_name: str,
28
+ policy: Callable[[List[Dict]], str],
29
+ goal: str,
30
+ exploration_prompts: List[str] = [],
31
+ exploration_prompt_probs: List[float] = [],
32
+ ):
33
+ self.seed = seed
34
+ self.agent_id = agent_id
35
+ self.agent_name = agent_name
36
+ self.policy = policy
37
+ self.goal = goal
38
+ self.exploration_prompts_toggled = len(exploration_prompts) > 0
39
+ if self.exploration_prompts_toggled:
40
+ exploration_prompts = copy.deepcopy(exploration_prompts)
41
+ exploration_prompts.append(None)
42
+ self.exploration_prompts = exploration_prompts
43
+ self.exploration_prompt_probs = np.array(exploration_prompt_probs)
44
+ assert self.exploration_prompt_probs.sum() <= 1
45
+ assert np.all(self.exploration_prompt_probs >= 0)
46
+ self.exploration_prompt_probs = np.append(
47
+ self.exploration_prompt_probs, 1 - self.exploration_prompt_probs.sum()
48
+ )
49
+ self.state = NegotiationAgentState(
50
+ round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
51
+ )
52
+
53
+ # Implemented in variants
54
+ self.intro_prompt = ""
55
+ self.new_round_prompt = ""
56
+ self.last_round_prompt = ""
57
+ self.send_split_prompt = ""
58
+ self.wait_for_message_prompt = ""
59
+ self.last_message_prompt = ""
60
+ self.send_message_prompt = ""
61
+
62
+ @abstractmethod
63
+ def get_message_regex(self, observation: NegotiationObs) -> str:
64
+ pass
65
+
66
+ @abstractmethod
67
+ def get_split_regex(self, observation: NegotiationObs) -> str:
68
+ pass
69
+
70
+ @abstractmethod
71
+ def get_split_action(
72
+ self, policy_output: str, observation: NegotiationObs
73
+ ) -> Split:
74
+ pass
75
+
76
+ async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
77
+ def dict_to_str(d: dict) -> str:
78
+ return ", ".join(f"{v} {k}" for k, v in d.items())
79
+
80
+ def dict_to_eq_str(d: dict) -> str:
81
+ return ", ".join(f"{k}={v}" for k, v in d.items())
82
+
83
+ is_our_turn = observation.current_agent == self.agent_id
84
+ action: Any = None
85
+ round_nb = observation.round_nb
86
+
87
+ prompt_parts: List[str] = []
88
+ obs_ctx = vars(observation)
89
+ obs_ctx_formmated = obs_ctx.copy()
90
+ for key in obs_ctx_formmated:
91
+ if isinstance(obs_ctx_formmated[key], dict) and "value" not in key:
92
+ obs_ctx_formmated[key] = dict_to_str(obs_ctx_formmated[key])
93
+ elif isinstance(obs_ctx_formmated[key], dict) and "value" in key:
94
+ obs_ctx_formmated[key] = dict_to_eq_str(obs_ctx_formmated[key])
95
+
96
+ #######################################
97
+ # build user prompt
98
+ #######################################
99
+
100
+ # First-ever call
101
+ is_intro = round_nb == 0 and self.state.chat_counter == 0
102
+ if is_intro:
103
+ prompt_parts.append(
104
+ self.intro_prompt.format(
105
+ goal=self.goal, agent=self.agent_name, **obs_ctx_formmated
106
+ )
107
+ )
108
+
109
+ # New round
110
+ is_new_round = round_nb > self.state.round_nb
111
+ if is_new_round or is_intro:
112
+ self.state.nb_messages_sent_this_round = 0
113
+ if not is_intro:
114
+ prompt_parts.append(self.last_round_prompt.format(**obs_ctx_formmated))
115
+ prompt_parts.append(self.new_round_prompt.format(**obs_ctx_formmated))
116
+ if self.exploration_prompts_toggled:
117
+ exploration_prompt = self.exploration_prompts[
118
+ np.random.choice(
119
+ len(self.exploration_prompts), p=self.exploration_prompt_probs
120
+ )
121
+ ]
122
+ if exploration_prompt is not None:
123
+ prompt_parts.append(exploration_prompt)
124
+ self.state.round_nb = round_nb
125
+
126
+ # Wait for message
127
+ if not is_our_turn and not observation.split_phase:
128
+ prompt_parts.append(
129
+ self.wait_for_message_prompt.format(**obs_ctx_formmated)
130
+ )
131
+
132
+ # Get last message
133
+ if is_our_turn and not is_new_round and not is_intro:
134
+ prompt_parts.append(self.last_message_prompt.format(**obs_ctx_formmated))
135
+
136
+ # Prompt to send message
137
+ must_send_message = not observation.split_phase and is_our_turn
138
+ if must_send_message:
139
+ prompt_parts.append(self.send_message_prompt.format(**obs_ctx_formmated))
140
+
141
+ # Prompt to give split
142
+ must_send_split = not must_send_message and observation.split_phase
143
+ if must_send_split:
144
+ var_names = ["x", "y", "z", "w"] # Extend as needed
145
+ items_str = ", ".join(
146
+ [
147
+ f"{var_names[i]} {item}"
148
+ for i, item in enumerate(obs_ctx["quantities"].keys())
149
+ ]
150
+ )
151
+ ranges_str = ", ".join(
152
+ [
153
+ f"{var_names[i]}: 0-{obs_ctx['quantities'][item]} (integer)"
154
+ for i, item in enumerate(obs_ctx["quantities"].keys())
155
+ ]
156
+ )
157
+ proposal_style = f"Proposal: {items_str} where {ranges_str}."
158
+ proposal_style2 = (
159
+ f"<items_to_self> {items_str} </items_to_self> where {ranges_str}."
160
+ )
161
+ prompt_parts.append(
162
+ self.send_split_prompt.format(
163
+ proposal_style=proposal_style,
164
+ proposal_style2=proposal_style2,
165
+ **obs_ctx_formmated,
166
+ )
167
+ )
168
+
169
+ # Append one ChatTurn with is_state_end=True
170
+ user_prompt = "\n".join(prompt_parts)
171
+ self.state.chat_history.append(
172
+ ChatTurn(
173
+ agent_id=self.agent_id,
174
+ role="user",
175
+ content=user_prompt,
176
+ is_state_end=True,
177
+ )
178
+ )
179
+
180
+ #######################################
181
+ # Get policy action
182
+ #######################################
183
+
184
+ # Query policy for the appropriate format
185
+ if must_send_message:
186
+ return_regex = self.get_message_regex(observation)
187
+ policy_output = await self.policy(
188
+ state=self.state.chat_history,
189
+ agent_id=self.agent_id,
190
+ regex=return_regex,
191
+ )
192
+ self.state.chat_history.append(
193
+ ChatTurn(
194
+ agent_id=self.agent_id,
195
+ role="assistant",
196
+ content=policy_output.content,
197
+ reasoning_content=policy_output.reasoning_content,
198
+ log_probs=policy_output.log_probs,
199
+ out_token_ids=policy_output.out_token_ids,
200
+ is_state_end=False,
201
+ )
202
+ )
203
+ action = Message(message=policy_output.content)
204
+ self.state.nb_messages_sent_this_round += 1
205
+
206
+ elif must_send_split:
207
+ return_regex = self.get_split_regex(observation)
208
+ policy_output = await self.policy(
209
+ state=self.state.chat_history,
210
+ agent_id=self.agent_id,
211
+ regex=return_regex,
212
+ )
213
+ self.state.chat_history.append(
214
+ ChatTurn(
215
+ agent_id=self.agent_id,
216
+ role="assistant",
217
+ content=policy_output.content,
218
+ reasoning_content=policy_output.reasoning_content,
219
+ log_probs=policy_output.log_probs,
220
+ out_token_ids=policy_output.out_token_ids,
221
+ is_state_end=False,
222
+ )
223
+ )
224
+ action = self.get_split_action(policy_output.content, observation)
225
+ else:
226
+ action = None
227
+
228
+ agent_step_log = AgentActLog(
229
+ chat_turns=self.state.chat_history[self.state.chat_counter :], info=None
230
+ )
231
+ self.state.chat_counter = len(self.state.chat_history)
232
+ return action, agent_step_log
233
+
234
+ def get_safe_copy(self):
235
+ agent_copy = copy.copy(self)
236
+ agent_copy.state = copy.deepcopy(self.state)
237
+ return agent_copy
238
+
239
+ def reset(self):
240
+ self.state = NegotiationAgentState(
241
+ round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
242
+ )
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Literal, Tuple
5
+
6
+ from mllm.markov_games.negotiation.nego_simulation import (
7
+ NegotiationObs,
8
+ NegotiationSimulation,
9
+ NegotiationState,
10
+ Split,
11
+ compute_tas_style_rewards,
12
+ )
13
+
14
+ AgentId = str
15
+
16
+
17
+ @dataclass
18
+ class NoPressState(NegotiationState):
19
+ pass
20
+
21
+
22
+ @dataclass
23
+ class NoPressObs(NegotiationObs):
24
+ other_value: Dict[str, float]
25
+
26
+
27
+ class NoPressSimulation(NegotiationSimulation):
28
+ def __init__(
29
+ self,
30
+ game_type: Literal["10-1-exclusive", "10-1-ties", "1-to-20"] = "1-to-20",
31
+ same_round_value: bool = True,
32
+ atleast_one_conflict: bool = False,
33
+ *args,
34
+ **kwargs,
35
+ ):
36
+ self.game_type = game_type
37
+ self.same_round_value = same_round_value
38
+ self.atleast_one_conflict = atleast_one_conflict
39
+ super().__init__(*args, **kwargs)
40
+
41
+ def _sample_values(self) -> Dict[AgentId, dict]:
42
+ values = defaultdict(dict)
43
+ if self.state is None:
44
+ item_types = self.item_types
45
+ else:
46
+ item_types = list(self.state.quantities.keys())
47
+ while True:
48
+ for item in item_types:
49
+ if self.game_type == "10-1-exclusive":
50
+ v = int(self.rng.choice([1, 10]))
51
+ values[self.agent_ids[0]][item] = v
52
+ values[self.agent_ids[1]][item] = 10 if v == 1 else 1
53
+ elif self.game_type == "10-1-ties":
54
+ for aid in self.agent_ids:
55
+ values[aid][item] = int(self.rng.choice([1, 10]))
56
+ elif self.game_type == "1-to-20":
57
+ for aid in self.agent_ids:
58
+ values[aid][item] = int(self.rng.integers(1, 21))
59
+ if self.atleast_one_conflict:
60
+ has_conflict = False
61
+ for item in item_types:
62
+ agent_values_for_item = [
63
+ values[aid][item] for aid in self.agent_ids
64
+ ]
65
+ if len(set(agent_values_for_item)) > 1:
66
+ has_conflict = True
67
+ break
68
+ if not has_conflict:
69
+ continue
70
+ agent_values = [sum(v.values()) for v in values.values()]
71
+ if len(set(agent_values)) == 1 or not self.same_round_value:
72
+ break
73
+ return values
74
+
75
+ def _sample_quantities(self) -> Dict[str, int]:
76
+ return {item.lower(): 10 for item in self.item_types}
77
+
78
+ def set_new_round_of_variant(self):
79
+ self.state.quantities = self._sample_quantities()
80
+ self.state.values = self._sample_values()
81
+ self.state.split_phase = True
82
+
83
+ def get_info_of_variant(
84
+ self, state: NegotiationState, actions: Dict[AgentId, Any]
85
+ ) -> Dict[str, Any]:
86
+ return {
87
+ "quantities": copy.deepcopy(state.quantities),
88
+ "values": copy.deepcopy(state.values),
89
+ "splits": copy.deepcopy(state.splits),
90
+ }
91
+
92
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
93
+ return compute_tas_style_rewards(
94
+ self.agent_ids, self.state.values, splits, self.state.quantities
95
+ )
96
+
97
+ def get_obs(self):
98
+ return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
99
+
100
+ def get_obs_agent(self, agent_id):
101
+ other_id = self._other(agent_id)
102
+ last_value_coagent = (
103
+ None
104
+ if self.state.previous_values is None
105
+ else self.state.previous_values.get(other_id)
106
+ )
107
+ last_points_coagent = (
108
+ None
109
+ if self.state.previous_points is None
110
+ else round(self.state.previous_points.get(other_id), 1)
111
+ )
112
+ last_value_agent = (
113
+ None
114
+ if self.state.previous_values is None
115
+ else self.state.previous_values.get(agent_id)
116
+ )
117
+ last_points_agent = (
118
+ None
119
+ if self.state.previous_points is None
120
+ else round(self.state.previous_points.get(agent_id), 1)
121
+ )
122
+ last_split_coagent = None
123
+ last_split_agent = None
124
+ if self.state.previous_splits is not None:
125
+ last_split_coagent = self.state.previous_splits[
126
+ other_id
127
+ ].items_given_to_self
128
+ last_split_agent = self.state.previous_splits[agent_id].items_given_to_self
129
+ obs = NoPressObs(
130
+ round_nb=self.state.round_nb,
131
+ last_message="",
132
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
133
+ current_agent=self.state.current_agent,
134
+ other_agent=self.agent_id_to_name[other_id],
135
+ quantities=self.state.quantities,
136
+ item_types=self.item_types,
137
+ value=self.state.values[agent_id],
138
+ split_phase=self.state.split_phase,
139
+ last_split_agent=last_split_agent,
140
+ last_value_agent=last_value_agent,
141
+ last_points_agent=last_points_agent,
142
+ last_split_coagent=last_split_coagent,
143
+ last_value_coagent=last_value_coagent,
144
+ last_points_coagent=last_points_coagent,
145
+ other_value=self.state.values[other_id],
146
+ last_quantities=self.state.previous_quantities,
147
+ )
148
+ return obs
149
+
150
+ def reset(self):
151
+ start_agent = self.agent_ids[self._starting_agent_index]
152
+ quantities = self._sample_quantities()
153
+ values = self._sample_values()
154
+ self.state = NoPressState(
155
+ round_nb=0,
156
+ last_message="",
157
+ current_agent=start_agent,
158
+ quantities=quantities,
159
+ values=values,
160
+ previous_values=None,
161
+ splits={aid: None for aid in self.agent_ids},
162
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
163
+ split_phase=True,
164
+ previous_splits=None,
165
+ previous_points=None,
166
+ previous_quantities=None,
167
+ )
168
+ return self.get_obs()
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
2
+ from mllm.markov_games.negotiation.nego_simulation import Split
3
+ from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
4
+
5
+
6
+ class TrustAndSplitAgent(NegotiationAgent):
7
+ def __init__(self, num_message_chars, *args, **kwargs):
8
+ self.num_message_chars = num_message_chars
9
+ super().__init__(*args, **kwargs)
10
+ self.intro_prompt = (
11
+ "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
12
+ "Setup:\n"
13
+ "1. The game has multiple independent rounds.\n"
14
+ "2. In each round, there are multiple items to split between the two agents.\n"
15
+ "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n"
16
+ "4. You can only observe your own per-item values.\n"
17
+ "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n"
18
+ "\n"
19
+ "Protocol:\n"
20
+ "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
21
+ "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the item.\n"
22
+ " - Use this chat to communicate your private per-item value to make informed proposals.\n"
23
+ "3. After the chat, both agents simultaneously propose the amount of each item they will keep.\n"
24
+ "4. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n"
25
+ "5. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n"
26
+ "6. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n"
27
+ "7. Points are accumulated across rounds.\n"
28
+ "Your goal: {goal}\n"
29
+ )
30
+ self.new_round_prompt = (
31
+ "A New Round Begins\n"
32
+ "The items to split are {quantities}.\n"
33
+ "Your per-item values are {value}."
34
+ )
35
+ self.last_round_prompt = (
36
+ "Last Round Summary:\n"
37
+ " - Items to split: {last_quantities}\n"
38
+ " - Your per-item values: {last_value_agent}\n"
39
+ " - {other_agent}'s per-item values: {last_value_coagent}\n"
40
+ " - You proposed: {last_split_agent}\n"
41
+ " - You earned: {last_points_agent} points\n"
42
+ " - {other_agent} proposed: {last_split_coagent}\n"
43
+ " - {other_agent} earned: {last_points_coagent} points\n"
44
+ " - Round Complete.\n"
45
+ )
46
+ self.send_split_prompt = (
47
+ "Message quota is finished for this round.\n"
48
+ "{other_agent} has finalized their proposal.\n"
49
+ "Submit your finalization now\n"
50
+ "Respond with {proposal_style2}"
51
+ )
52
+ # self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
53
+ self.wait_for_message_prompt = ""
54
+ self.last_message_prompt = "{other_agent} said: {last_message}"
55
+ # self.send_message_prompt = (
56
+ # f"Send your message now (max {self.num_message_chars} chars)."
57
+ # )
58
+ self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
59
+
60
+ def get_message_regex(self, observation: TrustAndSplitObs) -> str:
61
+ return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
62
+
63
+ # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
64
+ # return rf"(?s).{{0,{self.num_message_chars}}}"
65
+
66
+ def get_split_regex(self, observation: TrustAndSplitObs) -> str:
67
+ items = list(observation.quantities.keys())
68
+ # Accept both singular and plural forms
69
+ item_pattern = "|".join(
70
+ [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items]
71
+ )
72
+ regex = rf"(?i)<items_to_self> ?((?:\s*(?P<num>(10|[0-9]))\s*(?P<item>{item_pattern})\s*,?)+) ?</items_to_self>"
73
+ return regex
74
+
75
+ def get_split_action(
76
+ self, policy_output: str, observation: TrustAndSplitObs
77
+ ) -> Split:
78
+ items = list(observation.quantities.keys())
79
+ import re as _re
80
+
81
+ split_regex = self.get_split_regex(observation)
82
+ items_given_to_self = {item: 0 for item in items}
83
+ m = _re.match(split_regex, policy_output.strip())
84
+ if m:
85
+ # Find all (number, item) pairs
86
+ item_pattern = "|".join(
87
+ [
88
+ f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?"
89
+ for item in items
90
+ ]
91
+ )
92
+ inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
93
+
94
+ def normalize_item_name(item_str):
95
+ for orig in items:
96
+ if item_str.lower() == orig.lower():
97
+ return orig
98
+ if orig.endswith("s") and item_str.lower() == orig[:-1].lower():
99
+ return orig
100
+ if (
101
+ not orig.endswith("s")
102
+ and item_str.lower() == orig.lower() + "s"
103
+ ):
104
+ return orig
105
+
106
+ for num, item in _re.findall(inner_regex, m.group(1)):
107
+ items_given_to_self[normalize_item_name(item)] = int(num)
108
+ return Split(items_given_to_self=items_given_to_self)
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from collections.abc import Callable
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Tuple
5
+
6
+ from mllm.markov_games.agent import Agent
7
+ from mllm.markov_games.negotiation.nego_agent import (
8
+ Message,
9
+ NegotiationAgent,
10
+ NegotiationAgentState,
11
+ Split,
12
+ )
13
+ from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSObs
14
+ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
15
+
16
+
17
+ class TrustAndSplitRPSAgent(NegotiationAgent):
18
+ def __init__(
19
+ self,
20
+ num_message_chars: int,
21
+ message_start_end_format: bool = False,
22
+ proposal_start_end_format: bool = False,
23
+ *args,
24
+ **kwargs,
25
+ ):
26
+ self.num_message_chars = num_message_chars
27
+ self.message_start_end_format = message_start_end_format
28
+ self.proposal_start_end_format = proposal_start_end_format
29
+ super().__init__(*args, **kwargs)
30
+ self.intro_prompt = (
31
+ "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
32
+ "\n"
33
+ "Setup:\n"
34
+ "1. The game has multiple independent rounds.\n"
35
+ "2. In each round, there are 10 coins to split between the two agents.\n"
36
+ "3. Each agent's per-coin value for that round is determined as follows:\n"
37
+ " - Both agents are randomly assigned a rock, paper or scissors hands\n"
38
+ " - Rock has the upper hand over scissors, scissors has the upper hand over paper and paper has the upper hand over rock.\n"
39
+ " - The agent with the upper hand has a per-coin value of 10.\n"
40
+ " - The agent with the lower hand has a per-coin value of 1.\n"
41
+ "4. You only see your own hand, but you may communicate it in messages and infer your value based on the other agent's hand.\n"
42
+ "5. Over many rounds both agents are equally likely to have the upper and lower hand.\n"
43
+ "\n"
44
+ "Protocol:\n"
45
+ "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
46
+ "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
47
+ " - Use this chat to communicate your hand so that both agents can determine their per-coin values.\n"
48
+ "3. After the chat, both agents simultaneously propose how many coins they keep.\n"
49
+ "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
50
+ "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
51
+ "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
52
+ "7. The points are accumulated across rounds.\n"
53
+ "Your goal: {goal}\n"
54
+ )
55
+ self.new_round_prompt = (
56
+ "A New Round Begins\n"
57
+ "Your hand is {hand}. You don't know {other_agent}'s hand yet.\n"
58
+ )
59
+ # self.last_round_prompt = (
60
+ # "Last Round Summary:\n"
61
+ # " - Your hand: {last_hand_agent}\n"
62
+ # " - {other_agent}'s hand: {last_hand_coagent}\n"
63
+ # " - Your value per coin: {last_value_agent}\n"
64
+ # " - {other_agent}'s value per coin: {last_value_coagent}\n"
65
+ # " - You proposed: {last_split_agent} coins\n"
66
+ # " - You earned: {last_points_agent} points\n"
67
+ # " - {other_agent} proposed: {last_split_coagent} coins\n"
68
+ # " - {other_agent} earned: {last_points_coagent} points\n"
69
+ # " - Round Complete.\n"
70
+ # )
71
+ self.last_round_prompt = "In the previous round, {other_agent} had a {last_hand_value_coagent} hand and proposed {last_split_coagent} coins.\n"
72
+ if self.proposal_start_end_format:
73
+ self.send_split_prompt = (
74
+ "Submit your proposal\n"
75
+ "Respond with <<proposal_start>> x <<proposal_end>> where x is an integer in [0, 10]."
76
+ )
77
+ else:
78
+ self.send_split_prompt = (
79
+ "Submit your proposal\n"
80
+ "Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
81
+ )
82
+ self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
83
+ # self.wait_for_message_prompt = ""
84
+ self.last_message_prompt = "{other_agent} said: {last_message}"
85
+ if self.message_start_end_format:
86
+ self.send_message_prompt = f"Send your message now in <<message_start>>...<<message_end>> (<={self.num_message_chars} chars)."
87
+ else:
88
+ self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
89
+
90
+ def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
91
+ if self.message_start_end_format:
92
+ return (
93
+ rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
94
+ )
95
+ else:
96
+ return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
97
+
98
+ def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
99
+ if self.proposal_start_end_format:
100
+ return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
101
+ else:
102
+ return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
103
+
104
+ def get_split_action(
105
+ self, policy_output: str, observation: TrustAndSplitRPSObs
106
+ ) -> Split:
107
+ import re as _re
108
+
109
+ if self.proposal_start_end_format:
110
+ m = _re.search(
111
+ r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>", policy_output
112
+ )
113
+ else:
114
+ m = _re.search(
115
+ r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output
116
+ )
117
+ coins_int = int(m.group(1)) if m else int(policy_output)
118
+ return Split(items_given_to_self={"coins": coins_int})
src_code_for_reproducibility/markov_games/negotiation/tas_simple_simulation.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Literal
5
+
6
+ from numpy.random import default_rng
7
+
8
+ from mllm.markov_games.negotiation.nego_simulation import (
9
+ NegotiationObs,
10
+ NegotiationSimulation,
11
+ NegotiationState,
12
+ Split,
13
+ compute_tas_style_rewards,
14
+ )
15
+
16
+ AgentId = str
17
+
18
+
19
+ @dataclass
20
+ class TrustAndSplitSimpleState(NegotiationState):
21
+ pass
22
+
23
+
24
+ @dataclass
25
+ class TrustAndSplitSimpleObs(NegotiationObs):
26
+ last_value_str_coagent: str | None
27
+
28
+
29
+ class TrustAndSplitSimpleSimulation(NegotiationSimulation):
30
+ def __init__(
31
+ self,
32
+ game_type: Literal["10-1-exclusive", "1-to-10"] = "1-to-10",
33
+ dist_type: Literal["uniform", "bimodal"] = "uniform",
34
+ beta_dist_alpha: float = 0.1,
35
+ beta_dist_beta: float = 0.1,
36
+ *args,
37
+ **kwargs,
38
+ ):
39
+ self.game_type = game_type
40
+ self.dist_type = dist_type
41
+ self.beta_dist_alpha = beta_dist_alpha
42
+ self.beta_dist_beta = beta_dist_beta
43
+ super().__init__(*args, **kwargs)
44
+
45
+ def _sample_values(self) -> Dict[AgentId, dict]:
46
+ values = {}
47
+ while True:
48
+ if self.game_type == "10-1-exclusive":
49
+ v = int(self.rng.choice([1, 10]))
50
+ values[self.agent_ids[0]] = v
51
+ values[self.agent_ids[1]] = 10 if v == 1 else 1
52
+ elif self.game_type == "1-to-10":
53
+ for aid in self.agent_ids:
54
+ if self.dist_type == "uniform":
55
+ values[aid] = int(self.rng.integers(1, 11))
56
+ elif self.dist_type == "bimodal":
57
+ alpha, beta = self.beta_dist_alpha, self.beta_dist_beta
58
+ values[aid] = int(round(self.rng.beta(alpha, beta) * 9) + 1)
59
+ if len(set(values.values())) != 1:
60
+ break
61
+ return values
62
+
63
+ def _sample_quantities(self) -> Dict[str, int]:
64
+ return {"coins": 10}
65
+
66
+ def set_new_round_of_variant(self):
67
+ self.state.quantities = self._sample_quantities()
68
+ self.state.values = self._sample_values()
69
+ self.state.split_phase = False
70
+
71
+ def get_info_of_variant(
72
+ self, state: NegotiationState, actions: Dict[AgentId, Any]
73
+ ) -> Dict[str, Any]:
74
+ return {
75
+ "quantities": copy.deepcopy(state.quantities),
76
+ "values": copy.deepcopy(state.values),
77
+ # "previous_values": copy.deepcopy(state.previous_values),
78
+ "splits": copy.deepcopy(state.splits),
79
+ }
80
+
81
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
82
+ return compute_tas_style_rewards(
83
+ self.agent_ids, self.state.values, splits, self.state.quantities
84
+ )
85
+
86
+ def get_obs(self):
87
+ return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
88
+
89
+ def get_obs_agent(self, agent_id):
90
+ other_id = self._other(agent_id)
91
+ last_value_coagent = (
92
+ None
93
+ if self.state.previous_values is None
94
+ else self.state.previous_values.get(other_id)
95
+ )
96
+ last_points_coagent = (
97
+ None
98
+ if self.state.previous_points is None
99
+ else round(self.state.previous_points.get(other_id), 1)
100
+ )
101
+ last_value_agent = (
102
+ None
103
+ if self.state.previous_values is None
104
+ else self.state.previous_values.get(agent_id)
105
+ )
106
+ last_points_agent = (
107
+ None
108
+ if self.state.previous_points is None
109
+ else round(self.state.previous_points.get(agent_id), 1)
110
+ )
111
+ last_split_coagent = None
112
+ last_split_agent = None
113
+ if self.state.previous_splits is not None:
114
+ last_split_coagent = self.state.previous_splits[
115
+ other_id
116
+ ].items_given_to_self["coins"]
117
+ last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[
118
+ "coins"
119
+ ]
120
+ if last_value_agent is None or last_value_coagent is None:
121
+ last_value_str_coagent = None
122
+ else:
123
+ if last_value_coagent > last_value_agent:
124
+ last_value_str_coagent = "higher"
125
+ elif last_value_coagent < last_value_agent:
126
+ last_value_str_coagent = "lower"
127
+ else:
128
+ raise ValueError("Should not be equal values")
129
+
130
+ obs = TrustAndSplitSimpleObs(
131
+ round_nb=self.state.round_nb,
132
+ last_message=self.state.last_message,
133
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
134
+ current_agent=self.state.current_agent,
135
+ other_agent=self.agent_id_to_name[other_id],
136
+ quantities=self.state.quantities,
137
+ item_types=self.item_types,
138
+ value=self.state.values[agent_id],
139
+ split_phase=self.state.split_phase,
140
+ last_split_agent=last_split_agent,
141
+ last_value_agent=last_value_agent,
142
+ last_points_agent=last_points_agent,
143
+ last_split_coagent=last_split_coagent,
144
+ last_value_coagent=last_value_coagent,
145
+ last_points_coagent=last_points_coagent,
146
+ last_quantities=self.state.previous_quantities,
147
+ last_value_str_coagent=last_value_str_coagent,
148
+ )
149
+ return obs
150
+
151
+ def reset(self):
152
+ start_agent = self.agent_ids[self._starting_agent_index]
153
+ quantities = self._sample_quantities()
154
+ values = self._sample_values()
155
+ self.state = TrustAndSplitSimpleState(
156
+ round_nb=0,
157
+ last_message="",
158
+ current_agent=start_agent,
159
+ quantities=quantities,
160
+ values=values,
161
+ previous_values=None,
162
+ splits={aid: None for aid in self.agent_ids},
163
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
164
+ split_phase=False,
165
+ previous_splits=None,
166
+ previous_points=None,
167
+ previous_quantities=None,
168
+ )
169
+ return self.get_obs()
src_code_for_reproducibility/markov_games/negotiation/tas_simulation.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ from collections import defaultdict
3
+ from dataclasses import dataclass
4
+ from typing import Any, Dict, List, Literal
5
+
6
+ from numpy.random import default_rng
7
+
8
+ from mllm.markov_games.negotiation.nego_simulation import (
9
+ NegotiationObs,
10
+ NegotiationSimulation,
11
+ NegotiationState,
12
+ Split,
13
+ compute_tas_style_rewards,
14
+ )
15
+
16
+ AgentId = str
17
+
18
+
19
+ @dataclass
20
+ class TrustAndSplitState(NegotiationState):
21
+ pass
22
+
23
+
24
+ @dataclass
25
+ class TrustAndSplitObs(NegotiationObs):
26
+ pass
27
+
28
+
29
+ class TrustAndSplitSimulation(NegotiationSimulation):
30
+ def __init__(
31
+ self,
32
+ game_type: Literal["10-1-exclusive", "10-1-ties", "1-to-20"] = "1-to-20",
33
+ same_round_value: bool = True,
34
+ atleast_one_conflict: bool = False,
35
+ *args,
36
+ **kwargs,
37
+ ):
38
+ self.game_type = game_type
39
+ self.same_round_value = same_round_value
40
+ self.atleast_one_conflict = atleast_one_conflict
41
+ super().__init__(*args, **kwargs)
42
+
43
+ def _sample_values(self) -> Dict[AgentId, dict]:
44
+ values = defaultdict(dict)
45
+ if self.state is None:
46
+ item_types = self.item_types
47
+ else:
48
+ item_types = list(self.state.quantities.keys())
49
+ while True:
50
+ for item in item_types:
51
+ if self.game_type == "10-1-exclusive":
52
+ v = int(self.rng.choice([1, 10]))
53
+ values[self.agent_ids[0]][item] = v
54
+ values[self.agent_ids[1]][item] = 10 if v == 1 else 1
55
+ elif self.game_type == "10-1-ties":
56
+ for aid in self.agent_ids:
57
+ values[aid][item] = int(self.rng.choice([1, 10]))
58
+ elif self.game_type == "1-to-20":
59
+ for aid in self.agent_ids:
60
+ values[aid][item] = int(self.rng.integers(1, 21))
61
+ agent_values = [sum(v.values()) for v in values.values()]
62
+ if self.atleast_one_conflict:
63
+ has_conflict = False
64
+ for item in item_types:
65
+ agent_values_for_item = [
66
+ values[aid][item] for aid in self.agent_ids
67
+ ]
68
+ if (
69
+ len(set(agent_values_for_item)) > 1
70
+ ): # Different values for this item
71
+ has_conflict = True
72
+ break
73
+ if not has_conflict:
74
+ continue
75
+ if len(set(agent_values)) == 1 or not self.same_round_value:
76
+ break
77
+ return values
78
+
79
+ def _sample_quantities(self) -> Dict[str, int]:
80
+ return {item.lower(): 10 for item in self.item_types}
81
+
82
+ def set_new_round_of_variant(self):
83
+ self.state.quantities = self._sample_quantities()
84
+ self.state.values = self._sample_values()
85
+ self.state.split_phase = False
86
+
87
+ def get_info_of_variant(
88
+ self, state: NegotiationState, actions: Dict[AgentId, Any]
89
+ ) -> Dict[str, Any]:
90
+ return {
91
+ "quantities": copy.deepcopy(state.quantities),
92
+ "values": copy.deepcopy(state.values),
93
+ # "previous_values": copy.deepcopy(state.previous_values),
94
+ "splits": copy.deepcopy(state.splits),
95
+ }
96
+
97
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
98
+ return compute_tas_style_rewards(
99
+ self.agent_ids, self.state.values, splits, self.state.quantities
100
+ )
101
+
102
+ def get_obs(self):
103
+ return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
104
+
105
+ def get_obs_agent(self, agent_id):
106
+ other_id = self._other(agent_id)
107
+ last_value_coagent = (
108
+ None
109
+ if self.state.previous_values is None
110
+ else self.state.previous_values.get(other_id)
111
+ )
112
+ last_points_coagent = (
113
+ None
114
+ if self.state.previous_points is None
115
+ else round(self.state.previous_points.get(other_id), 1)
116
+ )
117
+ last_value_agent = (
118
+ None
119
+ if self.state.previous_values is None
120
+ else self.state.previous_values.get(agent_id)
121
+ )
122
+ last_points_agent = (
123
+ None
124
+ if self.state.previous_points is None
125
+ else round(self.state.previous_points.get(agent_id), 1)
126
+ )
127
+ last_split_coagent = None
128
+ last_split_agent = None
129
+ if self.state.previous_splits is not None:
130
+ last_split_coagent = self.state.previous_splits[
131
+ other_id
132
+ ].items_given_to_self
133
+ last_split_agent = self.state.previous_splits[agent_id].items_given_to_self
134
+ obs = TrustAndSplitObs(
135
+ round_nb=self.state.round_nb,
136
+ last_message=self.state.last_message,
137
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
138
+ current_agent=self.state.current_agent,
139
+ other_agent=self.agent_id_to_name[other_id],
140
+ quantities=self.state.quantities,
141
+ item_types=self.item_types,
142
+ value=self.state.values[agent_id],
143
+ split_phase=self.state.split_phase,
144
+ last_split_agent=last_split_agent,
145
+ last_value_agent=last_value_agent,
146
+ last_points_agent=last_points_agent,
147
+ last_split_coagent=last_split_coagent,
148
+ last_value_coagent=last_value_coagent,
149
+ last_points_coagent=last_points_coagent,
150
+ last_quantities=self.state.previous_quantities,
151
+ )
152
+ return obs
153
+
154
+ def reset(self):
155
+ start_agent = self.agent_ids[self._starting_agent_index]
156
+ quantities = self._sample_quantities()
157
+ values = self._sample_values()
158
+ self.state = TrustAndSplitState(
159
+ round_nb=0,
160
+ last_message="",
161
+ current_agent=start_agent,
162
+ quantities=quantities,
163
+ values=values,
164
+ previous_values=None,
165
+ splits={aid: None for aid in self.agent_ids},
166
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
167
+ split_phase=False,
168
+ previous_splits=None,
169
+ previous_points=None,
170
+ previous_quantities=None,
171
+ )
172
+ return self.get_obs()
src_code_for_reproducibility/markov_games/rollout_tree.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ TODO: add parent to nodes so that some verification can be done. For instance, to ensure that node reward keys match the parent node.
3
+ """
4
+
5
+ from __future__ import annotations
6
+
7
+ import json
8
+ from dataclasses import dataclass
9
+ from pathlib import Path
10
+ from typing import Any, List, Literal, Optional, Tuple
11
+
12
+ import jsonschema
13
+ from pydantic import BaseModel, Field, model_validator
14
+
15
+ from mllm.chat_utils.chat_turn import ChatTurn
16
+
17
+ AgentId = str
18
+
19
+
20
+ class SimulationStepLog(BaseModel):
21
+ rewards: dict[AgentId, float]
22
+ info: Any = None
23
+
24
+
25
+ class AgentActLog(BaseModel):
26
+ chat_turns: list[ChatTurn] | None
27
+ info: Any = None
28
+
29
+ @model_validator(mode="after")
30
+ def _exactly_one_state_end(self):
31
+ """
32
+ This method is used to enforce that for each AgentActLog, there is exactly one ChatTurn which is a state end.
33
+ """
34
+ if self.chat_turns != []:
35
+ n = sum(1 for t in self.chat_turns if t.is_state_end)
36
+ if n != 1:
37
+ raise ValueError(
38
+ f"AgentActLog must have exactly one ChatTurn with is_state_end=True; got {self.chat_turns}."
39
+ )
40
+ return self
41
+ else:
42
+ return self
43
+
44
+
45
+ class StepLog(BaseModel):
46
+ action_logs: dict[AgentId, AgentActLog]
47
+ simulation_step_log: SimulationStepLog
48
+
49
+
50
+ # BranchType = Literal["unilateral_deviation", "common_deviation"] # might not be necessary
51
+ # class BranchNodeInfo(BaseModel):
52
+ # branch_id: str
53
+ # branch_for: AgentId
54
+ # branch_type: BranchType
55
+
56
+
57
+ class RolloutTreeNode(BaseModel):
58
+ step_log: StepLog
59
+ time_step: int
60
+ child: RolloutTreeNode | RolloutTreeBranchNode | None = None
61
+
62
+
63
+ class RolloutTreeBranchNode(BaseModel):
64
+ """
65
+ First item of the tuple indicates which agent "called" for an alternative branch.
66
+ """
67
+
68
+ main_child: RolloutTreeNode
69
+ branches: dict[AgentId, list[RolloutTreeNode]] | None = None
70
+
71
+
72
+ class RolloutTreeRootNode(BaseModel):
73
+ id: int
74
+ crn_id: int # ID of the rng used to generate this rollout tree
75
+ child: RolloutTreeNode | RolloutTreeBranchNode | None = None
76
+ agent_ids: List[AgentId] = Field(min_length=1)
77
+
78
+
79
+ # class RolloutTreeLeafNode(BaseModel):
80
+ # step_log: StepLog
81
+ # time_step: int
82
+
83
+
84
+ # Necessary for self-referential stuff in pydantic
85
+ RolloutTreeBranchNode.model_rebuild()
86
+ RolloutTreeNode.model_rebuild()
src_code_for_reproducibility/markov_games/run_markov_games.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from collections.abc import Callable
3
+ from dataclasses import dataclass
4
+
5
+ from torch._C import ClassType
6
+
7
+ from mllm.markov_games.markov_game import MarkovGame
8
+ from mllm.markov_games.rollout_tree import RolloutTreeRootNode
9
+
10
+
11
+ async def run_markov_games(
12
+ runner: Callable[[MarkovGame], RolloutTreeRootNode],
13
+ runner_kwargs: dict,
14
+ output_folder: str,
15
+ markov_games: list[MarkovGame],
16
+ ) -> list[RolloutTreeRootNode]:
17
+ tasks = []
18
+ for mg in markov_games:
19
+ tasks.append(
20
+ asyncio.create_task(
21
+ runner(markov_game=mg, output_folder=output_folder, **runner_kwargs)
22
+ )
23
+ )
24
+ return await asyncio.gather(*tasks)
src_code_for_reproducibility/markov_games/simulation.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ A Simulation is the environment of a Markov Game.
3
+ The Simulation is not responsible for properly checking / formatting the responses of LLM's.
4
+ This is the job of the `Agent` class.
5
+ Simulations expect clean actions, and are defined similarly to `gymnasium` environments, except that they are adapted for the Multi-agent setting.
6
+ """
7
+
8
+ from abc import ABC, abstractmethod
9
+ from typing import Any, Tuple
10
+
11
+ from numpy.random import default_rng
12
+
13
+ from mllm.markov_games.rollout_tree import SimulationStepLog
14
+
15
+
16
+ class Simulation(ABC):
17
+ @abstractmethod
18
+ def __init__(self, seed: int, *args, **kwargs):
19
+ self.seed = seed
20
+ self.rng = default_rng(self.seed)
21
+
22
+ @abstractmethod
23
+ def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
24
+ """
25
+ Returns terminated, info
26
+ """
27
+ raise NotImplementedError
28
+
29
+ def get_obs(self):
30
+ """Returns all agent observations in dict
31
+
32
+ Returns:
33
+ observations
34
+ """
35
+ raise NotImplementedError
36
+
37
+ def get_obs_agent(self, agent_id):
38
+ """Returns observation for agent_id"""
39
+ raise NotImplementedError
40
+
41
+ def get_obs_size(self):
42
+ """Returns the shape of the observation"""
43
+ raise NotImplementedError
44
+
45
+ def get_state(self):
46
+ raise NotImplementedError
47
+
48
+ def get_state_size(self):
49
+ """Returns the shape of the state"""
50
+ raise NotImplementedError
51
+
52
+ def get_avail_actions(self):
53
+ raise NotImplementedError
54
+
55
+ def get_avail_agent_actions(self, agent_id):
56
+ """Returns the available actions for agent_id"""
57
+ raise NotImplementedError
58
+
59
+ def get_total_actions(self):
60
+ """Returns the total number of actions an agent could ever take"""
61
+ # TODO: This is only suitable for a discrete 1 dimensional action space for each agent
62
+ raise NotImplementedError
63
+
64
+ def get_safe_copy(self):
65
+ """
66
+ Return copy of the agent object that is decorrelated from the original object.
67
+ """
68
+ raise NotImplementedError
69
+
70
+ def reset(self):
71
+ """Returns initial observations and states"""
72
+ raise NotImplementedError
73
+
74
+ def render(self):
75
+ raise NotImplementedError
76
+
77
+ def close(self):
78
+ raise NotImplementedError
79
+
80
+ # def seed(self):
81
+ # raise NotImplementedError
82
+
83
+ def save_replay(self):
84
+ raise NotImplementedError
85
+
86
+ def get_simulation_info(self):
87
+ raise NotImplementedError
src_code_for_reproducibility/markov_games/statistics_runner.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import gc
4
+ import json
5
+ import pickle
6
+ from dataclasses import dataclass
7
+ from pathlib import Path
8
+ from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional
9
+
10
+ from basic_render import find_iteration_folders
11
+
12
+ from mllm.markov_games.rollout_tree import (
13
+ RolloutTreeBranchNode,
14
+ RolloutTreeNode,
15
+ RolloutTreeRootNode,
16
+ SimulationStepLog,
17
+ )
18
+
19
+
20
+ def _iterate_main_nodes(root: RolloutTreeRootNode) -> Iterator[RolloutTreeNode]:
21
+ """
22
+ Iterate the main path nodes without materializing full path lists.
23
+ """
24
+ current = root.child
25
+ while current is not None:
26
+ if isinstance(current, RolloutTreeNode):
27
+ yield current
28
+ current = current.child
29
+ elif isinstance(current, RolloutTreeBranchNode):
30
+ # Follow only the main child on the main trajectory
31
+ current = current.main_child
32
+ else:
33
+ break
34
+
35
+
36
+ def iterate_main_simulation_logs(
37
+ root: RolloutTreeRootNode,
38
+ ) -> Iterator[SimulationStepLog]:
39
+ for node in _iterate_main_nodes(root):
40
+ yield node.step_log.simulation_step_log
41
+
42
+
43
+ def stream_rollout_files(iteration_folder: Path) -> Iterator[Path]:
44
+ for p in iteration_folder.rglob("*.rt.pkl"):
45
+ if p.is_file():
46
+ yield p
47
+
48
+
49
+ def load_root(path: Path) -> RolloutTreeRootNode:
50
+ with open(path, "rb") as f:
51
+ data = pickle.load(f)
52
+ return RolloutTreeRootNode.model_validate(data)
53
+
54
+
55
+ @dataclass
56
+ class StatRecord:
57
+ mgid: int
58
+ crn_id: Optional[int]
59
+ iteration: str
60
+ values: Dict[str, Any]
61
+
62
+
63
+ class StatComputer:
64
+ """
65
+ Stateful stat computer that consumes SimulationStepLog instances
66
+ and produces final aggregated values for one rollout (mgid).
67
+ """
68
+
69
+ def update(self, sl: SimulationStepLog) -> None: # pragma: no cover - interface
70
+ raise NotImplementedError
71
+
72
+ def finalize(self) -> Dict[str, Any]: # pragma: no cover - interface
73
+ raise NotImplementedError
74
+
75
+
76
+ def run_stats(
77
+ data_root: Path,
78
+ game_name: str,
79
+ make_computers: Callable[[], List[StatComputer]],
80
+ output_filename: Optional[str] = None,
81
+ output_format: str = "json", # "json" (dict of lists) or "jsonl"
82
+ ) -> Path:
83
+ """
84
+ Compute stats across all iteration_* folders under data_root.
85
+ Writes JSONL to data_root/statistics/<output_filename or f"{game_name}.stats.jsonl">.
86
+ """
87
+ data_root = Path(data_root)
88
+ outdir = data_root / "statistics"
89
+ outdir.mkdir(parents=True, exist_ok=True)
90
+ # Choose extension by format
91
+ default_name = (
92
+ f"{game_name}.stats.json"
93
+ if output_format == "json"
94
+ else f"{game_name}.stats.jsonl"
95
+ )
96
+ outfile = outdir / (
97
+ output_filename if output_filename is not None else default_name
98
+ )
99
+
100
+ # Rewrite file each run to keep it clean and small
101
+ if outfile.exists():
102
+ outfile.unlink()
103
+
104
+ iteration_folders = find_iteration_folders(str(data_root))
105
+
106
+ # If writing JSONL, stream directly; otherwise accumulate minimal records
107
+ if output_format == "jsonl":
108
+ with open(outfile, "w", encoding="utf-8") as w:
109
+ for iteration_folder in iteration_folders:
110
+ iteration_name = Path(iteration_folder).name
111
+ for pkl_path in stream_rollout_files(Path(iteration_folder)):
112
+ root = load_root(pkl_path)
113
+
114
+ computers = make_computers()
115
+ for sl in iterate_main_simulation_logs(root):
116
+ for comp in computers:
117
+ try:
118
+ comp.update(sl)
119
+ except Exception:
120
+ continue
121
+
122
+ values: Dict[str, Any] = {}
123
+ for comp in computers:
124
+ try:
125
+ values.update(comp.finalize())
126
+ except Exception:
127
+ continue
128
+
129
+ rec = {
130
+ "mgid": getattr(root, "id", None),
131
+ "crn_id": getattr(root, "crn_id", None),
132
+ "iteration": iteration_name,
133
+ "stats": values,
134
+ }
135
+ w.write(json.dumps(rec, ensure_ascii=False) + "\n")
136
+
137
+ del root
138
+ del computers
139
+ gc.collect()
140
+ else:
141
+ # Aggregate to dict-of-lists for easier plotting
142
+ records: List[Dict[str, Any]] = []
143
+ # Process in deterministic order
144
+ for iteration_folder in iteration_folders:
145
+ iteration_name = Path(iteration_folder).name
146
+ for pkl_path in stream_rollout_files(Path(iteration_folder)):
147
+ root = load_root(pkl_path)
148
+
149
+ computers = make_computers()
150
+ for sl in iterate_main_simulation_logs(root):
151
+ for comp in computers:
152
+ try:
153
+ comp.update(sl)
154
+ except Exception:
155
+ continue
156
+
157
+ values: Dict[str, Any] = {}
158
+ for comp in computers:
159
+ try:
160
+ values.update(comp.finalize())
161
+ except Exception:
162
+ continue
163
+
164
+ records.append(
165
+ {
166
+ "mgid": getattr(root, "id", None),
167
+ "crn_id": getattr(root, "crn_id", None),
168
+ "iteration": iteration_name,
169
+ "stats": values,
170
+ }
171
+ )
172
+
173
+ del root
174
+ del computers
175
+ gc.collect()
176
+
177
+ # Build dict-of-lists with nested stats preserved
178
+ # Collect all stat keys and nested agent keys where needed
179
+ mgids: List[Any] = []
180
+ crn_ids: List[Any] = []
181
+ iterations_out: List[str] = []
182
+ # stats_out is a nested structure mirroring keys but with lists
183
+ stats_out: Dict[str, Any] = {}
184
+
185
+ # First pass to collect union of keys
186
+ stat_keys: set[str] = set()
187
+ nested_agent_keys: Dict[str, set[str]] = {}
188
+ for r in records:
189
+ stats = r.get("stats", {}) or {}
190
+ for k, v in stats.items():
191
+ stat_keys.add(k)
192
+ if isinstance(v, dict):
193
+ nested = nested_agent_keys.setdefault(k, set())
194
+ for ak in v.keys():
195
+ nested.add(str(ak))
196
+
197
+ # Initialize structure
198
+ for k in stat_keys:
199
+ if k in nested_agent_keys:
200
+ stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])}
201
+ else:
202
+ stats_out[k] = []
203
+
204
+ # Fill lists
205
+ for r in records:
206
+ mgids.append(r.get("mgid"))
207
+ crn_ids.append(r.get("crn_id"))
208
+ iterations_out.append(r.get("iteration"))
209
+ stats = r.get("stats", {}) or {}
210
+ for k in stat_keys:
211
+ val = stats.get(k)
212
+ if isinstance(stats_out[k], dict):
213
+ # per-agent dict
214
+ agent_dict = val if isinstance(val, dict) else {}
215
+ for ak in stats_out[k].keys():
216
+ stats_out[k][ak].append(agent_dict.get(ak))
217
+ else:
218
+ stats_out[k].append(val)
219
+
220
+ with open(outfile, "w", encoding="utf-8") as w:
221
+ json.dump(
222
+ {
223
+ "mgid": mgids,
224
+ "crn_id": crn_ids,
225
+ "iteration": iterations_out,
226
+ "stats": stats_out,
227
+ },
228
+ w,
229
+ ensure_ascii=False,
230
+ )
231
+
232
+ return outfile
233
+
234
+
235
+ def run_stats_functional(
236
+ data_root: Path,
237
+ game_name: str,
238
+ metrics: Dict[str, Callable[[SimulationStepLog], Optional[Dict[str, float]]]],
239
+ output_filename: Optional[str] = None,
240
+ output_format: str = "json",
241
+ ) -> Path:
242
+ """
243
+ Functional variant where metrics is a dict of name -> f(SimulationStepLog) -> {agent_id: value}.
244
+ Aggregates per rollout by averaging over steps where a metric produced a value.
245
+ Writes a single consolidated file in data_root/statistics/.
246
+ """
247
+ data_root = Path(data_root)
248
+ outdir = data_root / "statistics"
249
+ outdir.mkdir(parents=True, exist_ok=True)
250
+ default_name = (
251
+ f"{game_name}.stats.json"
252
+ if output_format == "json"
253
+ else f"{game_name}.stats.jsonl"
254
+ )
255
+ outfile = outdir / (
256
+ output_filename if output_filename is not None else default_name
257
+ )
258
+
259
+ if outfile.exists():
260
+ outfile.unlink()
261
+
262
+ iteration_folders = find_iteration_folders(str(data_root))
263
+
264
+ def finalize_rollout(
265
+ agg: Dict[str, Dict[str, List[float]]]
266
+ ) -> Dict[str, Dict[str, float]]:
267
+ # avg per metric per agent
268
+ result: Dict[str, Dict[str, float]] = {}
269
+ for mname, agent_values in agg.items():
270
+ result[mname] = {}
271
+ for aid, vals in agent_values.items():
272
+ if not vals:
273
+ result[mname][aid] = None # keep alignment; could be None
274
+ else:
275
+ result[mname][aid] = sum(vals) / len(vals)
276
+ return result
277
+
278
+ if output_format == "jsonl":
279
+ with open(outfile, "w", encoding="utf-8") as w:
280
+ for iteration_folder in iteration_folders:
281
+ iteration_name = Path(iteration_folder).name
282
+ for pkl_path in stream_rollout_files(Path(iteration_folder)):
283
+ root = load_root(pkl_path)
284
+
285
+ # aggregator structure: metric -> agent_id -> list of values
286
+ agg: Dict[str, Dict[str, List[float]]] = {
287
+ m: {} for m in metrics.keys()
288
+ }
289
+
290
+ for sl in iterate_main_simulation_logs(root):
291
+ for mname, fn in metrics.items():
292
+ try:
293
+ vals = fn(sl)
294
+ except Exception:
295
+ vals = None
296
+ if not vals:
297
+ continue
298
+ for aid, v in vals.items():
299
+ if v is None:
300
+ continue
301
+ lst = agg[mname].setdefault(str(aid), [])
302
+ try:
303
+ lst.append(float(v))
304
+ except Exception:
305
+ continue
306
+
307
+ values = finalize_rollout(agg)
308
+ rec = {
309
+ "mgid": getattr(root, "id", None),
310
+ "crn_id": getattr(root, "crn_id", None),
311
+ "iteration": iteration_name,
312
+ "stats": values,
313
+ }
314
+ w.write(json.dumps(rec, ensure_ascii=False) + "\n")
315
+
316
+ del root
317
+ gc.collect()
318
+ else:
319
+ records: List[Dict[str, Any]] = []
320
+ for iteration_folder in iteration_folders:
321
+ iteration_name = Path(iteration_folder).name
322
+ for pkl_path in stream_rollout_files(Path(iteration_folder)):
323
+ root = load_root(pkl_path)
324
+
325
+ agg: Dict[str, Dict[str, List[float]]] = {m: {} for m in metrics.keys()}
326
+ for sl in iterate_main_simulation_logs(root):
327
+ for mname, fn in metrics.items():
328
+ try:
329
+ vals = fn(sl)
330
+ except Exception:
331
+ vals = None
332
+ if not vals:
333
+ continue
334
+ for aid, v in vals.items():
335
+ if v is None:
336
+ continue
337
+ lst = agg[mname].setdefault(str(aid), [])
338
+ try:
339
+ lst.append(float(v))
340
+ except Exception:
341
+ continue
342
+
343
+ values = finalize_rollout(agg)
344
+ records.append(
345
+ {
346
+ "mgid": getattr(root, "id", None),
347
+ "crn_id": getattr(root, "crn_id", None),
348
+ "iteration": iteration_name,
349
+ "stats": values,
350
+ }
351
+ )
352
+
353
+ del root
354
+ gc.collect()
355
+
356
+ # Build dict-of-lists output
357
+ mgids: List[Any] = []
358
+ crn_ids: List[Any] = []
359
+ iterations_out: List[str] = []
360
+ stats_out: Dict[str, Any] = {}
361
+
362
+ stat_keys: set[str] = set()
363
+ nested_agent_keys: Dict[str, set[str]] = {}
364
+ for r in records:
365
+ stats = r.get("stats", {}) or {}
366
+ for k, v in stats.items():
367
+ stat_keys.add(k)
368
+ if isinstance(v, dict):
369
+ nested = nested_agent_keys.setdefault(k, set())
370
+ for ak in v.keys():
371
+ nested.add(str(ak))
372
+
373
+ for k in stat_keys:
374
+ if k in nested_agent_keys:
375
+ stats_out[k] = {ak: [] for ak in sorted(nested_agent_keys[k])}
376
+ else:
377
+ stats_out[k] = []
378
+
379
+ for r in records:
380
+ mgids.append(r.get("mgid"))
381
+ crn_ids.append(r.get("crn_id"))
382
+ iterations_out.append(r.get("iteration"))
383
+ stats = r.get("stats", {}) or {}
384
+ for k in stat_keys:
385
+ val = stats.get(k)
386
+ if isinstance(stats_out[k], dict):
387
+ agent_dict = val if isinstance(val, dict) else {}
388
+ for ak in stats_out[k].keys():
389
+ stats_out[k][ak].append(agent_dict.get(ak))
390
+ else:
391
+ stats_out[k].append(val)
392
+
393
+ with open(outfile, "w", encoding="utf-8") as w:
394
+ json.dump(
395
+ {
396
+ "mgid": mgids,
397
+ "crn_id": crn_ids,
398
+ "iteration": iterations_out,
399
+ "stats": stats_out,
400
+ },
401
+ w,
402
+ ensure_ascii=False,
403
+ )
404
+
405
+ return outfile
src_code_for_reproducibility/markov_games/vine_ppo.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from anytree import Node, RenderTree
2
+ from anytree.exporter import DotExporter
3
+ import os.path
4
+ import asyncio
5
+ from mllm.markov_games.markov_game import MarkovGame
6
+
7
+ async def VinePPORunner(
8
+ markov_game: MarkovGame,
9
+ **kwargs):
10
+ pass
src_code_for_reproducibility/models/__init__.py ADDED
File without changes
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (153 Bytes). View file
 
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc ADDED
Binary file (2.24 kB). View file
 
src_code_for_reproducibility/models/__pycache__/inference_backend_dummy.cpython-312.pyc ADDED
Binary file (2.34 kB). View file
 
src_code_for_reproducibility/models/adapter_training_wrapper.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import logging
4
+ from typing import Union
5
+ from peft import (
6
+ LoraConfig,
7
+ get_peft_model,
8
+ )
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class AdapterWrapper(nn.Module):
14
+ """
15
+ A thin façade that
16
+ • keeps a reference to a *shared* PEFT-wrapped model,
17
+ • ensures `set_adapter(adapter)` is called on every forward,
18
+ • exposes only the parameters that should be trained for that adapter
19
+ (plus whatever extra modules you name).
20
+ """
21
+ def __init__(
22
+ self,
23
+ shared_llm: nn.Module,
24
+ adapter_id: str,
25
+ lora_config: dict,
26
+ path: Union[str, None] = None,
27
+ ):
28
+ super().__init__()
29
+ self.shared_llm = shared_llm
30
+ self.adapter_id = adapter_id
31
+ lora_config = LoraConfig(**lora_config)
32
+ # this modifies the shared llm in place, adding a lora adapter inside
33
+ self.shared_llm = get_peft_model(
34
+ model=shared_llm,
35
+ peft_config=lora_config,
36
+ adapter_name=adapter_id,
37
+ )
38
+ self.shared_llm.train()
39
+ # Load external adapter weights if provided
40
+ loaded_from: str | None = None
41
+ if path:
42
+ try:
43
+ # Supports both local filesystem paths and HF Hub repo IDs
44
+ self.shared_llm.load_adapter(
45
+ is_trainable=True,
46
+ model_id=path,
47
+ adapter_name=adapter_id,
48
+ )
49
+ loaded_from = path
50
+ except Exception as exc: # noqa: BLE001 - want to log any load failure context
51
+ logger.warning(
52
+ f"Adapter '{adapter_id}': failed to load from '{path}': {exc}"
53
+ )
54
+
55
+ if loaded_from:
56
+ logger.info(
57
+ f"Adapter '{adapter_id}': loaded initial weights from '{loaded_from}'."
58
+ )
59
+ else:
60
+ logger.info(
61
+ f"Adapter '{adapter_id}': initialized with fresh weights (no initial weights found)."
62
+ )
63
+
64
+ def parameters(self, recurse: bool = True):
65
+ """
66
+ "recurse" is just for pytorch compatibility
67
+ """
68
+ self.shared_llm.set_adapter(self.adapter_id)
69
+ params = [p for p in self.shared_llm.parameters() if p.requires_grad]
70
+
71
+ return params
72
+
73
+ def get_base_model_logits(self, contexts):
74
+ """
75
+ Run the base model (without adapter) in inference mode, without tracking gradients.
76
+ This is useful to get reference logits for KL-divergence computation.
77
+ """
78
+ with torch.no_grad():
79
+ with self.shared_llm.disable_adapter():
80
+ return self.shared_llm(input_ids=contexts)[0]
81
+
82
+ def forward(self, *args, **kwargs):
83
+ self.shared_llm.set_adapter(self.adapter_id)
84
+ return self.shared_llm(*args, **kwargs)
85
+
86
+ def save_pretrained(self, save_path):
87
+ self.shared_llm.save_pretrained(save_path)
88
+
89
+ def gradient_checkpointing_enable(self, *args, **kwargs):
90
+ self.shared_llm.gradient_checkpointing_enable(*args, **kwargs)
91
+
92
+ @property
93
+ def dtype(self):
94
+ return self.shared_llm.dtype
95
+
96
+ @property
97
+ def device(self):
98
+ return self.shared_llm.device