Add files using upload-large-folder tool
Browse files- .gitattributes +1 -0
- run.log +3 -0
- seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +3 -0
- seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +3 -0
- seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/fixed_ad_align_adapter/adapter_model.safetensors +3 -0
- seed_1/agent_trainer/policy_optimizer_state.pt +3 -0
- seed_1/agent_trainer/trainer_annealing_state.pkl +3 -0
- seed_1/random_state.pkl +3 -0
- src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/negotiation/__pycache__/negotiation_statistics.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +176 -0
- src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py +249 -0
- src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py +128 -0
- src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc +0 -0
- src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc +0 -0
- src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc +0 -0
- src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc +0 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
run.log filter=lfs diff=lfs merge=lfs -text
|
run.log
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c50803dd26fc6b9db435c1427981e4017e7e156753c5cf5e60710cf5afdd64d
|
| 3 |
+
size 10900718
|
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ab958fd2facd005cae5571b0df06ed3be786697f4c31d6435ffcbc655d2920b
|
| 3 |
+
size 323014168
|
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4c44c3464099d92dfebb2b132524339800fbf19760b378a02c3c527ac3380b88
|
| 3 |
+
size 323014168
|
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/fixed_ad_align_adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf6bb8f8d702f23ed3c0797660ebbc16bdee9cbac5c984ffbad4a1dc3ba2215c
|
| 3 |
+
size 323014168
|
seed_1/agent_trainer/policy_optimizer_state.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1dca3a51476df532d5a63aa1f269f3467830fee1573883a3cb10d0857ddd4111
|
| 3 |
+
size 646269121
|
seed_1/agent_trainer/trainer_annealing_state.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:17f3ead2dac3c925aeb1b3176d071b434c765c0606d4e707e423de4498633e52
|
| 3 |
+
size 104
|
seed_1/random_state.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06d527c81a0ed8e596458de353799680ab01076dfc3d43cd3b1a2ebea4439ac5
|
| 3 |
+
size 12218
|
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc
ADDED
|
Binary file (5.42 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/negotiation/__pycache__/negotiation_statistics.cpython-312.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/dond_simulation.py
|
| 3 |
+
Summary: Simulates Deal-or-No-Deal negotiation games and logs rollouts.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import copy
|
| 7 |
+
from dataclasses import dataclass
|
| 8 |
+
from typing import Any, Dict, List, Tuple
|
| 9 |
+
|
| 10 |
+
from numpy.random import default_rng
|
| 11 |
+
|
| 12 |
+
from mllm.markov_games.negotiation.nego_simulation import (
|
| 13 |
+
NegotiationObs,
|
| 14 |
+
NegotiationSimulation,
|
| 15 |
+
NegotiationState,
|
| 16 |
+
Split,
|
| 17 |
+
)
|
| 18 |
+
from mllm.markov_games.rollout_tree import SimulationStepLog
|
| 19 |
+
from mllm.utils.get_coagent_id import get_coagent_id
|
| 20 |
+
|
| 21 |
+
AgentId = str
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@dataclass
|
| 25 |
+
class DealNoDealState(NegotiationState):
|
| 26 |
+
"""NegotiationState with per-agent value tables and item taxonomy."""
|
| 27 |
+
|
| 28 |
+
item_types: List[str]
|
| 29 |
+
values: Dict[AgentId, Dict[str, int]]
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
@dataclass
|
| 33 |
+
class DealNoDealObs(NegotiationObs):
|
| 34 |
+
"""Observation that reveals own values and (lagged) opponent values."""
|
| 35 |
+
|
| 36 |
+
my_values: Dict[str, int]
|
| 37 |
+
item_types: List[str]
|
| 38 |
+
previous_values_coagent: Dict[str, int] | None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def random_partition_integer(rng, total: int, parts: int) -> List[int]:
|
| 42 |
+
"""Sample non-negative integers summing to ``total`` across ``parts`` buckets."""
|
| 43 |
+
if parts <= 0:
|
| 44 |
+
return []
|
| 45 |
+
if total <= 0:
|
| 46 |
+
return [0 for _ in range(parts)]
|
| 47 |
+
cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
|
| 48 |
+
vals = []
|
| 49 |
+
prev = 0
|
| 50 |
+
for c in cuts + [total]:
|
| 51 |
+
vals.append(c - prev)
|
| 52 |
+
prev = c
|
| 53 |
+
return vals
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
class DealNoDealSimulation(NegotiationSimulation):
|
| 57 |
+
"""NegotiationSimulation variant implementing the Rubinstein-style Deal-or-No-Deal."""
|
| 58 |
+
|
| 59 |
+
def __init__(
|
| 60 |
+
self,
|
| 61 |
+
item_types: List[str] = ["books", "hats", "balls"],
|
| 62 |
+
*args,
|
| 63 |
+
**kwargs,
|
| 64 |
+
):
|
| 65 |
+
super().__init__(item_types=item_types, *args, **kwargs)
|
| 66 |
+
self.reset()
|
| 67 |
+
|
| 68 |
+
def _other(self, agent_id: AgentId) -> AgentId:
|
| 69 |
+
return get_coagent_id(self.agent_ids, agent_id)
|
| 70 |
+
|
| 71 |
+
def _sample_stock(self) -> Dict[str, int]:
|
| 72 |
+
# total items between 5 and 7
|
| 73 |
+
total_items = int(self.rng.integers(5, 8))
|
| 74 |
+
# nonnegative per-type counts summing to total_items
|
| 75 |
+
parts = random_partition_integer(self.rng, total_items, len(self.item_types))
|
| 76 |
+
# allow zeros per type
|
| 77 |
+
return {t: int(c) for t, c in zip(self.item_types, parts)}
|
| 78 |
+
|
| 79 |
+
def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
|
| 80 |
+
# Each agent has integer non-negative values that sum to 10
|
| 81 |
+
# Each item type valued by at least one agent
|
| 82 |
+
# Some item type valued by both agents
|
| 83 |
+
while True:
|
| 84 |
+
vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
|
| 85 |
+
vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
|
| 86 |
+
a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
|
| 87 |
+
b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
|
| 88 |
+
# each item valued by at least one
|
| 89 |
+
ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
|
| 90 |
+
# some item valued by both
|
| 91 |
+
ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
|
| 92 |
+
if ok1 and ok2:
|
| 93 |
+
return {self.agent_ids[0]: a, self.agent_ids[1]: b}
|
| 94 |
+
|
| 95 |
+
def _is_valid_allocation(
|
| 96 |
+
self, allocation: Dict[str, int], stock: Dict[str, int]
|
| 97 |
+
) -> bool:
|
| 98 |
+
for t in self.item_types:
|
| 99 |
+
v = allocation.get(t)
|
| 100 |
+
if v is None:
|
| 101 |
+
return False
|
| 102 |
+
if not isinstance(v, int):
|
| 103 |
+
return False
|
| 104 |
+
if v < 0 or v > int(stock.get(t, 0)):
|
| 105 |
+
return False
|
| 106 |
+
return True
|
| 107 |
+
|
| 108 |
+
def set_new_round_of_variant(self):
|
| 109 |
+
# Keep same values, resample stock
|
| 110 |
+
self.state.quantities = self._sample_stock()
|
| 111 |
+
|
| 112 |
+
def get_info_of_variant(
|
| 113 |
+
self, state: NegotiationState, actions: Dict[AgentId, Any]
|
| 114 |
+
) -> Dict[str, Any]:
|
| 115 |
+
return {
|
| 116 |
+
"quantities": copy.deepcopy(state.quantities),
|
| 117 |
+
"values": copy.deepcopy(state.values),
|
| 118 |
+
"splits": copy.deepcopy(state.splits),
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
|
| 122 |
+
"""
|
| 123 |
+
Returns the rewards for each agent.
|
| 124 |
+
"""
|
| 125 |
+
split_a = splits[self.agent_ids[0]].items_given_to_self
|
| 126 |
+
split_b = splits[self.agent_ids[1]].items_given_to_self
|
| 127 |
+
rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
|
| 128 |
+
for t in self.item_types:
|
| 129 |
+
# If not complementary, return 0!
|
| 130 |
+
if not split_a[t] + split_b[t] == self.state.quantities[t]:
|
| 131 |
+
return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
|
| 132 |
+
rewards[self.agent_ids[0]] += (
|
| 133 |
+
split_a[t] * self.state.values[self.agent_ids[0]][t]
|
| 134 |
+
)
|
| 135 |
+
rewards[self.agent_ids[1]] += (
|
| 136 |
+
split_b[t] * self.state.values[self.agent_ids[1]][t]
|
| 137 |
+
)
|
| 138 |
+
return rewards
|
| 139 |
+
|
| 140 |
+
def get_obs(self):
|
| 141 |
+
return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
|
| 142 |
+
|
| 143 |
+
def get_obs_agent(self, agent_id):
|
| 144 |
+
other_id = self._other(agent_id)
|
| 145 |
+
obs = DealNoDealObs(
|
| 146 |
+
round_nb=self.state.round_nb,
|
| 147 |
+
last_message=self.state.last_message,
|
| 148 |
+
current_agent=self.state.current_agent,
|
| 149 |
+
quantities=copy.deepcopy(self.state.quantities),
|
| 150 |
+
value=0.0, # unused in DOND
|
| 151 |
+
other_agent_split=None, # not meaningful until split
|
| 152 |
+
split_phase=self.state.split_phase,
|
| 153 |
+
quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
|
| 154 |
+
my_values=copy.deepcopy(self.state.values[agent_id]),
|
| 155 |
+
item_types=list(self.item_types),
|
| 156 |
+
previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
|
| 157 |
+
)
|
| 158 |
+
return obs
|
| 159 |
+
|
| 160 |
+
def reset(self):
|
| 161 |
+
start_agent = self.agent_ids[self._starting_agent_index]
|
| 162 |
+
stock = self._sample_stock()
|
| 163 |
+
values = self._sample_values_pair()
|
| 164 |
+
self.state = DealNoDealState(
|
| 165 |
+
round_nb=0,
|
| 166 |
+
last_message="",
|
| 167 |
+
current_agent=start_agent,
|
| 168 |
+
quantities=stock,
|
| 169 |
+
values=values,
|
| 170 |
+
previous_values=None,
|
| 171 |
+
splits={aid: None for aid in self.agent_ids},
|
| 172 |
+
nb_messages_sent={aid: 0 for aid in self.agent_ids},
|
| 173 |
+
split_phase=False,
|
| 174 |
+
item_types=list(self.item_types),
|
| 175 |
+
)
|
| 176 |
+
return self.get_obs()
|
src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py
ADDED
|
@@ -0,0 +1,249 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/negotiation_statistics.py
|
| 3 |
+
Summary: Aggregates and reports statistics for negotiation experiments.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from typing import Callable, Dict, List, Tuple
|
| 9 |
+
|
| 10 |
+
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 11 |
+
from mllm.markov_games.rollout_tree import SimulationStepLog
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
|
| 15 |
+
"""Average (per-step) reward for each agent and overall.
|
| 16 |
+
|
| 17 |
+
What it computes:
|
| 18 |
+
- Returns the raw reward for every (non-buffer) agent at the current
|
| 19 |
+
simulation step.
|
| 20 |
+
- Adds an aggregate key ``all_agents`` which is the simple arithmetic
|
| 21 |
+
mean across the agents present in ``sl.rewards``.
|
| 22 |
+
|
| 23 |
+
Rationale / motivation:
|
| 24 |
+
Monitoring the reward stream at each step helps:
|
| 25 |
+
* Diagnose reward shaping issues (e.g., unintended negative drift).
|
| 26 |
+
* Provide a fairness snapshot (are rewards systematically skewed?).
|
| 27 |
+
* Supply a ubiquitous baseline metric used by other higher‑level
|
| 28 |
+
summaries (efficiency, surplus allocation, etc.).
|
| 29 |
+
|
| 30 |
+
Return shape:
|
| 31 |
+
{ agent_id: float, ..., "all_agents": float }
|
| 32 |
+
If any agent id contains the substring "buffer" we treat this step as
|
| 33 |
+
an implementation artifact (e.g., rollout buffer) and return ``None``
|
| 34 |
+
to avoid polluting aggregates.
|
| 35 |
+
"""
|
| 36 |
+
for aid in sl.rewards.keys():
|
| 37 |
+
if "buffer" in str(aid) and "live" not in str(aid):
|
| 38 |
+
return None
|
| 39 |
+
# One value per agent at each step
|
| 40 |
+
rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
|
| 41 |
+
return [(key, value) for key, value in rewards_dict.items() if value is not None]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def split_efficiency(sl: SimulationStepLog) -> List[Tuple[str, float]] | None:
|
| 45 |
+
"""Final‑round allocation efficiency relative to an upper bound.
|
| 46 |
+
|
| 47 |
+
What it computes (only on the last timestep of a negotiation round):
|
| 48 |
+
- Uses ``info['values']`` (per‑agent per‑item valuations) and
|
| 49 |
+
``info['quantities']`` (available item counts) to form a greedy
|
| 50 |
+
*upper bound* on achievable total reward: allocate each unit of an
|
| 51 |
+
item to the single agent who values that item most.
|
| 52 |
+
- Compares the actually realized sum of rewards at that final
|
| 53 |
+
timestep to this constructed maximum.
|
| 54 |
+
- Emits a single scalar under key ``"all_agents"`` equal to
|
| 55 |
+
achieved / theoretical_max.
|
| 56 |
+
|
| 57 |
+
Motivation:
|
| 58 |
+
Efficiency (a core welfare notion) distinguishes between coordination
|
| 59 |
+
failures (low efficiency) versus strategic distributional disputes
|
| 60 |
+
(high efficiency but uneven splits). Tracking this per round helps
|
| 61 |
+
evaluate whether models learn to identify and realize joint surplus.
|
| 62 |
+
|
| 63 |
+
Notes / caveats:
|
| 64 |
+
- Only defined for 2+ non‑buffer agents; if a buffer agent is present
|
| 65 |
+
returns ``None`` to exclude spurious steps.
|
| 66 |
+
- Requires the environment to have populated ``values`` and
|
| 67 |
+
``quantities``; otherwise returns ``None``.
|
| 68 |
+
- This is an optimistic bound (not necessarily reachable under
|
| 69 |
+
protocol constraints) but is simple, fast, and comparable across
|
| 70 |
+
runs.
|
| 71 |
+
"""
|
| 72 |
+
info = sl.info or {}
|
| 73 |
+
if not info or not info.get("is_last_timestep_in_round"):
|
| 74 |
+
return None
|
| 75 |
+
quantities = info.get("quantities") or {}
|
| 76 |
+
values = info.get("values") or {}
|
| 77 |
+
if not values or not quantities:
|
| 78 |
+
return None
|
| 79 |
+
agent_ids = list(sl.rewards.keys())
|
| 80 |
+
if type(values[agent_ids[0]]) is dict:
|
| 81 |
+
item_keys = list(values.values())[0].keys()
|
| 82 |
+
max_vals, max_quantities = [], []
|
| 83 |
+
for item in item_keys:
|
| 84 |
+
max_val = max(float(agent_vals[item]) for agent_vals in values.values())
|
| 85 |
+
max_vals.append(max_val)
|
| 86 |
+
max_quantities.append(quantities[item])
|
| 87 |
+
else:
|
| 88 |
+
max_vals = [max(float(v) for v in values.values())]
|
| 89 |
+
max_quantities = [quantities[item] for item in quantities.keys()]
|
| 90 |
+
for aid in sl.rewards.keys():
|
| 91 |
+
if "buffer" in str(aid) and "live" not in str(aid):
|
| 92 |
+
return None
|
| 93 |
+
achieved = sum(float(v) for v in sl.rewards.values())
|
| 94 |
+
max_reward = sum(d * v for d, v in zip(max_quantities, max_vals))
|
| 95 |
+
# Efficiency is a global metric; emit same value for a special key "all"
|
| 96 |
+
return [("split_efficiency", achieved / max_reward)]
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def _extract_items_from_split(raw_split: Dict) -> Dict[str, float] | None:
|
| 100 |
+
"""Return a mapping item->proposal amount from a split structure.
|
| 101 |
+
|
| 102 |
+
Supports both generic negotiation splits with nested structure
|
| 103 |
+
{ 'items_given_to_self': {item: qty, ...}}
|
| 104 |
+
and TAS coin-only variants which may already be a flat mapping {'coins': qty}.
|
| 105 |
+
"""
|
| 106 |
+
|
| 107 |
+
if raw_split is None:
|
| 108 |
+
return {}
|
| 109 |
+
elif isinstance(raw_split, Split):
|
| 110 |
+
return {k: float(v) for k, v in raw_split.items_given_to_self.items()}
|
| 111 |
+
elif isinstance(raw_split, dict):
|
| 112 |
+
if "items_given_to_self" in raw_split and isinstance(
|
| 113 |
+
raw_split["items_given_to_self"], dict
|
| 114 |
+
):
|
| 115 |
+
return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
|
| 116 |
+
# Fallback: assume already flat mapping of items
|
| 117 |
+
elif hasattr(raw_split, "items_given_to_self"):
|
| 118 |
+
return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
|
| 119 |
+
return {
|
| 120 |
+
k: float(v) for k, v in raw_split.items() if isinstance(v, (int, float))
|
| 121 |
+
}
|
| 122 |
+
return {}
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
def _average_proposal_relative_value(
|
| 126 |
+
sl: SimulationStepLog,
|
| 127 |
+
metric_name: str,
|
| 128 |
+
comparator: Callable[[float, float], bool],
|
| 129 |
+
opposite_comparator: Callable[[float, float], bool],
|
| 130 |
+
) -> Dict[str, float | None] | None:
|
| 131 |
+
"""Shared implementation for proposal size conditioned on relative value.
|
| 132 |
+
|
| 133 |
+
Parameters:
|
| 134 |
+
comparator: returns True when agent_0's value relation (e.g. < or >)
|
| 135 |
+
to agent_1 holds for an item and we should collect agent_0's
|
| 136 |
+
proposed quantity for that item.
|
| 137 |
+
opposite_comparator: inverse relation used to collect agent_1's items.
|
| 138 |
+
|
| 139 |
+
Behavior:
|
| 140 |
+
- Executes only on final timestep of a round (where the definitive
|
| 141 |
+
proposal / allocation is known via ``info['splits']``).
|
| 142 |
+
- For each item, classifies which agent's value satisfies the chosen
|
| 143 |
+
relation and records that agent's proposed quantity from the split.
|
| 144 |
+
- Averages (mean) across all qualifying items per agent; if no items
|
| 145 |
+
qualify for an agent returns ``None`` for that agent id.
|
| 146 |
+
- Adds ``all_agents`` mean across the numeric (non-None) agent values.
|
| 147 |
+
|
| 148 |
+
Why this matters:
|
| 149 |
+
Distinguishing how much an agent *asks for* when it subjectively
|
| 150 |
+
values items more (or less) than its counterpart reveals patterns of
|
| 151 |
+
opportunism vs. concession. This is especially useful when raw reward
|
| 152 |
+
differences are subtle but allocation *intent* differs.
|
| 153 |
+
"""
|
| 154 |
+
info = sl.info or {}
|
| 155 |
+
if not info or not info.get("is_last_timestep_in_round"):
|
| 156 |
+
return None
|
| 157 |
+
quantities = info.get("quantities") or {}
|
| 158 |
+
splits = info.get("splits") or {}
|
| 159 |
+
values = info.get("values") or {}
|
| 160 |
+
agent_ids: List[str] = list(sl.rewards.keys())
|
| 161 |
+
if len(agent_ids) != 2:
|
| 162 |
+
return None # Only defined for 2-agent case.
|
| 163 |
+
for aid in agent_ids:
|
| 164 |
+
if "buffer" in str(aid) and "live" not in str(aid):
|
| 165 |
+
return None
|
| 166 |
+
# Extract per-agent item proposals robustly
|
| 167 |
+
split_items = {aid: _extract_items_from_split(splits.get(aid)) for aid in agent_ids}
|
| 168 |
+
agent_0_vals: List[float] = []
|
| 169 |
+
agent_1_vals: List[float] = []
|
| 170 |
+
for item in quantities.keys():
|
| 171 |
+
# Values may be either a float (same for all items) or dict per item
|
| 172 |
+
v0_raw = values[agent_ids[0]]
|
| 173 |
+
v1_raw = values[agent_ids[1]]
|
| 174 |
+
v0 = float(v0_raw[item]) if isinstance(v0_raw, dict) else float(v0_raw)
|
| 175 |
+
v1 = float(v1_raw[item]) if isinstance(v1_raw, dict) else float(v1_raw)
|
| 176 |
+
if comparator(v0, v1):
|
| 177 |
+
agent_0_vals.append(split_items[agent_ids[0]].get(item, 0.0))
|
| 178 |
+
elif opposite_comparator(v0, v1):
|
| 179 |
+
agent_1_vals.append(split_items[agent_ids[1]].get(item, 0.0))
|
| 180 |
+
out: Dict[str, float | None] = {}
|
| 181 |
+
out[f"{metric_name}-{agent_ids[0]}"] = (
|
| 182 |
+
sum(agent_0_vals) / len(agent_0_vals) if agent_0_vals else None
|
| 183 |
+
)
|
| 184 |
+
out[f"{metric_name}-{agent_ids[1]}"] = (
|
| 185 |
+
sum(agent_1_vals) / len(agent_1_vals) if agent_1_vals else None
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
return [(key, value) for key, value in out.items() if value is not None]
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
def average_proposal_when_agent_values_item_lower(
|
| 192 |
+
sl: SimulationStepLog,
|
| 193 |
+
) -> List[Tuple[str, float | None]] | None:
|
| 194 |
+
"""Mean quantity an agent proposes for items it values *less* than opponent.
|
| 195 |
+
|
| 196 |
+
Interpretation:
|
| 197 |
+
A higher value implies the agent still claims (or is allocated) a
|
| 198 |
+
notable share of items where it has a comparative *disadvantage* in
|
| 199 |
+
valuation, signaling either strategic over-claiming or protocol-driven
|
| 200 |
+
egalitarian splits. Conversely, very low numbers can indicate
|
| 201 |
+
efficient specialization or excessive concession.
|
| 202 |
+
|
| 203 |
+
Returns:
|
| 204 |
+
Mapping { agent_id: float | None, "all_agents": float | None } where
|
| 205 |
+
None indicates no qualifying items for that agent in the round.
|
| 206 |
+
"""
|
| 207 |
+
return _average_proposal_relative_value(
|
| 208 |
+
sl,
|
| 209 |
+
"average_proposal_when_agent_values_item_lower",
|
| 210 |
+
lambda a, b: a < b,
|
| 211 |
+
lambda a, b: a > b,
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
def average_proposal_when_agent_values_item_higher(
|
| 216 |
+
sl: SimulationStepLog,
|
| 217 |
+
) -> List[Tuple[str, float | None]] | None:
|
| 218 |
+
"""Mean quantity an agent proposes for items it values *more* than opponent.
|
| 219 |
+
|
| 220 |
+
Interpretation:
|
| 221 |
+
Captures how aggressively an agent claims items where it holds a
|
| 222 |
+
comparative *advantage*. Elevated values can reflect rational
|
| 223 |
+
specialization (efficient exploitation of comparative advantage) or
|
| 224 |
+
potentially unfair grabs if paired with low concession in the lower
|
| 225 |
+
valuation metric. Comparing this with the 'lower' counterpart helps
|
| 226 |
+
profile negotiation style (cooperative vs. exploitative).
|
| 227 |
+
|
| 228 |
+
Returns:
|
| 229 |
+
Mapping { agent_id: float | None, "all_agents": float | None } where
|
| 230 |
+
None indicates no qualifying items.
|
| 231 |
+
"""
|
| 232 |
+
return _average_proposal_relative_value(
|
| 233 |
+
sl,
|
| 234 |
+
"average_proposal_when_agent_values_item_higher",
|
| 235 |
+
lambda a, b: a > b,
|
| 236 |
+
lambda a, b: a < b,
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
# Explicit list of metric functions exported for rendering. Helper functions
|
| 241 |
+
# starting with '_' are intentionally excluded. Update this list when adding
|
| 242 |
+
# new public statistics so render.py can rely on it instead of introspecting
|
| 243 |
+
# every callable in the module.
|
| 244 |
+
stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
|
| 245 |
+
avg_reward,
|
| 246 |
+
average_proposal_when_agent_values_item_lower,
|
| 247 |
+
average_proposal_when_agent_values_item_higher,
|
| 248 |
+
split_efficiency,
|
| 249 |
+
]
|
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/tas_rps_agent.py
|
| 3 |
+
Summary: Agent logic for TAS Rock-Paper-Scissors blended game.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import copy
|
| 7 |
+
from collections.abc import Callable
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Any, Dict, List, Tuple
|
| 10 |
+
|
| 11 |
+
from mllm.markov_games.agent import Agent
|
| 12 |
+
from mllm.markov_games.negotiation.nego_agent import (
|
| 13 |
+
Message,
|
| 14 |
+
NegotiationAgent,
|
| 15 |
+
NegotiationAgentState,
|
| 16 |
+
Split,
|
| 17 |
+
)
|
| 18 |
+
from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSObs
|
| 19 |
+
from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class TrustAndSplitRPSAgent(NegotiationAgent):
|
| 23 |
+
"""NegotiationAgent that reasons about hidden hands before submitting TAS splits."""
|
| 24 |
+
|
| 25 |
+
def __init__(
|
| 26 |
+
self,
|
| 27 |
+
num_message_chars: int,
|
| 28 |
+
message_start_end_format: bool = False,
|
| 29 |
+
proposal_start_end_format: bool = False,
|
| 30 |
+
*args,
|
| 31 |
+
**kwargs,
|
| 32 |
+
):
|
| 33 |
+
self.num_message_chars = num_message_chars
|
| 34 |
+
self.message_start_end_format = message_start_end_format
|
| 35 |
+
self.proposal_start_end_format = proposal_start_end_format
|
| 36 |
+
super().__init__(*args, **kwargs)
|
| 37 |
+
self.intro_prompt = (
|
| 38 |
+
"Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
|
| 39 |
+
"\n"
|
| 40 |
+
"Setup:\n"
|
| 41 |
+
"1. The game has multiple independent rounds.\n"
|
| 42 |
+
"2. In each round, there are 10 coins to split between the two agents.\n"
|
| 43 |
+
"3. Each agent's per-coin value for that round is determined as follows:\n"
|
| 44 |
+
" - Both agents are randomly assigned a rock, paper or scissors hands\n"
|
| 45 |
+
" - Rock has the upper hand over scissors, scissors has the upper hand over paper and paper has the upper hand over rock.\n"
|
| 46 |
+
" - The agent with the upper hand has a per-coin value of 10.\n"
|
| 47 |
+
" - The agent with the lower hand has a per-coin value of 1.\n"
|
| 48 |
+
"4. You only see your own hand, but you may communicate it in messages and infer your value based on the other agent's hand.\n"
|
| 49 |
+
"5. Over many rounds both agents are equally likely to have the upper and lower hand.\n"
|
| 50 |
+
"\n"
|
| 51 |
+
"Protocol:\n"
|
| 52 |
+
"1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
|
| 53 |
+
"2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
|
| 54 |
+
" - Use this chat to communicate your hand so that both agents can determine their per-coin values.\n"
|
| 55 |
+
"3. After the chat, both agents simultaneously propose how many coins they keep.\n"
|
| 56 |
+
"4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
|
| 57 |
+
"5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
|
| 58 |
+
"6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
|
| 59 |
+
"7. The points are accumulated across rounds.\n"
|
| 60 |
+
"Your goal: {goal}\n"
|
| 61 |
+
)
|
| 62 |
+
self.new_round_prompt = (
|
| 63 |
+
"A New Round Begins\n"
|
| 64 |
+
"Your hand is {hand}. You don't know {other_agent}'s hand yet.\n"
|
| 65 |
+
)
|
| 66 |
+
# self.last_round_prompt = (
|
| 67 |
+
# "Last Round Summary:\n"
|
| 68 |
+
# " - Your hand: {last_hand_agent}\n"
|
| 69 |
+
# " - {other_agent}'s hand: {last_hand_coagent}\n"
|
| 70 |
+
# " - Your value per coin: {last_value_agent}\n"
|
| 71 |
+
# " - {other_agent}'s value per coin: {last_value_coagent}\n"
|
| 72 |
+
# " - You proposed: {last_split_agent} coins\n"
|
| 73 |
+
# " - You earned: {last_points_agent} points\n"
|
| 74 |
+
# " - {other_agent} proposed: {last_split_coagent} coins\n"
|
| 75 |
+
# " - {other_agent} earned: {last_points_coagent} points\n"
|
| 76 |
+
# " - Round Complete.\n"
|
| 77 |
+
# )
|
| 78 |
+
self.last_round_prompt = "In the previous round, {other_agent} had a {last_hand_value_coagent} hand and proposed {last_split_coagent} coins.\n"
|
| 79 |
+
if self.proposal_start_end_format:
|
| 80 |
+
self.send_split_prompt = (
|
| 81 |
+
"Submit your proposal\n"
|
| 82 |
+
"Respond with <<proposal_start>> x <<proposal_end>> where x is an integer in [0, 10]."
|
| 83 |
+
)
|
| 84 |
+
else:
|
| 85 |
+
self.send_split_prompt = (
|
| 86 |
+
"Submit your proposal\n"
|
| 87 |
+
"Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
|
| 88 |
+
)
|
| 89 |
+
self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
|
| 90 |
+
# self.wait_for_message_prompt = ""
|
| 91 |
+
self.last_message_prompt = "{other_agent} said: {last_message}"
|
| 92 |
+
if self.message_start_end_format:
|
| 93 |
+
self.send_message_prompt = f"Send your message now in <<message_start>>...<<message_end>> (<={self.num_message_chars} chars)."
|
| 94 |
+
else:
|
| 95 |
+
self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
|
| 96 |
+
|
| 97 |
+
def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
|
| 98 |
+
"""Switch between <message>...</message> and <<message_start>> formats on demand."""
|
| 99 |
+
if self.message_start_end_format:
|
| 100 |
+
return (
|
| 101 |
+
rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
|
| 102 |
+
)
|
| 103 |
+
else:
|
| 104 |
+
return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
|
| 105 |
+
|
| 106 |
+
def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
|
| 107 |
+
"""Force single-number proposals inside whichever tag style the config selected."""
|
| 108 |
+
if self.proposal_start_end_format:
|
| 109 |
+
return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
|
| 110 |
+
else:
|
| 111 |
+
return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
|
| 112 |
+
|
| 113 |
+
def get_split_action(
|
| 114 |
+
self, policy_output: str, observation: TrustAndSplitRPSObs
|
| 115 |
+
) -> Split:
|
| 116 |
+
"""Parse the proposal tag (or raw integer fallback) into a Split."""
|
| 117 |
+
import re as _re
|
| 118 |
+
|
| 119 |
+
if self.proposal_start_end_format:
|
| 120 |
+
m = _re.search(
|
| 121 |
+
r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>", policy_output
|
| 122 |
+
)
|
| 123 |
+
else:
|
| 124 |
+
m = _re.search(
|
| 125 |
+
r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output
|
| 126 |
+
)
|
| 127 |
+
coins_int = int(m.group(1)) if m else int(policy_output)
|
| 128 |
+
return Split(items_given_to_self={"coins": coins_int})
|
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (260 Bytes). View file
|
|
|
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc
ADDED
|
Binary file (2.37 kB). View file
|
|
|
src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc
ADDED
|
Binary file (5.97 kB). View file
|
|
|
src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc
ADDED
|
Binary file (6.02 kB). View file
|
|
|