Muqeeth commited on
Commit
bebc3ff
·
verified ·
1 Parent(s): 86bc4e1

Add files using upload-large-folder tool

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ run.log filter=lfs diff=lfs merge=lfs -text
run.log ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c50803dd26fc6b9db435c1427981e4017e7e156753c5cf5e60710cf5afdd64d
3
+ size 10900718
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ab958fd2facd005cae5571b0df06ed3be786697f4c31d6435ffcbc655d2920b
3
+ size 323014168
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c44c3464099d92dfebb2b132524339800fbf19760b378a02c3c527ac3380b88
3
+ size 323014168
seed_1/Qwen/Qwen2.5-7B-Instruct/adapters/fixed_ad_align_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf6bb8f8d702f23ed3c0797660ebbc16bdee9cbac5c984ffbad4a1dc3ba2215c
3
+ size 323014168
seed_1/agent_trainer/policy_optimizer_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1dca3a51476df532d5a63aa1f269f3467830fee1573883a3cb10d0857ddd4111
3
+ size 646269121
seed_1/agent_trainer/trainer_annealing_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17f3ead2dac3c925aeb1b3176d071b434c765c0606d4e707e423de4498633e52
3
+ size 104
seed_1/random_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06d527c81a0ed8e596458de353799680ab01076dfc3d43cd3b1a2ebea4439ac5
3
+ size 12218
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc ADDED
Binary file (5.42 kB). View file
 
src_code_for_reproducibility/markov_games/negotiation/__pycache__/negotiation_statistics.cpython-312.pyc ADDED
Binary file (14.2 kB). View file
 
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/negotiation/dond_simulation.py
3
+ Summary: Simulates Deal-or-No-Deal negotiation games and logs rollouts.
4
+ """
5
+
6
+ import copy
7
+ from dataclasses import dataclass
8
+ from typing import Any, Dict, List, Tuple
9
+
10
+ from numpy.random import default_rng
11
+
12
+ from mllm.markov_games.negotiation.nego_simulation import (
13
+ NegotiationObs,
14
+ NegotiationSimulation,
15
+ NegotiationState,
16
+ Split,
17
+ )
18
+ from mllm.markov_games.rollout_tree import SimulationStepLog
19
+ from mllm.utils.get_coagent_id import get_coagent_id
20
+
21
+ AgentId = str
22
+
23
+
24
+ @dataclass
25
+ class DealNoDealState(NegotiationState):
26
+ """NegotiationState with per-agent value tables and item taxonomy."""
27
+
28
+ item_types: List[str]
29
+ values: Dict[AgentId, Dict[str, int]]
30
+
31
+
32
+ @dataclass
33
+ class DealNoDealObs(NegotiationObs):
34
+ """Observation that reveals own values and (lagged) opponent values."""
35
+
36
+ my_values: Dict[str, int]
37
+ item_types: List[str]
38
+ previous_values_coagent: Dict[str, int] | None
39
+
40
+
41
+ def random_partition_integer(rng, total: int, parts: int) -> List[int]:
42
+ """Sample non-negative integers summing to ``total`` across ``parts`` buckets."""
43
+ if parts <= 0:
44
+ return []
45
+ if total <= 0:
46
+ return [0 for _ in range(parts)]
47
+ cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
48
+ vals = []
49
+ prev = 0
50
+ for c in cuts + [total]:
51
+ vals.append(c - prev)
52
+ prev = c
53
+ return vals
54
+
55
+
56
+ class DealNoDealSimulation(NegotiationSimulation):
57
+ """NegotiationSimulation variant implementing the Rubinstein-style Deal-or-No-Deal."""
58
+
59
+ def __init__(
60
+ self,
61
+ item_types: List[str] = ["books", "hats", "balls"],
62
+ *args,
63
+ **kwargs,
64
+ ):
65
+ super().__init__(item_types=item_types, *args, **kwargs)
66
+ self.reset()
67
+
68
+ def _other(self, agent_id: AgentId) -> AgentId:
69
+ return get_coagent_id(self.agent_ids, agent_id)
70
+
71
+ def _sample_stock(self) -> Dict[str, int]:
72
+ # total items between 5 and 7
73
+ total_items = int(self.rng.integers(5, 8))
74
+ # nonnegative per-type counts summing to total_items
75
+ parts = random_partition_integer(self.rng, total_items, len(self.item_types))
76
+ # allow zeros per type
77
+ return {t: int(c) for t, c in zip(self.item_types, parts)}
78
+
79
+ def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
80
+ # Each agent has integer non-negative values that sum to 10
81
+ # Each item type valued by at least one agent
82
+ # Some item type valued by both agents
83
+ while True:
84
+ vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
85
+ vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
86
+ a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
87
+ b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
88
+ # each item valued by at least one
89
+ ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
90
+ # some item valued by both
91
+ ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
92
+ if ok1 and ok2:
93
+ return {self.agent_ids[0]: a, self.agent_ids[1]: b}
94
+
95
+ def _is_valid_allocation(
96
+ self, allocation: Dict[str, int], stock: Dict[str, int]
97
+ ) -> bool:
98
+ for t in self.item_types:
99
+ v = allocation.get(t)
100
+ if v is None:
101
+ return False
102
+ if not isinstance(v, int):
103
+ return False
104
+ if v < 0 or v > int(stock.get(t, 0)):
105
+ return False
106
+ return True
107
+
108
+ def set_new_round_of_variant(self):
109
+ # Keep same values, resample stock
110
+ self.state.quantities = self._sample_stock()
111
+
112
+ def get_info_of_variant(
113
+ self, state: NegotiationState, actions: Dict[AgentId, Any]
114
+ ) -> Dict[str, Any]:
115
+ return {
116
+ "quantities": copy.deepcopy(state.quantities),
117
+ "values": copy.deepcopy(state.values),
118
+ "splits": copy.deepcopy(state.splits),
119
+ }
120
+
121
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
122
+ """
123
+ Returns the rewards for each agent.
124
+ """
125
+ split_a = splits[self.agent_ids[0]].items_given_to_self
126
+ split_b = splits[self.agent_ids[1]].items_given_to_self
127
+ rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
128
+ for t in self.item_types:
129
+ # If not complementary, return 0!
130
+ if not split_a[t] + split_b[t] == self.state.quantities[t]:
131
+ return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
132
+ rewards[self.agent_ids[0]] += (
133
+ split_a[t] * self.state.values[self.agent_ids[0]][t]
134
+ )
135
+ rewards[self.agent_ids[1]] += (
136
+ split_b[t] * self.state.values[self.agent_ids[1]][t]
137
+ )
138
+ return rewards
139
+
140
+ def get_obs(self):
141
+ return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
142
+
143
+ def get_obs_agent(self, agent_id):
144
+ other_id = self._other(agent_id)
145
+ obs = DealNoDealObs(
146
+ round_nb=self.state.round_nb,
147
+ last_message=self.state.last_message,
148
+ current_agent=self.state.current_agent,
149
+ quantities=copy.deepcopy(self.state.quantities),
150
+ value=0.0, # unused in DOND
151
+ other_agent_split=None, # not meaningful until split
152
+ split_phase=self.state.split_phase,
153
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
154
+ my_values=copy.deepcopy(self.state.values[agent_id]),
155
+ item_types=list(self.item_types),
156
+ previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
157
+ )
158
+ return obs
159
+
160
+ def reset(self):
161
+ start_agent = self.agent_ids[self._starting_agent_index]
162
+ stock = self._sample_stock()
163
+ values = self._sample_values_pair()
164
+ self.state = DealNoDealState(
165
+ round_nb=0,
166
+ last_message="",
167
+ current_agent=start_agent,
168
+ quantities=stock,
169
+ values=values,
170
+ previous_values=None,
171
+ splits={aid: None for aid in self.agent_ids},
172
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
173
+ split_phase=False,
174
+ item_types=list(self.item_types),
175
+ )
176
+ return self.get_obs()
src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/negotiation/negotiation_statistics.py
3
+ Summary: Aggregates and reports statistics for negotiation experiments.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Callable, Dict, List, Tuple
9
+
10
+ from mllm.markov_games.negotiation.nego_simulation import Split
11
+ from mllm.markov_games.rollout_tree import SimulationStepLog
12
+
13
+
14
+ def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
15
+ """Average (per-step) reward for each agent and overall.
16
+
17
+ What it computes:
18
+ - Returns the raw reward for every (non-buffer) agent at the current
19
+ simulation step.
20
+ - Adds an aggregate key ``all_agents`` which is the simple arithmetic
21
+ mean across the agents present in ``sl.rewards``.
22
+
23
+ Rationale / motivation:
24
+ Monitoring the reward stream at each step helps:
25
+ * Diagnose reward shaping issues (e.g., unintended negative drift).
26
+ * Provide a fairness snapshot (are rewards systematically skewed?).
27
+ * Supply a ubiquitous baseline metric used by other higher‑level
28
+ summaries (efficiency, surplus allocation, etc.).
29
+
30
+ Return shape:
31
+ { agent_id: float, ..., "all_agents": float }
32
+ If any agent id contains the substring "buffer" we treat this step as
33
+ an implementation artifact (e.g., rollout buffer) and return ``None``
34
+ to avoid polluting aggregates.
35
+ """
36
+ for aid in sl.rewards.keys():
37
+ if "buffer" in str(aid) and "live" not in str(aid):
38
+ return None
39
+ # One value per agent at each step
40
+ rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
41
+ return [(key, value) for key, value in rewards_dict.items() if value is not None]
42
+
43
+
44
+ def split_efficiency(sl: SimulationStepLog) -> List[Tuple[str, float]] | None:
45
+ """Final‑round allocation efficiency relative to an upper bound.
46
+
47
+ What it computes (only on the last timestep of a negotiation round):
48
+ - Uses ``info['values']`` (per‑agent per‑item valuations) and
49
+ ``info['quantities']`` (available item counts) to form a greedy
50
+ *upper bound* on achievable total reward: allocate each unit of an
51
+ item to the single agent who values that item most.
52
+ - Compares the actually realized sum of rewards at that final
53
+ timestep to this constructed maximum.
54
+ - Emits a single scalar under key ``"all_agents"`` equal to
55
+ achieved / theoretical_max.
56
+
57
+ Motivation:
58
+ Efficiency (a core welfare notion) distinguishes between coordination
59
+ failures (low efficiency) versus strategic distributional disputes
60
+ (high efficiency but uneven splits). Tracking this per round helps
61
+ evaluate whether models learn to identify and realize joint surplus.
62
+
63
+ Notes / caveats:
64
+ - Only defined for 2+ non‑buffer agents; if a buffer agent is present
65
+ returns ``None`` to exclude spurious steps.
66
+ - Requires the environment to have populated ``values`` and
67
+ ``quantities``; otherwise returns ``None``.
68
+ - This is an optimistic bound (not necessarily reachable under
69
+ protocol constraints) but is simple, fast, and comparable across
70
+ runs.
71
+ """
72
+ info = sl.info or {}
73
+ if not info or not info.get("is_last_timestep_in_round"):
74
+ return None
75
+ quantities = info.get("quantities") or {}
76
+ values = info.get("values") or {}
77
+ if not values or not quantities:
78
+ return None
79
+ agent_ids = list(sl.rewards.keys())
80
+ if type(values[agent_ids[0]]) is dict:
81
+ item_keys = list(values.values())[0].keys()
82
+ max_vals, max_quantities = [], []
83
+ for item in item_keys:
84
+ max_val = max(float(agent_vals[item]) for agent_vals in values.values())
85
+ max_vals.append(max_val)
86
+ max_quantities.append(quantities[item])
87
+ else:
88
+ max_vals = [max(float(v) for v in values.values())]
89
+ max_quantities = [quantities[item] for item in quantities.keys()]
90
+ for aid in sl.rewards.keys():
91
+ if "buffer" in str(aid) and "live" not in str(aid):
92
+ return None
93
+ achieved = sum(float(v) for v in sl.rewards.values())
94
+ max_reward = sum(d * v for d, v in zip(max_quantities, max_vals))
95
+ # Efficiency is a global metric; emit same value for a special key "all"
96
+ return [("split_efficiency", achieved / max_reward)]
97
+
98
+
99
+ def _extract_items_from_split(raw_split: Dict) -> Dict[str, float] | None:
100
+ """Return a mapping item->proposal amount from a split structure.
101
+
102
+ Supports both generic negotiation splits with nested structure
103
+ { 'items_given_to_self': {item: qty, ...}}
104
+ and TAS coin-only variants which may already be a flat mapping {'coins': qty}.
105
+ """
106
+
107
+ if raw_split is None:
108
+ return {}
109
+ elif isinstance(raw_split, Split):
110
+ return {k: float(v) for k, v in raw_split.items_given_to_self.items()}
111
+ elif isinstance(raw_split, dict):
112
+ if "items_given_to_self" in raw_split and isinstance(
113
+ raw_split["items_given_to_self"], dict
114
+ ):
115
+ return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
116
+ # Fallback: assume already flat mapping of items
117
+ elif hasattr(raw_split, "items_given_to_self"):
118
+ return {k: float(v) for k, v in raw_split["items_given_to_self"].items()}
119
+ return {
120
+ k: float(v) for k, v in raw_split.items() if isinstance(v, (int, float))
121
+ }
122
+ return {}
123
+
124
+
125
+ def _average_proposal_relative_value(
126
+ sl: SimulationStepLog,
127
+ metric_name: str,
128
+ comparator: Callable[[float, float], bool],
129
+ opposite_comparator: Callable[[float, float], bool],
130
+ ) -> Dict[str, float | None] | None:
131
+ """Shared implementation for proposal size conditioned on relative value.
132
+
133
+ Parameters:
134
+ comparator: returns True when agent_0's value relation (e.g. < or >)
135
+ to agent_1 holds for an item and we should collect agent_0's
136
+ proposed quantity for that item.
137
+ opposite_comparator: inverse relation used to collect agent_1's items.
138
+
139
+ Behavior:
140
+ - Executes only on final timestep of a round (where the definitive
141
+ proposal / allocation is known via ``info['splits']``).
142
+ - For each item, classifies which agent's value satisfies the chosen
143
+ relation and records that agent's proposed quantity from the split.
144
+ - Averages (mean) across all qualifying items per agent; if no items
145
+ qualify for an agent returns ``None`` for that agent id.
146
+ - Adds ``all_agents`` mean across the numeric (non-None) agent values.
147
+
148
+ Why this matters:
149
+ Distinguishing how much an agent *asks for* when it subjectively
150
+ values items more (or less) than its counterpart reveals patterns of
151
+ opportunism vs. concession. This is especially useful when raw reward
152
+ differences are subtle but allocation *intent* differs.
153
+ """
154
+ info = sl.info or {}
155
+ if not info or not info.get("is_last_timestep_in_round"):
156
+ return None
157
+ quantities = info.get("quantities") or {}
158
+ splits = info.get("splits") or {}
159
+ values = info.get("values") or {}
160
+ agent_ids: List[str] = list(sl.rewards.keys())
161
+ if len(agent_ids) != 2:
162
+ return None # Only defined for 2-agent case.
163
+ for aid in agent_ids:
164
+ if "buffer" in str(aid) and "live" not in str(aid):
165
+ return None
166
+ # Extract per-agent item proposals robustly
167
+ split_items = {aid: _extract_items_from_split(splits.get(aid)) for aid in agent_ids}
168
+ agent_0_vals: List[float] = []
169
+ agent_1_vals: List[float] = []
170
+ for item in quantities.keys():
171
+ # Values may be either a float (same for all items) or dict per item
172
+ v0_raw = values[agent_ids[0]]
173
+ v1_raw = values[agent_ids[1]]
174
+ v0 = float(v0_raw[item]) if isinstance(v0_raw, dict) else float(v0_raw)
175
+ v1 = float(v1_raw[item]) if isinstance(v1_raw, dict) else float(v1_raw)
176
+ if comparator(v0, v1):
177
+ agent_0_vals.append(split_items[agent_ids[0]].get(item, 0.0))
178
+ elif opposite_comparator(v0, v1):
179
+ agent_1_vals.append(split_items[agent_ids[1]].get(item, 0.0))
180
+ out: Dict[str, float | None] = {}
181
+ out[f"{metric_name}-{agent_ids[0]}"] = (
182
+ sum(agent_0_vals) / len(agent_0_vals) if agent_0_vals else None
183
+ )
184
+ out[f"{metric_name}-{agent_ids[1]}"] = (
185
+ sum(agent_1_vals) / len(agent_1_vals) if agent_1_vals else None
186
+ )
187
+
188
+ return [(key, value) for key, value in out.items() if value is not None]
189
+
190
+
191
+ def average_proposal_when_agent_values_item_lower(
192
+ sl: SimulationStepLog,
193
+ ) -> List[Tuple[str, float | None]] | None:
194
+ """Mean quantity an agent proposes for items it values *less* than opponent.
195
+
196
+ Interpretation:
197
+ A higher value implies the agent still claims (or is allocated) a
198
+ notable share of items where it has a comparative *disadvantage* in
199
+ valuation, signaling either strategic over-claiming or protocol-driven
200
+ egalitarian splits. Conversely, very low numbers can indicate
201
+ efficient specialization or excessive concession.
202
+
203
+ Returns:
204
+ Mapping { agent_id: float | None, "all_agents": float | None } where
205
+ None indicates no qualifying items for that agent in the round.
206
+ """
207
+ return _average_proposal_relative_value(
208
+ sl,
209
+ "average_proposal_when_agent_values_item_lower",
210
+ lambda a, b: a < b,
211
+ lambda a, b: a > b,
212
+ )
213
+
214
+
215
+ def average_proposal_when_agent_values_item_higher(
216
+ sl: SimulationStepLog,
217
+ ) -> List[Tuple[str, float | None]] | None:
218
+ """Mean quantity an agent proposes for items it values *more* than opponent.
219
+
220
+ Interpretation:
221
+ Captures how aggressively an agent claims items where it holds a
222
+ comparative *advantage*. Elevated values can reflect rational
223
+ specialization (efficient exploitation of comparative advantage) or
224
+ potentially unfair grabs if paired with low concession in the lower
225
+ valuation metric. Comparing this with the 'lower' counterpart helps
226
+ profile negotiation style (cooperative vs. exploitative).
227
+
228
+ Returns:
229
+ Mapping { agent_id: float | None, "all_agents": float | None } where
230
+ None indicates no qualifying items.
231
+ """
232
+ return _average_proposal_relative_value(
233
+ sl,
234
+ "average_proposal_when_agent_values_item_higher",
235
+ lambda a, b: a > b,
236
+ lambda a, b: a < b,
237
+ )
238
+
239
+
240
+ # Explicit list of metric functions exported for rendering. Helper functions
241
+ # starting with '_' are intentionally excluded. Update this list when adding
242
+ # new public statistics so render.py can rely on it instead of introspecting
243
+ # every callable in the module.
244
+ stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
245
+ avg_reward,
246
+ average_proposal_when_agent_values_item_lower,
247
+ average_proposal_when_agent_values_item_higher,
248
+ split_efficiency,
249
+ ]
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/negotiation/tas_rps_agent.py
3
+ Summary: Agent logic for TAS Rock-Paper-Scissors blended game.
4
+ """
5
+
6
+ import copy
7
+ from collections.abc import Callable
8
+ from dataclasses import dataclass
9
+ from typing import Any, Dict, List, Tuple
10
+
11
+ from mllm.markov_games.agent import Agent
12
+ from mllm.markov_games.negotiation.nego_agent import (
13
+ Message,
14
+ NegotiationAgent,
15
+ NegotiationAgentState,
16
+ Split,
17
+ )
18
+ from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSObs
19
+ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
20
+
21
+
22
+ class TrustAndSplitRPSAgent(NegotiationAgent):
23
+ """NegotiationAgent that reasons about hidden hands before submitting TAS splits."""
24
+
25
+ def __init__(
26
+ self,
27
+ num_message_chars: int,
28
+ message_start_end_format: bool = False,
29
+ proposal_start_end_format: bool = False,
30
+ *args,
31
+ **kwargs,
32
+ ):
33
+ self.num_message_chars = num_message_chars
34
+ self.message_start_end_format = message_start_end_format
35
+ self.proposal_start_end_format = proposal_start_end_format
36
+ super().__init__(*args, **kwargs)
37
+ self.intro_prompt = (
38
+ "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
39
+ "\n"
40
+ "Setup:\n"
41
+ "1. The game has multiple independent rounds.\n"
42
+ "2. In each round, there are 10 coins to split between the two agents.\n"
43
+ "3. Each agent's per-coin value for that round is determined as follows:\n"
44
+ " - Both agents are randomly assigned a rock, paper or scissors hands\n"
45
+ " - Rock has the upper hand over scissors, scissors has the upper hand over paper and paper has the upper hand over rock.\n"
46
+ " - The agent with the upper hand has a per-coin value of 10.\n"
47
+ " - The agent with the lower hand has a per-coin value of 1.\n"
48
+ "4. You only see your own hand, but you may communicate it in messages and infer your value based on the other agent's hand.\n"
49
+ "5. Over many rounds both agents are equally likely to have the upper and lower hand.\n"
50
+ "\n"
51
+ "Protocol:\n"
52
+ "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
53
+ "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
54
+ " - Use this chat to communicate your hand so that both agents can determine their per-coin values.\n"
55
+ "3. After the chat, both agents simultaneously propose how many coins they keep.\n"
56
+ "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
57
+ "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
58
+ "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
59
+ "7. The points are accumulated across rounds.\n"
60
+ "Your goal: {goal}\n"
61
+ )
62
+ self.new_round_prompt = (
63
+ "A New Round Begins\n"
64
+ "Your hand is {hand}. You don't know {other_agent}'s hand yet.\n"
65
+ )
66
+ # self.last_round_prompt = (
67
+ # "Last Round Summary:\n"
68
+ # " - Your hand: {last_hand_agent}\n"
69
+ # " - {other_agent}'s hand: {last_hand_coagent}\n"
70
+ # " - Your value per coin: {last_value_agent}\n"
71
+ # " - {other_agent}'s value per coin: {last_value_coagent}\n"
72
+ # " - You proposed: {last_split_agent} coins\n"
73
+ # " - You earned: {last_points_agent} points\n"
74
+ # " - {other_agent} proposed: {last_split_coagent} coins\n"
75
+ # " - {other_agent} earned: {last_points_coagent} points\n"
76
+ # " - Round Complete.\n"
77
+ # )
78
+ self.last_round_prompt = "In the previous round, {other_agent} had a {last_hand_value_coagent} hand and proposed {last_split_coagent} coins.\n"
79
+ if self.proposal_start_end_format:
80
+ self.send_split_prompt = (
81
+ "Submit your proposal\n"
82
+ "Respond with <<proposal_start>> x <<proposal_end>> where x is an integer in [0, 10]."
83
+ )
84
+ else:
85
+ self.send_split_prompt = (
86
+ "Submit your proposal\n"
87
+ "Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
88
+ )
89
+ self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
90
+ # self.wait_for_message_prompt = ""
91
+ self.last_message_prompt = "{other_agent} said: {last_message}"
92
+ if self.message_start_end_format:
93
+ self.send_message_prompt = f"Send your message now in <<message_start>>...<<message_end>> (<={self.num_message_chars} chars)."
94
+ else:
95
+ self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
96
+
97
+ def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
98
+ """Switch between <message>...</message> and <<message_start>> formats on demand."""
99
+ if self.message_start_end_format:
100
+ return (
101
+ rf"<<message_start>>[\s\S]{{0,{self.num_message_chars}}}<<message_end>>"
102
+ )
103
+ else:
104
+ return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
105
+
106
+ def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
107
+ """Force single-number proposals inside whichever tag style the config selected."""
108
+ if self.proposal_start_end_format:
109
+ return r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>"
110
+ else:
111
+ return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
112
+
113
+ def get_split_action(
114
+ self, policy_output: str, observation: TrustAndSplitRPSObs
115
+ ) -> Split:
116
+ """Parse the proposal tag (or raw integer fallback) into a Split."""
117
+ import re as _re
118
+
119
+ if self.proposal_start_end_format:
120
+ m = _re.search(
121
+ r"<<proposal_start>> ?(10|[0-9]) ?<<proposal_end>>", policy_output
122
+ )
123
+ else:
124
+ m = _re.search(
125
+ r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output
126
+ )
127
+ coins_int = int(m.group(1)) if m else int(policy_output)
128
+ return Split(items_given_to_self={"coins": coins_int})
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (260 Bytes). View file
 
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc ADDED
Binary file (2.37 kB). View file
 
src_code_for_reproducibility/training/__pycache__/tokenize_chats.cpython-312.pyc ADDED
Binary file (5.97 kB). View file
 
src_code_for_reproducibility/training/__pycache__/trainer_sum_rewards.cpython-312.pyc ADDED
Binary file (6.02 kB). View file