Muqeeth commited on
Commit
48ab31c
·
verified ·
1 Parent(s): 205759e

Add files using upload-large-folder tool

Browse files
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:26ad06cf2573ff7bf0587ba196024c62fed7fe859a2ed0a8ec5c03ce0db59d1c
3
+ size 323014168
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50cfa136e5499e5b1f83c90753b519572d60a378c94d09953a2738af6a8ae3c1
3
+ size 323014168
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/fixed_ad_align_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe1ae303292393790925eaeac3cba8b616a78ce4a3df22dafcc7fd0de06d66d8
3
+ size 323014168
seed_0/agent_trainer/critic_optimizer_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1574fdb90735a922b09c67d07f7abdbd51181f00dc7bed878cb80adb5f50c1d
3
+ size 2631
seed_0/agent_trainer/policy_optimizer_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:372d895c4bb9a90b6009ec941da2efdad875f1204eaf7499f67839222556bac8
3
+ size 646269121
seed_0/agent_trainer/trainer_annealing_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76acff6f1755878d1b098958dd60afbf112339e6b0ee2216d366f4ce8564ccec
3
+ size 104
seed_0/random_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7af5c6e16983563656a9b661cf2b84015d980b07816cd738110da2a886220c36
3
+ size 12176
src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Any, Tuple
3
+
4
+ from mllm.markov_games.ipd.ipd_agent import IPDAgent
5
+ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
6
+
7
+
8
+ @dataclass
9
+ class AlwaysCooperateIPDAgent(IPDAgent):
10
+ async def act(self, observation) -> Tuple[Any, AgentActLog]:
11
+ """
12
+ Always plays the cooperate action, ignoring observation.
13
+ Returns the configured cooperate_string so the simulation parses it as "C".
14
+ """
15
+
16
+ action = self.cooperate_string
17
+
18
+ # Log a minimal, structured chat turn for consistency with other agents
19
+ turn_text = f"Playing cooperate: {action}"
20
+ self.state.chat_history.append(
21
+ ChatTurn(
22
+ agent_id=self.agent_id,
23
+ role="assistant",
24
+ content=turn_text,
25
+ is_state_end=True,
26
+ )
27
+ )
28
+
29
+ act_log = AgentActLog(
30
+ chat_turns=[self.state.chat_history[-1]],
31
+ info=None,
32
+ )
33
+
34
+ # Advance internal counters similar to IPDAgent semantics
35
+ self.state.chat_counter = len(self.state.chat_history)
36
+ self.state.round_nb = observation.round_nb
37
+
38
+ return action, act_log
39
+
40
+
41
+ @dataclass
42
+ class AlwaysDefectIPDAgent(IPDAgent):
43
+ async def act(self, observation) -> Tuple[Any, AgentActLog]:
44
+ """
45
+ Always plays the defect action, ignoring observation.
46
+ Returns the configured defect_string so the simulation parses it as "D".
47
+ """
48
+
49
+ action = self.defect_string
50
+
51
+ # Log a minimal, structured chat turn for consistency with other agents
52
+ turn_text = f"Playing defect: {action}"
53
+ self.state.chat_history.append(
54
+ ChatTurn(
55
+ agent_id=self.agent_id,
56
+ role="assistant",
57
+ content=turn_text,
58
+ is_state_end=True,
59
+ )
60
+ )
61
+
62
+ act_log = AgentActLog(
63
+ chat_turns=[self.state.chat_history[-1]],
64
+ info=None,
65
+ )
66
+
67
+ # Advance internal counters similar to IPDAgent semantics
68
+ self.state.chat_counter = len(self.state.chat_history)
69
+ self.state.round_nb = observation.round_nb
70
+
71
+ return action, act_log
72
+
src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Trust-and-Split simulation.
3
+
4
+ This environment models a simple bargaining game over 10 coins with messaging.
5
+ Agents are assigned rock/paper/scissors hands, with the winner getting value 10 per coin
6
+ and the loser getting value 1 per coin. Agents alternate sending messages for a fixed
7
+ number of turns per round and then each submits a split proposal indicating how many
8
+ coins they keep for themselves. Rewards are proportional if the proposed totals exceed 10.
9
+ """
10
+
11
+ import copy
12
+ from dataclasses import dataclass
13
+ from typing import Any, Dict, List, Literal, Tuple
14
+
15
+ from numpy.random import default_rng
16
+
17
+ from mllm.markov_games.negotiation.nego_simulation import (
18
+ Message,
19
+ NegotiationObs,
20
+ NegotiationSimulation,
21
+ NegotiationState,
22
+ Split,
23
+ compute_tas_style_rewards,
24
+ )
25
+ from mllm.markov_games.rollout_tree import SimulationStepLog
26
+
27
+ AgentId = str
28
+
29
+
30
+ def _get_rps_winner(
31
+ hand1: Literal["rock", "paper", "scissors"],
32
+ hand2: Literal["rock", "paper", "scissors"],
33
+ ) -> Literal["rock", "paper", "scissors"]:
34
+ """Determine winner of rock-paper-scissors between two hands."""
35
+ if hand1 == hand2:
36
+ raise ValueError("Hands should be different")
37
+ if (
38
+ (hand1 == "rock" and hand2 == "scissors")
39
+ or (hand1 == "paper" and hand2 == "rock")
40
+ or (hand1 == "scissors" and hand2 == "paper")
41
+ ):
42
+ return hand1
43
+ else:
44
+ return hand2
45
+
46
+
47
+ @dataclass
48
+ class TrustAndSplitRPSState(NegotiationState):
49
+ hands: Dict[
50
+ AgentId, Literal["rock", "paper", "scissors"]
51
+ ] # rock, paper, or scissors
52
+ previous_hands: Dict[AgentId, Literal["rock", "paper", "scissors"]] | None
53
+
54
+
55
+ @dataclass
56
+ class TrustAndSplitRPSObs(NegotiationObs):
57
+ hand: Literal["rock", "paper", "scissors"]
58
+ last_hand_agent: Literal["rock", "paper", "scissors"] | None
59
+ last_hand_coagent: Literal["rock", "paper", "scissors"] | None
60
+ last_hand_value_coagent: Literal["upper", "lower"] | None
61
+
62
+
63
+ class TrustAndSplitRPSSimulation(NegotiationSimulation):
64
+ def __init__(
65
+ self,
66
+ alternating_hands: bool = False,
67
+ alternating_mix_ratio: float = None,
68
+ *args,
69
+ **kwargs,
70
+ ):
71
+ self.alternating_hands = alternating_hands
72
+ self.alternating_mix_ratio = alternating_mix_ratio
73
+ super().__init__(*args, **kwargs)
74
+ if self.alternating_mix_ratio is not None:
75
+ if self.rng.random() < self.alternating_mix_ratio:
76
+ self.alternating_hands = True
77
+ else:
78
+ self.alternating_hands = False
79
+
80
+ def _sample_hands_and_values(
81
+ self,
82
+ alternate_hands: bool = False,
83
+ ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]:
84
+ hands = ["rock", "paper", "scissors"]
85
+ if alternate_hands:
86
+ previous_hands = list(self.state.previous_hands.values())
87
+ hand1, hand2 = self.rng.choice(hands, size=2, replace=False)
88
+ winner = _get_rps_winner(hand1, hand2)
89
+ loser = hand1 if winner == hand2 else hand2
90
+ previous_winner = _get_rps_winner(previous_hands[0], previous_hands[1])
91
+ agent_hands, values = {}, {}
92
+ for agent_id in self.agent_ids:
93
+ if self.state.previous_hands[agent_id] == previous_winner:
94
+ agent_hands[agent_id] = loser
95
+ values[agent_id] = 1.0
96
+ else:
97
+ agent_hands[agent_id] = winner
98
+ values[agent_id] = 10.0
99
+ return agent_hands, values
100
+ else:
101
+ # Assign different hands to each agent
102
+ hand1, hand2 = self.rng.choice(hands, size=2, replace=False)
103
+
104
+ agent_hands = {self.agent_ids[0]: hand1, self.agent_ids[1]: hand2}
105
+
106
+ # Determine winner and assign values
107
+ winner = _get_rps_winner(hand1, hand2)
108
+ values = {}
109
+ for agent_id in self.agent_ids:
110
+ if agent_hands[agent_id] == winner:
111
+ values[agent_id] = 10.0 # Winner gets value 10
112
+ else:
113
+ values[agent_id] = 1.0 # Loser gets value 1
114
+
115
+ return agent_hands, values
116
+
117
+ def set_new_round_of_variant(self):
118
+ self.state.previous_hands = copy.deepcopy(self.state.hands)
119
+ new_hands, new_values = self._sample_hands_and_values(
120
+ alternate_hands=self.alternating_hands
121
+ )
122
+ self.state.hands = new_hands
123
+ self.state.values = new_values
124
+ # Quantities are constant in TAS
125
+ self.state.quantities = {"coins": 10}
126
+ self.state.split_phase = False
127
+
128
+ def get_info_of_variant(
129
+ self, state: NegotiationState, actions: Dict[AgentId, Any]
130
+ ) -> Dict[str, Any]:
131
+ return {
132
+ "quantities": copy.deepcopy(state.quantities),
133
+ "hands": copy.deepcopy(state.hands),
134
+ "values": copy.deepcopy(state.values),
135
+ "previous_hands": copy.deepcopy(state.previous_hands),
136
+ "previous_values": copy.deepcopy(state.previous_values),
137
+ "splits": copy.deepcopy(state.splits),
138
+ }
139
+
140
+ def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
141
+ return compute_tas_style_rewards(
142
+ self.agent_ids, self.state.values, splits, self.state.quantities
143
+ )
144
+
145
+ def get_obs_agent(self, agent_id):
146
+ """Returns observation for agent_id"""
147
+ other_id = self._other(agent_id)
148
+ last_value_coagent = (
149
+ None
150
+ if self.state.previous_values is None
151
+ else self.state.previous_values.get(other_id)
152
+ )
153
+ last_hand_coagent = (
154
+ None
155
+ if self.state.previous_hands is None
156
+ else self.state.previous_hands.get(other_id)
157
+ )
158
+ last_points_coagent = (
159
+ None
160
+ if self.state.previous_points is None
161
+ else round(self.state.previous_points.get(other_id), 1)
162
+ )
163
+ last_value_agent = (
164
+ None
165
+ if self.state.previous_values is None
166
+ else self.state.previous_values.get(agent_id)
167
+ )
168
+ last_hand_agent = (
169
+ None
170
+ if self.state.previous_hands is None
171
+ else self.state.previous_hands.get(agent_id)
172
+ )
173
+ last_points_agent = (
174
+ None
175
+ if self.state.previous_points is None
176
+ else round(self.state.previous_points.get(agent_id), 1)
177
+ )
178
+ last_split_coagent = None
179
+ last_split_agent = None
180
+ if self.state.previous_splits is not None:
181
+ last_split_coagent = self.state.previous_splits[
182
+ other_id
183
+ ].items_given_to_self["coins"]
184
+ last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[
185
+ "coins"
186
+ ]
187
+ if last_hand_agent is None or last_hand_coagent is None:
188
+ last_hand_value_coagent = None
189
+ else:
190
+ winner = _get_rps_winner(last_hand_agent, last_hand_coagent)
191
+ last_hand_value_coagent = (
192
+ "upper" if winner == last_hand_coagent else "lower"
193
+ )
194
+ obs = TrustAndSplitRPSObs(
195
+ round_nb=self.state.round_nb,
196
+ last_message=self.state.last_message,
197
+ quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
198
+ current_agent=self.state.current_agent,
199
+ other_agent=self.agent_id_to_name[other_id],
200
+ quantities={"coins": 10},
201
+ item_types=self.item_types,
202
+ value=self.state.values[agent_id],
203
+ split_phase=self.state.split_phase,
204
+ last_split_agent=last_split_agent,
205
+ last_value_agent=last_value_agent,
206
+ last_points_agent=last_points_agent,
207
+ last_split_coagent=last_split_coagent,
208
+ last_value_coagent=last_value_coagent,
209
+ last_points_coagent=last_points_coagent,
210
+ hand=self.state.hands[agent_id],
211
+ last_hand_coagent=last_hand_coagent,
212
+ last_hand_agent=last_hand_agent,
213
+ last_quantities=self.state.previous_quantities,
214
+ last_hand_value_coagent=last_hand_value_coagent,
215
+ )
216
+ return obs
217
+
218
+ def get_state(self):
219
+ return self.state
220
+
221
+ def get_safe_copy(self):
222
+ """Return a safe copy of the simulation."""
223
+ simulation_copy = copy.copy(self)
224
+ simulation_copy.state = copy.deepcopy(self.state)
225
+ return simulation_copy
226
+
227
+ def reset(self):
228
+ """Initialize and return initial observations"""
229
+ # Decide starting agent alternating across resets for determinism
230
+ start_agent = self.agent_ids[self._starting_agent_index]
231
+ hands, values = self._sample_hands_and_values()
232
+ self.state = TrustAndSplitRPSState(
233
+ round_nb=0,
234
+ last_message="",
235
+ current_agent=start_agent,
236
+ quantities={"coins": 10},
237
+ values=values,
238
+ splits={aid: None for aid in self.agent_ids},
239
+ nb_messages_sent={aid: 0 for aid in self.agent_ids},
240
+ previous_values=None,
241
+ previous_splits=None,
242
+ previous_points=None,
243
+ split_phase=False,
244
+ hands=hands,
245
+ previous_hands=None,
246
+ previous_quantities=None,
247
+ )
248
+ return self.get_obs()