Muqeeth commited on
Commit
1bba52f
·
verified ·
1 Parent(s): f300e98

Add files using upload-large-folder tool

Browse files
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9367e046f903df9758d68c6a7974aa895c7a0420593b57b19ecc2dbba2f0151
3
+ size 323014168
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50cfa136e5499e5b1f83c90753b519572d60a378c94d09953a2738af6a8ae3c1
3
+ size 323014168
seed_0/agent_trainer/critic_optimizer_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1574fdb90735a922b09c67d07f7abdbd51181f00dc7bed878cb80adb5f50c1d
3
+ size 2631
seed_0/agent_trainer/policy_optimizer_state.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:74a02f441a2fd0d9b186be5ff1f58d5f55d857a5090d965e3582013d506ab097
3
+ size 646269121
seed_0/agent_trainer/trainer_annealing_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3267ed357717fc3937f1937bcad74ad2892832d4bc8895042213f7e98312ee4
3
+ size 104
seed_0/random_state.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f75e413aca28c103f96a10ed73eaa7a25ad24ec074eba477ae9594dfc42ebd4
3
+ size 12176
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc ADDED
Binary file (5.43 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc ADDED
Binary file (6.23 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc ADDED
Binary file (10.2 kB). View file
 
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc ADDED
Binary file (4.25 kB). View file
 
src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/ipd/Ipd_hard_coded_agents.py
3
+ Summary: Contains hand-crafted IPD policies used as deterministic baselines.
4
+ """
5
+
6
+ from dataclasses import dataclass
7
+ from typing import Any, Tuple
8
+
9
+ from mllm.markov_games.ipd.ipd_agent import IPDAgent
10
+ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
11
+
12
+
13
+ @dataclass
14
+ class AlwaysCooperateIPDAgent(IPDAgent):
15
+ async def act(self, observation) -> Tuple[Any, AgentActLog]:
16
+ """
17
+ Always plays the cooperate action, ignoring observation.
18
+ Returns the configured cooperate_string so the simulation parses it as "C".
19
+ """
20
+
21
+ action = self.cooperate_string
22
+
23
+ # Log a minimal, structured chat turn for consistency with other agents
24
+ turn_text = f"Playing cooperate: {action}"
25
+ self.state.chat_history.append(
26
+ ChatTurn(
27
+ agent_id=self.agent_id,
28
+ role="assistant",
29
+ content=turn_text,
30
+ is_state_end=True,
31
+ )
32
+ )
33
+
34
+ act_log = AgentActLog(
35
+ chat_turns=[self.state.chat_history[-1]],
36
+ info=None,
37
+ )
38
+
39
+ # Advance internal counters similar to IPDAgent semantics
40
+ self.state.chat_counter = len(self.state.chat_history)
41
+ self.state.round_nb = observation.round_nb
42
+
43
+ return action, act_log
44
+
45
+
46
+ @dataclass
47
+ class AlwaysDefectIPDAgent(IPDAgent):
48
+ async def act(self, observation) -> Tuple[Any, AgentActLog]:
49
+ """
50
+ Always plays the defect action, ignoring observation.
51
+ Returns the configured defect_string so the simulation parses it as "D".
52
+ """
53
+
54
+ action = self.defect_string
55
+
56
+ # Log a minimal, structured chat turn for consistency with other agents
57
+ turn_text = f"Playing defect: {action}"
58
+ self.state.chat_history.append(
59
+ ChatTurn(
60
+ agent_id=self.agent_id,
61
+ role="assistant",
62
+ content=turn_text,
63
+ is_state_end=True,
64
+ )
65
+ )
66
+
67
+ act_log = AgentActLog(
68
+ chat_turns=[self.state.chat_history[-1]],
69
+ info=None,
70
+ )
71
+
72
+ # Advance internal counters similar to IPDAgent semantics
73
+ self.state.chat_counter = len(self.state.chat_history)
74
+ self.state.round_nb = observation.round_nb
75
+
76
+ return action, act_log
src_code_for_reproducibility/markov_games/ipd/ipd_simulation.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/ipd/ipd_simulation.py
3
+ Summary: Runs Iterated Prisoner's Dilemma simulations under the Markov-game API.
4
+ """
5
+
6
+ import copy
7
+ import random
8
+ from dataclasses import dataclass
9
+ from typing import Any, Dict, List, Optional, Tuple
10
+
11
+ import numpy as np
12
+
13
+ from mllm.markov_games.markov_game import Simulation
14
+ from mllm.markov_games.rollout_tree import SimulationStepLog
15
+ from mllm.utils.get_coagent_id import get_coagent_id
16
+
17
+
18
+ @dataclass
19
+ class IPDState:
20
+ """
21
+ State of the Iterated Prisoner's Dilemma game.
22
+ """
23
+
24
+ round_nb: int = 0
25
+ done: bool = False
26
+ last_moves: Dict[str, str] | None = None
27
+
28
+
29
+ @dataclass
30
+ class IPDObs:
31
+ """
32
+ Observation in Iterated Prisoner's Dilemma game.
33
+ """
34
+
35
+ round_nb: int
36
+ last_coagent_move: str | None
37
+
38
+
39
+ class IPD(Simulation):
40
+ """
41
+ Iterated Prisoner's Dilemma simulation following the standard.
42
+
43
+ In each round of the game, two agents simultaneously choose to either cooperate (C) or defect (D).
44
+ The payoffs are as follows:
45
+ - If both cooperate: Both receive the "reward" (usually 3 points)
46
+ - If both defect: Both receive the "punishment" (usually 1 point)
47
+ - If one cooperates and one defects: The defector receives the "temptation" (usually 5 points)
48
+ and the cooperator receives the "sucker" payoff (usually 0 points)
49
+
50
+ The game is played for a specified number of rounds.
51
+ """
52
+
53
+ def __init__(
54
+ self,
55
+ agent_ids: List[str],
56
+ agent_names: List[str],
57
+ seed: int,
58
+ rounds_per_game: int,
59
+ reward: float, # Both cooperate
60
+ punishment: float, # Both defect
61
+ temptation: float, # Defector's reward when other cooperates
62
+ sucker: float, # Cooperator's reward when other defects
63
+ cooperate_actions: List[str],
64
+ defect_actions: List[str],
65
+ ):
66
+ self.agent_ids = agent_ids
67
+ self.agent_names = agent_names
68
+ self.seed = seed
69
+ self.rounds_per_game = rounds_per_game
70
+ self.reward = reward
71
+ self.punishment = punishment
72
+ self.temptation = temptation
73
+ self.sucker = sucker
74
+ self.cooperate_actions = cooperate_actions
75
+ self.defect_actions = defect_actions
76
+ self.state = IPDState()
77
+
78
+ def step(self, actions: Dict[str, str]) -> Tuple[bool, SimulationStepLog]:
79
+ """
80
+ Take a step in the environment using the provided actions.
81
+ Here, the observations are just the states of the game.
82
+
83
+ Args:
84
+ actions (dict): A dictionary where keys are agent identifiers and values are actions ('C' or 'D').
85
+
86
+ Returns:
87
+ observations (dict): A dictionary where keys are agent identifiers and values are observations.
88
+ done (bool): Whether the episode has ended.
89
+ info (dict): Additional information about the environment.
90
+ """
91
+
92
+ # Calculate rewards using payoff matrix
93
+ agent0_action = actions[self.agent_ids[0]]
94
+ agent1_action = actions[self.agent_ids[1]]
95
+
96
+ # Normalize actions to standard cooperate/defect/gibberish format
97
+ def normalize_action(action):
98
+ if action in self.cooperate_actions:
99
+ return "C"
100
+ elif action in self.defect_actions:
101
+ return "D"
102
+ else:
103
+ return "D"
104
+
105
+ norm_action0 = normalize_action(agent0_action)
106
+ norm_action1 = normalize_action(agent1_action)
107
+
108
+ payoffs = {
109
+ ("C", "C"): [self.reward, self.reward],
110
+ ("C", "D"): [self.sucker, self.temptation],
111
+ ("D", "C"): [self.temptation, self.sucker],
112
+ ("D", "D"): [self.punishment, self.punishment],
113
+ }
114
+
115
+ round_rewards = {
116
+ self.agent_ids[0]: payoffs[(norm_action0, norm_action1)][0],
117
+ self.agent_ids[1]: payoffs[(norm_action0, norm_action1)][1],
118
+ }
119
+
120
+ # Update game state
121
+ self.state.round_nb += 1
122
+ self.state.last_moves = copy.deepcopy(actions)
123
+ done = self.state.round_nb >= self.rounds_per_game
124
+ step_log = SimulationStepLog(
125
+ rewards=round_rewards,
126
+ info={
127
+ "actions": {
128
+ self.agent_ids[0]: norm_action0,
129
+ self.agent_ids[1]: norm_action1,
130
+ }
131
+ },
132
+ )
133
+
134
+ return done, step_log
135
+
136
+ def get_obs(self):
137
+ """Returns all agent observations in dict
138
+ Returns:
139
+ observations
140
+ """
141
+ observations = {}
142
+ for agent_id in self.agent_ids:
143
+ observations[agent_id] = self.get_obs_agent(agent_id)
144
+ return observations
145
+
146
+ def get_obs_agent(self, agent_id):
147
+ """Returns observation for agent_id"""
148
+ if self.state.last_moves != None:
149
+ other_id = get_coagent_id(self.agent_ids, agent_id)
150
+ last_coagent_move = self.state.last_moves[other_id]
151
+ else:
152
+ last_coagent_move = None
153
+ obs = IPDObs(round_nb=self.state.round_nb, last_coagent_move=last_coagent_move)
154
+ return obs
155
+
156
+ def reset(self):
157
+ """Returns initial observations and states"""
158
+ self.state = IPDState()
159
+ return self.get_obs()
160
+
161
+ def get_safe_copy(self):
162
+ """
163
+ Return a safe copy of the simulation.
164
+ """
165
+ simulation_copy = copy.copy(self)
166
+ simulation_copy.state = copy.deepcopy(self.state)
167
+ return simulation_copy
src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ File: mllm/markov_games/ipd/ipd_statistics.py
3
+ Summary: Computes statistics and summaries for IPD experiments.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from typing import Callable, Dict, List, Tuple
9
+
10
+ from mllm.markov_games.rollout_tree import SimulationStepLog
11
+
12
+
13
+ def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
14
+ for aid in sl.rewards.keys():
15
+ if "buffer" in str(aid) and "live" not in str(aid):
16
+ return None
17
+ # One value per agent at each step
18
+ rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
19
+ return [(key, value) for key, value in rewards_dict.items() if value is not None]
20
+
21
+
22
+ stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
23
+ avg_reward,
24
+ ]
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc ADDED
Binary file (10.7 kB). View file