Add files using upload-large-folder tool
Browse files- seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +3 -0
- seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +3 -0
- seed_0/agent_trainer/critic_optimizer_state.pt +3 -0
- seed_0/agent_trainer/policy_optimizer_state.pt +3 -0
- seed_0/agent_trainer/trainer_annealing_state.pkl +3 -0
- seed_0/random_state.pkl +3 -0
- src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py +76 -0
- src_code_for_reproducibility/markov_games/ipd/ipd_simulation.py +167 -0
- src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py +24 -0
- src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc +0 -0
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9367e046f903df9758d68c6a7974aa895c7a0420593b57b19ecc2dbba2f0151
|
| 3 |
+
size 323014168
|
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50cfa136e5499e5b1f83c90753b519572d60a378c94d09953a2738af6a8ae3c1
|
| 3 |
+
size 323014168
|
seed_0/agent_trainer/critic_optimizer_state.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1574fdb90735a922b09c67d07f7abdbd51181f00dc7bed878cb80adb5f50c1d
|
| 3 |
+
size 2631
|
seed_0/agent_trainer/policy_optimizer_state.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74a02f441a2fd0d9b186be5ff1f58d5f55d857a5090d965e3582013d506ab097
|
| 3 |
+
size 646269121
|
seed_0/agent_trainer/trainer_annealing_state.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f3267ed357717fc3937f1937bcad74ad2892832d4bc8895042213f7e98312ee4
|
| 3 |
+
size 104
|
seed_0/random_state.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f75e413aca28c103f96a10ed73eaa7a25ad24ec074eba477ae9594dfc42ebd4
|
| 3 |
+
size 12176
|
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc
ADDED
|
Binary file (5.43 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc
ADDED
|
Binary file (6.23 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc
ADDED
|
Binary file (4.25 kB). View file
|
|
|
src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py
ADDED
|
@@ -0,0 +1,76 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/ipd/Ipd_hard_coded_agents.py
|
| 3 |
+
Summary: Contains hand-crafted IPD policies used as deterministic baselines.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from dataclasses import dataclass
|
| 7 |
+
from typing import Any, Tuple
|
| 8 |
+
|
| 9 |
+
from mllm.markov_games.ipd.ipd_agent import IPDAgent
|
| 10 |
+
from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class AlwaysCooperateIPDAgent(IPDAgent):
|
| 15 |
+
async def act(self, observation) -> Tuple[Any, AgentActLog]:
|
| 16 |
+
"""
|
| 17 |
+
Always plays the cooperate action, ignoring observation.
|
| 18 |
+
Returns the configured cooperate_string so the simulation parses it as "C".
|
| 19 |
+
"""
|
| 20 |
+
|
| 21 |
+
action = self.cooperate_string
|
| 22 |
+
|
| 23 |
+
# Log a minimal, structured chat turn for consistency with other agents
|
| 24 |
+
turn_text = f"Playing cooperate: {action}"
|
| 25 |
+
self.state.chat_history.append(
|
| 26 |
+
ChatTurn(
|
| 27 |
+
agent_id=self.agent_id,
|
| 28 |
+
role="assistant",
|
| 29 |
+
content=turn_text,
|
| 30 |
+
is_state_end=True,
|
| 31 |
+
)
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
act_log = AgentActLog(
|
| 35 |
+
chat_turns=[self.state.chat_history[-1]],
|
| 36 |
+
info=None,
|
| 37 |
+
)
|
| 38 |
+
|
| 39 |
+
# Advance internal counters similar to IPDAgent semantics
|
| 40 |
+
self.state.chat_counter = len(self.state.chat_history)
|
| 41 |
+
self.state.round_nb = observation.round_nb
|
| 42 |
+
|
| 43 |
+
return action, act_log
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
@dataclass
|
| 47 |
+
class AlwaysDefectIPDAgent(IPDAgent):
|
| 48 |
+
async def act(self, observation) -> Tuple[Any, AgentActLog]:
|
| 49 |
+
"""
|
| 50 |
+
Always plays the defect action, ignoring observation.
|
| 51 |
+
Returns the configured defect_string so the simulation parses it as "D".
|
| 52 |
+
"""
|
| 53 |
+
|
| 54 |
+
action = self.defect_string
|
| 55 |
+
|
| 56 |
+
# Log a minimal, structured chat turn for consistency with other agents
|
| 57 |
+
turn_text = f"Playing defect: {action}"
|
| 58 |
+
self.state.chat_history.append(
|
| 59 |
+
ChatTurn(
|
| 60 |
+
agent_id=self.agent_id,
|
| 61 |
+
role="assistant",
|
| 62 |
+
content=turn_text,
|
| 63 |
+
is_state_end=True,
|
| 64 |
+
)
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
act_log = AgentActLog(
|
| 68 |
+
chat_turns=[self.state.chat_history[-1]],
|
| 69 |
+
info=None,
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
# Advance internal counters similar to IPDAgent semantics
|
| 73 |
+
self.state.chat_counter = len(self.state.chat_history)
|
| 74 |
+
self.state.round_nb = observation.round_nb
|
| 75 |
+
|
| 76 |
+
return action, act_log
|
src_code_for_reproducibility/markov_games/ipd/ipd_simulation.py
ADDED
|
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/ipd/ipd_simulation.py
|
| 3 |
+
Summary: Runs Iterated Prisoner's Dilemma simulations under the Markov-game API.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import copy
|
| 7 |
+
import random
|
| 8 |
+
from dataclasses import dataclass
|
| 9 |
+
from typing import Any, Dict, List, Optional, Tuple
|
| 10 |
+
|
| 11 |
+
import numpy as np
|
| 12 |
+
|
| 13 |
+
from mllm.markov_games.markov_game import Simulation
|
| 14 |
+
from mllm.markov_games.rollout_tree import SimulationStepLog
|
| 15 |
+
from mllm.utils.get_coagent_id import get_coagent_id
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
@dataclass
|
| 19 |
+
class IPDState:
|
| 20 |
+
"""
|
| 21 |
+
State of the Iterated Prisoner's Dilemma game.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
round_nb: int = 0
|
| 25 |
+
done: bool = False
|
| 26 |
+
last_moves: Dict[str, str] | None = None
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@dataclass
|
| 30 |
+
class IPDObs:
|
| 31 |
+
"""
|
| 32 |
+
Observation in Iterated Prisoner's Dilemma game.
|
| 33 |
+
"""
|
| 34 |
+
|
| 35 |
+
round_nb: int
|
| 36 |
+
last_coagent_move: str | None
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
class IPD(Simulation):
|
| 40 |
+
"""
|
| 41 |
+
Iterated Prisoner's Dilemma simulation following the standard.
|
| 42 |
+
|
| 43 |
+
In each round of the game, two agents simultaneously choose to either cooperate (C) or defect (D).
|
| 44 |
+
The payoffs are as follows:
|
| 45 |
+
- If both cooperate: Both receive the "reward" (usually 3 points)
|
| 46 |
+
- If both defect: Both receive the "punishment" (usually 1 point)
|
| 47 |
+
- If one cooperates and one defects: The defector receives the "temptation" (usually 5 points)
|
| 48 |
+
and the cooperator receives the "sucker" payoff (usually 0 points)
|
| 49 |
+
|
| 50 |
+
The game is played for a specified number of rounds.
|
| 51 |
+
"""
|
| 52 |
+
|
| 53 |
+
def __init__(
|
| 54 |
+
self,
|
| 55 |
+
agent_ids: List[str],
|
| 56 |
+
agent_names: List[str],
|
| 57 |
+
seed: int,
|
| 58 |
+
rounds_per_game: int,
|
| 59 |
+
reward: float, # Both cooperate
|
| 60 |
+
punishment: float, # Both defect
|
| 61 |
+
temptation: float, # Defector's reward when other cooperates
|
| 62 |
+
sucker: float, # Cooperator's reward when other defects
|
| 63 |
+
cooperate_actions: List[str],
|
| 64 |
+
defect_actions: List[str],
|
| 65 |
+
):
|
| 66 |
+
self.agent_ids = agent_ids
|
| 67 |
+
self.agent_names = agent_names
|
| 68 |
+
self.seed = seed
|
| 69 |
+
self.rounds_per_game = rounds_per_game
|
| 70 |
+
self.reward = reward
|
| 71 |
+
self.punishment = punishment
|
| 72 |
+
self.temptation = temptation
|
| 73 |
+
self.sucker = sucker
|
| 74 |
+
self.cooperate_actions = cooperate_actions
|
| 75 |
+
self.defect_actions = defect_actions
|
| 76 |
+
self.state = IPDState()
|
| 77 |
+
|
| 78 |
+
def step(self, actions: Dict[str, str]) -> Tuple[bool, SimulationStepLog]:
|
| 79 |
+
"""
|
| 80 |
+
Take a step in the environment using the provided actions.
|
| 81 |
+
Here, the observations are just the states of the game.
|
| 82 |
+
|
| 83 |
+
Args:
|
| 84 |
+
actions (dict): A dictionary where keys are agent identifiers and values are actions ('C' or 'D').
|
| 85 |
+
|
| 86 |
+
Returns:
|
| 87 |
+
observations (dict): A dictionary where keys are agent identifiers and values are observations.
|
| 88 |
+
done (bool): Whether the episode has ended.
|
| 89 |
+
info (dict): Additional information about the environment.
|
| 90 |
+
"""
|
| 91 |
+
|
| 92 |
+
# Calculate rewards using payoff matrix
|
| 93 |
+
agent0_action = actions[self.agent_ids[0]]
|
| 94 |
+
agent1_action = actions[self.agent_ids[1]]
|
| 95 |
+
|
| 96 |
+
# Normalize actions to standard cooperate/defect/gibberish format
|
| 97 |
+
def normalize_action(action):
|
| 98 |
+
if action in self.cooperate_actions:
|
| 99 |
+
return "C"
|
| 100 |
+
elif action in self.defect_actions:
|
| 101 |
+
return "D"
|
| 102 |
+
else:
|
| 103 |
+
return "D"
|
| 104 |
+
|
| 105 |
+
norm_action0 = normalize_action(agent0_action)
|
| 106 |
+
norm_action1 = normalize_action(agent1_action)
|
| 107 |
+
|
| 108 |
+
payoffs = {
|
| 109 |
+
("C", "C"): [self.reward, self.reward],
|
| 110 |
+
("C", "D"): [self.sucker, self.temptation],
|
| 111 |
+
("D", "C"): [self.temptation, self.sucker],
|
| 112 |
+
("D", "D"): [self.punishment, self.punishment],
|
| 113 |
+
}
|
| 114 |
+
|
| 115 |
+
round_rewards = {
|
| 116 |
+
self.agent_ids[0]: payoffs[(norm_action0, norm_action1)][0],
|
| 117 |
+
self.agent_ids[1]: payoffs[(norm_action0, norm_action1)][1],
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
# Update game state
|
| 121 |
+
self.state.round_nb += 1
|
| 122 |
+
self.state.last_moves = copy.deepcopy(actions)
|
| 123 |
+
done = self.state.round_nb >= self.rounds_per_game
|
| 124 |
+
step_log = SimulationStepLog(
|
| 125 |
+
rewards=round_rewards,
|
| 126 |
+
info={
|
| 127 |
+
"actions": {
|
| 128 |
+
self.agent_ids[0]: norm_action0,
|
| 129 |
+
self.agent_ids[1]: norm_action1,
|
| 130 |
+
}
|
| 131 |
+
},
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
return done, step_log
|
| 135 |
+
|
| 136 |
+
def get_obs(self):
|
| 137 |
+
"""Returns all agent observations in dict
|
| 138 |
+
Returns:
|
| 139 |
+
observations
|
| 140 |
+
"""
|
| 141 |
+
observations = {}
|
| 142 |
+
for agent_id in self.agent_ids:
|
| 143 |
+
observations[agent_id] = self.get_obs_agent(agent_id)
|
| 144 |
+
return observations
|
| 145 |
+
|
| 146 |
+
def get_obs_agent(self, agent_id):
|
| 147 |
+
"""Returns observation for agent_id"""
|
| 148 |
+
if self.state.last_moves != None:
|
| 149 |
+
other_id = get_coagent_id(self.agent_ids, agent_id)
|
| 150 |
+
last_coagent_move = self.state.last_moves[other_id]
|
| 151 |
+
else:
|
| 152 |
+
last_coagent_move = None
|
| 153 |
+
obs = IPDObs(round_nb=self.state.round_nb, last_coagent_move=last_coagent_move)
|
| 154 |
+
return obs
|
| 155 |
+
|
| 156 |
+
def reset(self):
|
| 157 |
+
"""Returns initial observations and states"""
|
| 158 |
+
self.state = IPDState()
|
| 159 |
+
return self.get_obs()
|
| 160 |
+
|
| 161 |
+
def get_safe_copy(self):
|
| 162 |
+
"""
|
| 163 |
+
Return a safe copy of the simulation.
|
| 164 |
+
"""
|
| 165 |
+
simulation_copy = copy.copy(self)
|
| 166 |
+
simulation_copy.state = copy.deepcopy(self.state)
|
| 167 |
+
return simulation_copy
|
src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/ipd/ipd_statistics.py
|
| 3 |
+
Summary: Computes statistics and summaries for IPD experiments.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from typing import Callable, Dict, List, Tuple
|
| 9 |
+
|
| 10 |
+
from mllm.markov_games.rollout_tree import SimulationStepLog
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
|
| 14 |
+
for aid in sl.rewards.keys():
|
| 15 |
+
if "buffer" in str(aid) and "live" not in str(aid):
|
| 16 |
+
return None
|
| 17 |
+
# One value per agent at each step
|
| 18 |
+
rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
|
| 19 |
+
return [(key, value) for key, value in rewards_dict.items() if value is not None]
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
|
| 23 |
+
avg_reward,
|
| 24 |
+
]
|
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc
ADDED
|
Binary file (10.7 kB). View file
|
|
|