Add files using upload-large-folder tool

Browse files

Files changed (14) hide show

seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +3 -0
seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +3 -0
seed_0/agent_trainer/critic_optimizer_state.pt +3 -0
seed_0/agent_trainer/policy_optimizer_state.pt +3 -0
seed_0/agent_trainer/trainer_annealing_state.pkl +3 -0
seed_0/random_state.pkl +3 -0
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py +76 -0
src_code_for_reproducibility/markov_games/ipd/ipd_simulation.py +167 -0
src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py +24 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc +0 -0

seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c9367e046f903df9758d68c6a7974aa895c7a0420593b57b19ecc2dbba2f0151
+size 323014168

seed_0/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:50cfa136e5499e5b1f83c90753b519572d60a378c94d09953a2738af6a8ae3c1
+size 323014168

seed_0/agent_trainer/critic_optimizer_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1574fdb90735a922b09c67d07f7abdbd51181f00dc7bed878cb80adb5f50c1d
+size 2631

seed_0/agent_trainer/policy_optimizer_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74a02f441a2fd0d9b186be5ff1f58d5f55d857a5090d965e3582013d506ab097
+size 646269121

seed_0/agent_trainer/trainer_annealing_state.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f3267ed357717fc3937f1937bcad74ad2892832d4bc8895042213f7e98312ee4
+size 104

seed_0/random_state.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f75e413aca28c103f96a10ed73eaa7a25ad24ec074eba477ae9594dfc42ebd4
+size 12176

src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc ADDED Viewed

Binary file (5.43 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-312.pyc ADDED Viewed

Binary file (6.23 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc ADDED Viewed

Binary file (10.2 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-312.pyc ADDED Viewed

Binary file (4.25 kB). View file

src_code_for_reproducibility/markov_games/ipd/Ipd_hard_coded_agents.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+File: mllm/markov_games/ipd/Ipd_hard_coded_agents.py
+Summary: Contains hand-crafted IPD policies used as deterministic baselines.
+"""
+from dataclasses import dataclass
+from typing import Any, Tuple
+from mllm.markov_games.ipd.ipd_agent import IPDAgent
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+@dataclass
+class AlwaysCooperateIPDAgent(IPDAgent):
+    async def act(self, observation) -> Tuple[Any, AgentActLog]:
+        """
+        Always plays the cooperate action, ignoring observation.
+        Returns the configured cooperate_string so the simulation parses it as "C".
+        """
+        action = self.cooperate_string
+        # Log a minimal, structured chat turn for consistency with other agents
+        turn_text = f"Playing cooperate: {action}"
+        self.state.chat_history.append(
+            ChatTurn(
+                agent_id=self.agent_id,
+                role="assistant",
+                content=turn_text,
+                is_state_end=True,
+            )
+        )
+        act_log = AgentActLog(
+            chat_turns=[self.state.chat_history[-1]],
+            info=None,
+        )
+        # Advance internal counters similar to IPDAgent semantics
+        self.state.chat_counter = len(self.state.chat_history)
+        self.state.round_nb = observation.round_nb
+        return action, act_log
+@dataclass
+class AlwaysDefectIPDAgent(IPDAgent):
+    async def act(self, observation) -> Tuple[Any, AgentActLog]:
+        """
+        Always plays the defect action, ignoring observation.
+        Returns the configured defect_string so the simulation parses it as "D".
+        """
+        action = self.defect_string
+        # Log a minimal, structured chat turn for consistency with other agents
+        turn_text = f"Playing defect: {action}"
+        self.state.chat_history.append(
+            ChatTurn(
+                agent_id=self.agent_id,
+                role="assistant",
+                content=turn_text,
+                is_state_end=True,
+            )
+        )
+        act_log = AgentActLog(
+            chat_turns=[self.state.chat_history[-1]],
+            info=None,
+        )
+        # Advance internal counters similar to IPDAgent semantics
+        self.state.chat_counter = len(self.state.chat_history)
+        self.state.round_nb = observation.round_nb
+        return action, act_log

src_code_for_reproducibility/markov_games/ipd/ipd_simulation.py ADDED Viewed

	@@ -0,0 +1,167 @@

+"""
+File: mllm/markov_games/ipd/ipd_simulation.py
+Summary: Runs Iterated Prisoner's Dilemma simulations under the Markov-game API.
+"""
+import copy
+import random
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple
+import numpy as np
+from mllm.markov_games.markov_game import Simulation
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.utils.get_coagent_id import get_coagent_id
+@dataclass
+class IPDState:
+    """
+    State of the Iterated Prisoner's Dilemma game.
+    """
+    round_nb: int = 0
+    done: bool = False
+    last_moves: Dict[str, str] | None = None
+@dataclass
+class IPDObs:
+    """
+    Observation in Iterated Prisoner's Dilemma game.
+    """
+    round_nb: int
+    last_coagent_move: str | None
+class IPD(Simulation):
+    """
+    Iterated Prisoner's Dilemma simulation following the standard.
+    In each round of the game, two agents simultaneously choose to either cooperate (C) or defect (D).
+    The payoffs are as follows:
+    - If both cooperate: Both receive the "reward" (usually 3 points)
+    - If both defect: Both receive the "punishment" (usually 1 point)
+    - If one cooperates and one defects: The defector receives the "temptation" (usually 5 points)
+      and the cooperator receives the "sucker" payoff (usually 0 points)
+    The game is played for a specified number of rounds.
+    """
+    def __init__(
+        self,
+        agent_ids: List[str],
+        agent_names: List[str],
+        seed: int,
+        rounds_per_game: int,
+        reward: float,  # Both cooperate
+        punishment: float,  # Both defect
+        temptation: float,  # Defector's reward when other cooperates
+        sucker: float,  # Cooperator's reward when other defects
+        cooperate_actions: List[str],
+        defect_actions: List[str],
+    ):
+        self.agent_ids = agent_ids
+        self.agent_names = agent_names
+        self.seed = seed
+        self.rounds_per_game = rounds_per_game
+        self.reward = reward
+        self.punishment = punishment
+        self.temptation = temptation
+        self.sucker = sucker
+        self.cooperate_actions = cooperate_actions
+        self.defect_actions = defect_actions
+        self.state = IPDState()
+    def step(self, actions: Dict[str, str]) -> Tuple[bool, SimulationStepLog]:
+        """
+        Take a step in the environment using the provided actions.
+        Here, the observations are just the states of the game.
+        Args:
+            actions (dict): A dictionary where keys are agent identifiers and values are actions ('C' or 'D').
+        Returns:
+            observations (dict): A dictionary where keys are agent identifiers and values are observations.
+            done (bool): Whether the episode has ended.
+            info (dict): Additional information about the environment.
+        """
+        # Calculate rewards using payoff matrix
+        agent0_action = actions[self.agent_ids[0]]
+        agent1_action = actions[self.agent_ids[1]]
+        # Normalize actions to standard cooperate/defect/gibberish format
+        def normalize_action(action):
+            if action in self.cooperate_actions:
+                return "C"
+            elif action in self.defect_actions:
+                return "D"
+            else:
+                return "D"
+        norm_action0 = normalize_action(agent0_action)
+        norm_action1 = normalize_action(agent1_action)
+        payoffs = {
+            ("C", "C"): [self.reward, self.reward],
+            ("C", "D"): [self.sucker, self.temptation],
+            ("D", "C"): [self.temptation, self.sucker],
+            ("D", "D"): [self.punishment, self.punishment],
+        }
+        round_rewards = {
+            self.agent_ids[0]: payoffs[(norm_action0, norm_action1)][0],
+            self.agent_ids[1]: payoffs[(norm_action0, norm_action1)][1],
+        }
+        # Update game state
+        self.state.round_nb += 1
+        self.state.last_moves = copy.deepcopy(actions)
+        done = self.state.round_nb >= self.rounds_per_game
+        step_log = SimulationStepLog(
+            rewards=round_rewards,
+            info={
+                "actions": {
+                    self.agent_ids[0]: norm_action0,
+                    self.agent_ids[1]: norm_action1,
+                }
+            },
+        )
+        return done, step_log
+    def get_obs(self):
+        """Returns all agent observations in dict
+        Returns:
+            observations
+        """
+        observations = {}
+        for agent_id in self.agent_ids:
+            observations[agent_id] = self.get_obs_agent(agent_id)
+        return observations
+    def get_obs_agent(self, agent_id):
+        """Returns observation for agent_id"""
+        if self.state.last_moves != None:
+            other_id = get_coagent_id(self.agent_ids, agent_id)
+            last_coagent_move = self.state.last_moves[other_id]
+        else:
+            last_coagent_move = None
+        obs = IPDObs(round_nb=self.state.round_nb, last_coagent_move=last_coagent_move)
+        return obs
+    def reset(self):
+        """Returns initial observations and states"""
+        self.state = IPDState()
+        return self.get_obs()
+    def get_safe_copy(self):
+        """
+        Return a safe copy of the simulation.
+        """
+        simulation_copy = copy.copy(self)
+        simulation_copy.state = copy.deepcopy(self.state)
+        return simulation_copy

src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py ADDED Viewed

	@@ -0,0 +1,24 @@

+"""
+File: mllm/markov_games/ipd/ipd_statistics.py
+Summary: Computes statistics and summaries for IPD experiments.
+"""
+from __future__ import annotations
+from typing import Callable, Dict, List, Tuple
+from mllm.markov_games.rollout_tree import SimulationStepLog
+def avg_reward(sl: SimulationStepLog) -> List[Tuple[str, float]]:
+    for aid in sl.rewards.keys():
+        if "buffer" in str(aid) and "live" not in str(aid):
+            return None
+    # One value per agent at each step
+    rewards_dict = {f"reward-{aid}": float(v) for aid, v in (sl.rewards or {}).items()}
+    return [(key, value) for key, value in rewards_dict.items() if value is not None]
+stat_functs: list[Callable[[SimulationStepLog], List[Tuple[str, float]]]] = [
+    avg_reward,
+]

src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-312.pyc ADDED Viewed

Binary file (10.7 kB). View file