Add files using upload-large-folder tool

Browse files

Files changed (14) hide show

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +3 -0
seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +3 -0
seed_42/agent_trainer/critic_optimizer_state.pt +3 -0
seed_42/agent_trainer/policy_optimizer_state.pt +3 -0
seed_42/agent_trainer/trainer_annealing_state.pkl +3 -0
seed_42/random_state.pkl +3 -0
src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +176 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py +108 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +182 -0
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc +0 -0
src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc +0 -0
src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc +0 -0

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e6e2c070d5b214399e5b5b2c59952896c83984ef5e9785cd825b34b193d318f
+size 323014168

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a2810cb0ec24072033412e5ff181e51188612e95b9f1685f9177794aa66a8bc0
+size 323014168

seed_42/agent_trainer/critic_optimizer_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f1574fdb90735a922b09c67d07f7abdbd51181f00dc7bed878cb80adb5f50c1d
+size 2631

seed_42/agent_trainer/policy_optimizer_state.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e09a8f7a1cba58eee3b92c4c63eb09713d7bb2e9c1248bda1479bab99de86429
+size 646269121

seed_42/agent_trainer/trainer_annealing_state.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e1ece3508808aa0372885bc9aafb57c945a1aa92d15785b25ba6ae0f7fe9860
+size 104

seed_42/random_state.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da5363953b9a73a6aef9243748650aa4dfb203e5208ae92c87b3735e89bfa42c
+size 12254

src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-312.pyc ADDED Viewed

Binary file (1.64 kB). View file

src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py ADDED Viewed

	@@ -0,0 +1,176 @@

+"""
+File: mllm/markov_games/negotiation/dond_simulation.py
+Summary: Simulates Deal-or-No-Deal negotiation games and logs rollouts.
+"""
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+)
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.utils.get_coagent_id import get_coagent_id
+AgentId = str
+@dataclass
+class DealNoDealState(NegotiationState):
+    """NegotiationState with per-agent value tables and item taxonomy."""
+    item_types: List[str]
+    values: Dict[AgentId, Dict[str, int]]
+@dataclass
+class DealNoDealObs(NegotiationObs):
+    """Observation that reveals own values and (lagged) opponent values."""
+    my_values: Dict[str, int]
+    item_types: List[str]
+    previous_values_coagent: Dict[str, int] | None
+def random_partition_integer(rng, total: int, parts: int) -> List[int]:
+    """Sample non-negative integers summing to ``total`` across ``parts`` buckets."""
+    if parts <= 0:
+        return []
+    if total <= 0:
+        return [0 for _ in range(parts)]
+    cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
+    vals = []
+    prev = 0
+    for c in cuts + [total]:
+        vals.append(c - prev)
+        prev = c
+    return vals
+class DealNoDealSimulation(NegotiationSimulation):
+    """NegotiationSimulation variant implementing the Rubinstein-style Deal-or-No-Deal."""
+    def __init__(
+        self,
+        item_types: List[str] = ["books", "hats", "balls"],
+        *args,
+        **kwargs,
+    ):
+        super().__init__(item_types=item_types, *args, **kwargs)
+        self.reset()
+    def _other(self, agent_id: AgentId) -> AgentId:
+        return get_coagent_id(self.agent_ids, agent_id)
+    def _sample_stock(self) -> Dict[str, int]:
+        # total items between 5 and 7
+        total_items = int(self.rng.integers(5, 8))
+        # nonnegative per-type counts summing to total_items
+        parts = random_partition_integer(self.rng, total_items, len(self.item_types))
+        # allow zeros per type
+        return {t: int(c) for t, c in zip(self.item_types, parts)}
+    def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
+        # Each agent has integer non-negative values that sum to 10
+        # Each item type valued by at least one agent
+        # Some item type valued by both agents
+        while True:
+            vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
+            vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
+            a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
+            b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
+            # each item valued by at least one
+            ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
+            # some item valued by both
+            ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
+            if ok1 and ok2:
+                return {self.agent_ids[0]: a, self.agent_ids[1]: b}
+    def _is_valid_allocation(
+        self, allocation: Dict[str, int], stock: Dict[str, int]
+    ) -> bool:
+        for t in self.item_types:
+            v = allocation.get(t)
+            if v is None:
+                return False
+            if not isinstance(v, int):
+                return False
+            if v < 0 or v > int(stock.get(t, 0)):
+                return False
+        return True
+    def set_new_round_of_variant(self):
+        # Keep same values, resample stock
+        self.state.quantities = self._sample_stock()
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """
+        Returns the rewards for each agent.
+        """
+        split_a = splits[self.agent_ids[0]].items_given_to_self
+        split_b = splits[self.agent_ids[1]].items_given_to_self
+        rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+        for t in self.item_types:
+            # If not complementary, return 0!
+            if not split_a[t] + split_b[t] == self.state.quantities[t]:
+                return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+            rewards[self.agent_ids[0]] += (
+                split_a[t] * self.state.values[self.agent_ids[0]][t]
+            )
+            rewards[self.agent_ids[1]] += (
+                split_b[t] * self.state.values[self.agent_ids[1]][t]
+            )
+        return rewards
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        obs = DealNoDealObs(
+            round_nb=self.state.round_nb,
+            last_message=self.state.last_message,
+            current_agent=self.state.current_agent,
+            quantities=copy.deepcopy(self.state.quantities),
+            value=0.0,  # unused in DOND
+            other_agent_split=None,  # not meaningful until split
+            split_phase=self.state.split_phase,
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            my_values=copy.deepcopy(self.state.values[agent_id]),
+            item_types=list(self.item_types),
+            previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        stock = self._sample_stock()
+        values = self._sample_values_pair()
+        self.state = DealNoDealState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities=stock,
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=False,
+            item_types=list(self.item_types),
+        )
+        return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py ADDED Viewed

	@@ -0,0 +1,108 @@

+"""
+File: mllm/markov_games/negotiation/no_press_nego_agent.py
+Summary: Agent variant for no-press negotiations without explicit messaging.
+"""
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.negotiation.nego_agent import (
+    NegotiationAgent,
+    NegotiationAgentState,
+)
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class NoPressAgent(NegotiationAgent):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # No communication in this variant
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game consists of multiple independent rounds.\n"
+            "2. In each round, there are multiple items to split between the two agents.\n"
+            "3. Both agents are assigned a per-item value between 1 and 20 (inclusive) in each round.\n"
+            "4. You can observe per-item values of both agents.\n"
+            "5. Because assignments are random, both agents are equally likely to have same expected per-item value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. Both agents simultaneously propose the amount of each item they will keep.\n"
+            "2. If the total sum of proposals is less than or equal to the item quantity, both agents receive their proposed amounts.\n"
+            "3. If the total sum of proposals exceeds the item quantity, they are allocated proportionally.\n"
+            "4. Your points for the round = (amount you receive per item) x (your per-item value for that round), added across all items.\n"
+            "5. Points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = (
+            "A New Round Begins\n"
+            "The items to split are {quantities}.\n"
+            "Your per-item values are {value} and {other_agent}'s per-item values are  {other_value}."
+        )
+        self.last_round_prompt = (
+            "Last Round Summary:\n"
+            "   - Items to split: {last_quantities}\n"
+            "   - Your per-item values: {last_value_agent}\n"
+            "   - {other_agent}'s per-item values: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent}\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent}\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round Complete.\n"
+        )
+        self.send_split_prompt = "Submit Your Proposal\n" "Respond as {proposal_style}"
+    def get_message_regex(self, observation: NoPressObs) -> str:
+        """Return an empty pattern because the no-press variant forbids chat."""
+        return r"^$"  # No messages allowed
+    def get_split_regex(self, observation: NoPressObs) -> str:
+        """Match proposals like ``Proposal: 4 coins, 6 apples`` case-insensitively."""
+        items = list(observation.quantities.keys())
+        # Accept both singular and plural forms
+        item_pattern = "|".join(
+            [f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?" for item in items]
+        )
+        regex = rf"(?i)Proposal:\s*((?:\s*(?P<num>(10|[0-9]))\s*(?P<item>{item_pattern})\s*,?)+)"
+        return regex
+    def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split:
+        """
+        Parse the LLM proposal into a normalized ``Split`` structure.
+        The regex-based parser is lenient (accepts pluralization variants) so that
+        prompt tweaks do not require re-training the extraction logic.
+        """
+        items = list(observation.quantities.keys())
+        import re as _re
+        split_regex = self.get_split_regex(observation)
+        items_given_to_self = {item: 0 for item in items}
+        m = _re.match(split_regex, policy_output.strip())
+        if m:
+            # Find all (number, item) pairs
+            item_pattern = "|".join(
+                [
+                    f"{item[:-1]}s?" if item.endswith("s") else f"{item}s?"
+                    for item in items
+                ]
+            )
+            inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
+            def normalize_item_name(item_str):
+                """Canonicalize plural/singular user text back to the config item id."""
+                for orig in items:
+                    if item_str.lower() == orig.lower():
+                        return orig
+                    if orig.endswith("s") and item_str.lower() == orig[:-1].lower():
+                        return orig
+                    if (
+                        not orig.endswith("s")
+                        and item_str.lower() == orig.lower() + "s"
+                    ):
+                        return orig
+            for num, item in _re.findall(inner_regex, m.group(1)):
+                items_given_to_self[normalize_item_name(item)] = int(num)
+        return Split(items_given_to_self=items_given_to_self)

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py ADDED Viewed

	@@ -0,0 +1,182 @@

+"""
+File: mllm/markov_games/negotiation/no_press_nego_simulation.py
+Summary: Simulation driver for no-press negotiation scenarios.
+"""
+import copy
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Tuple
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+    compute_tas_style_rewards,
+)
+AgentId = str
+@dataclass
+class NoPressState(NegotiationState):
+    """NegotiationState alias used to clarify we run in always-split phase."""
+    pass
+@dataclass
+class NoPressObs(NegotiationObs):
+    """Observation that includes both agents' values (since there is no messaging)."""
+    other_value: Dict[str, float]
+class NoPressSimulation(NegotiationSimulation):
+    def __init__(
+        self,
+        game_type: Literal["10-1-exclusive", "10-1-ties", "1-to-20"] = "1-to-20",
+        same_round_value: bool = True,
+        atleast_one_conflict: bool = False,
+        *args,
+        **kwargs,
+    ):
+        self.game_type = game_type
+        self.same_round_value = same_round_value
+        self.atleast_one_conflict = atleast_one_conflict
+        super().__init__(*args, **kwargs)
+    def _sample_values(self) -> Dict[AgentId, dict]:
+        """Sample per-item valuations according to the configured template."""
+        values = defaultdict(dict)
+        if self.state is None:
+            item_types = self.item_types
+        else:
+            item_types = list(self.state.quantities.keys())
+        while True:
+            for item in item_types:
+                if self.game_type == "10-1-exclusive":
+                    v = int(self.rng.choice([1, 10]))
+                    values[self.agent_ids[0]][item] = v
+                    values[self.agent_ids[1]][item] = 10 if v == 1 else 1
+                elif self.game_type == "10-1-ties":
+                    for aid in self.agent_ids:
+                        values[aid][item] = int(self.rng.choice([1, 10]))
+                elif self.game_type == "1-to-20":
+                    for aid in self.agent_ids:
+                        values[aid][item] = int(self.rng.integers(1, 21))
+            if self.atleast_one_conflict:
+                has_conflict = False
+                for item in item_types:
+                    agent_values_for_item = [
+                        values[aid][item] for aid in self.agent_ids
+                    ]
+                    if len(set(agent_values_for_item)) > 1:
+                        has_conflict = True
+                        break
+                if not has_conflict:
+                    continue
+            agent_values = [sum(v.values()) for v in values.values()]
+            if len(set(agent_values)) == 1 or not self.same_round_value:
+                break
+        return values
+    def _sample_quantities(self) -> Dict[str, int]:
+        """No-press setups use symmetric 10-unit stocks for every item."""
+        return {item.lower(): 10 for item in self.item_types}
+    def set_new_round_of_variant(self):
+        """Refresh quantities/values and jump directly into the simultaneous split."""
+        self.state.quantities = self._sample_quantities()
+        self.state.values = self._sample_values()
+        self.state.split_phase = True
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        """Surface quantities/values/splits so statistics modules can read them."""
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """Reuse TAS reward logic because the split arbitration is identical."""
+        return compute_tas_style_rewards(
+            self.agent_ids, self.state.values, splits, self.state.quantities
+        )
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        last_value_coagent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(other_id)
+        )
+        last_points_coagent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(other_id), 1)
+        )
+        last_value_agent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(agent_id)
+        )
+        last_points_agent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(agent_id), 1)
+        )
+        last_split_coagent = None
+        last_split_agent = None
+        if self.state.previous_splits is not None:
+            last_split_coagent = self.state.previous_splits[
+                other_id
+            ].items_given_to_self
+            last_split_agent = self.state.previous_splits[agent_id].items_given_to_self
+        obs = NoPressObs(
+            round_nb=self.state.round_nb,
+            last_message="",
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            current_agent=self.state.current_agent,
+            other_agent=self.agent_id_to_name[other_id],
+            quantities=self.state.quantities,
+            item_types=self.item_types,
+            value=self.state.values[agent_id],
+            split_phase=self.state.split_phase,
+            last_split_agent=last_split_agent,
+            last_value_agent=last_value_agent,
+            last_points_agent=last_points_agent,
+            last_split_coagent=last_split_coagent,
+            last_value_coagent=last_value_coagent,
+            last_points_coagent=last_points_coagent,
+            other_value=self.state.values[other_id],
+            last_quantities=self.state.previous_quantities,
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        quantities = self._sample_quantities()
+        values = self._sample_values()
+        self.state = NoPressState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities=quantities,
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=True,
+            previous_splits=None,
+            previous_points=None,
+            previous_quantities=None,
+        )
+        return self.get_obs()

src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (269 Bytes). View file

src_code_for_reproducibility/models/__pycache__/inference_backend.cpython-312.pyc ADDED Viewed

Binary file (2.38 kB). View file

src_code_for_reproducibility/models/__pycache__/scalar_critic.cpython-312.pyc ADDED Viewed

Binary file (3.32 kB). View file

src_code_for_reproducibility/training/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (277 Bytes). View file