Add files using upload-large-folder tool

Browse files

Files changed (13) hide show

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +1 -1
seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +1 -1
seed_42/agent_trainer/policy_optimizer_state.pt +1 -1
seed_42/agent_trainer/trainer_annealing_state.pkl +1 -1
seed_42/random_state.pkl +1 -1
src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/README.md +3 -16
src_code_for_reproducibility/markov_games/negotiation/dond_agent.py +39 -25
src_code_for_reproducibility/markov_games/negotiation/nego_agent.py +19 -0
src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py +10 -4
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +14 -0
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +10 -0

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1436ad64c7607b662e4cd395c9c37b0a0f5021552b3b73a095acc06bb533387f
 size 323014168

 version https://git-lfs.github.com/spec/v1
+oid sha256:95c2062df6f7a10adbd244c9b6e75a8153c54f776c3619732a3fa0c766be3166
 size 323014168

seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:15466bfa7faf33695c2e4470cb1a440f086f27f467fe02f2b09a20631914bab9
 size 323014168

 version https://git-lfs.github.com/spec/v1
+oid sha256:a2810cb0ec24072033412e5ff181e51188612e95b9f1685f9177794aa66a8bc0
 size 323014168

seed_42/agent_trainer/policy_optimizer_state.pt CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b0ce97dbf5119cad8778d4a88276b1f2a9857951d9286bafbc1889a0668f7f3
 size 646269121

 version https://git-lfs.github.com/spec/v1
+oid sha256:c9d017d2f98e71c9ed36613fc1d2e7e8daeef9aa62ee96e0e838ec293e469025
 size 646269121

seed_42/agent_trainer/trainer_annealing_state.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:79cfce2a5040c0939846d147a00d13a3f05afa3b73ce05b85fd5b5b13bf4ddcf
 size 104

 version https://git-lfs.github.com/spec/v1
+oid sha256:5468e667c6b74a7cb34fc016988230e631fc520b2df33e5a5c71068b59689f3e
 size 104

seed_42/random_state.pkl CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9bcfbe7f3d2c5bca58434c5809547ecb4c92e58ceebcb74196e41d6c6751e9ea
 size 12254

 version https://git-lfs.github.com/spec/v1
+oid sha256:03db597030fc1fe5f071eb41114416ad894895b73935ddcee0fc06e622471c8a
 size 12254

src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc differ

src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc CHANGED Viewed

Binary files a/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc differ

src_code_for_reproducibility/markov_games/negotiation/README.md CHANGED Viewed

@@ -9,29 +9,16 @@ Proportional splitting is used when the two proposals exceed the available total
 ### Variants (in increasing difficulty)
 - No‑Press Split
-  - Single item type (coins)
-  - No communication; agents go straight to making split proposals, with the starting player alternating deterministically.
   - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
-  - Deterministic Mode: values are fixed and public: one agent values coins at 10, the other at 1 (alternates each round).
-  - Stochastic Mode: values are random and uncorrelated.
 - Trust-and-Split RPS (TAS-RPS)
   - Single item type (coins)
   - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
   - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
   - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
-- Trust-and-Split (TAS)
-  - Single item type (coins); each round, each agent’s per-coin value is independently sampled in a broad range (e.g., 1–20).
-  - Each agent observes only their own value; they may use short messages to share and negotiate.
-  - Motivation: a simple blend that tests whether agents learn to exchange private information and coordinate proportional, value-aware splits.
-- Deal-or-No-Deal (DOND)
-  - Introduced in [Deal or No Deal? End-to-End Learning for Negotiation Dialogues](https://arxiv.org/pdf/1706.05125)
-  - Multiple item types (typically "books", "hats" and "balls") with limited stocks; each agent has its own per-type values.
-  - A deal pays out only if both proposals exactly agree and respect the stock; otherwise no deal (zero reward) that round.
-  - Motivation: a known benchmark closer to real-world bargaining, where both parties must explicitly agree.

 ### Variants (in increasing difficulty)
 - No‑Press Split
+  - Multiple item types (e.g., hats, balls, books)
+  - The item values for each agent are public.
+  - No communication; agents go straight to making split proposals.
   - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
 - Trust-and-Split RPS (TAS-RPS)
   - Single item type (coins)
   - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
   - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
   - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.

src_code_for_reproducibility/markov_games/negotiation/dond_agent.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import copy
 import re
 from collections.abc import Callable
@@ -5,14 +10,18 @@ from dataclasses import dataclass
 from typing import Any, Dict, List, Tuple
 from mllm.markov_games.agent import Agent
-from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
-from mllm.markov_games.negotiation.dond_simulation import (
-    DealNoDealObs,
 )
 from mllm.markov_games.negotiation.nego_simulation import Split
-from mllm.markov_games.negotiation.nego_agent import NegotiationAgent, NegotiationAgentState
 class DealNoDealAgent(NegotiationAgent):
     def __init__(
         self,
         *args,
@@ -20,42 +29,47 @@ class DealNoDealAgent(NegotiationAgent):
     ):
         super().__init__(*args, **kwargs)
         self.intro_prompt = (
-                "You are {agent_id}. You are playing an iterated game. "
-                "At each round, you and other agent will try to distribute among yourselves items of types {item_types}. "
-                "You only know how much you value each item type, but not the other agent's values. "
-                "You can communicate with the other agent by sending up to {quota_messages_per_agent_per_round} short messages per round. "
-                "Each round, after exchanging messages, you and the other agent will submit a private proposal. "
-                "A deal is accepted only if both proposals match exactly and are within stock; otherwise no deal (0 points for both at that round). "
-                "The values of the items of the other agent at the previous round are revealed to you after each round. "
-                "Your goal is: {goal}."
-            )
-        self.new_round_prompt = ("New round {round_nb}. Items: {stock}. Your values: {values}. ")
-        self.last_round_prompt = ("Last round, other agent's values: {previous_values_coagent}. ")
-        self.send_split_prompt = ("Respond with <split>...</split> where you propose how many items of each type you want to keep.")
     def get_message_regex(self, observation: DealNoDealObs) -> str:
         return r"<message>[\s\S]{0,400}</message>"
     def get_split_regex(self, observation: DealNoDealObs) -> str:
         parts = []
         for t in observation.item_types:
             s = int(observation.quantities.get(t, 0))
             allowed = "|".join(str(k) for k in range(0, s + 1))
             rng = f"({allowed})"
-            parts.append(fr"<{t}>{rng}</{t}>")
         items_block = "".join(parts)
-        return fr"(<split>{items_block}</split>)"
     def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
         import re as _re
         allocations: Dict[str, int] = {}
         for t in observation.item_types:
-            m = _re.search(fr"<{t}>([0-9]+)</{t}>", policy_output)
             if m:
                 allocations[t] = int(m.group(1))
             else:
                 allocations[t] = 0
         return Split(items_given_to_self=allocations)

+"""
+File: mllm/markov_games/negotiation/dond_agent.py
+Summary: Agent implementation for Deal-or-No-Deal style negotiations.
+"""
 import copy
 import re
 from collections.abc import Callable
 from typing import Any, Dict, List, Tuple
 from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.dond_simulation import DealNoDealObs
+from mllm.markov_games.negotiation.nego_agent import (
+    NegotiationAgent,
+    NegotiationAgentState,
 )
 from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
 class DealNoDealAgent(NegotiationAgent):
+    """NegotiationAgent tailored to the Deal-or-No-Deal stock/value revelation rules."""
     def __init__(
         self,
         *args,
     ):
         super().__init__(*args, **kwargs)
         self.intro_prompt = (
+            "You are {agent_id}. You are playing an iterated game. "
+            "At each round, you and other agent will try to distribute among yourselves items of types {item_types}. "
+            "You only know how much you value each item type, but not the other agent's values. "
+            "You can communicate with the other agent by sending up to {quota_messages_per_agent_per_round} short messages per round. "
+            "Each round, after exchanging messages, you and the other agent will submit a private proposal. "
+            "A deal is accepted only if both proposals match exactly and are within stock; otherwise no deal (0 points for both at that round). "
+            "The values of the items of the other agent at the previous round are revealed to you after each round. "
+            "Your goal is: {goal}."
+        )
+        self.new_round_prompt = (
+            "New round {round_nb}. Items: {stock}. Your values: {values}. "
+        )
+        self.last_round_prompt = (
+            "Last round, other agent's values: {previous_values_coagent}. "
+        )
+        self.send_split_prompt = "Respond with <split>...</split> where you propose how many items of each type you want to keep."
     def get_message_regex(self, observation: DealNoDealObs) -> str:
+        """Allow short XML messages (<400 chars) between proposal phases."""
         return r"<message>[\s\S]{0,400}</message>"
     def get_split_regex(self, observation: DealNoDealObs) -> str:
+        """Constrain split proposals to per-item XML tags bounded by the current stock."""
         parts = []
         for t in observation.item_types:
             s = int(observation.quantities.get(t, 0))
             allowed = "|".join(str(k) for k in range(0, s + 1))
             rng = f"({allowed})"
+            parts.append(rf"<{t}>{rng}</{t}>")
         items_block = "".join(parts)
+        return rf"(<split>{items_block}</split>)"
     def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
+        """Convert the XML proposal into a Split dataclass understood by the simulator."""
         import re as _re
         allocations: Dict[str, int] = {}
         for t in observation.item_types:
+            m = _re.search(rf"<{t}>([0-9]+)</{t}>", policy_output)
             if m:
                 allocations[t] = int(m.group(1))
             else:
                 allocations[t] = 0
         return Split(items_given_to_self=allocations)

src_code_for_reproducibility/markov_games/negotiation/nego_agent.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import copy
 from abc import abstractmethod
 from collections.abc import Callable
@@ -13,6 +18,8 @@ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
 @dataclass
 class NegotiationAgentState:
     round_nb: int
     nb_messages_sent_this_round: int
     chat_counter: int
@@ -20,6 +27,8 @@ class NegotiationAgentState:
 class NegotiationAgent(Agent):
     def __init__(
         self,
         seed: int,
@@ -61,19 +70,29 @@ class NegotiationAgent(Agent):
     @abstractmethod
     def get_message_regex(self, observation: NegotiationObs) -> str:
         pass
     @abstractmethod
     def get_split_regex(self, observation: NegotiationObs) -> str:
         pass
     @abstractmethod
     def get_split_action(
         self, policy_output: str, observation: NegotiationObs
     ) -> Split:
         pass
     async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
         def dict_to_str(d: dict) -> str:
             return ", ".join(f"{v} {k}" for k, v in d.items())

+"""
+File: mllm/markov_games/negotiation/nego_agent.py
+Summary: General-purpose negotiation agent coordinating prompts and actions.
+"""
 import copy
 from abc import abstractmethod
 from collections.abc import Callable
 @dataclass
 class NegotiationAgentState:
+    """Lightweight container tracking round progression and message history."""
     round_nb: int
     nb_messages_sent_this_round: int
     chat_counter: int
 class NegotiationAgent(Agent):
+    """Base agent that manages prompt scaffolding and regex validation for variants."""
     def __init__(
         self,
         seed: int,
     @abstractmethod
     def get_message_regex(self, observation: NegotiationObs) -> str:
+        """Return the regex that outgoing chat messages must satisfy."""
         pass
     @abstractmethod
     def get_split_regex(self, observation: NegotiationObs) -> str:
+        """Return the regex that final split proposals must satisfy."""
         pass
     @abstractmethod
     def get_split_action(
         self, policy_output: str, observation: NegotiationObs
     ) -> Split:
+        """Convert raw LLM output into the ``Split`` structure required by simulations."""
         pass
     async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
+        """
+        Assemble the appropriate prompt, query the policy, and return message or split.
+        This handles intro text, new-round reminders, quota tracking, and post-processing
+        (regex enforcement + ChatTurn logging) so subclasses only customize prompts/regexes.
+        """
         def dict_to_str(d: dict) -> str:
             return ", ".join(f"{v} {k}" for k, v in d.items())

src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py CHANGED Viewed

@@ -1,11 +1,17 @@
 import asyncio
-from typing import Optional
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
 from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
 from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
-from mllm.markov_games.negotiation.nego_simulation import Split
-from typing import Any, Tuple
 class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
     async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
@@ -40,6 +46,7 @@ class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
         )
         return action, act_log
 class HardCodedNegoGreedyPolicy(NoPressAgent):
     async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
         """
@@ -61,4 +68,3 @@ class HardCodedNegoGreedyPolicy(NoPressAgent):
             info=None,
         )
         return action, act_log

+"""
+File: mllm/markov_games/negotiation/nego_hard_coded_policies.py
+Summary: Provides deterministic negotiation policies for testing and baselines.
+"""
 import asyncio
+from typing import Any, Optional, Tuple
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
+from mllm.markov_games.negotiation.nego_simulation import Split
 from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
 from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
 from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
 class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
     async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
         )
         return action, act_log
 class HardCodedNegoGreedyPolicy(NoPressAgent):
     async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
         """
             info=None,
         )
         return action, act_log

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py CHANGED Viewed

@@ -1,3 +1,8 @@
 import copy
 from collections import defaultdict
 from dataclasses import dataclass
@@ -16,11 +21,15 @@ AgentId = str
 @dataclass
 class NoPressState(NegotiationState):
     pass
 @dataclass
 class NoPressObs(NegotiationObs):
     other_value: Dict[str, float]
@@ -39,6 +48,7 @@ class NoPressSimulation(NegotiationSimulation):
         super().__init__(*args, **kwargs)
     def _sample_values(self) -> Dict[AgentId, dict]:
         values = defaultdict(dict)
         if self.state is None:
             item_types = self.item_types
@@ -73,9 +83,11 @@ class NoPressSimulation(NegotiationSimulation):
         return values
     def _sample_quantities(self) -> Dict[str, int]:
         return {item.lower(): 10 for item in self.item_types}
     def set_new_round_of_variant(self):
         self.state.quantities = self._sample_quantities()
         self.state.values = self._sample_values()
         self.state.split_phase = True
@@ -83,6 +95,7 @@ class NoPressSimulation(NegotiationSimulation):
     def get_info_of_variant(
         self, state: NegotiationState, actions: Dict[AgentId, Any]
     ) -> Dict[str, Any]:
         return {
             "quantities": copy.deepcopy(state.quantities),
             "values": copy.deepcopy(state.values),
@@ -90,6 +103,7 @@ class NoPressSimulation(NegotiationSimulation):
         }
     def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
         return compute_tas_style_rewards(
             self.agent_ids, self.state.values, splits, self.state.quantities
         )

+"""
+File: mllm/markov_games/negotiation/no_press_nego_simulation.py
+Summary: Simulation driver for no-press negotiation scenarios.
+"""
 import copy
 from collections import defaultdict
 from dataclasses import dataclass
 @dataclass
 class NoPressState(NegotiationState):
+    """NegotiationState alias used to clarify we run in always-split phase."""
     pass
 @dataclass
 class NoPressObs(NegotiationObs):
+    """Observation that includes both agents' values (since there is no messaging)."""
     other_value: Dict[str, float]
         super().__init__(*args, **kwargs)
     def _sample_values(self) -> Dict[AgentId, dict]:
+        """Sample per-item valuations according to the configured template."""
         values = defaultdict(dict)
         if self.state is None:
             item_types = self.item_types
         return values
     def _sample_quantities(self) -> Dict[str, int]:
+        """No-press setups use symmetric 10-unit stocks for every item."""
         return {item.lower(): 10 for item in self.item_types}
     def set_new_round_of_variant(self):
+        """Refresh quantities/values and jump directly into the simultaneous split."""
         self.state.quantities = self._sample_quantities()
         self.state.values = self._sample_values()
         self.state.split_phase = True
     def get_info_of_variant(
         self, state: NegotiationState, actions: Dict[AgentId, Any]
     ) -> Dict[str, Any]:
+        """Surface quantities/values/splits so statistics modules can read them."""
         return {
             "quantities": copy.deepcopy(state.quantities),
             "values": copy.deepcopy(state.values),
         }
     def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """Reuse TAS reward logic because the split arbitration is identical."""
         return compute_tas_style_rewards(
             self.agent_ids, self.state.values, splits, self.state.quantities
         )

src_code_for_reproducibility/markov_games/negotiation/tas_agent.py CHANGED Viewed

@@ -1,9 +1,16 @@
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 from mllm.markov_games.negotiation.nego_simulation import Split
 from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
 class TrustAndSplitAgent(NegotiationAgent):
     def __init__(self, num_message_chars, *args, **kwargs):
         self.num_message_chars = num_message_chars
         super().__init__(*args, **kwargs)
@@ -58,12 +65,14 @@ class TrustAndSplitAgent(NegotiationAgent):
         self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitObs) -> str:
         return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
     #     return rf"(?s).{{0,{self.num_message_chars}}}"
     def get_split_regex(self, observation: TrustAndSplitObs) -> str:
         items = list(observation.quantities.keys())
         # Accept both singular and plural forms
         item_pattern = "|".join(
@@ -75,6 +84,7 @@ class TrustAndSplitAgent(NegotiationAgent):
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitObs
     ) -> Split:
         items = list(observation.quantities.keys())
         import re as _re

+"""
+File: mllm/markov_games/negotiation/tas_agent.py
+Summary: Agent implementation for Take-and-Split negotiations.
+"""
 from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 from mllm.markov_games.negotiation.nego_simulation import Split
 from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
 class TrustAndSplitAgent(NegotiationAgent):
+    """Prompt/template wrapper for the classic multi-item Take-and-Split benchmark."""
     def __init__(self, num_message_chars, *args, **kwargs):
         self.num_message_chars = num_message_chars
         super().__init__(*args, **kwargs)
         self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
     def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+        """Constrain chat to bounded XML tags for stable parsing."""
         return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
     # def get_message_regex(self, observation: TrustAndSplitObs) -> str:
     #     return rf"(?s).{{0,{self.num_message_chars}}}"
     def get_split_regex(self, observation: TrustAndSplitObs) -> str:
+        """Allow natural-language item names while still returning machine-parsable XML."""
         items = list(observation.quantities.keys())
         # Accept both singular and plural forms
         item_pattern = "|".join(
     def get_split_action(
         self, policy_output: str, observation: TrustAndSplitObs
     ) -> Split:
+        """Convert human-readable allocation text back into canonical item IDs."""
         items = list(observation.quantities.keys())
         import re as _re