dereckpichemila commited on Sep 10, 2025

Commit

fa30e5a

verified ·

1 Parent(s): 9e1da36

Add files using upload-large-folder tool

Browse files

Files changed (50) hide show

.hydra/hydra.yaml +155 -0
src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/analysis_utils.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/env_imports.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/environment_imports.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/export.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/mg_schemas.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/render_utils.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/rollout_tree_extract_utils.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/run_markov_games.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/runners.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/scores.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/two_chats_to_html.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/__pycache__/types.cpython-310.pyc +0 -0
src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/__init__.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/dond_agent.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/dond_simulation.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/__init__.py +0 -0
src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py +10 -0
src_code_for_reproducibility/markov_games/negotiation/README.md +40 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/nego_agent.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simulation.cpython-311.pyc +0 -0
src_code_for_reproducibility/markov_games/negotiation/dond_agent.py +61 -0
src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py +153 -0
src_code_for_reproducibility/markov_games/negotiation/nego_agent.py +174 -0
src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py +229 -0
src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py +44 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py +48 -0
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +141 -0
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +61 -0
src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py +85 -0
src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py +208 -0

.hydra/hydra.yaml ADDED Viewed

	@@ -0,0 +1,155 @@

+hydra:
+  run:
+    dir: ${oc.env:SCRATCH}/llm_negotiation/${now:%Y_%m}/${experiment.name}
+  sweep:
+    dir: multirun/${now:%Y-%m-%d}/${now:%H-%M-%S}
+    subdir: ${hydra.job.num}
+  launcher:
+    _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher
+  sweeper:
+    _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper
+    max_batch_size: null
+    params: null
+  help:
+    app_name: ${hydra.job.name}
+    header: '${hydra.help.app_name} is powered by Hydra.
+      '
+    footer: 'Powered by Hydra (https://hydra.cc)
+      Use --hydra-help to view Hydra specific help
+      '
+    template: '${hydra.help.header}
+      == Configuration groups ==
+      Compose your configuration from those groups (group=option)
+      $APP_CONFIG_GROUPS
+      == Config ==
+      Override anything in the config (foo.bar=value)
+      $CONFIG
+      ${hydra.help.footer}
+      '
+  hydra_help:
+    template: 'Hydra (${hydra.runtime.version})
+      See https://hydra.cc for more info.
+      == Flags ==
+      $FLAGS_HELP
+      == Configuration groups ==
+      Compose your configuration from those groups (For example, append hydra/job_logging=disabled
+      to command line)
+      $HYDRA_CONFIG_GROUPS
+      Use ''--cfg hydra'' to Show the Hydra config.
+      '
+    hydra_help: ???
+  hydra_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][HYDRA] %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+    root:
+      level: INFO
+      handlers:
+      - console
+    loggers:
+      logging_example:
+        level: DEBUG
+    disable_existing_loggers: false
+  job_logging:
+    version: 1
+    formatters:
+      simple:
+        format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s'
+    handlers:
+      console:
+        class: logging.StreamHandler
+        formatter: simple
+        stream: ext://sys.stdout
+      file:
+        class: logging.FileHandler
+        formatter: simple
+        filename: ${hydra.runtime.output_dir}/${hydra.job.name}.log
+    root:
+      level: INFO
+      handlers:
+      - console
+      - file
+    disable_existing_loggers: false
+  env: {}
+  mode: RUN
+  searchpath: []
+  callbacks: {}
+  output_subdir: .hydra
+  overrides:
+    hydra:
+    - hydra.mode=RUN
+    task:
+    - experiment.name=tas_rps_no_regex_prev_ad_align_buffer_gae
+  job:
+    name: run
+    chdir: false
+    override_dirname: experiment.name=tas_rps_no_regex_prev_ad_align_buffer_gae
+    id: ???
+    num: ???
+    config_name: tas_rps_no_regex_prev_ad_align_buffer_gae
+    env_set: {}
+    env_copy: []
+    config:
+      override_dirname:
+        kv_sep: '='
+        item_sep: ','
+        exclude_keys: []
+  runtime:
+    version: 1.3.2
+    version_base: '1.1'
+    cwd: /home/mila/d/dereck.piche/llm_negotiation
+    config_sources:
+    - path: hydra.conf
+      schema: pkg
+      provider: hydra
+    - path: /home/mila/d/dereck.piche/llm_negotiation/configs
+      schema: file
+      provider: main
+    - path: ''
+      schema: structured
+      provider: schema
+    output_dir: /network/scratch/d/dereck.piche/llm_negotiation/2025_09/tas_rps_no_regex_prev_ad_align_buffer_gae
+    choices:
+      hydra/env: default
+      hydra/callbacks: null
+      hydra/job_logging: default
+      hydra/hydra_logging: default
+      hydra/hydra_help: default
+      hydra/help: default
+      hydra/sweeper: basic
+      hydra/launcher: basic
+      hydra/output: default
+  verbose: false

src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (164 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (180 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-311.pyc ADDED Viewed

Binary file (3.47 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-311.pyc ADDED Viewed

Binary file (5.62 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/analysis_utils.cpython-310.pyc ADDED Viewed

Binary file (10.9 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/env_imports.cpython-310.pyc ADDED Viewed

Binary file (644 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/environment_imports.cpython-310.pyc ADDED Viewed

Binary file (822 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/export.cpython-310.pyc ADDED Viewed

Binary file (11.9 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-310.pyc ADDED Viewed

Binary file (30.6 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/gather_and_export_utils.cpython-311.pyc ADDED Viewed

Binary file (41 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/group_timesteps.cpython-311.pyc ADDED Viewed

Binary file (6.71 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/linear_runner.cpython-311.pyc ADDED Viewed

Binary file (1.4 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-310.pyc ADDED Viewed

Binary file (5.91 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-311.pyc ADDED Viewed

Binary file (10.7 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/mg_schemas.cpython-310.pyc ADDED Viewed

Binary file (2.82 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-310.pyc ADDED Viewed

Binary file (1.63 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/mg_utils.cpython-311.pyc ADDED Viewed

Binary file (3.94 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/render_utils.cpython-311.pyc ADDED Viewed

Binary file (18.9 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-310.pyc ADDED Viewed

Binary file (3.26 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/rollout_tree.cpython-311.pyc ADDED Viewed

Binary file (4.69 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/rollout_tree_extract_utils.cpython-310.pyc ADDED Viewed

Binary file (667 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/run_markov_games.cpython-310.pyc ADDED Viewed

Binary file (829 Bytes). View file

src_code_for_reproducibility/markov_games/__pycache__/runners.cpython-310.pyc ADDED Viewed

Binary file (2.74 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/scores.cpython-310.pyc ADDED Viewed

Binary file (8.02 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-310.pyc ADDED Viewed

Binary file (3.51 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/simulation.cpython-311.pyc ADDED Viewed

Binary file (4.28 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/two_chats_to_html.cpython-310.pyc ADDED Viewed

Binary file (9.29 kB). View file

src_code_for_reproducibility/markov_games/__pycache__/types.cpython-310.pyc ADDED Viewed

Binary file (404 Bytes). View file

src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (193 Bytes). View file

src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/dond_agent.cpython-311.pyc ADDED Viewed

Binary file (9.93 kB). View file

src_code_for_reproducibility/markov_games/deal_no_deal/__pycache__/dond_simulation.cpython-311.pyc ADDED Viewed

Binary file (18.7 kB). View file

src_code_for_reproducibility/markov_games/ipd/__init__.py ADDED Viewed

File without changes

src_code_for_reproducibility/markov_games/ipd/__pycache__/__init__.cpython-311.pyc ADDED Viewed

Binary file (184 Bytes). View file

src_code_for_reproducibility/markov_games/ipd/ipd_statistics.py ADDED Viewed

	@@ -0,0 +1,10 @@

+from __future__ import annotations
+from typing import Dict
+from mllm.markov_games.rollout_tree import SimulationStepLog
+def avg_reward(sl: SimulationStepLog) -> Dict[str, float]:
+    # One value per agent at each step
+    return {aid: float(v) for aid, v in (sl.rewards or {}).items()}

src_code_for_reproducibility/markov_games/negotiation/README.md ADDED Viewed

	@@ -0,0 +1,40 @@

+## Negotiation Games: core mechanics and variants
+This family of games feature two agents who, in each round, may briefly communicate and then simultaneously propose how to split a fixed resource (most commonly 10 coins). Rewards are the amount kept multiplied by an agent’s per-unit value. The starting speaker alternates deterministically across rounds.
+Communication is optional and variant-dependent: some settings encourage rich messaging to share private information, while others remove messaging entirely to focus on allocation behavior.
+Proportional splitting is used when the two proposals exceed the available total: allocations are scaled proportionally rather than discarded. This preserves a useful learning signal even when agents over-claim.
+### Variants (in increasing difficulty)
+- No‑Press Split
+  - Single item type (coins)
+  - No communication; agents go straight to making split proposals, with the starting player alternating deterministically.
+  - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
+  - Deterministic Mode: values are fixed and public: one agent values coins at 10, the other at 1 (alternates each round).
+  - Stochastic Mode: values are random and uncorrelated.
+- Trust-and-Split RPS (TAS-RPS)
+  - Single item type (coins)
+  - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
+  - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
+  - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
+- Trust-and-Split (TAS)
+  - Single item type (coins); each round, each agent’s per-coin value is independently sampled in a broad range (e.g., 1–20).
+  - Each agent observes only their own value; they may use short messages to share and negotiate.
+  - Motivation: a simple blend that tests whether agents learn to exchange private information and coordinate proportional, value-aware splits.
+- Deal-or-No-Deal (DOND)
+  - Introduced in [Deal or No Deal? End-to-End Learning for Negotiation Dialogues](https://arxiv.org/pdf/1706.05125)
+  - Multiple item types (typically "books", "hats" and "balls") with limited stocks; each agent has its own per-type values.
+  - A deal pays out only if both proposals exactly agree and respect the stock; otherwise no deal (zero reward) that round.
+  - Motivation: a known benchmark closer to real-world bargaining, where both parties must explicitly agree.

src_code_for_reproducibility/markov_games/negotiation/__pycache__/dond_simulation.cpython-311.pyc ADDED Viewed

Binary file (11.9 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/nego_agent.cpython-311.pyc ADDED Viewed

Binary file (7.72 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/no_press_nego_simulation.cpython-311.pyc ADDED Viewed

Binary file (8.1 kB). View file

src_code_for_reproducibility/markov_games/negotiation/__pycache__/tas_simulation.cpython-311.pyc ADDED Viewed

Binary file (7.23 kB). View file

src_code_for_reproducibility/markov_games/negotiation/dond_agent.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import copy
+import re
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+from mllm.markov_games.negotiation.dond_simulation import (
+    DealNoDealObs,
+)
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.nego_agent import NegotiationAgent, NegotiationAgentState
+class DealNoDealAgent(NegotiationAgent):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+                "You are {agent_id}. You are playing an iterated game. "
+                "At each round, you and other agent will try to distribute among yourselves items of types {item_types}. "
+                "You only know how much you value each item type, but not the other agent's values. "
+                "You can communicate with the other agent by sending up to {quota_messages_per_agent_per_round} short messages per round. "
+                "Each round, after exchanging messages, you and the other agent will submit a private proposal. "
+                "A deal is accepted only if both proposals match exactly and are within stock; otherwise no deal (0 points for both at that round). "
+                "The values of the items of the other agent at the previous round are revealed to you after each round. "
+                "Your goal is: {goal}."
+            )
+        self.new_round_prompt = ("New round {round_nb}. Items: {stock}. Your values: {values}. ")
+        self.last_round_prompt = ("Last round, other agent's values: {previous_values_coagent}. ")
+        self.send_split_prompt = ("Respond with <split>...</split> where you propose how many items of each type you want to keep.")
+    def get_message_regex(self, observation: DealNoDealObs) -> str:
+        return r"<message>[\s\S]{0,400}</message>"
+    def get_split_regex(self, observation: DealNoDealObs) -> str:
+        parts = []
+        for t in observation.item_types:
+            s = int(observation.quantities.get(t, 0))
+            allowed = "|".join(str(k) for k in range(0, s + 1))
+            rng = f"({allowed})"
+            parts.append(fr"<{t}>{rng}</{t}>")
+        items_block = "".join(parts)
+        return fr"(<split>{items_block}</split>)"
+    def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
+        import re as _re
+        allocations: Dict[str, int] = {}
+        for t in observation.item_types:
+            m = _re.search(fr"<{t}>([0-9]+)</{t}>", policy_output)
+            if m:
+                allocations[t] = int(m.group(1))
+            else:
+                allocations[t] = 0
+        return Split(items_given_to_self=allocations)

src_code_for_reproducibility/markov_games/negotiation/dond_simulation.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.markov_games.negotiation.nego_simulation import Split, NegotiationState, NegotiationObs, NegotiationSimulation
+from mllm.utils.get_coagent_id import get_coagent_id
+AgentId = str
+@dataclass
+class DealNoDealState(NegotiationState):
+    item_types: List[str]
+    values: Dict[AgentId, Dict[str, int]]
+@dataclass
+class DealNoDealObs(NegotiationObs):
+    my_values: Dict[str, int]
+    item_types: List[str]
+    previous_values_coagent: Dict[str, int] | None
+def random_partition_integer(rng, total: int, parts: int) -> List[int]:
+    if parts <= 0:
+        return []
+    if total <= 0:
+        return [0 for _ in range(parts)]
+    cuts = sorted(rng.integers(0, total + 1, size=parts - 1).tolist())
+    vals = []
+    prev = 0
+    for c in cuts + [total]:
+        vals.append(c - prev)
+        prev = c
+    return vals
+class DealNoDealSimulation(NegotiationSimulation):
+    def __init__(
+        self,
+        item_types: List[str] = ["books", "hats", "balls"],
+        *args,
+        **kwargs,
+    ):
+        super().__init__(item_types=item_types, *args, **kwargs)
+        self.reset()
+    def _other(self, agent_id: AgentId) -> AgentId:
+        return get_coagent_id(self.agent_ids, agent_id)
+    def _sample_stock(self) -> Dict[str, int]:
+        # total items between 5 and 7
+        total_items = int(self.rng.integers(5, 8))
+        # nonnegative per-type counts summing to total_items
+        parts = random_partition_integer(self.rng, total_items, len(self.item_types))
+        # allow zeros per type
+        return {t: int(c) for t, c in zip(self.item_types, parts)}
+    def _sample_values_pair(self) -> Dict[AgentId, Dict[str, int]]:
+        # Each agent has integer non-negative values that sum to 10
+        # Each item type valued by at least one agent
+        # Some item type valued by both agents
+        while True:
+            vals_a = random_partition_integer(self.rng, 10, len(self.item_types))
+            vals_b = random_partition_integer(self.rng, 10, len(self.item_types))
+            a = {t: int(v) for t, v in zip(self.item_types, vals_a)}
+            b = {t: int(v) for t, v in zip(self.item_types, vals_b)}
+            # each item valued by at least one
+            ok1 = all((a[t] > 0) or (b[t] > 0) for t in self.item_types)
+            # some item valued by both
+            ok2 = any((a[t] > 0) and (b[t] > 0) for t in self.item_types)
+            if ok1 and ok2:
+                return {self.agent_ids[0]: a, self.agent_ids[1]: b}
+    def _is_valid_allocation(self, allocation: Dict[str, int], stock: Dict[str, int]) -> bool:
+        for t in self.item_types:
+            v = allocation.get(t)
+            if v is None:
+                return False
+            if not isinstance(v, int):
+                return False
+            if v < 0 or v > int(stock.get(t, 0)):
+                return False
+        return True
+    def set_new_round_of_variant(self):
+        # Keep same values, resample stock
+        self.state.quantities = self._sample_stock()
+    def get_info_of_variant(self, state: NegotiationState, actions: Dict[AgentId, Any]) -> Dict[str, Any]:
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            'splits': copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        """
+        Returns the rewards for each agent.
+        """
+        split_a = splits[self.agent_ids[0]].items_given_to_self
+        split_b = splits[self.agent_ids[1]].items_given_to_self
+        rewards = {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+        for t in self.item_types:
+            # If not complementary, return 0!
+            if not split_a[t] + split_b[t] == self.state.quantities[t]:
+                return {self.agent_ids[0]: 0, self.agent_ids[1]: 0}
+            rewards[self.agent_ids[0]] += split_a[t] * self.state.values[self.agent_ids[0]][t]
+            rewards[self.agent_ids[1]] += split_b[t] * self.state.values[self.agent_ids[1]][t]
+        return rewards
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        obs = DealNoDealObs(
+            round_nb=self.state.round_nb,
+            last_message=self.state.last_message,
+            current_agent=self.state.current_agent,
+            quantities=copy.deepcopy(self.state.quantities),
+            value=0.0,  # unused in DOND
+            other_agent_split=None,  # not meaningful until split
+            split_phase=self.state.split_phase,
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            my_values=copy.deepcopy(self.state.values[agent_id]),
+            item_types=list(self.item_types),
+            previous_values_coagent=copy.deepcopy(self.state.values.get(other_id, {})),
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        stock = self._sample_stock()
+        values = self._sample_values_pair()
+        self.state = DealNoDealState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities=stock,
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=False,
+            item_types=list(self.item_types),
+        )
+        return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/nego_agent.py ADDED Viewed

	@@ -0,0 +1,174 @@

+import copy
+from abc import abstractmethod
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.nego_simulation import Message, NegotiationObs, Split
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+@dataclass
+class NegotiationAgentState:
+    round_nb: int
+    nb_messages_sent_this_round: int
+    chat_counter: int
+    chat_history: List[ChatTurn]
+class NegotiationAgent(Agent):
+    def __init__(
+        self,
+        seed: int,
+        agent_id: str,
+        agent_name: str,
+        policy: Callable[[List[Dict]], str],
+        goal: str,
+    ):
+        self.seed = seed
+        self.agent_id = agent_id
+        self.agent_name = agent_name
+        self.policy = policy
+        self.goal = goal
+        self.state = NegotiationAgentState(
+            round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
+        )
+        # Implemented in variants
+        self.intro_prompt = ""
+        self.new_round_prompt = ""
+        self.last_round_prompt = ""
+        self.send_split_prompt = ""
+        self.wait_for_message_prompt = ""
+        self.last_message_prompt = ""
+        self.send_message_prompt = ""
+    @abstractmethod
+    def get_message_regex(self, observation: NegotiationObs) -> str:
+        pass
+    @abstractmethod
+    def get_split_regex(self, observation: NegotiationObs) -> str:
+        pass
+    @abstractmethod
+    def get_split_action(
+        self, policy_output: str, observation: NegotiationObs
+    ) -> Split:
+        pass
+    async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
+        is_our_turn = observation.current_agent == self.agent_id
+        action: Any = None
+        round_nb = observation.round_nb
+        prompt_parts: List[str] = []
+        obs_ctx = vars(observation)
+        #######################################
+        # build user prompt
+        #######################################
+        # First-ever call
+        is_intro = round_nb == 0 and self.state.chat_counter == 0
+        if is_intro:
+            prompt_parts.append(
+                self.intro_prompt.format(
+                    goal=self.goal, agent=self.agent_name, **obs_ctx
+                )
+            )
+        # New round
+        is_new_round = round_nb > self.state.round_nb
+        if is_new_round or is_intro:
+            self.state.nb_messages_sent_this_round = 0
+            if not is_intro:
+                prompt_parts.append(self.last_round_prompt.format(**obs_ctx))
+            prompt_parts.append(self.new_round_prompt.format(**obs_ctx))
+            self.state.round_nb = round_nb
+        # Wait for message
+        if not is_our_turn and not observation.split_phase:
+            prompt_parts.append(self.wait_for_message_prompt.format(**obs_ctx))
+        # Get last message
+        if is_our_turn and not is_new_round and not is_intro:
+            prompt_parts.append(self.last_message_prompt.format(**obs_ctx))
+        # Prompt to send message
+        must_send_message = not observation.split_phase and is_our_turn
+        if must_send_message:
+            prompt_parts.append(self.send_message_prompt.format(**obs_ctx))
+        # Prompt to give split
+        must_send_split = not must_send_message and observation.split_phase
+        if must_send_split:
+            prompt_parts.append(self.send_split_prompt.format(**obs_ctx))
+        # Append one ChatTurn with is_state_end=True
+        user_prompt = "\n".join(prompt_parts)
+        self.state.chat_history.append(
+            ChatTurn(
+                agent_id=self.agent_id,
+                role="user",
+                content=user_prompt,
+                is_state_end=True,
+            )
+        )
+        #######################################
+        # Get policy action
+        #######################################
+        # Query policy for the appropriate format
+        if must_send_message:
+            return_regex = self.get_message_regex(observation)
+            policy_output = await self.policy(
+                prompt=[c.dict() for c in self.state.chat_history],
+                regex=return_regex,
+            )
+            self.state.chat_history.append(
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content=policy_output,
+                    is_state_end=False,
+                )
+            )
+            action = Message(message=policy_output)
+            self.state.nb_messages_sent_this_round += 1
+        elif must_send_split:
+            return_regex = self.get_split_regex(observation)
+            policy_output = await self.policy(
+                prompt=[c.dict() for c in self.state.chat_history],
+                regex=return_regex,
+            )
+            self.state.chat_history.append(
+                ChatTurn(
+                    agent_id=self.agent_id,
+                    role="assistant",
+                    content=policy_output,
+                    is_state_end=False,
+                )
+            )
+            action = self.get_split_action(policy_output, observation)
+        else:
+            action = None
+        agent_step_log = AgentActLog(
+            chat_turns=self.state.chat_history[self.state.chat_counter :], info=None
+        )
+        self.state.chat_counter = len(self.state.chat_history)
+        return action, agent_step_log
+    def get_safe_copy(self):
+        agent_copy = copy.copy(self)
+        agent_copy.state = copy.deepcopy(self.state)
+        return agent_copy
+    def reset(self):
+        self.state = NegotiationAgentState(
+            round_nb=0, nb_messages_sent_this_round=0, chat_counter=0, chat_history=[]
+        )

src_code_for_reproducibility/markov_games/negotiation/nego_simulation.py ADDED Viewed

	@@ -0,0 +1,229 @@

+"""
+Negotiation simulation environment
+other agent is set at the start of every round. Even though current agent changes over message turns in a round.
+"""
+import copy
+from abc import abstractmethod
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.rollout_tree import SimulationStepLog
+from mllm.markov_games.simulation import Simulation
+from mllm.utils.get_coagent_id import get_coagent_id
+AgentId = str
+@dataclass
+class Split:
+    items_given_to_self: Dict[str, int]
+@dataclass
+class Message:
+    message: str
+@dataclass  # gets extended by variants
+class NegotiationState:
+    round_nb: int
+    last_message: str
+    current_agent: AgentId
+    quantities: Dict[str, int]
+    values: Dict[AgentId, float]
+    splits: Dict[AgentId, Split | None]
+    nb_messages_sent: Dict[AgentId, int]
+    previous_values: Dict[AgentId, float] | None
+    previous_splits: Dict[AgentId, Split | None] | None
+    previous_points: Dict[AgentId, float] | None
+    split_phase: bool
+@dataclass  # gets extended by variants
+class NegotiationObs:
+    round_nb: int
+    last_message: str
+    quota_messages_per_agent_per_round: int
+    current_agent: AgentId
+    other_agent: str
+    quantities: Dict[str, int]
+    item_types: List[str]
+    value: float
+    split_phase: bool
+    last_split_agent: int | None
+    last_value_agent: float | None
+    last_points_agent: float | None
+    last_split_coagent: int | None
+    last_value_coagent: float | None
+    last_points_coagent: float | None
+def compute_tas_style_rewards(
+    agent_ids: List[AgentId],
+    values: Dict[AgentId, float],
+    splits: Dict[AgentId, Split],
+    max_coins: int,
+) -> Dict[AgentId, float]:
+    """
+    TAS-like reward computation: if sum of proposed coins exceeds max_coins,
+    allocate proportionally. Otherwise, use proposed amounts directly.
+    Rewards are quantity_kept * per-coin value for each agent.
+    """
+    a0, a1 = agent_ids[0], agent_ids[1]
+    coins_to_self_0 = int(
+        (splits[a0].items_given_to_self.get("coins", 0))
+        if splits[a0] is not None
+        else 0
+    )
+    coins_to_self_1 = int(
+        (splits[a1].items_given_to_self.get("coins", 0))
+        if splits[a1] is not None
+        else 0
+    )
+    denom = max(int(max_coins), coins_to_self_0 + coins_to_self_1)
+    q0 = float(max_coins) * float(coins_to_self_0) / float(denom)
+    q1 = float(max_coins) * float(coins_to_self_1) / float(denom)
+    r0 = q0 * float(values[a0])
+    r1 = q1 * float(values[a1])
+    return {a0: r0, a1: r1}
+class NegotiationSimulation(Simulation):
+    def __init__(
+        self,
+        agent_ids: List[AgentId],
+        agent_names: List[str],
+        seed: int,
+        nb_of_rounds: int,
+        quota_messages_per_agent_per_round: int,
+        item_types: List[str] | None = None,
+    ):
+        self.seed = seed
+        self.rng = default_rng(self.seed)
+        self.agent_ids = list(agent_ids)
+        self.agent_names = agent_names
+        self.agent_id_to_name = {
+            agent_id: agent_name for agent_id, agent_name in zip(agent_ids, agent_names)
+        }
+        self.nb_of_rounds = int(nb_of_rounds)
+        self.quota_messages_per_agent_per_round = int(
+            quota_messages_per_agent_per_round
+        )
+        self.item_types = item_types or ["coins"]
+        self.state: NegotiationState | None = None
+        self._starting_agent_index = self.rng.choice([0, 1])
+        self.reset()
+    def _other(self, agent_id: AgentId) -> AgentId:
+        return get_coagent_id(self.agent_ids, agent_id)
+    @abstractmethod
+    def set_new_round_of_variant(self):
+        pass
+    @abstractmethod
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        pass
+    def step(self, actions: Any) -> Tuple[bool, SimulationStepLog]:
+        """
+        Returns terminated, step_log
+        """
+        assert self.state is not None
+        current_agent = self.state.current_agent
+        a0, a1 = self.agent_ids[0], self.agent_ids[1]
+        action = actions.get(current_agent)
+        # Split phase: require both splits in the same timestep
+        if self.state.split_phase:
+            action_a0 = actions.get(a0)
+            action_a1 = actions.get(a1)
+            have_both_splits = isinstance(action_a0, Split) and isinstance(
+                action_a1, Split
+            )
+            if not have_both_splits:
+                rewards = {agent_id: 0.0 for agent_id in self.agent_ids}
+                return False, SimulationStepLog(
+                    rewards=rewards, info={"type": "waiting_for_splits"}
+                )
+            # Record splits
+            self.state.splits[a0] = action_a0
+            self.state.splits[a1] = action_a1
+            # Compute rewards and end round
+            rewards = self.get_rewards(self.state.splits)
+            # Info
+            info = self.get_info_of_variant(self.state, actions)
+            # Prepare next round
+            # Alternate starting agent
+            self.state.round_nb += 1
+            self._starting_agent_index = 1 - self._starting_agent_index
+            self.state.current_agent = self.agent_ids[self._starting_agent_index]
+            self.state.other_agent = self.agent_id_to_name[
+                self._other(self.state.current_agent)
+            ]
+            self.set_new_round_of_variant()  # variant specific
+            self.state.previous_splits = copy.deepcopy(self.state.splits)
+            self.state.previous_points = copy.deepcopy(rewards)
+            self.state.last_message = ""
+            self.state.splits = {agent_id: None for agent_id in self.agent_ids}
+            self.state.nb_messages_sent = {agent_id: 0 for agent_id in self.agent_ids}
+            is_last_timestep_in_round = True
+            done = self.state.round_nb >= self.nb_of_rounds
+        # Message phase
+        elif isinstance(action, Message):
+            self.state.last_message = action.message
+            self.state.nb_messages_sent[current_agent] += 1
+            # Move turn to other agent
+            self.state.current_agent = self._other(current_agent)
+            # If both agents have reached their message quota, enter split phase
+            if all(
+                self.state.nb_messages_sent[agent_id]
+                >= self.quota_messages_per_agent_per_round
+                for agent_id in self.agent_ids
+            ):
+                self.state.split_phase = True
+            is_last_timestep_in_round = False
+            done = False
+            rewards = {agent_id: 0.0 for agent_id in self.agent_ids}
+            info = {"type": "message"}
+        info[
+            "is_last_timestep_in_round"
+        ] = is_last_timestep_in_round  # Used later to group round timesteps if needed
+        return done, SimulationStepLog(rewards=rewards, info=info)
+    def get_obs(self):
+        """Returns all agent observations in dict"""
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    @abstractmethod
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        pass
+    @abstractmethod
+    def get_obs_agent(self, agent_id):
+        pass
+    def get_state(self):
+        return self.state
+    def get_safe_copy(self):
+        """Return a safe copy of the simulation."""
+        simulation_copy = copy.copy(self)
+        simulation_copy.state = copy.deepcopy(self.state)
+        return simulation_copy
+    @abstractmethod
+    def reset(self) -> dict[AgentId, NegotiationObs]:
+        pass

src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py ADDED Viewed

	@@ -0,0 +1,44 @@

+from __future__ import annotations
+from typing import Dict
+from mllm.markov_games.rollout_tree import SimulationStepLog
+def split_greed(sl: SimulationStepLog) -> Dict[str, float] | None:
+    info = sl.info or {}
+    if not info or not info.get("is_last_timestep_in_round"):
+        return None
+    quantities = info.get("quantities") or {}
+    denom = float(quantities.get("coins", 1.0)) or 1.0
+    splits = info.get("splits") or {}
+    out: Dict[str, float] = {}
+    for aid, split in splits.items():
+        try:
+            out[str(aid)] = float(split["items_given_to_self"]["coins"]) / denom
+        except Exception:
+            continue
+    return out
+def split_efficiency(sl: SimulationStepLog) -> Dict[str, float] | None:
+    info = sl.info or {}
+    if not info or not info.get("is_last_timestep_in_round"):
+        return None
+    quantities = info.get("quantities") or {}
+    denom = float(quantities.get("coins", 1.0)) or 1.0
+    values = info.get("values") or {}
+    if not values:
+        return None
+    try:
+        max_val = max(float(v) for v in values.values())
+    except Exception:
+        return None
+    if not denom or not max_val:
+        return None
+    achieved = sum(float(v) for v in (sl.rewards or {}).values())
+    max_reward = denom * max_val
+    if not max_reward:
+        return None
+    # Efficiency is a global metric; emit same value for a special key "all"
+    return {"all": achieved / max_reward}

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py ADDED Viewed

	@@ -0,0 +1,48 @@

+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.negotiation.nego_agent import (
+    NegotiationAgent,
+    NegotiationAgentState,
+)
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class NoPressAgent(NegotiationAgent):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # No communication in this variant
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game consists of multiple independent rounds.\n"
+            "2. In each round, there are 10 coins to split between the two agents.\n"
+            "3. Each round, both agents are randomly assigned a value of either 1 or 10 per coin.\n"
+            "4. You can observe values of both agents.\n"
+            "5. Because assignments are random, both agents are equally likely to have same expected per-coin value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. Both agents simultaneously propose how many coins they keep.\n"
+            "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
+            "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
+            "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
+            "7. The points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = "In this round, your per-coin value is {value} and {other_agent}'s per-coin value is {other_value}."
+        self.last_round_prompt = "In the last round, your per-coin value was {last_value_agent} and {other_agent}'s per-coin value was {last_value_coagent}.\nYou proposed {last_split_agent} coins and earned {last_points_agent} points, while {other_agent} proposed {last_split_coagent} coins and earned {last_points_coagent} points."
+        self.send_split_prompt = "Respond with <coins_to_self> X </coins_to_self> where X is the number of coins you propose for yourself, between 0 and 10 inclusive."
+    def get_message_regex(self, observation: NoPressObs) -> str:
+        return r"^$"  # No messages allowed
+    def get_split_regex(self, observation: NoPressObs) -> str:
+        return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
+    def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split:
+        import re as _re
+        m = _re.search(r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output)
+        coins_int = int(m.group(1)) if m else int(policy_output)
+        return Split(items_given_to_self={"coins": coins_int})

src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.negotiation.nego_simulation import (
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+    compute_tas_style_rewards,
+)
+AgentId = str
+@dataclass
+class NoPressState(NegotiationState):
+    pass
+@dataclass
+class NoPressObs(NegotiationObs):
+    other_value: float
+class NoPressSimulation(NegotiationSimulation):
+    def __init__(
+        self,
+        deterministic: bool,
+        *args,
+        **kwargs,
+    ):
+        self.deterministic = deterministic
+        super().__init__(*args, **kwargs)
+    def _sample_values(self) -> Dict[AgentId, float]:
+        v = float(int(self.rng.choice([1, 10])))
+        return {self.agent_ids[0]: v, self.agent_ids[1]: 10.0 if v == 1.0 else 1.0}
+    def set_new_round_of_variant(self):
+        self.state.previous_values = copy.deepcopy(self.state.values)
+        self.state.quantities = {"coins": 10.0}
+        if self.deterministic:
+            self.state.values = {
+                aid: 1.0 if aid == self.state.current_agent else 10.0
+                for aid in self.agent_ids
+            }
+        else:
+            self.state.values = self._sample_values()
+        self.state.split_phase = True
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "values": copy.deepcopy(state.values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        return compute_tas_style_rewards(
+            self.agent_ids, self.state.values, splits, 10.0
+        )
+    def get_obs(self):
+        return {agent_id: self.get_obs_agent(agent_id) for agent_id in self.agent_ids}
+    def get_obs_agent(self, agent_id):
+        other_id = self._other(agent_id)
+        last_value_coagent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(other_id)
+        )
+        last_points_coagent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(other_id), 1)
+        )
+        last_value_agent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(agent_id)
+        )
+        last_points_agent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(agent_id), 1)
+        )
+        last_split_coagent = None
+        last_split_agent = None
+        if self.state.previous_splits is not None:
+            last_split_coagent = self.state.previous_splits[
+                other_id
+            ].items_given_to_self["coins"]
+            last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[
+                "coins"
+            ]
+        obs = NoPressObs(
+            round_nb=self.state.round_nb,
+            last_message="",
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            current_agent=self.state.current_agent,
+            other_agent=other_id,
+            quantities={"coins": 10},
+            item_types=self.item_types,
+            value=self.state.values[agent_id],
+            split_phase=self.state.split_phase,
+            last_split_agent=last_split_agent,
+            last_value_agent=last_value_agent,
+            last_points_agent=last_points_agent,
+            last_split_coagent=last_split_coagent,
+            last_value_coagent=last_value_coagent,
+            last_points_coagent=last_points_coagent,
+            other_value=self.state.values[other_id],
+        )
+        return obs
+    def reset(self):
+        start_agent = self.agent_ids[self._starting_agent_index]
+        if self.deterministic:
+            values = {
+                aid: 1.0 if aid == start_agent else 10.0 for aid in self.agent_ids
+            }
+        else:
+            values = self._sample_values()
+        self.state = NoPressState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities={"coins": 10.0},
+            values=values,
+            previous_values=None,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            split_phase=True,
+            previous_splits=None,
+            previous_points=None,
+        )
+        return self.get_obs()

src_code_for_reproducibility/markov_games/negotiation/tas_agent.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
+from mllm.markov_games.negotiation.nego_simulation import Split
+from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
+class TrustAndSplitAgent(NegotiationAgent):
+    def __init__(self, num_message_chars, *args, **kwargs):
+        self.num_message_chars = num_message_chars
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "Setup:\n"
+            "1. The game has multiple independent rounds.\n"
+            "2. In each round, there are 10 coins to split between the two agents.\n"
+            "3. Both agents are assigned a private per-coin value between 1 and 20 (inclusive) in each round.\n"
+            "4. Because assignments are random, both agents are equally likely to have same expected per-coin value.\n"
+            "\n"
+            "Protocol:\n"
+            "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
+            "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
+            "   - Use this chat to communicate your private per-coin value to make informed proposals.\n"
+            "3. After the chat, both agents simultaneously propose how many coins they keep.\n"
+            "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
+            "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
+            "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
+            "7. The points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = "A new round begins\n" "Your per-coin value is {value}."
+        self.last_round_prompt = (
+            "Round summary:\n"
+            "   - Your value per coin: {last_value_agent}\n"
+            "   - {other_agent}'s value per coin: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent} coins\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent} coins\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round complete.\n"
+        )
+        self.send_split_prompt = (
+            "Submit your proposal\n"
+            "Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
+        )
+        self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
+        self.last_message_prompt = "{other_agent} said: {last_message}"
+        self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
+    def get_message_regex(self, observation: TrustAndSplitObs) -> str:
+        return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
+    def get_split_regex(self, observation: TrustAndSplitObs) -> str:
+        return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
+    def get_split_action(
+        self, policy_output: str, observation: TrustAndSplitObs
+    ) -> Split:
+        import re as _re
+        m = _re.search(r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output)
+        coins_int = int(m.group(1)) if m else int(policy_output)
+        return Split(items_given_to_self={"coins": coins_int})

src_code_for_reproducibility/markov_games/negotiation/tas_rps_agent.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import copy
+from collections.abc import Callable
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple
+from mllm.markov_games.agent import Agent
+from mllm.markov_games.negotiation.nego_agent import (
+    Message,
+    NegotiationAgent,
+    NegotiationAgentState,
+    Split,
+)
+from mllm.markov_games.negotiation.tas_rps_simulation import TrustAndSplitRPSObs
+from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
+class TrustAndSplitRPSAgent(NegotiationAgent):
+    def __init__(
+        self,
+        num_message_chars: int,
+        *args,
+        **kwargs,
+    ):
+        self.num_message_chars = num_message_chars
+        super().__init__(*args, **kwargs)
+        self.intro_prompt = (
+            "Welcome to an iterated game. You are {agent}. The other agent is {other_agent}.\n"
+            "\n"
+            "Setup:\n"
+            "1. The game has multiple independent rounds.\n"
+            "2. In each round, there are 10 coins to split between the two agents.\n"
+            "3. Each agent's per-coin value for that round is determined as follows:\n"
+            "   - Both agents are randomly assigned a rock, paper or scissors hands\n"
+            "   - Rock has the upper hand over scissors, scissors has the upper hand over paper and paper has the upper hand over rock.\n"
+            "   - The agent with the upper hand has a per-coin value of 10.\n"
+            "   - The agent with the lower hand has a per-coin value of 1.\n"
+            "4. You only see your own hand, but you may communicate it in messages and infer your value based on the other agent's hand.\n"
+            "5. Over many rounds both agents are equally likely to have the upper and lower hand.\n"
+            "\n"
+            "Protocol:\n"
+            "1. At the start of the round, one agent begins the conversation. The starting role alternates each round.\n"
+            "2. Agents exchange a short chat ({quota_messages_per_agent_per_round} messages per round per agent) to negotiate how to split the 10 coins.\n"
+            "   - Use this chat to communicate your hand so that both agents can determine their per-coin values.\n"
+            "3. After the chat, both agents simultaneously propose how many coins they keep.\n"
+            "4. If the total sum of proposals is less than or equal to 10, both agents receive their proposals.\n"
+            "5. If the total sum of proposals exceeds 10, the coins are allocated proportionally.\n"
+            "6. Your points for the round = (coins you receive) x (your per-coin value for that round). \n"
+            "7. The points are accumulated across rounds.\n"
+            "Your goal: {goal}\n"
+        )
+        self.new_round_prompt = "A new round begins\n" "Your hand is {hand}."
+        self.last_round_prompt = (
+            "Round summary:\n"
+            "   - Your hand: {last_hand_agent}\n"
+            "   - {other_agent}'s hand: {last_hand_coagent}\n"
+            "   - Your value per coin: {last_value_agent}\n"
+            "   - {other_agent}'s value per coin: {last_value_coagent}\n"
+            "   - You proposed: {last_split_agent} coins\n"
+            "   - You earned: {last_points_agent} points\n"
+            "   - {other_agent} proposed: {last_split_coagent} coins\n"
+            "   - {other_agent} earned: {last_points_coagent} points\n"
+            "   - Round complete.\n"
+        )
+        self.send_split_prompt = (
+            "Submit your proposal\n"
+            "Respond with <coins_to_self> x </coins_to_self> where x is an integer in [0, 10]."
+        )
+        self.wait_for_message_prompt = "Wait for {other_agent} to send a message..."
+        self.last_message_prompt = "{other_agent} said: {last_message}"
+        self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
+    def get_message_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
+    def get_split_regex(self, observation: TrustAndSplitRPSObs) -> str:
+        return r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>"
+    def get_split_action(
+        self, policy_output: str, observation: TrustAndSplitRPSObs
+    ) -> Split:
+        import re as _re
+        m = _re.search(r"<coins_to_self> ?(10|[0-9]) ?</coins_to_self>", policy_output)
+        coins_int = int(m.group(1)) if m else int(policy_output)
+        return Split(items_given_to_self={"coins": coins_int})

src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py ADDED Viewed

	@@ -0,0 +1,208 @@

+"""
+Trust-and-Split simulation.
+This environment models a simple bargaining game over 10 coins with messaging.
+Agents are assigned rock/paper/scissors hands, with the winner getting value 10 per coin
+and the loser getting value 1 per coin. Agents alternate sending messages for a fixed
+number of turns per round and then each submits a split proposal indicating how many
+coins they keep for themselves. Rewards are proportional if the proposed totals exceed 10.
+"""
+import copy
+from dataclasses import dataclass
+from typing import Any, Dict, List, Literal, Tuple
+from numpy.random import default_rng
+from mllm.markov_games.negotiation.nego_simulation import (
+    Message,
+    NegotiationObs,
+    NegotiationSimulation,
+    NegotiationState,
+    Split,
+    compute_tas_style_rewards,
+)
+from mllm.markov_games.rollout_tree import SimulationStepLog
+AgentId = str
+def _get_rps_winner(
+    hand1: Literal["rock", "paper", "scissors"],
+    hand2: Literal["rock", "paper", "scissors"],
+) -> Literal["rock", "paper", "scissors"]:
+    """Determine winner of rock-paper-scissors between two hands."""
+    if hand1 == hand2:
+        raise ValueError("Hands should be different")
+    if (
+        (hand1 == "rock" and hand2 == "scissors")
+        or (hand1 == "paper" and hand2 == "rock")
+        or (hand1 == "scissors" and hand2 == "paper")
+    ):
+        return hand1
+    else:
+        return hand2
+@dataclass
+class TrustAndSplitRPSState(NegotiationState):
+    hands: Dict[
+        AgentId, Literal["rock", "paper", "scissors"]
+    ]  # rock, paper, or scissors
+    previous_hands: Dict[AgentId, Literal["rock", "paper", "scissors"]] | None
+@dataclass
+class TrustAndSplitRPSObs(NegotiationObs):
+    hand: Literal["rock", "paper", "scissors"]
+    last_hand_agent: Literal["rock", "paper", "scissors"] | None
+    last_hand_coagent: Literal["rock", "paper", "scissors"] | None
+class TrustAndSplitRPSSimulation(NegotiationSimulation):
+    def __init__(
+        self,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+    def _sample_hands_and_values(
+        self,
+    ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]:
+        # Assign different hands to each agent
+        hands = ["rock", "paper", "scissors"]
+        hand1, hand2 = self.rng.choice(hands, size=2, replace=False)
+        agent_hands = {self.agent_ids[0]: hand1, self.agent_ids[1]: hand2}
+        # Determine winner and assign values
+        winner = _get_rps_winner(hand1, hand2)
+        values = {}
+        for agent_id in self.agent_ids:
+            if agent_hands[agent_id] == winner:
+                values[agent_id] = 10.0  # Winner gets value 10
+            else:
+                values[agent_id] = 1.0  # Loser gets value 1
+        return agent_hands, values
+    def set_new_round_of_variant(self):
+        self.state.previous_values = copy.deepcopy(self.state.values)
+        self.state.previous_hands = copy.deepcopy(self.state.hands)
+        new_hands, new_values = self._sample_hands_and_values()
+        self.state.hands = new_hands
+        self.state.values = new_values
+        # Quantities are constant in TAS
+        self.state.quantities = {"coins": 10}
+        self.state.split_phase = False
+    def get_info_of_variant(
+        self, state: NegotiationState, actions: Dict[AgentId, Any]
+    ) -> Dict[str, Any]:
+        return {
+            "quantities": copy.deepcopy(state.quantities),
+            "hands": copy.deepcopy(state.hands),
+            "values": copy.deepcopy(state.values),
+            "previous_hands": copy.deepcopy(state.previous_hands),
+            "previous_values": copy.deepcopy(state.previous_values),
+            "splits": copy.deepcopy(state.splits),
+        }
+    def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
+        return compute_tas_style_rewards(self.agent_ids, self.state.values, splits, 10)
+    def get_obs_agent(self, agent_id):
+        """Returns observation for agent_id"""
+        other_id = self._other(agent_id)
+        last_value_coagent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(other_id)
+        )
+        last_hand_coagent = (
+            None
+            if self.state.previous_hands is None
+            else self.state.previous_hands.get(other_id)
+        )
+        last_points_coagent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(other_id), 1)
+        )
+        last_value_agent = (
+            None
+            if self.state.previous_values is None
+            else self.state.previous_values.get(agent_id)
+        )
+        last_hand_agent = (
+            None
+            if self.state.previous_hands is None
+            else self.state.previous_hands.get(agent_id)
+        )
+        last_points_agent = (
+            None
+            if self.state.previous_points is None
+            else round(self.state.previous_points.get(agent_id), 1)
+        )
+        last_split_coagent = None
+        last_split_agent = None
+        if self.state.previous_splits is not None:
+            last_split_coagent = self.state.previous_splits[
+                other_id
+            ].items_given_to_self["coins"]
+            last_split_agent = self.state.previous_splits[agent_id].items_given_to_self[
+                "coins"
+            ]
+        obs = TrustAndSplitRPSObs(
+            round_nb=self.state.round_nb,
+            last_message=self.state.last_message,
+            quota_messages_per_agent_per_round=self.quota_messages_per_agent_per_round,
+            current_agent=self.state.current_agent,
+            other_agent=other_id,
+            quantities={"coins": 10},
+            item_types=self.item_types,
+            value=self.state.values[agent_id],
+            split_phase=self.state.split_phase,
+            last_split_agent=last_split_agent,
+            last_value_agent=last_value_agent,
+            last_points_agent=last_points_agent,
+            last_split_coagent=last_split_coagent,
+            last_value_coagent=last_value_coagent,
+            last_points_coagent=last_points_coagent,
+            hand=self.state.hands[agent_id],
+            last_hand_coagent=last_hand_coagent,
+            last_hand_agent=last_hand_agent,
+        )
+        return obs
+    def get_state(self):
+        return self.state
+    def get_safe_copy(self):
+        """Return a safe copy of the simulation."""
+        simulation_copy = copy.copy(self)
+        simulation_copy.state = copy.deepcopy(self.state)
+        return simulation_copy
+    def reset(self):
+        """Initialize and return initial observations"""
+        # Decide starting agent alternating across resets for determinism
+        start_agent = self.agent_ids[self._starting_agent_index]
+        hands, values = self._sample_hands_and_values()
+        self.state = TrustAndSplitRPSState(
+            round_nb=0,
+            last_message="",
+            current_agent=start_agent,
+            quantities={"coins": 10},
+            values=values,
+            splits={aid: None for aid in self.agent_ids},
+            nb_messages_sent={aid: 0 for aid in self.agent_ids},
+            previous_values=None,
+            previous_splits=None,
+            previous_points=None,
+            split_phase=False,
+            hands=hands,
+            previous_hands=None,
+        )
+        return self.get_obs()