Add files using upload-large-folder tool
Browse files- seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors +1 -1
- seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors +1 -1
- seed_42/agent_trainer/policy_optimizer_state.pt +1 -1
- seed_42/agent_trainer/trainer_annealing_state.pkl +1 -1
- seed_42/random_state.pkl +1 -1
- src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc +0 -0
- src_code_for_reproducibility/markov_games/negotiation/README.md +3 -16
- src_code_for_reproducibility/markov_games/negotiation/dond_agent.py +39 -25
- src_code_for_reproducibility/markov_games/negotiation/nego_agent.py +19 -0
- src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py +10 -4
- src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py +14 -0
- src_code_for_reproducibility/markov_games/negotiation/tas_agent.py +10 -0
seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 323014168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:95c2062df6f7a10adbd244c9b6e75a8153c54f776c3619732a3fa0c766be3166
|
| 3 |
size 323014168
|
seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/critic_adapter/adapter_model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 323014168
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a2810cb0ec24072033412e5ff181e51188612e95b9f1685f9177794aa66a8bc0
|
| 3 |
size 323014168
|
seed_42/agent_trainer/policy_optimizer_state.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 646269121
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9d017d2f98e71c9ed36613fc1d2e7e8daeef9aa62ee96e0e838ec293e469025
|
| 3 |
size 646269121
|
seed_42/agent_trainer/trainer_annealing_state.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 104
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5468e667c6b74a7cb34fc016988230e631fc520b2df33e5a5c71068b59689f3e
|
| 3 |
size 104
|
seed_42/random_state.pkl
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 12254
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:03db597030fc1fe5f071eb41114416ad894895b73935ddcee0fc06e622471c8a
|
| 3 |
size 12254
|
src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc
CHANGED
|
Binary files a/src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/agent.cpython-312.pyc differ
|
|
|
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc
CHANGED
|
Binary files a/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc differ
|
|
|
src_code_for_reproducibility/markov_games/negotiation/README.md
CHANGED
|
@@ -9,29 +9,16 @@ Proportional splitting is used when the two proposals exceed the available total
|
|
| 9 |
### Variants (in increasing difficulty)
|
| 10 |
|
| 11 |
- No‑Press Split
|
| 12 |
-
-
|
| 13 |
-
-
|
|
|
|
| 14 |
- Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
|
| 15 |
-
- Deterministic Mode: values are fixed and public: one agent values coins at 10, the other at 1 (alternates each round).
|
| 16 |
-
- Stochastic Mode: values are random and uncorrelated.
|
| 17 |
|
| 18 |
- Trust-and-Split RPS (TAS-RPS)
|
| 19 |
- Single item type (coins)
|
| 20 |
- Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
|
| 21 |
- Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
|
| 22 |
- Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
|
| 23 |
-
|
| 24 |
-
- Trust-and-Split (TAS)
|
| 25 |
-
- Single item type (coins); each round, each agent’s per-coin value is independently sampled in a broad range (e.g., 1–20).
|
| 26 |
-
- Each agent observes only their own value; they may use short messages to share and negotiate.
|
| 27 |
-
- Motivation: a simple blend that tests whether agents learn to exchange private information and coordinate proportional, value-aware splits.
|
| 28 |
-
|
| 29 |
-
- Deal-or-No-Deal (DOND)
|
| 30 |
-
- Introduced in [Deal or No Deal? End-to-End Learning for Negotiation Dialogues](https://arxiv.org/pdf/1706.05125)
|
| 31 |
-
- Multiple item types (typically "books", "hats" and "balls") with limited stocks; each agent has its own per-type values.
|
| 32 |
-
- A deal pays out only if both proposals exactly agree and respect the stock; otherwise no deal (zero reward) that round.
|
| 33 |
-
- Motivation: a known benchmark closer to real-world bargaining, where both parties must explicitly agree.
|
| 34 |
-
|
| 35 |
|
| 36 |
|
| 37 |
|
|
|
|
| 9 |
### Variants (in increasing difficulty)
|
| 10 |
|
| 11 |
- No‑Press Split
|
| 12 |
+
- Multiple item types (e.g., hats, balls, books)
|
| 13 |
+
- The item values for each agent are public.
|
| 14 |
+
- No communication; agents go straight to making split proposals.
|
| 15 |
- Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
|
|
|
|
|
|
|
| 16 |
|
| 17 |
- Trust-and-Split RPS (TAS-RPS)
|
| 18 |
- Single item type (coins)
|
| 19 |
- Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
|
| 20 |
- Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
|
| 21 |
- Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
|
src_code_for_reproducibility/markov_games/negotiation/dond_agent.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import copy
|
| 2 |
import re
|
| 3 |
from collections.abc import Callable
|
|
@@ -5,14 +10,18 @@ from dataclasses import dataclass
|
|
| 5 |
from typing import Any, Dict, List, Tuple
|
| 6 |
|
| 7 |
from mllm.markov_games.agent import Agent
|
| 8 |
-
from mllm.markov_games.
|
| 9 |
-
from mllm.markov_games.negotiation.
|
| 10 |
-
|
|
|
|
| 11 |
)
|
| 12 |
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 13 |
-
from mllm.markov_games.
|
|
|
|
| 14 |
|
| 15 |
class DealNoDealAgent(NegotiationAgent):
|
|
|
|
|
|
|
| 16 |
def __init__(
|
| 17 |
self,
|
| 18 |
*args,
|
|
@@ -20,42 +29,47 @@ class DealNoDealAgent(NegotiationAgent):
|
|
| 20 |
):
|
| 21 |
super().__init__(*args, **kwargs)
|
| 22 |
self.intro_prompt = (
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
self.new_round_prompt = (
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
def get_message_regex(self, observation: DealNoDealObs) -> str:
|
|
|
|
| 37 |
return r"<message>[\s\S]{0,400}</message>"
|
| 38 |
-
|
| 39 |
def get_split_regex(self, observation: DealNoDealObs) -> str:
|
|
|
|
| 40 |
parts = []
|
| 41 |
for t in observation.item_types:
|
| 42 |
s = int(observation.quantities.get(t, 0))
|
| 43 |
allowed = "|".join(str(k) for k in range(0, s + 1))
|
| 44 |
rng = f"({allowed})"
|
| 45 |
-
parts.append(
|
| 46 |
items_block = "".join(parts)
|
| 47 |
-
return
|
| 48 |
-
|
| 49 |
def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
|
|
|
|
| 50 |
import re as _re
|
|
|
|
| 51 |
allocations: Dict[str, int] = {}
|
| 52 |
for t in observation.item_types:
|
| 53 |
-
m = _re.search(
|
| 54 |
if m:
|
| 55 |
allocations[t] = int(m.group(1))
|
| 56 |
else:
|
| 57 |
allocations[t] = 0
|
| 58 |
return Split(items_given_to_self=allocations)
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/dond_agent.py
|
| 3 |
+
Summary: Agent implementation for Deal-or-No-Deal style negotiations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import copy
|
| 7 |
import re
|
| 8 |
from collections.abc import Callable
|
|
|
|
| 10 |
from typing import Any, Dict, List, Tuple
|
| 11 |
|
| 12 |
from mllm.markov_games.agent import Agent
|
| 13 |
+
from mllm.markov_games.negotiation.dond_simulation import DealNoDealObs
|
| 14 |
+
from mllm.markov_games.negotiation.nego_agent import (
|
| 15 |
+
NegotiationAgent,
|
| 16 |
+
NegotiationAgentState,
|
| 17 |
)
|
| 18 |
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 19 |
+
from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
| 20 |
+
|
| 21 |
|
| 22 |
class DealNoDealAgent(NegotiationAgent):
|
| 23 |
+
"""NegotiationAgent tailored to the Deal-or-No-Deal stock/value revelation rules."""
|
| 24 |
+
|
| 25 |
def __init__(
|
| 26 |
self,
|
| 27 |
*args,
|
|
|
|
| 29 |
):
|
| 30 |
super().__init__(*args, **kwargs)
|
| 31 |
self.intro_prompt = (
|
| 32 |
+
"You are {agent_id}. You are playing an iterated game. "
|
| 33 |
+
"At each round, you and other agent will try to distribute among yourselves items of types {item_types}. "
|
| 34 |
+
"You only know how much you value each item type, but not the other agent's values. "
|
| 35 |
+
"You can communicate with the other agent by sending up to {quota_messages_per_agent_per_round} short messages per round. "
|
| 36 |
+
"Each round, after exchanging messages, you and the other agent will submit a private proposal. "
|
| 37 |
+
"A deal is accepted only if both proposals match exactly and are within stock; otherwise no deal (0 points for both at that round). "
|
| 38 |
+
"The values of the items of the other agent at the previous round are revealed to you after each round. "
|
| 39 |
+
"Your goal is: {goal}."
|
| 40 |
+
)
|
| 41 |
+
self.new_round_prompt = (
|
| 42 |
+
"New round {round_nb}. Items: {stock}. Your values: {values}. "
|
| 43 |
+
)
|
| 44 |
+
self.last_round_prompt = (
|
| 45 |
+
"Last round, other agent's values: {previous_values_coagent}. "
|
| 46 |
+
)
|
| 47 |
+
self.send_split_prompt = "Respond with <split>...</split> where you propose how many items of each type you want to keep."
|
| 48 |
+
|
| 49 |
def get_message_regex(self, observation: DealNoDealObs) -> str:
|
| 50 |
+
"""Allow short XML messages (<400 chars) between proposal phases."""
|
| 51 |
return r"<message>[\s\S]{0,400}</message>"
|
| 52 |
+
|
| 53 |
def get_split_regex(self, observation: DealNoDealObs) -> str:
|
| 54 |
+
"""Constrain split proposals to per-item XML tags bounded by the current stock."""
|
| 55 |
parts = []
|
| 56 |
for t in observation.item_types:
|
| 57 |
s = int(observation.quantities.get(t, 0))
|
| 58 |
allowed = "|".join(str(k) for k in range(0, s + 1))
|
| 59 |
rng = f"({allowed})"
|
| 60 |
+
parts.append(rf"<{t}>{rng}</{t}>")
|
| 61 |
items_block = "".join(parts)
|
| 62 |
+
return rf"(<split>{items_block}</split>)"
|
| 63 |
+
|
| 64 |
def get_split_action(self, policy_output: str, observation: DealNoDealObs) -> Split:
|
| 65 |
+
"""Convert the XML proposal into a Split dataclass understood by the simulator."""
|
| 66 |
import re as _re
|
| 67 |
+
|
| 68 |
allocations: Dict[str, int] = {}
|
| 69 |
for t in observation.item_types:
|
| 70 |
+
m = _re.search(rf"<{t}>([0-9]+)</{t}>", policy_output)
|
| 71 |
if m:
|
| 72 |
allocations[t] = int(m.group(1))
|
| 73 |
else:
|
| 74 |
allocations[t] = 0
|
| 75 |
return Split(items_given_to_self=allocations)
|
|
|
|
|
|
|
|
|
src_code_for_reproducibility/markov_games/negotiation/nego_agent.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import copy
|
| 2 |
from abc import abstractmethod
|
| 3 |
from collections.abc import Callable
|
|
@@ -13,6 +18,8 @@ from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
|
| 13 |
|
| 14 |
@dataclass
|
| 15 |
class NegotiationAgentState:
|
|
|
|
|
|
|
| 16 |
round_nb: int
|
| 17 |
nb_messages_sent_this_round: int
|
| 18 |
chat_counter: int
|
|
@@ -20,6 +27,8 @@ class NegotiationAgentState:
|
|
| 20 |
|
| 21 |
|
| 22 |
class NegotiationAgent(Agent):
|
|
|
|
|
|
|
| 23 |
def __init__(
|
| 24 |
self,
|
| 25 |
seed: int,
|
|
@@ -61,19 +70,29 @@ class NegotiationAgent(Agent):
|
|
| 61 |
|
| 62 |
@abstractmethod
|
| 63 |
def get_message_regex(self, observation: NegotiationObs) -> str:
|
|
|
|
| 64 |
pass
|
| 65 |
|
| 66 |
@abstractmethod
|
| 67 |
def get_split_regex(self, observation: NegotiationObs) -> str:
|
|
|
|
| 68 |
pass
|
| 69 |
|
| 70 |
@abstractmethod
|
| 71 |
def get_split_action(
|
| 72 |
self, policy_output: str, observation: NegotiationObs
|
| 73 |
) -> Split:
|
|
|
|
| 74 |
pass
|
| 75 |
|
| 76 |
async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
def dict_to_str(d: dict) -> str:
|
| 78 |
return ", ".join(f"{v} {k}" for k, v in d.items())
|
| 79 |
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/nego_agent.py
|
| 3 |
+
Summary: General-purpose negotiation agent coordinating prompts and actions.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import copy
|
| 7 |
from abc import abstractmethod
|
| 8 |
from collections.abc import Callable
|
|
|
|
| 18 |
|
| 19 |
@dataclass
|
| 20 |
class NegotiationAgentState:
|
| 21 |
+
"""Lightweight container tracking round progression and message history."""
|
| 22 |
+
|
| 23 |
round_nb: int
|
| 24 |
nb_messages_sent_this_round: int
|
| 25 |
chat_counter: int
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
class NegotiationAgent(Agent):
|
| 30 |
+
"""Base agent that manages prompt scaffolding and regex validation for variants."""
|
| 31 |
+
|
| 32 |
def __init__(
|
| 33 |
self,
|
| 34 |
seed: int,
|
|
|
|
| 70 |
|
| 71 |
@abstractmethod
|
| 72 |
def get_message_regex(self, observation: NegotiationObs) -> str:
|
| 73 |
+
"""Return the regex that outgoing chat messages must satisfy."""
|
| 74 |
pass
|
| 75 |
|
| 76 |
@abstractmethod
|
| 77 |
def get_split_regex(self, observation: NegotiationObs) -> str:
|
| 78 |
+
"""Return the regex that final split proposals must satisfy."""
|
| 79 |
pass
|
| 80 |
|
| 81 |
@abstractmethod
|
| 82 |
def get_split_action(
|
| 83 |
self, policy_output: str, observation: NegotiationObs
|
| 84 |
) -> Split:
|
| 85 |
+
"""Convert raw LLM output into the ``Split`` structure required by simulations."""
|
| 86 |
pass
|
| 87 |
|
| 88 |
async def act(self, observation: NegotiationObs) -> Tuple[Any, AgentActLog]:
|
| 89 |
+
"""
|
| 90 |
+
Assemble the appropriate prompt, query the policy, and return message or split.
|
| 91 |
+
|
| 92 |
+
This handles intro text, new-round reminders, quota tracking, and post-processing
|
| 93 |
+
(regex enforcement + ChatTurn logging) so subclasses only customize prompts/regexes.
|
| 94 |
+
"""
|
| 95 |
+
|
| 96 |
def dict_to_str(d: dict) -> str:
|
| 97 |
return ", ".join(f"{v} {k}" for k, v in d.items())
|
| 98 |
|
src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py
CHANGED
|
@@ -1,11 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import asyncio
|
| 2 |
-
from typing import Optional
|
|
|
|
| 3 |
from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
|
|
|
|
| 4 |
from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
|
| 5 |
from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
|
| 6 |
from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
| 7 |
-
|
| 8 |
-
from typing import Any, Tuple
|
| 9 |
|
| 10 |
class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
|
| 11 |
async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
|
|
@@ -40,6 +46,7 @@ class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
|
|
| 40 |
)
|
| 41 |
return action, act_log
|
| 42 |
|
|
|
|
| 43 |
class HardCodedNegoGreedyPolicy(NoPressAgent):
|
| 44 |
async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
|
| 45 |
"""
|
|
@@ -61,4 +68,3 @@ class HardCodedNegoGreedyPolicy(NoPressAgent):
|
|
| 61 |
info=None,
|
| 62 |
)
|
| 63 |
return action, act_log
|
| 64 |
-
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/nego_hard_coded_policies.py
|
| 3 |
+
Summary: Provides deterministic negotiation policies for testing and baselines.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import asyncio
|
| 7 |
+
from typing import Any, Optional, Tuple
|
| 8 |
+
|
| 9 |
from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
|
| 10 |
+
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 11 |
from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
|
| 12 |
from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
|
| 13 |
from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
|
| 14 |
+
|
|
|
|
| 15 |
|
| 16 |
class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
|
| 17 |
async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
|
|
|
|
| 46 |
)
|
| 47 |
return action, act_log
|
| 48 |
|
| 49 |
+
|
| 50 |
class HardCodedNegoGreedyPolicy(NoPressAgent):
|
| 51 |
async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
|
| 52 |
"""
|
|
|
|
| 68 |
info=None,
|
| 69 |
)
|
| 70 |
return action, act_log
|
|
|
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_simulation.py
CHANGED
|
@@ -1,3 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import copy
|
| 2 |
from collections import defaultdict
|
| 3 |
from dataclasses import dataclass
|
|
@@ -16,11 +21,15 @@ AgentId = str
|
|
| 16 |
|
| 17 |
@dataclass
|
| 18 |
class NoPressState(NegotiationState):
|
|
|
|
|
|
|
| 19 |
pass
|
| 20 |
|
| 21 |
|
| 22 |
@dataclass
|
| 23 |
class NoPressObs(NegotiationObs):
|
|
|
|
|
|
|
| 24 |
other_value: Dict[str, float]
|
| 25 |
|
| 26 |
|
|
@@ -39,6 +48,7 @@ class NoPressSimulation(NegotiationSimulation):
|
|
| 39 |
super().__init__(*args, **kwargs)
|
| 40 |
|
| 41 |
def _sample_values(self) -> Dict[AgentId, dict]:
|
|
|
|
| 42 |
values = defaultdict(dict)
|
| 43 |
if self.state is None:
|
| 44 |
item_types = self.item_types
|
|
@@ -73,9 +83,11 @@ class NoPressSimulation(NegotiationSimulation):
|
|
| 73 |
return values
|
| 74 |
|
| 75 |
def _sample_quantities(self) -> Dict[str, int]:
|
|
|
|
| 76 |
return {item.lower(): 10 for item in self.item_types}
|
| 77 |
|
| 78 |
def set_new_round_of_variant(self):
|
|
|
|
| 79 |
self.state.quantities = self._sample_quantities()
|
| 80 |
self.state.values = self._sample_values()
|
| 81 |
self.state.split_phase = True
|
|
@@ -83,6 +95,7 @@ class NoPressSimulation(NegotiationSimulation):
|
|
| 83 |
def get_info_of_variant(
|
| 84 |
self, state: NegotiationState, actions: Dict[AgentId, Any]
|
| 85 |
) -> Dict[str, Any]:
|
|
|
|
| 86 |
return {
|
| 87 |
"quantities": copy.deepcopy(state.quantities),
|
| 88 |
"values": copy.deepcopy(state.values),
|
|
@@ -90,6 +103,7 @@ class NoPressSimulation(NegotiationSimulation):
|
|
| 90 |
}
|
| 91 |
|
| 92 |
def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
|
|
|
|
| 93 |
return compute_tas_style_rewards(
|
| 94 |
self.agent_ids, self.state.values, splits, self.state.quantities
|
| 95 |
)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/no_press_nego_simulation.py
|
| 3 |
+
Summary: Simulation driver for no-press negotiation scenarios.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
import copy
|
| 7 |
from collections import defaultdict
|
| 8 |
from dataclasses import dataclass
|
|
|
|
| 21 |
|
| 22 |
@dataclass
|
| 23 |
class NoPressState(NegotiationState):
|
| 24 |
+
"""NegotiationState alias used to clarify we run in always-split phase."""
|
| 25 |
+
|
| 26 |
pass
|
| 27 |
|
| 28 |
|
| 29 |
@dataclass
|
| 30 |
class NoPressObs(NegotiationObs):
|
| 31 |
+
"""Observation that includes both agents' values (since there is no messaging)."""
|
| 32 |
+
|
| 33 |
other_value: Dict[str, float]
|
| 34 |
|
| 35 |
|
|
|
|
| 48 |
super().__init__(*args, **kwargs)
|
| 49 |
|
| 50 |
def _sample_values(self) -> Dict[AgentId, dict]:
|
| 51 |
+
"""Sample per-item valuations according to the configured template."""
|
| 52 |
values = defaultdict(dict)
|
| 53 |
if self.state is None:
|
| 54 |
item_types = self.item_types
|
|
|
|
| 83 |
return values
|
| 84 |
|
| 85 |
def _sample_quantities(self) -> Dict[str, int]:
|
| 86 |
+
"""No-press setups use symmetric 10-unit stocks for every item."""
|
| 87 |
return {item.lower(): 10 for item in self.item_types}
|
| 88 |
|
| 89 |
def set_new_round_of_variant(self):
|
| 90 |
+
"""Refresh quantities/values and jump directly into the simultaneous split."""
|
| 91 |
self.state.quantities = self._sample_quantities()
|
| 92 |
self.state.values = self._sample_values()
|
| 93 |
self.state.split_phase = True
|
|
|
|
| 95 |
def get_info_of_variant(
|
| 96 |
self, state: NegotiationState, actions: Dict[AgentId, Any]
|
| 97 |
) -> Dict[str, Any]:
|
| 98 |
+
"""Surface quantities/values/splits so statistics modules can read them."""
|
| 99 |
return {
|
| 100 |
"quantities": copy.deepcopy(state.quantities),
|
| 101 |
"values": copy.deepcopy(state.values),
|
|
|
|
| 103 |
}
|
| 104 |
|
| 105 |
def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
|
| 106 |
+
"""Reuse TAS reward logic because the split arbitration is identical."""
|
| 107 |
return compute_tas_style_rewards(
|
| 108 |
self.agent_ids, self.state.values, splits, self.state.quantities
|
| 109 |
)
|
src_code_for_reproducibility/markov_games/negotiation/tas_agent.py
CHANGED
|
@@ -1,9 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
|
| 2 |
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 3 |
from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
|
| 4 |
|
| 5 |
|
| 6 |
class TrustAndSplitAgent(NegotiationAgent):
|
|
|
|
|
|
|
| 7 |
def __init__(self, num_message_chars, *args, **kwargs):
|
| 8 |
self.num_message_chars = num_message_chars
|
| 9 |
super().__init__(*args, **kwargs)
|
|
@@ -58,12 +65,14 @@ class TrustAndSplitAgent(NegotiationAgent):
|
|
| 58 |
self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
|
| 59 |
|
| 60 |
def get_message_regex(self, observation: TrustAndSplitObs) -> str:
|
|
|
|
| 61 |
return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
|
| 62 |
|
| 63 |
# def get_message_regex(self, observation: TrustAndSplitObs) -> str:
|
| 64 |
# return rf"(?s).{{0,{self.num_message_chars}}}"
|
| 65 |
|
| 66 |
def get_split_regex(self, observation: TrustAndSplitObs) -> str:
|
|
|
|
| 67 |
items = list(observation.quantities.keys())
|
| 68 |
# Accept both singular and plural forms
|
| 69 |
item_pattern = "|".join(
|
|
@@ -75,6 +84,7 @@ class TrustAndSplitAgent(NegotiationAgent):
|
|
| 75 |
def get_split_action(
|
| 76 |
self, policy_output: str, observation: TrustAndSplitObs
|
| 77 |
) -> Split:
|
|
|
|
| 78 |
items = list(observation.quantities.keys())
|
| 79 |
import re as _re
|
| 80 |
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
File: mllm/markov_games/negotiation/tas_agent.py
|
| 3 |
+
Summary: Agent implementation for Take-and-Split negotiations.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
|
| 7 |
from mllm.markov_games.negotiation.nego_simulation import Split
|
| 8 |
from mllm.markov_games.negotiation.tas_simulation import TrustAndSplitObs
|
| 9 |
|
| 10 |
|
| 11 |
class TrustAndSplitAgent(NegotiationAgent):
|
| 12 |
+
"""Prompt/template wrapper for the classic multi-item Take-and-Split benchmark."""
|
| 13 |
+
|
| 14 |
def __init__(self, num_message_chars, *args, **kwargs):
|
| 15 |
self.num_message_chars = num_message_chars
|
| 16 |
super().__init__(*args, **kwargs)
|
|
|
|
| 65 |
self.send_message_prompt = f"Send your message now in <message>...</message> (<={self.num_message_chars} chars)."
|
| 66 |
|
| 67 |
def get_message_regex(self, observation: TrustAndSplitObs) -> str:
|
| 68 |
+
"""Constrain chat to bounded XML tags for stable parsing."""
|
| 69 |
return rf"<message>[\s\S]{{0,{self.num_message_chars}}}</message>"
|
| 70 |
|
| 71 |
# def get_message_regex(self, observation: TrustAndSplitObs) -> str:
|
| 72 |
# return rf"(?s).{{0,{self.num_message_chars}}}"
|
| 73 |
|
| 74 |
def get_split_regex(self, observation: TrustAndSplitObs) -> str:
|
| 75 |
+
"""Allow natural-language item names while still returning machine-parsable XML."""
|
| 76 |
items = list(observation.quantities.keys())
|
| 77 |
# Accept both singular and plural forms
|
| 78 |
item_pattern = "|".join(
|
|
|
|
| 84 |
def get_split_action(
|
| 85 |
self, policy_output: str, observation: TrustAndSplitObs
|
| 86 |
) -> Split:
|
| 87 |
+
"""Convert human-readable allocation text back into canonical item IDs."""
|
| 88 |
items = list(observation.quantities.keys())
|
| 89 |
import re as _re
|
| 90 |
|