Muqeeth commited on
Commit
c8010a5
·
verified ·
1 Parent(s): 5fb294e

Add files using upload-large-folder tool

Browse files
seed_42/Qwen/Qwen2.5-7B-Instruct/adapters/agent_adapter/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f52affcd642fa67620c5f7c3155cb8a867b8f45e80119606c46cb2301660cde
3
  size 323014168
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cf7df3f718064f8b8bccd484ee71d607c706663cefa78a180e45d4dcf8fc0b7
3
  size 323014168
seed_42/agent_trainer/policy_optimizer_state.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:911335e20ef7b5f2e34bb166cb8f236807ed6874707956bad488dac1989ca6e9
3
  size 646269121
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ad344079b33d8b7633e4db957f7f603999d25b31d44becaf441bf8f8a6cb607
3
  size 646269121
seed_42/agent_trainer/trainer_annealing_state.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b6175536b701094d5172932b38c6ca6c17baa2f07ab83ebb00f80f9d1c96bc9
3
  size 104
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a41c35f20678f6c02b24d48db5127433d12a342565274268816a2f39fd757e8
3
  size 104
seed_42/random_state.pkl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2113fdb42ab3e7764f6a201a6b7edb00a002a4d9dead874859847cfcadac96f
3
  size 12254
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bed461beebec279976e9e4353eb3aff688ccd1bd5ff66516a692c1a0356b610
3
  size 12254
src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc CHANGED
Binary files a/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/alternative_actions_runner.cpython-312.pyc differ
 
src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc CHANGED
Binary files a/src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc and b/src_code_for_reproducibility/markov_games/__pycache__/markov_game.cpython-312.pyc differ
 
src_code_for_reproducibility/markov_games/negotiation/README.md CHANGED
@@ -9,29 +9,16 @@ Proportional splitting is used when the two proposals exceed the available total
9
  ### Variants (in increasing difficulty)
10
 
11
  - No‑Press Split
12
- - Single item type (coins)
13
- - No communication; agents go straight to making split proposals, with the starting player alternating deterministically.
 
14
  - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
15
- - Deterministic Mode: values are fixed and public: one agent values coins at 10, the other at 1 (alternates each round).
16
- - Stochastic Mode: values are random and uncorrelated.
17
 
18
  - Trust-and-Split RPS (TAS-RPS)
19
  - Single item type (coins)
20
  - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
21
  - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
22
  - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
23
-
24
- - Trust-and-Split (TAS)
25
- - Single item type (coins); each round, each agent’s per-coin value is independently sampled in a broad range (e.g., 1–20).
26
- - Each agent observes only their own value; they may use short messages to share and negotiate.
27
- - Motivation: a simple blend that tests whether agents learn to exchange private information and coordinate proportional, value-aware splits.
28
-
29
- - Deal-or-No-Deal (DOND)
30
- - Introduced in [Deal or No Deal? End-to-End Learning for Negotiation Dialogues](https://arxiv.org/pdf/1706.05125)
31
- - Multiple item types (typically "books", "hats" and "balls") with limited stocks; each agent has its own per-type values.
32
- - A deal pays out only if both proposals exactly agree and respect the stock; otherwise no deal (zero reward) that round.
33
- - Motivation: a known benchmark closer to real-world bargaining, where both parties must explicitly agree.
34
-
35
 
36
 
37
 
 
9
  ### Variants (in increasing difficulty)
10
 
11
  - No‑Press Split
12
+ - Multiple item types (e.g., hats, balls, books)
13
+ - The item values for each agent are public.
14
+ - No communication; agents go straight to making split proposals.
15
  - Motivation: mirrors no‑communication setups (e.g., Advantage Alignment) while keeping the split decision nontrivial.
 
 
16
 
17
  - Trust-and-Split RPS (TAS-RPS)
18
  - Single item type (coins)
19
  - Each round, a rock–paper–scissors hand draw creates a strong asymmetry: the winner’s per-coin value is 10, the loser’s is 1.
20
  - Each agent initially sees only their own hand and must communicate to coordinate an optimal split.
21
  - Motivation: enforce large value disparity so one’s own value reveals little about the other’s (avoiding ceiling effects) and incentivize meaningful communication.
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
 
24
 
src_code_for_reproducibility/markov_games/negotiation/nego_hard_coded_policies.py CHANGED
@@ -1,11 +1,17 @@
 
 
 
 
 
1
  import asyncio
2
- from typing import Optional
 
3
  from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
 
4
  from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
5
  from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
6
  from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
7
- from mllm.markov_games.negotiation.nego_simulation import Split
8
- from typing import Any, Tuple
9
 
10
  class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
11
  async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
@@ -40,6 +46,7 @@ class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
40
  )
41
  return action, act_log
42
 
 
43
  class HardCodedNegoGreedyPolicy(NoPressAgent):
44
  async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
45
  """
@@ -61,4 +68,3 @@ class HardCodedNegoGreedyPolicy(NoPressAgent):
61
  info=None,
62
  )
63
  return action, act_log
64
-
 
1
+ """
2
+ File: mllm/markov_games/negotiation/nego_hard_coded_policies.py
3
+ Summary: Provides deterministic negotiation policies for testing and baselines.
4
+ """
5
+
6
  import asyncio
7
+ from typing import Any, Optional, Tuple
8
+
9
  from mllm.markov_games.negotiation.nego_agent import NegotiationAgent
10
+ from mllm.markov_games.negotiation.nego_simulation import Split
11
  from mllm.markov_games.negotiation.no_press_nego_agent import NoPressAgent
12
  from mllm.markov_games.negotiation.no_press_nego_simulation import NoPressObs
13
  from mllm.markov_games.rollout_tree import AgentActLog, ChatTurn
14
+
 
15
 
16
  class HardCodedNegoWelfareMaximizingPolicy(NoPressAgent):
17
  async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
 
46
  )
47
  return action, act_log
48
 
49
+
50
  class HardCodedNegoGreedyPolicy(NoPressAgent):
51
  async def act(self, observation: NoPressObs) -> Tuple[Any, AgentActLog]:
52
  """
 
68
  info=None,
69
  )
70
  return action, act_log
 
src_code_for_reproducibility/markov_games/negotiation/negotiation_statistics.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  from __future__ import annotations
2
 
3
  from typing import Callable, Dict, List, Tuple
 
1
+ """
2
+ File: mllm/markov_games/negotiation/negotiation_statistics.py
3
+ Summary: Aggregates and reports statistics for negotiation experiments.
4
+ """
5
+
6
  from __future__ import annotations
7
 
8
  from typing import Callable, Dict, List, Tuple
src_code_for_reproducibility/markov_games/negotiation/no_press_nego_agent.py CHANGED
@@ -1,3 +1,8 @@
 
 
 
 
 
1
  from typing import Any, Dict, List, Tuple
2
 
3
  from mllm.markov_games.negotiation.nego_agent import (
@@ -49,9 +54,11 @@ class NoPressAgent(NegotiationAgent):
49
  self.send_split_prompt = "Submit Your Proposal\n" "Respond as {proposal_style}"
50
 
51
  def get_message_regex(self, observation: NoPressObs) -> str:
 
52
  return r"^$" # No messages allowed
53
 
54
  def get_split_regex(self, observation: NoPressObs) -> str:
 
55
  items = list(observation.quantities.keys())
56
  # Accept both singular and plural forms
57
  item_pattern = "|".join(
@@ -61,6 +68,12 @@ class NoPressAgent(NegotiationAgent):
61
  return regex
62
 
63
  def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split:
 
 
 
 
 
 
64
  items = list(observation.quantities.keys())
65
  import re as _re
66
 
@@ -78,6 +91,7 @@ class NoPressAgent(NegotiationAgent):
78
  inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
79
 
80
  def normalize_item_name(item_str):
 
81
  for orig in items:
82
  if item_str.lower() == orig.lower():
83
  return orig
 
1
+ """
2
+ File: mllm/markov_games/negotiation/no_press_nego_agent.py
3
+ Summary: Agent variant for no-press negotiations without explicit messaging.
4
+ """
5
+
6
  from typing import Any, Dict, List, Tuple
7
 
8
  from mllm.markov_games.negotiation.nego_agent import (
 
54
  self.send_split_prompt = "Submit Your Proposal\n" "Respond as {proposal_style}"
55
 
56
  def get_message_regex(self, observation: NoPressObs) -> str:
57
+ """Return an empty pattern because the no-press variant forbids chat."""
58
  return r"^$" # No messages allowed
59
 
60
  def get_split_regex(self, observation: NoPressObs) -> str:
61
+ """Match proposals like ``Proposal: 4 coins, 6 apples`` case-insensitively."""
62
  items = list(observation.quantities.keys())
63
  # Accept both singular and plural forms
64
  item_pattern = "|".join(
 
68
  return regex
69
 
70
  def get_split_action(self, policy_output: str, observation: NoPressObs) -> Split:
71
+ """
72
+ Parse the LLM proposal into a normalized ``Split`` structure.
73
+
74
+ The regex-based parser is lenient (accepts pluralization variants) so that
75
+ prompt tweaks do not require re-training the extraction logic.
76
+ """
77
  items = list(observation.quantities.keys())
78
  import re as _re
79
 
 
91
  inner_regex = rf"(?i)(10|[0-9])\s*({item_pattern})"
92
 
93
  def normalize_item_name(item_str):
94
+ """Canonicalize plural/singular user text back to the config item id."""
95
  for orig in items:
96
  if item_str.lower() == orig.lower():
97
  return orig
src_code_for_reproducibility/markov_games/negotiation/tas_rps_simulation.py CHANGED
@@ -1,19 +1,12 @@
1
  """
2
- Trust-and-Split simulation.
3
-
4
- This environment models a simple bargaining game over 10 coins with messaging.
5
- Agents are assigned rock/paper/scissors hands, with the winner getting value 10 per coin
6
- and the loser getting value 1 per coin. Agents alternate sending messages for a fixed
7
- number of turns per round and then each submits a split proposal indicating how many
8
- coins they keep for themselves. Rewards are proportional if the proposed totals exceed 10.
9
  """
10
 
11
  import copy
12
  from dataclasses import dataclass
13
  from typing import Any, Dict, List, Literal, Tuple
14
 
15
- from numpy.random import default_rng
16
-
17
  from mllm.markov_games.negotiation.nego_simulation import (
18
  Message,
19
  NegotiationObs,
@@ -46,6 +39,8 @@ def _get_rps_winner(
46
 
47
  @dataclass
48
  class TrustAndSplitRPSState(NegotiationState):
 
 
49
  hands: Dict[
50
  AgentId, Literal["rock", "paper", "scissors"]
51
  ] # rock, paper, or scissors
@@ -54,6 +49,8 @@ class TrustAndSplitRPSState(NegotiationState):
54
 
55
  @dataclass
56
  class TrustAndSplitRPSObs(NegotiationObs):
 
 
57
  hand: Literal["rock", "paper", "scissors"]
58
  last_hand_agent: Literal["rock", "paper", "scissors"] | None
59
  last_hand_coagent: Literal["rock", "paper", "scissors"] | None
@@ -61,6 +58,8 @@ class TrustAndSplitRPSObs(NegotiationObs):
61
 
62
 
63
  class TrustAndSplitRPSSimulation(NegotiationSimulation):
 
 
64
  def __init__(
65
  self,
66
  alternating_hands: bool = False,
@@ -81,6 +80,13 @@ class TrustAndSplitRPSSimulation(NegotiationSimulation):
81
  self,
82
  alternate_hands: bool = False,
83
  ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]:
 
 
 
 
 
 
 
84
  hands = ["rock", "paper", "scissors"]
85
  if alternate_hands:
86
  previous_hands = list(self.state.previous_hands.values())
@@ -115,6 +121,7 @@ class TrustAndSplitRPSSimulation(NegotiationSimulation):
115
  return agent_hands, values
116
 
117
  def set_new_round_of_variant(self):
 
118
  self.state.previous_hands = copy.deepcopy(self.state.hands)
119
  new_hands, new_values = self._sample_hands_and_values(
120
  alternate_hands=self.alternating_hands
@@ -128,6 +135,7 @@ class TrustAndSplitRPSSimulation(NegotiationSimulation):
128
  def get_info_of_variant(
129
  self, state: NegotiationState, actions: Dict[AgentId, Any]
130
  ) -> Dict[str, Any]:
 
131
  return {
132
  "quantities": copy.deepcopy(state.quantities),
133
  "hands": copy.deepcopy(state.hands),
@@ -138,12 +146,13 @@ class TrustAndSplitRPSSimulation(NegotiationSimulation):
138
  }
139
 
140
  def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
 
141
  return compute_tas_style_rewards(
142
  self.agent_ids, self.state.values, splits, self.state.quantities
143
  )
144
 
145
  def get_obs_agent(self, agent_id):
146
- """Returns observation for agent_id"""
147
  other_id = self._other(agent_id)
148
  last_value_coagent = (
149
  None
 
1
  """
2
+ File: mllm/markov_games/negotiation/tas_rps_simulation.py
3
+ Summary: Simulation for TAS Rock-Paper-Scissors blended scenarios.
 
 
 
 
 
4
  """
5
 
6
  import copy
7
  from dataclasses import dataclass
8
  from typing import Any, Dict, List, Literal, Tuple
9
 
 
 
10
  from mllm.markov_games.negotiation.nego_simulation import (
11
  Message,
12
  NegotiationObs,
 
39
 
40
  @dataclass
41
  class TrustAndSplitRPSState(NegotiationState):
42
+ """Negotiation state augmented with the current and previous RPS hands."""
43
+
44
  hands: Dict[
45
  AgentId, Literal["rock", "paper", "scissors"]
46
  ] # rock, paper, or scissors
 
49
 
50
  @dataclass
51
  class TrustAndSplitRPSObs(NegotiationObs):
52
+ """Agent-facing observation enriched with last-hand metadata."""
53
+
54
  hand: Literal["rock", "paper", "scissors"]
55
  last_hand_agent: Literal["rock", "paper", "scissors"] | None
56
  last_hand_coagent: Literal["rock", "paper", "scissors"] | None
 
58
 
59
 
60
  class TrustAndSplitRPSSimulation(NegotiationSimulation):
61
+ """Negotiation variant that splices TAS splitting with RPS-determined stakes."""
62
+
63
  def __init__(
64
  self,
65
  alternating_hands: bool = False,
 
80
  self,
81
  alternate_hands: bool = False,
82
  ) -> Tuple[Dict[AgentId, str], Dict[AgentId, float]]:
83
+ """
84
+ Sample a rock-paper-scissors hand for each agent plus the per-hand value.
85
+
86
+ When ``alternate_hands`` is True we deliberately flip the previous round's
87
+ winner/loser roles to create nonstationary payoffs; otherwise we draw
88
+ uniformly without replacement.
89
+ """
90
  hands = ["rock", "paper", "scissors"]
91
  if alternate_hands:
92
  previous_hands = list(self.state.previous_hands.values())
 
121
  return agent_hands, values
122
 
123
  def set_new_round_of_variant(self):
124
+ """Refresh hands/values and reset round-specific state."""
125
  self.state.previous_hands = copy.deepcopy(self.state.hands)
126
  new_hands, new_values = self._sample_hands_and_values(
127
  alternate_hands=self.alternating_hands
 
135
  def get_info_of_variant(
136
  self, state: NegotiationState, actions: Dict[AgentId, Any]
137
  ) -> Dict[str, Any]:
138
+ """Expose variant-specific tensors for downstream logging/analysis."""
139
  return {
140
  "quantities": copy.deepcopy(state.quantities),
141
  "hands": copy.deepcopy(state.hands),
 
146
  }
147
 
148
  def get_rewards(self, splits: Dict[AgentId, Split]) -> Dict[AgentId, float]:
149
+ """Delegates to TAS reward helper because the payout rule is identical."""
150
  return compute_tas_style_rewards(
151
  self.agent_ids, self.state.values, splits, self.state.quantities
152
  )
153
 
154
  def get_obs_agent(self, agent_id):
155
+ """Return a full Trust-and-Split observation for ``agent_id``."""
156
  other_id = self._other(agent_id)
157
  last_value_coagent = (
158
  None
src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc CHANGED
Binary files a/src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc and b/src_code_for_reproducibility/models/__pycache__/__init__.cpython-312.pyc differ