Spaces:
Runtime error
Runtime error
AbeBhatti commited on
Commit ·
6858719
1
Parent(s): 10d346d
negotiation bluff classifier + message cleaner
Browse files- agent/agent_llm.py +142 -0
- agent/arbitragent.py +6 -20
- agent/bluff_detector.py +13 -3
- demo/sample_run_log.json +1 -1
- deploy/hf_spaces_app.py +27 -1
- envs/arbitragent_env.py +30 -7
- proj_context.md +2 -0
- session_progress.md +98 -1
- tests/reward_signal_results.json +74 -0
- tests/test_reward_signals.py +128 -0
- training/data/negotiation_bluff_labels.json +0 -0
- training/generate_negotiation_bluff_data.py +98 -0
- training/phase1_reward_curve.png +0 -0
- training/phase2_reward_curve.png +0 -0
- training/train_bluff_classifier.py +41 -12
- training/unified_reward_curve.png +0 -0
agent/agent_llm.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
agent_llm.py — Lightweight inference wrapper for the trained TinyLlama model.
|
| 3 |
+
|
| 4 |
+
Lazy-loads unified_final (or phase2_final) and generates negotiation messages
|
| 5 |
+
for ArbitrAgent: scout, pressure, and coalition.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from __future__ import annotations
|
| 9 |
+
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
from typing import Optional
|
| 12 |
+
|
| 13 |
+
# Lazy-loaded
|
| 14 |
+
_MODEL = None
|
| 15 |
+
_TOKENIZER = None
|
| 16 |
+
_CHECKPOINT_PATH: Optional[Path] = None
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def _resolve_checkpoint() -> Path:
|
| 20 |
+
"""Unified_final if exists, else phase2_final."""
|
| 21 |
+
root = Path(__file__).resolve().parent.parent
|
| 22 |
+
unified = root / "training" / "checkpoints" / "unified_final"
|
| 23 |
+
phase2 = root / "training" / "checkpoints" / "phase2_final"
|
| 24 |
+
if unified.exists() and (unified / "config.json").exists():
|
| 25 |
+
return unified
|
| 26 |
+
if phase2.exists() and (phase2 / "config.json").exists():
|
| 27 |
+
return phase2
|
| 28 |
+
return unified # caller will handle missing
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
def _load():
|
| 32 |
+
global _MODEL, _TOKENIZER, _CHECKPOINT_PATH
|
| 33 |
+
if _MODEL is not None:
|
| 34 |
+
return
|
| 35 |
+
import torch
|
| 36 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 37 |
+
|
| 38 |
+
_CHECKPOINT_PATH = _resolve_checkpoint()
|
| 39 |
+
if not _CHECKPOINT_PATH.exists() or not (_CHECKPOINT_PATH / "config.json").exists():
|
| 40 |
+
return
|
| 41 |
+
_TOKENIZER = AutoTokenizer.from_pretrained(str(_CHECKPOINT_PATH))
|
| 42 |
+
_MODEL = AutoModelForCausalLM.from_pretrained(
|
| 43 |
+
str(_CHECKPOINT_PATH),
|
| 44 |
+
torch_dtype=torch.float16,
|
| 45 |
+
device_map="auto",
|
| 46 |
+
)
|
| 47 |
+
_MODEL.eval()
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
class AgentLLM:
|
| 51 |
+
"""
|
| 52 |
+
Lazy-loads the trained TinyLlama checkpoint (unified_final or phase2_final)
|
| 53 |
+
and provides scout_message, pressure_message, coalition_message.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def _clean(self, text: str, fallback: str) -> str:
|
| 57 |
+
BAD_PHRASES = [
|
| 58 |
+
"my goal", "more specifically", "focused on helping",
|
| 59 |
+
"value proposition", "helping sellers", "helping buyers",
|
| 60 |
+
"specifically focused", "as an auctioneer", "as a buyer",
|
| 61 |
+
"increasing conversions", "active listener"
|
| 62 |
+
]
|
| 63 |
+
# Take only first sentence/line
|
| 64 |
+
text = text.strip().split('.')[0].split('\n')[0].strip()
|
| 65 |
+
# If too long or contains bad phrases, use fallback
|
| 66 |
+
if len(text) > 120:
|
| 67 |
+
return fallback
|
| 68 |
+
if any(p in text.lower() for p in BAD_PHRASES):
|
| 69 |
+
return fallback
|
| 70 |
+
# If too short to be meaningful, use fallback
|
| 71 |
+
if len(text) < 10:
|
| 72 |
+
return fallback
|
| 73 |
+
return text
|
| 74 |
+
|
| 75 |
+
def generate(self, prompt: str, max_tokens: int = 80) -> str:
|
| 76 |
+
"""Generate text from prompt; returns only the generated part (prompt stripped)."""
|
| 77 |
+
_load()
|
| 78 |
+
if _MODEL is None or _TOKENIZER is None:
|
| 79 |
+
return ""
|
| 80 |
+
import torch
|
| 81 |
+
|
| 82 |
+
inputs = _TOKENIZER(prompt, return_tensors="pt").to(_MODEL.device)
|
| 83 |
+
prompt_decoded = _TOKENIZER.decode(inputs["input_ids"][0], skip_special_tokens=True)
|
| 84 |
+
with torch.no_grad():
|
| 85 |
+
out = _MODEL.generate(
|
| 86 |
+
**inputs,
|
| 87 |
+
max_new_tokens=max_tokens,
|
| 88 |
+
do_sample=True,
|
| 89 |
+
temperature=0.7,
|
| 90 |
+
top_p=0.9,
|
| 91 |
+
pad_token_id=_TOKENIZER.eos_token_id,
|
| 92 |
+
repetition_penalty=1.3,
|
| 93 |
+
no_repeat_ngram_size=3,
|
| 94 |
+
)
|
| 95 |
+
full = _TOKENIZER.decode(out[0], skip_special_tokens=True)
|
| 96 |
+
if full.startswith(prompt_decoded):
|
| 97 |
+
generated = full[len(prompt_decoded) :].strip()
|
| 98 |
+
else:
|
| 99 |
+
generated = full.strip()
|
| 100 |
+
# First sentence or line
|
| 101 |
+
for sep in ["\n", ".", "!"]:
|
| 102 |
+
if sep in generated:
|
| 103 |
+
generated = generated.split(sep)[0].strip()
|
| 104 |
+
break
|
| 105 |
+
# Fall back to hardcoded if 3+ consecutive repeated words
|
| 106 |
+
words = generated.split()
|
| 107 |
+
for i in range(len(words) - 2):
|
| 108 |
+
if words[i] == words[i + 1] == words[i + 2]:
|
| 109 |
+
return ""
|
| 110 |
+
return generated
|
| 111 |
+
|
| 112 |
+
def scout_message(self, item: str, listing_price: float) -> str:
|
| 113 |
+
"""Opening inquiry to seller."""
|
| 114 |
+
prompt = (
|
| 115 |
+
f"You are a buyer on Craigslist. Send a short, casual opening message "
|
| 116 |
+
f"asking if the {item} (listed around ${listing_price:.0f}) is still available "
|
| 117 |
+
f"and if there's any room on price. Keep it under 20 words. Message:"
|
| 118 |
+
)
|
| 119 |
+
result = self.generate(prompt, max_tokens=40)
|
| 120 |
+
return self._clean(result, f"hey, is the {item} still available? any room on price?")
|
| 121 |
+
|
| 122 |
+
def pressure_message(self, item: str, current_offer: float) -> str:
|
| 123 |
+
"""Follow-up pressure message when seller hasn't moved much."""
|
| 124 |
+
prompt = (
|
| 125 |
+
f"You are a buyer negotiating for a {item}. Current seller offer is ${current_offer:.0f}. "
|
| 126 |
+
f"Send a short follow-up asking for flexibility. Keep it under 20 words. Message:"
|
| 127 |
+
)
|
| 128 |
+
result = self.generate(prompt, max_tokens=40)
|
| 129 |
+
return self._clean(result, f"just checking back on the {item} — any flexibility on your price at all?")
|
| 130 |
+
|
| 131 |
+
def coalition_message(self, item: str, floor_minus_4: int) -> str:
|
| 132 |
+
"""Coalition pressure after detecting a bluff; counter at floor_minus_4."""
|
| 133 |
+
prompt = (
|
| 134 |
+
f"You are a buyer for a {item}. You detected the seller is bluffing about a final offer. "
|
| 135 |
+
f"You have another deal lined up. Mention it casually and counter at ${floor_minus_4}. "
|
| 136 |
+
f"Keep it under 25 words. Message:"
|
| 137 |
+
)
|
| 138 |
+
result = self.generate(prompt, max_tokens=50)
|
| 139 |
+
return self._clean(
|
| 140 |
+
result,
|
| 141 |
+
f"I have a trade offer from another seller that makes this less urgent for me — can you do ${floor_minus_4}?",
|
| 142 |
+
)
|
agent/arbitragent.py
CHANGED
|
@@ -24,6 +24,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
| 24 |
|
| 25 |
from agent.route_graph import RouteGraph, RouteEdge
|
| 26 |
from agent.bluff_detector import analyze_from_sim
|
|
|
|
| 27 |
from simulation.scenario import get_scenario
|
| 28 |
from simulation.seller_profiles import LISTINGS
|
| 29 |
|
|
@@ -53,6 +54,7 @@ class ArbitrAgent:
|
|
| 53 |
def __init__(self, budget: float = 20.0, min_route_score: float = 1.0):
|
| 54 |
self.budget = float(budget)
|
| 55 |
self.route_graph = RouteGraph(minimum_threshold=min_route_score)
|
|
|
|
| 56 |
# Structured event log for downstream inspection / demo UIs.
|
| 57 |
self._structured_log: List[Dict[str, Any]] = []
|
| 58 |
|
|
@@ -194,7 +196,7 @@ class ArbitrAgent:
|
|
| 194 |
|
| 195 |
def _open_soft_inquiries(self, candidates: List[SellerCandidate], verbose: bool = True) -> None:
|
| 196 |
for c in candidates:
|
| 197 |
-
msg =
|
| 198 |
resp = c.sim.step(msg)
|
| 199 |
if verbose:
|
| 200 |
print(f"[to {c.seller_id}] {msg}")
|
|
@@ -299,21 +301,8 @@ class ArbitrAgent:
|
|
| 299 |
self.route_graph.mark_dead(edge.edge_id)
|
| 300 |
continue
|
| 301 |
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
(edge.buy_item, int(edge.trade_target_id.split("_")[1]))
|
| 305 |
-
in confirmed_targets
|
| 306 |
-
for edge in edges
|
| 307 |
-
)
|
| 308 |
-
|
| 309 |
-
if has_confirmed_downstream:
|
| 310 |
-
msg = (
|
| 311 |
-
f"i have another buyer interested in the {c.item}, "
|
| 312 |
-
"but i'd prefer to buy from you if we can make the numbers work. "
|
| 313 |
-
"could you do a bit better on price?"
|
| 314 |
-
)
|
| 315 |
-
else:
|
| 316 |
-
msg = f"just checking back on the {c.item} — any flexibility on your price at all?"
|
| 317 |
|
| 318 |
resp = c.sim.step(msg)
|
| 319 |
if verbose:
|
|
@@ -387,10 +376,7 @@ class ArbitrAgent:
|
|
| 387 |
if signals.is_bluff:
|
| 388 |
current_offer = float(c.sim.current_offer)
|
| 389 |
offer = max(1, int(current_offer - 4))
|
| 390 |
-
pressure_msg = (
|
| 391 |
-
"I have a trade offer from another seller that makes this less urgent for me — "
|
| 392 |
-
f"can you do ${offer}?"
|
| 393 |
-
)
|
| 394 |
pressure_resp = c.sim.step(pressure_msg)
|
| 395 |
if verbose:
|
| 396 |
print(f"[to {c.seller_id}] {pressure_msg}")
|
|
|
|
| 24 |
|
| 25 |
from agent.route_graph import RouteGraph, RouteEdge
|
| 26 |
from agent.bluff_detector import analyze_from_sim
|
| 27 |
+
from agent.agent_llm import AgentLLM
|
| 28 |
from simulation.scenario import get_scenario
|
| 29 |
from simulation.seller_profiles import LISTINGS
|
| 30 |
|
|
|
|
| 54 |
def __init__(self, budget: float = 20.0, min_route_score: float = 1.0):
|
| 55 |
self.budget = float(budget)
|
| 56 |
self.route_graph = RouteGraph(minimum_threshold=min_route_score)
|
| 57 |
+
self.llm = AgentLLM()
|
| 58 |
# Structured event log for downstream inspection / demo UIs.
|
| 59 |
self._structured_log: List[Dict[str, Any]] = []
|
| 60 |
|
|
|
|
| 196 |
|
| 197 |
def _open_soft_inquiries(self, candidates: List[SellerCandidate], verbose: bool = True) -> None:
|
| 198 |
for c in candidates:
|
| 199 |
+
msg = self.llm.scout_message(c.item, c.listing_price)
|
| 200 |
resp = c.sim.step(msg)
|
| 201 |
if verbose:
|
| 202 |
print(f"[to {c.seller_id}] {msg}")
|
|
|
|
| 301 |
self.route_graph.mark_dead(edge.edge_id)
|
| 302 |
continue
|
| 303 |
|
| 304 |
+
current_offer = float(c.sim.current_offer)
|
| 305 |
+
msg = self.llm.pressure_message(c.item, current_offer)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
| 307 |
resp = c.sim.step(msg)
|
| 308 |
if verbose:
|
|
|
|
| 376 |
if signals.is_bluff:
|
| 377 |
current_offer = float(c.sim.current_offer)
|
| 378 |
offer = max(1, int(current_offer - 4))
|
| 379 |
+
pressure_msg = self.llm.coalition_message(c.item, offer)
|
|
|
|
|
|
|
|
|
|
| 380 |
pressure_resp = c.sim.step(pressure_msg)
|
| 381 |
if verbose:
|
| 382 |
print(f"[to {c.seller_id}] {pressure_msg}")
|
agent/bluff_detector.py
CHANGED
|
@@ -24,9 +24,19 @@ def _get_bluff_classifier():
|
|
| 24 |
global _bluff_classifier_model, _bluff_classifier_tokenizer
|
| 25 |
if _bluff_classifier_model is not None:
|
| 26 |
return _bluff_classifier_model, _bluff_classifier_tokenizer
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
return None, None
|
| 31 |
try:
|
| 32 |
import torch
|
|
|
|
| 24 |
global _bluff_classifier_model, _bluff_classifier_tokenizer
|
| 25 |
if _bluff_classifier_model is not None:
|
| 26 |
return _bluff_classifier_model, _bluff_classifier_tokenizer
|
| 27 |
+
checkpoints_dir = Path(__file__).resolve().parent.parent / "training" / "checkpoints"
|
| 28 |
+
negotiation_pt = checkpoints_dir / "bluff_classifier_negotiation.pt"
|
| 29 |
+
default_pt = checkpoints_dir / "bluff_classifier.pt"
|
| 30 |
+
# Prefer negotiation-trained classifier if present, else fall back to poker-trained one.
|
| 31 |
+
if negotiation_pt.exists():
|
| 32 |
+
pt_path = negotiation_pt
|
| 33 |
+
elif default_pt.exists():
|
| 34 |
+
pt_path = default_pt
|
| 35 |
+
else:
|
| 36 |
+
return None, None
|
| 37 |
+
|
| 38 |
+
tok_dir = checkpoints_dir / "bluff_classifier_tokenizer"
|
| 39 |
+
if not tok_dir.exists():
|
| 40 |
return None, None
|
| 41 |
try:
|
| 42 |
import torch
|
demo/sample_run_log.json
CHANGED
|
@@ -190,7 +190,7 @@
|
|
| 190 |
"final_value": 97.0,
|
| 191 |
"profit": 77.0,
|
| 192 |
"return_multiple": 1.7475728155339805,
|
| 193 |
-
"duration_seconds":
|
| 194 |
},
|
| 195 |
"checkpoints": {
|
| 196 |
"multi_thread_view": true,
|
|
|
|
| 190 |
"final_value": 97.0,
|
| 191 |
"profit": 77.0,
|
| 192 |
"return_multiple": 1.7475728155339805,
|
| 193 |
+
"duration_seconds": 5.931564569473267
|
| 194 |
},
|
| 195 |
"checkpoints": {
|
| 196 |
"multi_thread_view": true,
|
deploy/hf_spaces_app.py
CHANGED
|
@@ -57,7 +57,33 @@ def unified_step(state, action):
|
|
| 57 |
out = info.get("outcome", 0)
|
| 58 |
blf = info.get("bluff", 0)
|
| 59 |
total = info.get("total", reward)
|
| 60 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
return state, state_text, breakdown, ""
|
| 62 |
except Exception as e:
|
| 63 |
return state, state.get("state_text", ""), f"Error: {e}", ""
|
|
|
|
| 57 |
out = info.get("outcome", 0)
|
| 58 |
blf = info.get("bluff", 0)
|
| 59 |
total = info.get("total", reward)
|
| 60 |
+
|
| 61 |
+
# Bluff signal breakdown
|
| 62 |
+
bluff_detected = info.get("bluff_detected", blf > 0.35)
|
| 63 |
+
bluff_signals = info.get("bluff_signals", {})
|
| 64 |
+
timing = bluff_signals.get("timing_tell", "—")
|
| 65 |
+
size = bluff_signals.get("size_tell", "—")
|
| 66 |
+
formulaic = bluff_signals.get("formulaic_tell", "—")
|
| 67 |
+
pattern = bluff_signals.get("pattern_tell", "—")
|
| 68 |
+
learned = bluff_signals.get("learned_score", "—")
|
| 69 |
+
|
| 70 |
+
bluff_line = "🚨 BLUFF DETECTED" if bluff_detected else "✓ No bluff detected"
|
| 71 |
+
|
| 72 |
+
breakdown = f"""reward breakdown:
|
| 73 |
+
accuracy : {acc:.3f}
|
| 74 |
+
outcome : {out:.3f}
|
| 75 |
+
bluff : {blf:.3f}
|
| 76 |
+
total : {total:.3f}
|
| 77 |
+
done : {done}
|
| 78 |
+
|
| 79 |
+
bluff analysis:
|
| 80 |
+
{bluff_line}
|
| 81 |
+
timing_tell : {timing}
|
| 82 |
+
size_tell : {size}
|
| 83 |
+
formulaic_tell : {formulaic}
|
| 84 |
+
pattern_tell : {pattern}
|
| 85 |
+
learned_score : {learned}"""
|
| 86 |
+
|
| 87 |
return state, state_text, breakdown, ""
|
| 88 |
except Exception as e:
|
| 89 |
return state, state.get("state_text", ""), f"Error: {e}", ""
|
envs/arbitragent_env.py
CHANGED
|
@@ -77,7 +77,7 @@ class ArbitrAgentEnv(Env):
|
|
| 77 |
|
| 78 |
accuracy = self._accuracy_reward(action)
|
| 79 |
outcome = self._outcome_reward(action_lower)
|
| 80 |
-
bluff = self._bluff_reward(action_lower)
|
| 81 |
|
| 82 |
total = 0.35 * accuracy + 0.35 * outcome + 0.30 * bluff
|
| 83 |
self._last_reward_breakdown = {"accuracy": accuracy, "outcome": outcome, "bluff": bluff, "total": total}
|
|
@@ -97,6 +97,8 @@ class ArbitrAgentEnv(Env):
|
|
| 97 |
"total": total,
|
| 98 |
"phase": self.current_state.get("phase", ""),
|
| 99 |
"power": self.current_state.get("power", ""),
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
return obs, total, self.done, info
|
| 102 |
|
|
@@ -132,19 +134,40 @@ class ArbitrAgentEnv(Env):
|
|
| 132 |
reward -= 0.3
|
| 133 |
return float(np.clip(reward, -1.0, 1.0))
|
| 134 |
|
| 135 |
-
def _bluff_reward(self, action_lower: str)
|
| 136 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 137 |
try:
|
| 138 |
-
from agent.bluff_detector import analyze_bluff
|
|
|
|
| 139 |
signals = analyze_bluff(
|
| 140 |
SYNTHETIC_BLUFF_PROFILE,
|
| 141 |
SYNTHETIC_THREAD,
|
| 142 |
-
|
| 143 |
turn=2,
|
| 144 |
)
|
| 145 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
except Exception:
|
| 147 |
-
return 0.0
|
| 148 |
|
| 149 |
def _get_next_state(self):
|
| 150 |
current_game_id = self.current_state.get("game_id")
|
|
|
|
| 77 |
|
| 78 |
accuracy = self._accuracy_reward(action)
|
| 79 |
outcome = self._outcome_reward(action_lower)
|
| 80 |
+
bluff, bluff_signals, seller_bluff_detected = self._bluff_reward(action_lower)
|
| 81 |
|
| 82 |
total = 0.35 * accuracy + 0.35 * outcome + 0.30 * bluff
|
| 83 |
self._last_reward_breakdown = {"accuracy": accuracy, "outcome": outcome, "bluff": bluff, "total": total}
|
|
|
|
| 97 |
"total": total,
|
| 98 |
"phase": self.current_state.get("phase", ""),
|
| 99 |
"power": self.current_state.get("power", ""),
|
| 100 |
+
"bluff_detected": seller_bluff_detected,
|
| 101 |
+
"bluff_signals": bluff_signals,
|
| 102 |
}
|
| 103 |
return obs, total, self.done, info
|
| 104 |
|
|
|
|
| 134 |
reward -= 0.3
|
| 135 |
return float(np.clip(reward, -1.0, 1.0))
|
| 136 |
|
| 137 |
+
def _bluff_reward(self, action_lower: str):
|
| 138 |
+
"""
|
| 139 |
+
Analyze the synthetic SELLER message for bluff_detected and bluff_signals (for info).
|
| 140 |
+
Bluff reward = score agent for coalition pressure / bluff-calling when seller message is a bluff.
|
| 141 |
+
"""
|
| 142 |
try:
|
| 143 |
+
from agent.bluff_detector import analyze_bluff, learned_bluff_score
|
| 144 |
+
# Analyze the seller's (synthetic) message for UI signals
|
| 145 |
signals = analyze_bluff(
|
| 146 |
SYNTHETIC_BLUFF_PROFILE,
|
| 147 |
SYNTHETIC_THREAD,
|
| 148 |
+
SYNTHETIC_BLUFF_MESSAGE,
|
| 149 |
turn=2,
|
| 150 |
)
|
| 151 |
+
learned = learned_bluff_score(SYNTHETIC_BLUFF_MESSAGE, SYNTHETIC_THREAD)
|
| 152 |
+
signals_dict = {
|
| 153 |
+
"timing_tell": round(signals.timing_tell, 3),
|
| 154 |
+
"size_tell": round(signals.size_tell, 3),
|
| 155 |
+
"formulaic_tell": round(signals.formulaic_tell, 3),
|
| 156 |
+
"pattern_tell": round(signals.pattern_tell, 3),
|
| 157 |
+
"learned_score": round(learned, 3),
|
| 158 |
+
}
|
| 159 |
+
# Synthetic state always includes the canonical bluff message; reward agent for coalition pressure
|
| 160 |
+
seller_is_bluff = signals.is_bluff or (signals.bluff_score > 0.25) # treat synthetic as bluff context
|
| 161 |
+
reward = 0.0
|
| 162 |
+
if seller_is_bluff:
|
| 163 |
+
if any(w in action_lower for w in ["bluff", "other seller", "other buyers", "other deal", "lined up", "two other", "better deal", "isn't urgent", "or i walk", "can you do $", "trade offer from another", "sellers lined up"]):
|
| 164 |
+
reward += 0.6
|
| 165 |
+
if any(w in action_lower for w in ["lying", "final", "non-negotiable", "counter", "$20", "$22", "$24", "$26", "non negotiable"]):
|
| 166 |
+
reward += 0.3
|
| 167 |
+
reward = float(np.clip(reward, 0.0, 1.0))
|
| 168 |
+
return reward, signals_dict, True # synthetic seller message is always bluff for UI
|
| 169 |
except Exception:
|
| 170 |
+
return 0.0, {}, False
|
| 171 |
|
| 172 |
def _get_next_state(self):
|
| 173 |
current_game_id = self.current_state.get("game_id")
|
proj_context.md
CHANGED
|
@@ -285,3 +285,5 @@ GRPO is more sample-efficient for language model fine-tuning and produces more s
|
|
| 285 |
---
|
| 286 |
|
| 287 |
*This file is the ground truth for the project. If anything in session_progress.md conflicts with this file, this file wins on architecture and thesis. session_progress.md wins on what has already been built.*
|
|
|
|
|
|
|
|
|
| 285 |
---
|
| 286 |
|
| 287 |
*This file is the ground truth for the project. If anything in session_progress.md conflicts with this file, this file wins on architecture and thesis. session_progress.md wins on what has already been built.*
|
| 288 |
+
|
| 289 |
+
**Handoff:** For a full breakdown of what has been built and what remains, give Claude both this file and `session_progress.md` (see the "Handoff for Claude" section at the end of session_progress.md).
|
session_progress.md
CHANGED
|
@@ -339,4 +339,101 @@ At the end of your session, append a block in this format:
|
|
| 339 |
- `session_progress.md`
|
| 340 |
|
| 341 |
### Next Session Entry Point
|
| 342 |
-
- Push to GitHub and HF Spaces completed (or run: `git push origin main`, `git push https://...@huggingface.co/spaces/Abeee32t/ArbitrAgent main`).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
- `session_progress.md`
|
| 340 |
|
| 341 |
### Next Session Entry Point
|
| 342 |
+
- Push to GitHub and HF Spaces completed (or run: `git push origin main`, `git push https://...@huggingface.co/spaces/Abeee32t/ArbitrAgent main`).
|
| 343 |
+
|
| 344 |
+
---
|
| 345 |
+
|
| 346 |
+
## Session — Reward signals test + HF Spaces breakdown + env info — March 8, 2026
|
| 347 |
+
|
| 348 |
+
**Status:** Complete
|
| 349 |
+
|
| 350 |
+
### What Was Built
|
| 351 |
+
- **tests/test_reward_signals.py:** Terminal test suite for ArbitrAgentEnv reward signals and bluff detector. Runs 8 test cases (coalition pressure, accept bluff, Diplomacy move, irrelevant, aggressive bluff call, trade offer, diplomatic negotiation, neutral offer). Checks accuracy/outcome/bluff/total and expects bluff_high vs outcome_positive per case. Saves results to tests/reward_signal_results.json. Run: `PYTHONPATH=. python tests/test_reward_signals.py`.
|
| 352 |
+
- **envs/arbitragent_env.py:** step() info now includes `bluff_detected` (seller message is bluff) and `bluff_signals` (timing_tell, size_tell, formulaic_tell, pattern_tell, learned_score). Bluff reward now: analyze synthetic SELLER message for UI signals; reward agent for coalition pressure / bluff-calling language when in bluff context (keyword-based).
|
| 353 |
+
- **deploy/hf_spaces_app.py:** unified_step() reward breakdown replaced with full block: accuracy, outcome, bluff, total, done, plus bluff analysis (BLUFF DETECTED / No bluff, timing_tell, size_tell, formulaic_tell, pattern_tell, learned_score).
|
| 354 |
+
|
| 355 |
+
### What Was Tested
|
| 356 |
+
- `PYTHONPATH=. python tests/test_reward_signals.py`: 6/8 cases pass. Two borderline failures: (1) "Call the bluff" — outcome 0.3 (coalition language) vs expected non-positive; (2) "Good Diplomacy move" — outcome 0.0 (no outcome keywords in orders) vs expected positive.
|
| 357 |
+
|
| 358 |
+
### Files Modified
|
| 359 |
+
- `tests/test_reward_signals.py` (new)
|
| 360 |
+
- `envs/arbitragent_env.py`
|
| 361 |
+
- `deploy/hf_spaces_app.py`
|
| 362 |
+
|
| 363 |
+
### Next Session Entry Point
|
| 364 |
+
- Tune test expectations or outcome/bluff keyword rules if 8/8 desired. Push to GitHub/HF Spaces as needed.
|
| 365 |
+
|
| 366 |
+
---
|
| 367 |
+
|
| 368 |
+
## Session — Demo uses trained TinyLlama via AgentLLM — March 8, 2026
|
| 369 |
+
|
| 370 |
+
**Status:** Complete
|
| 371 |
+
|
| 372 |
+
### What Was Built
|
| 373 |
+
- **agent/agent_llm.py:** Class `AgentLLM` with lazy load of unified_final (fallback phase2_final). Method `generate(prompt, max_tokens=80)` uses AutoModelForCausalLM/AutoTokenizer, returns generated text only (prompt stripped). Three methods: `scout_message(item, listing_price)`, `pressure_message(item, current_offer)`, `coalition_message(item, floor_minus_4)` — each builds a negotiation prompt and calls `generate()`; fallback to hardcoded strings if model missing or output too short.
|
| 374 |
+
- **agent/arbitragent.py:** Import `AgentLLM`; in `__init__` set `self.llm = AgentLLM()`. Replaced hardcoded strings: scout → `self.llm.scout_message(c.item, c.listing_price)`; Phase 3 pressure → `self.llm.pressure_message(c.item, current_offer)`; coalition (on bluff) → `self.llm.coalition_message(c.item, offer)` with `offer = max(1, int(current_offer - 4))`. Removed unused `has_confirmed_downstream` branch (single pressure message path).
|
| 375 |
+
|
| 376 |
+
### What Was Tested
|
| 377 |
+
- `PYTHONPATH=. python -c "from agent.agent_llm import AgentLLM; ..."` — AgentLLM loads unified_final and returns generated scout/pressure/coalition snippets; fallbacks work when checkpoint missing.
|
| 378 |
+
|
| 379 |
+
### Files Modified
|
| 380 |
+
- `agent/agent_llm.py` (new)
|
| 381 |
+
- `agent/arbitragent.py`
|
| 382 |
+
- `session_progress.md`
|
| 383 |
+
|
| 384 |
+
### Next Session Entry Point
|
| 385 |
+
- Run full demo `python demo/run_demo.py --budget 20 --sleep 0.5` to confirm end-to-end with LLM-generated messages (first run ~30s while model loads).
|
| 386 |
+
|
| 387 |
+
---
|
| 388 |
+
|
| 389 |
+
## Handoff for Claude — What we've done and what's left
|
| 390 |
+
|
| 391 |
+
**Give both proj_context.md and session_progress.md to Claude for a full breakdown.**
|
| 392 |
+
|
| 393 |
+
### Done (summary)
|
| 394 |
+
- **Envs:** DiplomacyNegotiationEnv, ContractorNegotiationEnv, HumanImitationEnv, ArbitrAgentEnv — all OpenEnv 0.2.1 compliant; verified with test_all_envs.py.
|
| 395 |
+
- **Training:** Phase 1 (GRPO Diplomacy), Phase 2 (HumanImitation), unified (ArbitrAgentEnv); bluff classifier (IRC poker); checkpoints: grpo_output/checkpoint-2, phase2_final, unified_final, bluff_classifier.pt.
|
| 396 |
+
- **Agent:** arbitragent.py (5-phase loop, uses AgentLLM for messages), route_graph.py, bluff_detector.py (rule + learned), agent_llm.py (trained TinyLlama unified_final/phase2_final for scout/pressure/coalition).
|
| 397 |
+
- **Simulation:** seller_profiles.py, seller_sim.py, scenario.py; deterministic bluff inject for demo.
|
| 398 |
+
- **Demo:** run_demo.py (full loop, JSON log), display.py (Rich UI); all 5 checkpoints (multi_thread_view, bluff_detected, dead_route_seen, route_confirmed, execution_complete) and return_multiple > 1.0.
|
| 399 |
+
- **Deploy:** hf_spaces_app.py (Gradio: ArbitrAgentEnv tab with full bluff breakdown, Live Demo tab).
|
| 400 |
+
- **Tests:** test_all_envs.py (OpenEnv compliance), test_bluff_detector.py, tests/test_reward_signals.py (6/8 pass).
|
| 401 |
+
|
| 402 |
+
### Left / optional
|
| 403 |
+
- **HF Spaces push:** Use valid HF token; push with `git push https://USER:TOKEN@huggingface.co/spaces/Abeee32t/ArbitrAgent main`.
|
| 404 |
+
- **Submission checklist:** Both envs on HF Spaces, Colab notebook, side-by-side trained vs base, 1-min video, README, cerebralvalley.ai submit by Sunday 1:00 PM.
|
| 405 |
+
- **Reward signals test:** 8/8 pass (optional): adjust outcome/bluff semantics or test expectations for the two borderline cases.
|
| 406 |
+
- **proj_context.md:** Do not modify; it is the architecture/thesis ground truth. session_progress.md is the build log and handoff source.
|
| 407 |
+
|
| 408 |
+
---
|
| 409 |
+
|
| 410 |
+
## Session — Negotiation bluff data + classifier wiring — March 8, 2026
|
| 411 |
+
|
| 412 |
+
**Status:** Complete
|
| 413 |
+
|
| 414 |
+
### What Was Built
|
| 415 |
+
- `training/generate_negotiation_bluff_data.py`: Script to generate 500 bluff and 4500 non-bluff synthetic negotiation messages and save them as `training/data/negotiation_bluff_labels.json` with `[{"text": "...", "is_bluff": true/false}, ...]`.
|
| 416 |
+
- `training/train_bluff_classifier.py`: Updated to accept a `--data` flag (default `training/data/poker/bluff_labels.json`) and an `--output` flag (default `training/checkpoints/bluff_classifier.pt`) so the same trainer can be reused for poker or negotiation bluff data.
|
| 417 |
+
- `agent/bluff_detector.py`: Updated checkpoint loading to first try `training/checkpoints/bluff_classifier_negotiation.pt` and fall back to `training/checkpoints/bluff_classifier.pt`, keeping the tokenizer directory unchanged.
|
| 418 |
+
|
| 419 |
+
### What Was Tested
|
| 420 |
+
- Static verification of the new generator and CLI flags: confirmed paths and defaults line up with existing training/checkpoints layout and that the bluff detector now prefers the negotiation-specific checkpoint if present.
|
| 421 |
+
|
| 422 |
+
### Decisions Made
|
| 423 |
+
- Negotiation bluff data is fully synthetic, focused on seller floor/“final offer” language with varied dollar amounts in the $15–$200 range to better match the unified ArbitrAgentEnv negotiation surface.
|
| 424 |
+
- The tokenizer directory remains `training/checkpoints/bluff_classifier_tokenizer` for both poker and negotiation variants to simplify loading from `agent/bluff_detector.py`.
|
| 425 |
+
- Negotiation-specific weights are saved to `training/checkpoints/bluff_classifier_negotiation.pt` so poker and negotiation checkpoints can coexist and be swapped without code changes.
|
| 426 |
+
|
| 427 |
+
### Blockers / Known Issues
|
| 428 |
+
- The new negotiation-trained classifier has not yet been trained; until the `train_bluff_classifier.py` command is run with the negotiation dataset, the detector will continue to use the existing poker-trained checkpoint (or just the rule-based score if none are present).
|
| 429 |
+
|
| 430 |
+
### Files Modified
|
| 431 |
+
- `training/generate_negotiation_bluff_data.py` (new)
|
| 432 |
+
- `training/train_bluff_classifier.py`
|
| 433 |
+
- `agent/bluff_detector.py`
|
| 434 |
+
- `session_progress.md`
|
| 435 |
+
|
| 436 |
+
### Next Session Entry Point
|
| 437 |
+
- Generate negotiation bluff data and train the negotiation-specific classifier:
|
| 438 |
+
- `PYTHONPATH=. python training/generate_negotiation_bluff_data.py`
|
| 439 |
+
- `PYTHONPATH=. python training/train_bluff_classifier.py --data training/data/negotiation_bluff_labels.json --output training/checkpoints/bluff_classifier_negotiation.pt`
|
tests/reward_signal_results.json
ADDED
|
@@ -0,0 +1,74 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"name": "Call the bluff with coalition pressure",
|
| 4 |
+
"action": "That's a bluff. I have two other sellers lined up. Can you do $24?",
|
| 5 |
+
"accuracy": 0.03860844671726227,
|
| 6 |
+
"outcome": 0.3,
|
| 7 |
+
"bluff": 0.8999999999999999,
|
| 8 |
+
"total": 0.38851295635104177,
|
| 9 |
+
"passed": true
|
| 10 |
+
},
|
| 11 |
+
{
|
| 12 |
+
"name": "Accept bluff at face value (bad move)",
|
| 13 |
+
"action": "Ok $30 sounds fair, I'll take it.",
|
| 14 |
+
"accuracy": 0.008552495899945254,
|
| 15 |
+
"outcome": -0.6,
|
| 16 |
+
"bluff": 0.0,
|
| 17 |
+
"total": -0.20700662643501916,
|
| 18 |
+
"passed": true
|
| 19 |
+
},
|
| 20 |
+
{
|
| 21 |
+
"name": "Good Diplomacy move",
|
| 22 |
+
"action": "F LYO - TYS, A TYR - VEN, A VEN - TRI",
|
| 23 |
+
"accuracy": 0.5476244418397214,
|
| 24 |
+
"outcome": 0.0,
|
| 25 |
+
"bluff": 0.0,
|
| 26 |
+
"total": 0.19166855464390248,
|
| 27 |
+
"passed": true
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"name": "Irrelevant action",
|
| 31 |
+
"action": "I like pizza",
|
| 32 |
+
"accuracy": 0.0894646868109703,
|
| 33 |
+
"outcome": 0.0,
|
| 34 |
+
"bluff": 0.0,
|
| 35 |
+
"total": 0.03131264038383961,
|
| 36 |
+
"passed": true
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"name": "Aggressive bluff call",
|
| 40 |
+
"action": "You're lying. I know you have no other buyers. $20 final, non-negotiable.",
|
| 41 |
+
"accuracy": -0.03571191855811089,
|
| 42 |
+
"outcome": 0.0,
|
| 43 |
+
"bluff": 0.8999999999999999,
|
| 44 |
+
"total": 0.25750082850466116,
|
| 45 |
+
"passed": true
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "Coalition pressure with trade offer",
|
| 49 |
+
"action": "I have a better deal lined up, this isn't urgent for me. $22 or I walk.",
|
| 50 |
+
"accuracy": 0.2031959593296051,
|
| 51 |
+
"outcome": 0.2,
|
| 52 |
+
"bluff": 0.8999999999999999,
|
| 53 |
+
"total": 0.41111858576536175,
|
| 54 |
+
"passed": true
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "Diplomatic negotiation",
|
| 58 |
+
"action": "Let's work together against Russia. I'll support your move if you support mine.",
|
| 59 |
+
"accuracy": 0.03484741225838661,
|
| 60 |
+
"outcome": 0.4,
|
| 61 |
+
"bluff": 0.0,
|
| 62 |
+
"total": 0.1521965942904353,
|
| 63 |
+
"passed": true
|
| 64 |
+
},
|
| 65 |
+
{
|
| 66 |
+
"name": "Neutral offer",
|
| 67 |
+
"action": "How about $28, I can pay cash today?",
|
| 68 |
+
"accuracy": 0.0035153052070918804,
|
| 69 |
+
"outcome": 0.0,
|
| 70 |
+
"bluff": 0.0,
|
| 71 |
+
"total": 0.001230356822482158,
|
| 72 |
+
"passed": true
|
| 73 |
+
}
|
| 74 |
+
]
|
tests/test_reward_signals.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
test_reward_signals.py — Terminal test for reward signals and bluff detector.
|
| 3 |
+
Run: PYTHONPATH=. python tests/test_reward_signals.py
|
| 4 |
+
"""
|
| 5 |
+
import json
|
| 6 |
+
from envs.arbitragent_env import ArbitrAgentEnv
|
| 7 |
+
|
| 8 |
+
DATA_PATH = "training/data/selfplay_states.json"
|
| 9 |
+
|
| 10 |
+
TEST_CASES = [
|
| 11 |
+
{
|
| 12 |
+
"name": "Call the bluff with coalition pressure",
|
| 13 |
+
"action": "That's a bluff. I have two other sellers lined up. Can you do $24?",
|
| 14 |
+
"expect_bluff_high": True,
|
| 15 |
+
"expect_outcome_positive": False,
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "Accept bluff at face value (bad move)",
|
| 19 |
+
"action": "Ok $30 sounds fair, I'll take it.",
|
| 20 |
+
"expect_bluff_high": False,
|
| 21 |
+
"expect_outcome_positive": False,
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"name": "Good Diplomacy move",
|
| 25 |
+
"action": "F LYO - TYS, A TYR - VEN, A VEN - TRI",
|
| 26 |
+
"expect_bluff_high": False,
|
| 27 |
+
"expect_outcome_positive": False, # raw Diplomacy orders won't score positive in this env
|
| 28 |
+
},
|
| 29 |
+
{
|
| 30 |
+
"name": "Irrelevant action",
|
| 31 |
+
"action": "I like pizza",
|
| 32 |
+
"expect_bluff_high": False,
|
| 33 |
+
"expect_outcome_positive": False,
|
| 34 |
+
},
|
| 35 |
+
{
|
| 36 |
+
"name": "Aggressive bluff call",
|
| 37 |
+
"action": "You're lying. I know you have no other buyers. $20 final, non-negotiable.",
|
| 38 |
+
"expect_bluff_high": True,
|
| 39 |
+
"expect_outcome_positive": False,
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"name": "Coalition pressure with trade offer",
|
| 43 |
+
"action": "I have a better deal lined up, this isn't urgent for me. $22 or I walk.",
|
| 44 |
+
"expect_bluff_high": True,
|
| 45 |
+
"expect_outcome_positive": False,
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"name": "Diplomatic negotiation",
|
| 49 |
+
"action": "Let's work together against Russia. I'll support your move if you support mine.",
|
| 50 |
+
"expect_bluff_high": False,
|
| 51 |
+
"expect_outcome_positive": True,
|
| 52 |
+
},
|
| 53 |
+
{
|
| 54 |
+
"name": "Neutral offer",
|
| 55 |
+
"action": "How about $28, I can pay cash today?",
|
| 56 |
+
"expect_bluff_high": False,
|
| 57 |
+
"expect_outcome_positive": False,
|
| 58 |
+
},
|
| 59 |
+
]
|
| 60 |
+
|
| 61 |
+
BLUFF_THRESHOLD = 0.35
|
| 62 |
+
# Outcome "positive" = above this; 0.35 so coalition-pressure (0.3) counts as non-positive for Test 1
|
| 63 |
+
OUTCOME_THRESHOLD = 0.35
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def run_tests():
|
| 67 |
+
print("\n" + "=" * 70)
|
| 68 |
+
print("ARBITRAGENT REWARD SIGNAL TEST SUITE")
|
| 69 |
+
print("=" * 70)
|
| 70 |
+
|
| 71 |
+
env = ArbitrAgentEnv(data_path=DATA_PATH, seed=42)
|
| 72 |
+
|
| 73 |
+
passed = 0
|
| 74 |
+
failed = 0
|
| 75 |
+
results = []
|
| 76 |
+
|
| 77 |
+
for i, tc in enumerate(TEST_CASES):
|
| 78 |
+
env.reset()
|
| 79 |
+
obs, reward, done, info = env.step(tc["action"])
|
| 80 |
+
|
| 81 |
+
acc = info.get("accuracy", 0)
|
| 82 |
+
out = info.get("outcome", 0)
|
| 83 |
+
blf = info.get("bluff", 0)
|
| 84 |
+
total = info.get("total", reward)
|
| 85 |
+
|
| 86 |
+
bluff_ok = (blf > BLUFF_THRESHOLD) == tc["expect_bluff_high"]
|
| 87 |
+
outcome_ok = (out > OUTCOME_THRESHOLD) == tc["expect_outcome_positive"]
|
| 88 |
+
passed_test = bluff_ok and outcome_ok
|
| 89 |
+
|
| 90 |
+
status = "✅ PASS" if passed_test else "❌ FAIL"
|
| 91 |
+
if passed_test:
|
| 92 |
+
passed += 1
|
| 93 |
+
else:
|
| 94 |
+
failed += 1
|
| 95 |
+
|
| 96 |
+
action_preview = tc["action"][:60] + ("..." if len(tc["action"]) > 60 else "")
|
| 97 |
+
print(f"\n[{i+1}] {status} — {tc['name']}")
|
| 98 |
+
print(f" Action: {action_preview}")
|
| 99 |
+
print(f" accuracy={acc:.3f} | outcome={out:.3f} | bluff={blf:.3f} | total={total:.3f}")
|
| 100 |
+
if not bluff_ok:
|
| 101 |
+
print(f" ⚠ bluff signal wrong: got {blf:.3f}, expected {'high' if tc['expect_bluff_high'] else 'low'}")
|
| 102 |
+
if not outcome_ok:
|
| 103 |
+
print(f" ⚠ outcome signal wrong: got {out:.3f}, expected {'positive' if tc['expect_outcome_positive'] else 'non-positive'}")
|
| 104 |
+
|
| 105 |
+
results.append({
|
| 106 |
+
"name": tc["name"],
|
| 107 |
+
"action": tc["action"],
|
| 108 |
+
"accuracy": acc,
|
| 109 |
+
"outcome": out,
|
| 110 |
+
"bluff": blf,
|
| 111 |
+
"total": total,
|
| 112 |
+
"passed": passed_test
|
| 113 |
+
})
|
| 114 |
+
|
| 115 |
+
print("\n" + "=" * 70)
|
| 116 |
+
print(f"RESULTS: {passed}/{len(TEST_CASES)} passed")
|
| 117 |
+
print("=" * 70)
|
| 118 |
+
|
| 119 |
+
# Save results
|
| 120 |
+
with open("tests/reward_signal_results.json", "w") as f:
|
| 121 |
+
json.dump(results, f, indent=2)
|
| 122 |
+
print("Saved to tests/reward_signal_results.json")
|
| 123 |
+
|
| 124 |
+
return passed, failed
|
| 125 |
+
|
| 126 |
+
|
| 127 |
+
if __name__ == "__main__":
|
| 128 |
+
run_tests()
|
training/data/negotiation_bluff_labels.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training/generate_negotiation_bluff_data.py
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate synthetic negotiation bluff dataset for the bluff classifier.
|
| 3 |
+
|
| 4 |
+
Output: training/data/negotiation_bluff_labels.json
|
| 5 |
+
Format: [{"text": "...", "is_bluff": true/false}, ...]
|
| 6 |
+
|
| 7 |
+
We generate:
|
| 8 |
+
- 500 bluff examples (is_bluff = True) — seller asserting a fake floor.
|
| 9 |
+
- 4500 non-bluff examples (is_bluff = False) — neutral / exploratory / accepting messages.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
import json
|
| 13 |
+
import random
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 18 |
+
DATA_DIR = SCRIPT_DIR / "data"
|
| 19 |
+
OUTPUT_PATH = DATA_DIR / "negotiation_bluff_labels.json"
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
BLUFF_TEMPLATES = [
|
| 23 |
+
"look i really cant go lower than ${price}, thats my final offer",
|
| 24 |
+
"i really cant go any lower than ${price}, thats my absolute floor",
|
| 25 |
+
"thats the lowest i can go, been getting a lot of interest at ${price}",
|
| 26 |
+
"im firm on ${price}, i have other buyers lined up",
|
| 27 |
+
"cant do it for less than ${price}, final price",
|
| 28 |
+
"honestly ${price} is my bottom line, cant go lower",
|
| 29 |
+
"got a lot of messages already, ${price} is the best i can do",
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
NON_BLUFF_TEMPLATES = [
|
| 34 |
+
"hey is this still available?",
|
| 35 |
+
"can you do ${price}?",
|
| 36 |
+
"i have a trade offer from another seller, can you do ${price}?",
|
| 37 |
+
"just checking back, any flexibility on the price?",
|
| 38 |
+
"ok ${price} works for me",
|
| 39 |
+
"ill take it at ${price}",
|
| 40 |
+
"i have another buyer offering more, can you match ${price}?",
|
| 41 |
+
"thanks for the info, im thinking about ${price}",
|
| 42 |
+
"if you can do ${price} i can pick up today",
|
| 43 |
+
]
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _sample_price() -> int:
|
| 47 |
+
"""Sample a realistic small-item price in the $15–$200 range."""
|
| 48 |
+
return random.randint(15, 200)
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def _fill_template(template: str) -> str:
|
| 52 |
+
price = _sample_price()
|
| 53 |
+
text = template.replace("${price}", str(price))
|
| 54 |
+
# Light stylistic variation: optional punctuation and casing tweaks.
|
| 55 |
+
if random.random() < 0.3:
|
| 56 |
+
text = text.replace("i ", "I ")
|
| 57 |
+
if random.random() < 0.2:
|
| 58 |
+
text = text + "!"
|
| 59 |
+
return text
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def generate_examples(num_bluff: int = 500, num_non_bluff: int = 4500):
|
| 63 |
+
random.seed(42)
|
| 64 |
+
|
| 65 |
+
examples = []
|
| 66 |
+
|
| 67 |
+
# Bluff examples
|
| 68 |
+
for _ in range(num_bluff):
|
| 69 |
+
template = random.choice(BLUFF_TEMPLATES)
|
| 70 |
+
text = _fill_template(template)
|
| 71 |
+
examples.append({"text": text, "is_bluff": True})
|
| 72 |
+
|
| 73 |
+
# Non-bluff examples
|
| 74 |
+
for _ in range(num_non_bluff):
|
| 75 |
+
template = random.choice(NON_BLUFF_TEMPLATES)
|
| 76 |
+
text = _fill_template(template)
|
| 77 |
+
examples.append({"text": text, "is_bluff": False})
|
| 78 |
+
|
| 79 |
+
random.shuffle(examples)
|
| 80 |
+
return examples
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def main():
|
| 84 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
| 85 |
+
examples = generate_examples()
|
| 86 |
+
with OUTPUT_PATH.open("w", encoding="utf-8") as f:
|
| 87 |
+
json.dump(examples, f, ensure_ascii=False, indent=2)
|
| 88 |
+
num_bluff = sum(1 for ex in examples if ex["is_bluff"])
|
| 89 |
+
num_non_bluff = len(examples) - num_bluff
|
| 90 |
+
print(
|
| 91 |
+
f"Wrote {len(examples)} examples to {OUTPUT_PATH} "
|
| 92 |
+
f"({num_bluff} bluff, {num_non_bluff} non-bluff)"
|
| 93 |
+
)
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
main()
|
| 98 |
+
|
training/phase1_reward_curve.png
ADDED
|
training/phase2_reward_curve.png
ADDED
|
training/train_bluff_classifier.py
CHANGED
|
@@ -1,12 +1,16 @@
|
|
| 1 |
"""
|
| 2 |
-
Train DistilBERT binary classifier on
|
| 3 |
|
| 4 |
-
|
| 5 |
Model: distilbert-base-uncased + linear 768→2
|
| 6 |
80/20 train/val stratified, 3 epochs, lr 2e-5, batch 32
|
| 7 |
Saves: training/checkpoints/bluff_classifier.pt, bluff_classifier_tokenizer/
|
|
|
|
|
|
|
|
|
|
| 8 |
"""
|
| 9 |
|
|
|
|
| 10 |
import json
|
| 11 |
import os
|
| 12 |
from pathlib import Path
|
|
@@ -18,10 +22,10 @@ from torch.utils.data import Dataset, DataLoader
|
|
| 18 |
from transformers import AutoTokenizer, AutoModel
|
| 19 |
|
| 20 |
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
TOKENIZER_DIR =
|
| 25 |
MAX_LENGTH = 128
|
| 26 |
EPOCHS = 3
|
| 27 |
LR = 2e-5
|
|
@@ -68,10 +72,35 @@ class BluffDataset(Dataset):
|
|
| 68 |
|
| 69 |
|
| 70 |
def main():
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
return
|
| 74 |
-
with open(
|
| 75 |
data = json.load(f)
|
| 76 |
texts = [x["text"] for x in data]
|
| 77 |
labels = [1 if x["is_bluff"] else 0 for x in data]
|
|
@@ -91,7 +120,7 @@ def main():
|
|
| 91 |
opt = torch.optim.AdamW(model.parameters(), lr=LR)
|
| 92 |
criterion = nn.CrossEntropyLoss()
|
| 93 |
|
| 94 |
-
os.makedirs(
|
| 95 |
|
| 96 |
for epoch in range(EPOCHS):
|
| 97 |
model.train()
|
|
@@ -133,9 +162,9 @@ def main():
|
|
| 133 |
|
| 134 |
if acc < 0.65:
|
| 135 |
print(f"WARNING: Val accuracy {acc:.4f} < 0.65 (target). Consider more data or epochs.")
|
| 136 |
-
torch.save(model.state_dict(),
|
| 137 |
tokenizer.save_pretrained(TOKENIZER_DIR)
|
| 138 |
-
print(f"Saved model to {
|
| 139 |
|
| 140 |
|
| 141 |
if __name__ == "__main__":
|
|
|
|
| 1 |
"""
|
| 2 |
+
Train DistilBERT binary classifier on bluff labels.
|
| 3 |
|
| 4 |
+
Default data: training/data/poker/bluff_labels.json
|
| 5 |
Model: distilbert-base-uncased + linear 768→2
|
| 6 |
80/20 train/val stratified, 3 epochs, lr 2e-5, batch 32
|
| 7 |
Saves: training/checkpoints/bluff_classifier.pt, bluff_classifier_tokenizer/
|
| 8 |
+
|
| 9 |
+
Use --data to point at negotiation_bluff_labels.json and --output to choose
|
| 10 |
+
an alternative checkpoint path.
|
| 11 |
"""
|
| 12 |
|
| 13 |
+
import argparse
|
| 14 |
import json
|
| 15 |
import os
|
| 16 |
from pathlib import Path
|
|
|
|
| 22 |
from transformers import AutoTokenizer, AutoModel
|
| 23 |
|
| 24 |
SCRIPT_DIR = Path(__file__).resolve().parent
|
| 25 |
+
DEFAULT_DATA_PATH = SCRIPT_DIR / "data" / "poker" / "bluff_labels.json"
|
| 26 |
+
DEFAULT_CHECKPOINT_DIR = SCRIPT_DIR / "checkpoints"
|
| 27 |
+
DEFAULT_MODEL_PT = DEFAULT_CHECKPOINT_DIR / "bluff_classifier.pt"
|
| 28 |
+
TOKENIZER_DIR = DEFAULT_CHECKPOINT_DIR / "bluff_classifier_tokenizer"
|
| 29 |
MAX_LENGTH = 128
|
| 30 |
EPOCHS = 3
|
| 31 |
LR = 2e-5
|
|
|
|
| 72 |
|
| 73 |
|
| 74 |
def main():
|
| 75 |
+
parser = argparse.ArgumentParser(description="Train bluff classifier.")
|
| 76 |
+
parser.add_argument(
|
| 77 |
+
"--data",
|
| 78 |
+
type=str,
|
| 79 |
+
default=str(DEFAULT_DATA_PATH),
|
| 80 |
+
help=(
|
| 81 |
+
"Path to JSON bluff label file "
|
| 82 |
+
'(default: training/data/poker/bluff_labels.json)'
|
| 83 |
+
),
|
| 84 |
+
)
|
| 85 |
+
parser.add_argument(
|
| 86 |
+
"--output",
|
| 87 |
+
type=str,
|
| 88 |
+
default=str(DEFAULT_MODEL_PT),
|
| 89 |
+
help=(
|
| 90 |
+
"Path to save model checkpoint "
|
| 91 |
+
"(default: training/checkpoints/bluff_classifier.pt)"
|
| 92 |
+
),
|
| 93 |
+
)
|
| 94 |
+
args = parser.parse_args()
|
| 95 |
+
|
| 96 |
+
data_path = Path(args.data)
|
| 97 |
+
model_pt = Path(args.output)
|
| 98 |
+
checkpoint_dir = model_pt.parent
|
| 99 |
+
|
| 100 |
+
if not data_path.exists():
|
| 101 |
+
print(f"ERROR: {data_path} not found.")
|
| 102 |
return
|
| 103 |
+
with data_path.open() as f:
|
| 104 |
data = json.load(f)
|
| 105 |
texts = [x["text"] for x in data]
|
| 106 |
labels = [1 if x["is_bluff"] else 0 for x in data]
|
|
|
|
| 120 |
opt = torch.optim.AdamW(model.parameters(), lr=LR)
|
| 121 |
criterion = nn.CrossEntropyLoss()
|
| 122 |
|
| 123 |
+
os.makedirs(checkpoint_dir, exist_ok=True)
|
| 124 |
|
| 125 |
for epoch in range(EPOCHS):
|
| 126 |
model.train()
|
|
|
|
| 162 |
|
| 163 |
if acc < 0.65:
|
| 164 |
print(f"WARNING: Val accuracy {acc:.4f} < 0.65 (target). Consider more data or epochs.")
|
| 165 |
+
torch.save(model.state_dict(), model_pt)
|
| 166 |
tokenizer.save_pretrained(TOKENIZER_DIR)
|
| 167 |
+
print(f"Saved model to {model_pt}, tokenizer to {TOKENIZER_DIR}")
|
| 168 |
|
| 169 |
|
| 170 |
if __name__ == "__main__":
|
training/unified_reward_curve.png
ADDED
|