Chaos-Economy / eval.py
MananBansal's picture
feat: full Chaos Economy codebase (no binaries)
32b1802
"""Inference-only evaluation for the Chaos Economy multi-agent simulator.
Bypasses the GRPO trainer entirely. Loads a base model (optionally with a
saved LoRA adapter) and runs N episodes through the env, logging story/*
metrics to W&B for direct comparison against trained runs.
Usage:
# Trained LoRA evaluation
python eval.py --base_model unsloth/Llama-3.2-1B-Instruct \
--load_lora_path ./checkpoints/unified_market_lora \
--num_episodes 30 --episode_length 250 \
--wandb_project "Chaos Economy" --run_name eval-trained-1b
# Baseline (no adapter)
python eval.py --base_model unsloth/Llama-3.2-1B-Instruct \
--num_episodes 30 --episode_length 250 \
--wandb_project "Chaos Economy" --run_name eval-baseline-1b
# AWS Bedrock 70B (no local GPU needed)
python eval.py --use_bedrock \
--bedrock_model meta.llama3-1-70b-instruct-v1:0 \
--aws_region us-east-1 \
--num_episodes 5 --episode_length 50 \
--wandb_project "Chaos Economy" --run_name eval-bedrock-70b
"""
import argparse
import os
from collections import Counter
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from multi_agent.environment import MultiAgentVSREnvironment
from train_multi_agent_pipeline import (
TRADER_CONFIGS,
format_trader_prompt,
format_mm_prompt,
format_oversight_prompt,
parse_json,
scripted_trader,
scripted_mm,
scripted_oversight,
detect_coordinated_pressure,
get_position_heatmap,
)
try:
import wandb
HAS_WANDB = True
except ImportError:
HAS_WANDB = False
def generate_batch_bedrock(prompts, bedrock_client, model_id, max_new_tokens=120, temperature=0.7):
"""Call AWS Bedrock converse API. Returns one response string per prompt."""
results = []
for prompt in prompts:
response = bedrock_client.converse(
modelId=model_id,
messages=[{"role": "user", "content": [{"text": prompt}]}],
inferenceConfig={"maxTokens": max_new_tokens, "temperature": temperature},
)
results.append(response["output"]["message"]["content"][0]["text"])
return results
def generate_batch(prompts, model, tokenizer, device, max_new_tokens=120, temperature=0.7):
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True,
max_length=1500).to(device)
with torch.no_grad():
out = model.generate(
**inputs,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=temperature > 0,
pad_token_id=tokenizer.pad_token_id,
repetition_penalty=1.1,
)
in_len = inputs["input_ids"].shape[1]
return [tokenizer.decode(out[i][in_len:], skip_special_tokens=True) for i in range(len(prompts))]
def diversity_score(actions):
"""Shannon-entropy-style diversity over (direction, bucket) keys for traders."""
keys = []
for aid, a in actions.items():
if not aid.startswith("trader"):
continue
d = a.get("direction", "hold")
b = a.get("size_bucket", "small") if d != "hold" else "hold"
keys.append((d, b))
if not keys:
return 0.0
counts = Counter(keys)
total = sum(counts.values())
probs = [c / total for c in counts.values()]
return -sum(p * np.log(p + 1e-12) for p in probs)
def run_episode(model, tokenizer, device, episode_length, seed, use_model=True, verbose=False,
bedrock_client=None, bedrock_model=None):
env = MultiAgentVSREnvironment(episode_length=episode_length)
obs = env.reset(seed=seed)
rewards_total = {f"trader_{i}": 0.0 for i in range(4)}
rewards_total["market_maker"] = 0.0
rewards_total["oversight"] = 0.0
format_hits = 0
format_attempts = 0
diversity_per_step = []
oversight_intervention_steps = 0
for step in range(episode_length):
actions = {}
if use_model:
prompts, meta = [], []
for archetype, cfg in TRADER_CONFIGS.items():
for tid in cfg["trader_ids"]:
aid = f"trader_{tid}"
if aid in obs:
prompts.append(format_trader_prompt(archetype, aid, obs[aid]))
meta.append((aid, "trader", cfg["temperature"]))
coord = detect_coordinated_pressure(env.agent_states) if hasattr(env, "agent_states") else {}
prompts.append(format_mm_prompt(obs["market_maker"], coord))
meta.append(("market_maker", "market_maker", 0.3))
outputs = []
for i, (aid, role, temp) in enumerate(meta):
if bedrock_client:
out = generate_batch_bedrock([prompts[i]], bedrock_client, bedrock_model,
max_new_tokens=120, temperature=temp)[0]
else:
out = generate_batch([prompts[i]], model, tokenizer, device,
max_new_tokens=120, temperature=temp)[0]
outputs.append(out)
agent_thoughts = {}
for output, (aid, role, _) in zip(outputs, meta):
parsed, info = parse_json(output, role=role)
format_attempts += 1
if info.get("valid"):
format_hits += 1
actions[aid] = parsed
else:
if role == "trader":
actions[aid] = scripted_trader(int(aid.split("_")[1]), step)
else:
actions[aid] = scripted_mm(step)
agent_thoughts[aid] = actions[aid].get("reasoning", "")
heatmap = get_position_heatmap(env.agent_states) if hasattr(env, "agent_states") else {}
ov_prompt = format_oversight_prompt(obs["oversight"], heatmap, coord, agent_thoughts)
if bedrock_client:
ov_out = generate_batch_bedrock([ov_prompt], bedrock_client, bedrock_model,
max_new_tokens=140, temperature=0.5)[0]
else:
ov_out = generate_batch([ov_prompt], model, tokenizer, device,
max_new_tokens=140, temperature=0.5)[0]
ov_parsed, ov_info = parse_json(ov_out, role="oversight")
format_attempts += 1
if ov_info.get("valid"):
format_hits += 1
actions["oversight"] = ov_parsed
else:
actions["oversight"] = scripted_oversight()
else:
for i in range(3):
actions[f"trader_{i}"] = scripted_trader(i, step)
actions["market_maker"] = scripted_mm(step)
actions["oversight"] = scripted_oversight()
actions["trader_3"] = scripted_trader(3, step)
if actions["oversight"].get("intervention_type", "none") != "none":
oversight_intervention_steps += 1
diversity_per_step.append(diversity_score(actions))
obs, rewards, done, _info = env.step(actions)
for k in rewards_total:
rewards_total[k] += float(rewards.get(k, 0.0))
if verbose and step % 20 == 0:
print(f" step {step:3d} pnl_t0={rewards.get('trader_0',0):+.2f} "
f"div={diversity_per_step[-1]:.2f} "
f"fmt={format_hits}/{format_attempts}")
if done:
break
steps_run = len(diversity_per_step)
return {
"rewards_total": rewards_total,
"format_rate": format_hits / max(1, format_attempts),
"diversity_mean": float(np.mean(diversity_per_step)),
"oversight_rate": oversight_intervention_steps / max(1, steps_run),
"steps": steps_run,
}
def main():
p = argparse.ArgumentParser()
p.add_argument("--base_model", default="unsloth/Llama-3.2-1B-Instruct")
p.add_argument("--load_lora_path", default=None)
p.add_argument("--num_episodes", type=int, default=10)
p.add_argument("--episode_length", type=int, default=250)
p.add_argument("--seed", type=int, default=42)
p.add_argument("--wandb_project", default=None)
p.add_argument("--run_name", default=None)
# AWS Bedrock flags — skips local model loading entirely
p.add_argument("--use_bedrock", action="store_true")
p.add_argument("--bedrock_model", default="meta.llama3-1-70b-instruct-v1:0")
p.add_argument("--aws_region", default="us-east-1")
args = p.parse_args()
if args.wandb_project and HAS_WANDB and os.environ.get("WANDB_API_KEY"):
wandb.init(project=args.wandb_project, name=args.run_name or "eval",
config=vars(args))
bedrock_client = None
model = tokenizer = device = None
if args.use_bedrock:
import boto3
print(f"[Bedrock] Region={args.aws_region} Model={args.bedrock_model}")
bedrock_client = boto3.client("bedrock-runtime", region_name=args.aws_region)
print("[Bedrock] Client ready — no local model loaded")
else:
device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu")
dtype = torch.bfloat16 if device in ("cuda", "mps") else torch.float32
if device == "cuda":
print(f"[Device] {device}{torch.cuda.get_device_name(0)} dtype={dtype}")
else:
print(f"[Device] {device} dtype={dtype}")
if device == "cpu":
print("[WARNING] No GPU detected — running on CPU will be very slow.")
print(f"[Model] Loading base: {args.base_model}")
tokenizer = AutoTokenizer.from_pretrained(args.base_model)
# device_map="cuda:0" pins the whole model to one GPU — avoids split-device
# issues on ROCm where device_map="auto" can partially land on CPU.
load_device_map = "cuda:0" if device == "cuda" else ("mps" if device == "mps" else "cpu")
model = AutoModelForCausalLM.from_pretrained(args.base_model, device_map=load_device_map, torch_dtype=dtype)
print(f"[Model] Loaded — first param device: {next(model.parameters()).device}")
if args.load_lora_path:
from peft import PeftModel
print(f"[LoRA] Loading adapter: {args.load_lora_path}")
model = PeftModel.from_pretrained(model, args.load_lora_path)
else:
print("[LoRA] No adapter (baseline mode)")
model.eval()
print(f"\n[Eval] {args.num_episodes} episodes x {args.episode_length} steps\n")
agg = {"format_rate": [], "diversity_mean": [], "oversight_rate": [],
"rewards_total": []}
for ep in range(args.num_episodes):
print(f"--- Episode {ep+1}/{args.num_episodes} ---")
result = run_episode(model, tokenizer, device, args.episode_length,
seed=args.seed + ep, use_model=True, verbose=(ep == 0),
bedrock_client=bedrock_client,
bedrock_model=args.bedrock_model if args.use_bedrock else None)
agg["format_rate"].append(result["format_rate"])
agg["diversity_mean"].append(result["diversity_mean"])
agg["oversight_rate"].append(result["oversight_rate"])
agg["rewards_total"].append(result["rewards_total"])
ep_pnl = sum(result["rewards_total"][f"trader_{i}"] for i in range(4)) / 4
print(f" ep_format={result['format_rate']:.2%} "
f"ep_diversity={result['diversity_mean']:.3f} "
f"ep_oversight={result['oversight_rate']:.2%} "
f"ep_pnl_mean={ep_pnl:+.2f}\n")
if HAS_WANDB and wandb.run:
wandb.log({
"story/global_step": ep,
"story/format_mean": result["format_rate"],
"story/diversity_mean": result["diversity_mean"],
"story/oversight_mean": result["oversight_rate"],
"story/pnl_mean": ep_pnl,
})
print("=" * 60)
print("FINAL EVALUATION SUMMARY")
print("=" * 60)
print(f"format_rate mean={np.mean(agg['format_rate']):.2%} std={np.std(agg['format_rate']):.2%}")
print(f"diversity_mean mean={np.mean(agg['diversity_mean']):.3f} std={np.std(agg['diversity_mean']):.3f}")
print(f"oversight_rate mean={np.mean(agg['oversight_rate']):.2%} std={np.std(agg['oversight_rate']):.2%}")
for agent in ["trader_0", "trader_1", "trader_2", "trader_3", "market_maker", "oversight"]:
vals = [r[agent] for r in agg["rewards_total"]]
print(f"{agent:<14} cum_reward mean={np.mean(vals):+.2f} std={np.std(vals):.2f}")
if HAS_WANDB and wandb.run:
wandb.summary["format_rate"] = float(np.mean(agg["format_rate"]))
wandb.summary["diversity_mean"] = float(np.mean(agg["diversity_mean"]))
wandb.summary["oversight_rate"] = float(np.mean(agg["oversight_rate"]))
wandb.finish()
if __name__ == "__main__":
main()