| """Inference-only evaluation for the Chaos Economy multi-agent simulator. |
| |
| Bypasses the GRPO trainer entirely. Loads a base model (optionally with a |
| saved LoRA adapter) and runs N episodes through the env, logging story/* |
| metrics to W&B for direct comparison against trained runs. |
| |
| Usage: |
| # Trained LoRA evaluation |
| python eval.py --base_model unsloth/Llama-3.2-1B-Instruct \ |
| --load_lora_path ./checkpoints/unified_market_lora \ |
| --num_episodes 30 --episode_length 250 \ |
| --wandb_project "Chaos Economy" --run_name eval-trained-1b |
| |
| # Baseline (no adapter) |
| python eval.py --base_model unsloth/Llama-3.2-1B-Instruct \ |
| --num_episodes 30 --episode_length 250 \ |
| --wandb_project "Chaos Economy" --run_name eval-baseline-1b |
| |
| # AWS Bedrock 70B (no local GPU needed) |
| python eval.py --use_bedrock \ |
| --bedrock_model meta.llama3-1-70b-instruct-v1:0 \ |
| --aws_region us-east-1 \ |
| --num_episodes 5 --episode_length 50 \ |
| --wandb_project "Chaos Economy" --run_name eval-bedrock-70b |
| """ |
|
|
| import argparse |
| import os |
| from collections import Counter |
|
|
| import numpy as np |
| import torch |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
|
|
| from multi_agent.environment import MultiAgentVSREnvironment |
| from train_multi_agent_pipeline import ( |
| TRADER_CONFIGS, |
| format_trader_prompt, |
| format_mm_prompt, |
| format_oversight_prompt, |
| parse_json, |
| scripted_trader, |
| scripted_mm, |
| scripted_oversight, |
| detect_coordinated_pressure, |
| get_position_heatmap, |
| ) |
|
|
| try: |
| import wandb |
| HAS_WANDB = True |
| except ImportError: |
| HAS_WANDB = False |
|
|
|
|
| def generate_batch_bedrock(prompts, bedrock_client, model_id, max_new_tokens=120, temperature=0.7): |
| """Call AWS Bedrock converse API. Returns one response string per prompt.""" |
| results = [] |
| for prompt in prompts: |
| response = bedrock_client.converse( |
| modelId=model_id, |
| messages=[{"role": "user", "content": [{"text": prompt}]}], |
| inferenceConfig={"maxTokens": max_new_tokens, "temperature": temperature}, |
| ) |
| results.append(response["output"]["message"]["content"][0]["text"]) |
| return results |
|
|
|
|
| def generate_batch(prompts, model, tokenizer, device, max_new_tokens=120, temperature=0.7): |
| tokenizer.padding_side = "left" |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, |
| max_length=1500).to(device) |
| with torch.no_grad(): |
| out = model.generate( |
| **inputs, |
| max_new_tokens=max_new_tokens, |
| temperature=temperature, |
| do_sample=temperature > 0, |
| pad_token_id=tokenizer.pad_token_id, |
| repetition_penalty=1.1, |
| ) |
| in_len = inputs["input_ids"].shape[1] |
| return [tokenizer.decode(out[i][in_len:], skip_special_tokens=True) for i in range(len(prompts))] |
|
|
|
|
| def diversity_score(actions): |
| """Shannon-entropy-style diversity over (direction, bucket) keys for traders.""" |
| keys = [] |
| for aid, a in actions.items(): |
| if not aid.startswith("trader"): |
| continue |
| d = a.get("direction", "hold") |
| b = a.get("size_bucket", "small") if d != "hold" else "hold" |
| keys.append((d, b)) |
| if not keys: |
| return 0.0 |
| counts = Counter(keys) |
| total = sum(counts.values()) |
| probs = [c / total for c in counts.values()] |
| return -sum(p * np.log(p + 1e-12) for p in probs) |
|
|
|
|
| def run_episode(model, tokenizer, device, episode_length, seed, use_model=True, verbose=False, |
| bedrock_client=None, bedrock_model=None): |
| env = MultiAgentVSREnvironment(episode_length=episode_length) |
| obs = env.reset(seed=seed) |
|
|
| rewards_total = {f"trader_{i}": 0.0 for i in range(4)} |
| rewards_total["market_maker"] = 0.0 |
| rewards_total["oversight"] = 0.0 |
|
|
| format_hits = 0 |
| format_attempts = 0 |
| diversity_per_step = [] |
| oversight_intervention_steps = 0 |
|
|
| for step in range(episode_length): |
| actions = {} |
|
|
| if use_model: |
| prompts, meta = [], [] |
| for archetype, cfg in TRADER_CONFIGS.items(): |
| for tid in cfg["trader_ids"]: |
| aid = f"trader_{tid}" |
| if aid in obs: |
| prompts.append(format_trader_prompt(archetype, aid, obs[aid])) |
| meta.append((aid, "trader", cfg["temperature"])) |
| coord = detect_coordinated_pressure(env.agent_states) if hasattr(env, "agent_states") else {} |
| prompts.append(format_mm_prompt(obs["market_maker"], coord)) |
| meta.append(("market_maker", "market_maker", 0.3)) |
|
|
| outputs = [] |
| for i, (aid, role, temp) in enumerate(meta): |
| if bedrock_client: |
| out = generate_batch_bedrock([prompts[i]], bedrock_client, bedrock_model, |
| max_new_tokens=120, temperature=temp)[0] |
| else: |
| out = generate_batch([prompts[i]], model, tokenizer, device, |
| max_new_tokens=120, temperature=temp)[0] |
| outputs.append(out) |
|
|
| agent_thoughts = {} |
| for output, (aid, role, _) in zip(outputs, meta): |
| parsed, info = parse_json(output, role=role) |
| format_attempts += 1 |
| if info.get("valid"): |
| format_hits += 1 |
| actions[aid] = parsed |
| else: |
| if role == "trader": |
| actions[aid] = scripted_trader(int(aid.split("_")[1]), step) |
| else: |
| actions[aid] = scripted_mm(step) |
| agent_thoughts[aid] = actions[aid].get("reasoning", "") |
|
|
| heatmap = get_position_heatmap(env.agent_states) if hasattr(env, "agent_states") else {} |
| ov_prompt = format_oversight_prompt(obs["oversight"], heatmap, coord, agent_thoughts) |
| if bedrock_client: |
| ov_out = generate_batch_bedrock([ov_prompt], bedrock_client, bedrock_model, |
| max_new_tokens=140, temperature=0.5)[0] |
| else: |
| ov_out = generate_batch([ov_prompt], model, tokenizer, device, |
| max_new_tokens=140, temperature=0.5)[0] |
| ov_parsed, ov_info = parse_json(ov_out, role="oversight") |
| format_attempts += 1 |
| if ov_info.get("valid"): |
| format_hits += 1 |
| actions["oversight"] = ov_parsed |
| else: |
| actions["oversight"] = scripted_oversight() |
| else: |
| for i in range(3): |
| actions[f"trader_{i}"] = scripted_trader(i, step) |
| actions["market_maker"] = scripted_mm(step) |
| actions["oversight"] = scripted_oversight() |
|
|
| actions["trader_3"] = scripted_trader(3, step) |
|
|
| if actions["oversight"].get("intervention_type", "none") != "none": |
| oversight_intervention_steps += 1 |
|
|
| diversity_per_step.append(diversity_score(actions)) |
|
|
| obs, rewards, done, _info = env.step(actions) |
| for k in rewards_total: |
| rewards_total[k] += float(rewards.get(k, 0.0)) |
|
|
| if verbose and step % 20 == 0: |
| print(f" step {step:3d} pnl_t0={rewards.get('trader_0',0):+.2f} " |
| f"div={diversity_per_step[-1]:.2f} " |
| f"fmt={format_hits}/{format_attempts}") |
|
|
| if done: |
| break |
|
|
| steps_run = len(diversity_per_step) |
| return { |
| "rewards_total": rewards_total, |
| "format_rate": format_hits / max(1, format_attempts), |
| "diversity_mean": float(np.mean(diversity_per_step)), |
| "oversight_rate": oversight_intervention_steps / max(1, steps_run), |
| "steps": steps_run, |
| } |
|
|
|
|
| def main(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--base_model", default="unsloth/Llama-3.2-1B-Instruct") |
| p.add_argument("--load_lora_path", default=None) |
| p.add_argument("--num_episodes", type=int, default=10) |
| p.add_argument("--episode_length", type=int, default=250) |
| p.add_argument("--seed", type=int, default=42) |
| p.add_argument("--wandb_project", default=None) |
| p.add_argument("--run_name", default=None) |
| |
| p.add_argument("--use_bedrock", action="store_true") |
| p.add_argument("--bedrock_model", default="meta.llama3-1-70b-instruct-v1:0") |
| p.add_argument("--aws_region", default="us-east-1") |
| args = p.parse_args() |
|
|
| if args.wandb_project and HAS_WANDB and os.environ.get("WANDB_API_KEY"): |
| wandb.init(project=args.wandb_project, name=args.run_name or "eval", |
| config=vars(args)) |
|
|
| bedrock_client = None |
| model = tokenizer = device = None |
|
|
| if args.use_bedrock: |
| import boto3 |
| print(f"[Bedrock] Region={args.aws_region} Model={args.bedrock_model}") |
| bedrock_client = boto3.client("bedrock-runtime", region_name=args.aws_region) |
| print("[Bedrock] Client ready — no local model loaded") |
| else: |
| device = "cuda" if torch.cuda.is_available() else ("mps" if torch.backends.mps.is_available() else "cpu") |
| dtype = torch.bfloat16 if device in ("cuda", "mps") else torch.float32 |
| if device == "cuda": |
| print(f"[Device] {device} — {torch.cuda.get_device_name(0)} dtype={dtype}") |
| else: |
| print(f"[Device] {device} dtype={dtype}") |
| if device == "cpu": |
| print("[WARNING] No GPU detected — running on CPU will be very slow.") |
|
|
| print(f"[Model] Loading base: {args.base_model}") |
| tokenizer = AutoTokenizer.from_pretrained(args.base_model) |
| |
| |
| load_device_map = "cuda:0" if device == "cuda" else ("mps" if device == "mps" else "cpu") |
| model = AutoModelForCausalLM.from_pretrained(args.base_model, device_map=load_device_map, torch_dtype=dtype) |
| print(f"[Model] Loaded — first param device: {next(model.parameters()).device}") |
|
|
| if args.load_lora_path: |
| from peft import PeftModel |
| print(f"[LoRA] Loading adapter: {args.load_lora_path}") |
| model = PeftModel.from_pretrained(model, args.load_lora_path) |
| else: |
| print("[LoRA] No adapter (baseline mode)") |
| model.eval() |
|
|
| print(f"\n[Eval] {args.num_episodes} episodes x {args.episode_length} steps\n") |
|
|
| agg = {"format_rate": [], "diversity_mean": [], "oversight_rate": [], |
| "rewards_total": []} |
|
|
| for ep in range(args.num_episodes): |
| print(f"--- Episode {ep+1}/{args.num_episodes} ---") |
| result = run_episode(model, tokenizer, device, args.episode_length, |
| seed=args.seed + ep, use_model=True, verbose=(ep == 0), |
| bedrock_client=bedrock_client, |
| bedrock_model=args.bedrock_model if args.use_bedrock else None) |
| agg["format_rate"].append(result["format_rate"]) |
| agg["diversity_mean"].append(result["diversity_mean"]) |
| agg["oversight_rate"].append(result["oversight_rate"]) |
| agg["rewards_total"].append(result["rewards_total"]) |
|
|
| ep_pnl = sum(result["rewards_total"][f"trader_{i}"] for i in range(4)) / 4 |
| print(f" ep_format={result['format_rate']:.2%} " |
| f"ep_diversity={result['diversity_mean']:.3f} " |
| f"ep_oversight={result['oversight_rate']:.2%} " |
| f"ep_pnl_mean={ep_pnl:+.2f}\n") |
|
|
| if HAS_WANDB and wandb.run: |
| wandb.log({ |
| "story/global_step": ep, |
| "story/format_mean": result["format_rate"], |
| "story/diversity_mean": result["diversity_mean"], |
| "story/oversight_mean": result["oversight_rate"], |
| "story/pnl_mean": ep_pnl, |
| }) |
|
|
| print("=" * 60) |
| print("FINAL EVALUATION SUMMARY") |
| print("=" * 60) |
| print(f"format_rate mean={np.mean(agg['format_rate']):.2%} std={np.std(agg['format_rate']):.2%}") |
| print(f"diversity_mean mean={np.mean(agg['diversity_mean']):.3f} std={np.std(agg['diversity_mean']):.3f}") |
| print(f"oversight_rate mean={np.mean(agg['oversight_rate']):.2%} std={np.std(agg['oversight_rate']):.2%}") |
| for agent in ["trader_0", "trader_1", "trader_2", "trader_3", "market_maker", "oversight"]: |
| vals = [r[agent] for r in agg["rewards_total"]] |
| print(f"{agent:<14} cum_reward mean={np.mean(vals):+.2f} std={np.std(vals):.2f}") |
|
|
| if HAS_WANDB and wandb.run: |
| wandb.summary["format_rate"] = float(np.mean(agg["format_rate"])) |
| wandb.summary["diversity_mean"] = float(np.mean(agg["diversity_mean"])) |
| wandb.summary["oversight_rate"] = float(np.mean(agg["oversight_rate"])) |
| wandb.finish() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|