Spaces:

iteratehack
/

deepbattler

Sleeping

File size: 22,783 Bytes

787c99c

#!/usr/bin/env python
# eval_battleground_rlaif.py
#
# Evaluation script for Battlegrounds RLAIF models: No FT, SFT, and SFT+GRPO.
# Measures action prediction accuracy against expert/labeled actions.

import argparse
import json
import os
import sys
from typing import Optional, Dict, Any, List
from tqdm import tqdm

import torch
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import PeftModel

_SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
if _SCRIPT_DIR not in sys.path:
    sys.path.append(_SCRIPT_DIR)

from battleground_nl_utils import (
    dataset_state_to_game_state,
    game_state_to_natural_language,
)

# ================== Constants ==================

LOCAL_INSTRUCT_PATH = "models/qwen3-4b-instruct-2507/Qwen/Qwen3-4B-Instruct-2507"
DEFAULT_DATA_FILE = "RL/datasets/battleground_rlaif_multicandidate.jsonl"


def _resolve_default_model_id() -> str:
    env_override = os.environ.get("QWEN_INSTRUCT_MODEL")
    if env_override:
        return env_override
    if os.path.isdir(LOCAL_INSTRUCT_PATH):
        return LOCAL_INSTRUCT_PATH
    return "Qwen/Qwen3-4B-Instruct"


DEFAULT_MODEL_ID = _resolve_default_model_id()


# ================== Data loading ==================

INSTRUCTION_PREFIX = """You are a Hearthstone Battlegrounds AI.
Given the current game state as a JSON object, choose exactly one best action and respond with a single JSON object in this exact format:
{"action":{"type":"<ACTION_TYPE>","tavern_index":<int-or-null>,"hand_index":<int-or-null>,"board_index":<int-or-null>,"card_name":<string-or-null>}}
Rules:
1. Respond with JSON only. Do not add explanations or any extra text.
2. The top-level object must have exactly one key: "action".
3. Use 0-based integers for indices or null when not used.
4. "type" must be one of: "BUY_FROM_TAVERN","PLAY_FROM_HAND","SELL_FROM_BOARD","HERO_POWER","ROLL","UPGRADE_TAVERN","FREEZE","END_TURN".
5. "card_name" must exactly match a card name from the game state when required, otherwise null.
Now here is the game state JSON:
"""

INSTRUCTION_PREFIX_NL = """You are a Hearthstone Battlegrounds AI.
Given the following natural language description of the current game state, choose exactly one best action and respond with a single JSON object in this exact format:
{"action":{"type":"<ACTION_TYPE>","tavern_index":<int-or-null>,"hand_index":<int-or-null>,"board_index":<int-or-null>,"card_name":<string-or-null>}}
Rules:
1. Respond with JSON only. Do not add explanations or any extra text.
2. The top-level object must have exactly one key: "action".
3. Use 0-based integers for indices or null when not used.
4. "type" must be one of: "BUY_FROM_TAVERN","PLAY_FROM_HAND","SELL_FROM_BOARD","HERO_POWER","ROLL","UPGRADE_TAVERN","FREEZE","END_TURN".
5. "card_name" must exactly match a card name from the game state when required, otherwise null.
Now here is the description of the game state:
"""


def _build_prompt(example: Dict[str, Any], input_mode: str = "json") -> str:
    """Build prompt from game state (same format as training)."""
    if input_mode == "nl":
        game_state = dataset_state_to_game_state(example)
        nl_state = game_state_to_natural_language(game_state)
        prefix = INSTRUCTION_PREFIX_NL
        state_text = nl_state
    else:
        obj = {
            "task": "battlegrounds_policy_v1",
            "phase": example["phase"],
            "turn": example["turn"],
            "state": example["state"],
        }
        state_text = json.dumps(obj, separators=(",", ":"), ensure_ascii=False)
        prefix = INSTRUCTION_PREFIX

    return prefix + "\n" + state_text


def load_eval_dataset(
    data_file: str,
    test_size: float = 0.1,
    seed: int = 42,
    limit: Optional[int] = None,
    input_mode: str = "json",
):
    """
    Load evaluation dataset from JSONL file.
    Uses the same train/test split as training to get the held-out test set.
    """
    raw = load_dataset("json", data_files={"train": data_file})["train"]
    
    # Same split as training
    split = raw.train_test_split(test_size=test_size, seed=seed)
    test_ds = split["test"]
    
    def format_example(example):
        prompt = _build_prompt(example, input_mode=input_mode)
        candidates = example["candidates"]
        
        # Find expert action
        expert = None
        for c in candidates:
            if c.get("role") == "expert":
                expert = c
                break
        if expert is None:
            expert = max(candidates, key=lambda x: float(x.get("reward", 0.0)))
        
        return {
            "prompt": prompt,
            "expert_action": expert["action"],
            "candidates": candidates,
            "game_id": example.get("game_id", ""),
            "step_id": example.get("step_id", 0),
            "turn": example["turn"],
            "phase": example["phase"],
        }
    
    test_ds = test_ds.map(format_example, remove_columns=raw.column_names)
    
    if limit is not None:
        test_ds = test_ds.select(range(min(limit, len(test_ds))))
    
    return test_ds


# ================== Action parsing & comparison ==================

def parse_action_from_completion(text: str) -> Optional[Dict[str, Any]]:
    """
    Parse model completion to extract action dict.
    Expected format from training: {"action": {...}}
    """
    text = text.strip()
    
    # Try to find JSON in the text
    # Sometimes model outputs extra text before/after JSON
    start_idx = text.find("{")
    if start_idx == -1:
        return None
    
    # Find matching closing brace
    brace_count = 0
    end_idx = -1
    for i, c in enumerate(text[start_idx:], start=start_idx):
        if c == "{":
            brace_count += 1
        elif c == "}":
            brace_count -= 1
            if brace_count == 0:
                end_idx = i + 1
                break
    
    if end_idx == -1:
        # No matching brace, try to find any closing brace
        end_idx = text.rfind("}") + 1
        if end_idx == 0:
            return None
    
    json_str = text[start_idx:end_idx]
    
    try:
        obj = json.loads(json_str)
    except Exception:
        # Try to fix common issues
        try:
            # Sometimes model outputs incomplete JSON, try adding closing braces
            obj = json.loads(json_str + "}")
        except:
            try:
                obj = json.loads(json_str + "}}")
            except:
                return None
    
    if isinstance(obj, dict):
        # Format from training: {"action": {...}}
        if "action" in obj and isinstance(obj["action"], dict):
            return obj["action"]
        # If it's directly an action dict (has "type" field)
        if "type" in obj:
            return obj
    return None


def actions_match(pred: Dict[str, Any], gold: Dict[str, Any], strict: bool = True) -> bool:
    """
    Compare predicted action with gold action.
    
    Args:
        pred: Predicted action dict
        gold: Gold/expert action dict
        strict: If True, all fields must match exactly. If False, only compare key fields.
    """
    if strict:
        return pred == gold
    
    # Relaxed matching: compare only essential fields
    key_fields = ["type", "tavern_index", "hand_index", "board_index", "card_name"]
    for field in key_fields:
        pred_val = pred.get(field)
        gold_val = gold.get(field)
        # Treat None and missing as equivalent
        if pred_val is None and gold_val is None:
            continue
        if pred_val != gold_val:
            return False
    return True


def get_action_reward(pred: Dict[str, Any], candidates: List[Dict[str, Any]]) -> float:
    """Get reward for predicted action by matching against candidates."""
    for cand in candidates:
        cand_action = cand.get("action", {})
        if actions_match(pred, cand_action, strict=False):
            return float(cand.get("reward", 0.0))
    return 0.0


# ================== Model loading ==================

def load_base_model(model_path: str, bf16: bool = True):
    """Load base model without any adapters."""
    dtype = torch.bfloat16 if bf16 and torch.cuda.is_available() else torch.float16
    model_kwargs = {
        "torch_dtype": dtype,
        "trust_remote_code": True,
    }
    if torch.cuda.is_available():
        model_kwargs["device_map"] = "auto"

    model = AutoModelForCausalLM.from_pretrained(model_path, **model_kwargs)
    tokenizer = AutoTokenizer.from_pretrained(
        model_path, use_fast=True, trust_remote_code=True
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    return model, tokenizer


def load_peft_model(base_model_path: str, adapter_path: str, bf16: bool = True):
    """Load base model with PEFT adapter."""
    dtype = torch.bfloat16 if bf16 and torch.cuda.is_available() else torch.float16
    model_kwargs = {
        "torch_dtype": dtype,
        "trust_remote_code": True,
    }
    if torch.cuda.is_available():
        model_kwargs["device_map"] = "auto"

    base_model = AutoModelForCausalLM.from_pretrained(base_model_path, **model_kwargs)
    model = PeftModel.from_pretrained(base_model, adapter_path)
    model = model.merge_and_unload()  # Merge for faster inference

    tokenizer = AutoTokenizer.from_pretrained(
        base_model_path, use_fast=True, trust_remote_code=True
    )
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "left"

    return model, tokenizer


# ================== Evaluation ==================

@torch.no_grad()
def evaluate_model(
    model,
    tokenizer,
    test_ds,
    max_new_tokens: int = 128,
    batch_size: int = 8,
    verbose: bool = False,
):
    """
    Evaluate model on Battlegrounds test set.
    
    Returns:
        - exact_match_acc: Accuracy of exact action match
        - relaxed_match_acc: Accuracy with relaxed matching (key fields only)
        - avg_reward: Average reward of predicted actions
        - results: List of per-sample results
    """
    model.eval()
    device = next(model.parameters()).device

    exact_correct = 0
    relaxed_correct = 0
    total_reward = 0.0
    total = 0
    parse_failures = 0
    results = []

    for i in tqdm(range(0, len(test_ds), batch_size), desc="Evaluating"):
        batch = test_ds[i : i + batch_size]
        prompts = batch["prompt"] if isinstance(batch["prompt"], list) else [batch["prompt"]]
        expert_actions = batch["expert_action"] if isinstance(batch["expert_action"], list) else [batch["expert_action"]]
        candidates_list = batch["candidates"] if isinstance(batch["candidates"], list) else [batch["candidates"]]

        inputs = tokenizer(
            prompts,
            return_tensors="pt",
            padding=True,
            truncation=True,
            max_length=1024,
        ).to(device)

        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )

        # Decode and evaluate each sample
        for j, (output, prompt, expert_action, candidates) in enumerate(
            zip(outputs, prompts, expert_actions, candidates_list)
        ):
            input_len = inputs["input_ids"][j].shape[0]
            generated = tokenizer.decode(output[input_len:], skip_special_tokens=True)
            pred_action = parse_action_from_completion(generated)

            is_exact_match = False
            is_relaxed_match = False
            reward = 0.0

            if pred_action is None:
                parse_failures += 1
            else:
                is_exact_match = actions_match(pred_action, expert_action, strict=True)
                is_relaxed_match = actions_match(pred_action, expert_action, strict=False)
                reward = get_action_reward(pred_action, candidates)

            if is_exact_match:
                exact_correct += 1
            if is_relaxed_match:
                relaxed_correct += 1
            total_reward += reward
            total += 1

            result = {
                "game_id": batch["game_id"][j] if isinstance(batch["game_id"], list) else batch["game_id"],
                "step_id": batch["step_id"][j] if isinstance(batch["step_id"], list) else batch["step_id"],
                "turn": batch["turn"][j] if isinstance(batch["turn"], list) else batch["turn"],
                "phase": batch["phase"][j] if isinstance(batch["phase"], list) else batch["phase"],
                "expert_action": expert_action,
                "predicted_action": pred_action,
                "generated_text": generated.strip()[:200],  # Truncate for readability
                "exact_match": is_exact_match,
                "relaxed_match": is_relaxed_match,
                "reward": reward,
            }
            results.append(result)

            if verbose and not is_relaxed_match:
                print(f"\n[WRONG] Game: {result['game_id']}, Step: {result['step_id']}")
                print(f"  Expert: {expert_action}")
                print(f"  Pred:   {pred_action}")
                print(f"  Gen:    {generated[:150]}")

    exact_match_acc = exact_correct / total if total > 0 else 0.0
    relaxed_match_acc = relaxed_correct / total if total > 0 else 0.0
    avg_reward = total_reward / total if total > 0 else 0.0

    return {
        "exact_match_acc": exact_match_acc,
        "relaxed_match_acc": relaxed_match_acc,
        "avg_reward": avg_reward,
        "parse_failure_rate": parse_failures / total if total > 0 else 0.0,
        "total_samples": total,
        "results": results,
    }


# ================== Main ==================

def main():
    parser = argparse.ArgumentParser(description="Evaluate Battlegrounds RLAIF models: No FT, SFT, SFT+GRPO")
    parser.add_argument(
        "--base-model",
        default=DEFAULT_MODEL_ID,
        help="Base model path (Qwen instruct checkpoint).",
    )
    parser.add_argument(
        "--output-dir",
        default="./battleground_rlaif_qwen",
        help="Directory containing SFT and GRPO checkpoints.",
    )
    parser.add_argument(
        "--data-file",
        default=DEFAULT_DATA_FILE,
        help="Path to JSONL file with multi-candidate Battlegrounds data.",
    )
    parser.add_argument(
        "--sft-adapter",
        default=None,
        help="Path to SFT adapter (default: <output-dir>/sft_model).",
    )
    parser.add_argument(
        "--grpo-adapter",
        default=None,
        help="Path to GRPO adapter (default: <output-dir>/grpo_model).",
    )
    parser.add_argument(
        "--eval-samples",
        type=int,
        default=50,
        help="Number of test samples to evaluate (default: 50 for quick testing, use -1 for full set).",
    )
    parser.add_argument("--batch-size", type=int, default=8, help="Batch size for inference (default: 8 for A800).")
    parser.add_argument("--max-new-tokens", type=int, default=128, help="Max tokens to generate.")
    parser.add_argument("--disable-bf16", action="store_true", help="Use fp16 instead of bf16.")
    parser.add_argument("--verbose", action="store_true", help="Print wrong predictions.")
    parser.add_argument(
        "--eval-no-ft", action="store_true", help="Evaluate base model (no fine-tuning)."
    )
    parser.add_argument("--eval-sft", action="store_true", help="Evaluate SFT model.")
    parser.add_argument("--eval-grpo", action="store_true", help="Evaluate SFT+GRPO model.")
    parser.add_argument(
        "--save-results",
        default=None,
        help="Path to save detailed results as JSON.",
    )
    parser.add_argument(
        "--input-mode",
        choices=["json", "nl"],
        default="json",
        help="Input format for game state: 'json' uses raw JSON, 'nl' uses natural language description.",
    )

    args = parser.parse_args()
    bf16 = not args.disable_bf16

    # Default: evaluate all if none specified
    eval_all = not (args.eval_no_ft or args.eval_sft or args.eval_grpo)
    if eval_all:
        args.eval_no_ft = True
        args.eval_sft = True
        args.eval_grpo = True

    # Resolve adapter paths
    sft_adapter = args.sft_adapter or os.path.join(args.output_dir, "sft_model")
    grpo_adapter = args.grpo_adapter or os.path.join(args.output_dir, "grpo_model")

    # Handle eval_samples=-1 as full set
    eval_samples = None if args.eval_samples == -1 else args.eval_samples

    # Load test data
    print("Loading Battlegrounds test set...")
    if not os.path.exists(args.data_file):
        print(f"ERROR: Data file not found: {args.data_file}")
        return
    
    test_ds = load_eval_dataset(
        args.data_file,
        limit=eval_samples,
        input_mode=args.input_mode,
    )
    print(f"Test samples: {len(test_ds)}")

    all_results = {}

    # ===== Evaluate No FT (base model) =====
    if args.eval_no_ft:
        print("\n" + "=" * 60)
        print("Evaluating: No Fine-Tuning (Base Model)")
        print("=" * 60)
        model, tokenizer = load_base_model(args.base_model, bf16=bf16)
        metrics = evaluate_model(
            model, tokenizer, test_ds,
            max_new_tokens=args.max_new_tokens,
            batch_size=args.batch_size,
            verbose=args.verbose,
        )
        print(f"[No FT] Exact Match: {metrics['exact_match_acc']:.4f}")
        print(f"[No FT] Relaxed Match: {metrics['relaxed_match_acc']:.4f}")
        print(f"[No FT] Avg Reward: {metrics['avg_reward']:.4f}")
        print(f"[No FT] Parse Failures: {metrics['parse_failure_rate']:.2%}")
        all_results["no_ft"] = metrics
        del model
        torch.cuda.empty_cache()

    # ===== Evaluate SFT =====
    if args.eval_sft:
        print("\n" + "=" * 60)
        print("Evaluating: SFT Fine-Tuned Model")
        print("=" * 60)
        if not os.path.exists(sft_adapter):
            print(f"[SKIP] SFT adapter not found at: {sft_adapter}")
        else:
            model, tokenizer = load_peft_model(args.base_model, sft_adapter, bf16=bf16)
            metrics = evaluate_model(
                model, tokenizer, test_ds,
                max_new_tokens=args.max_new_tokens,
                batch_size=args.batch_size,
                verbose=args.verbose,
            )
            print(f"[SFT] Exact Match: {metrics['exact_match_acc']:.4f}")
            print(f"[SFT] Relaxed Match: {metrics['relaxed_match_acc']:.4f}")
            print(f"[SFT] Avg Reward: {metrics['avg_reward']:.4f}")
            print(f"[SFT] Parse Failures: {metrics['parse_failure_rate']:.2%}")
            all_results["sft"] = metrics
            del model
            torch.cuda.empty_cache()

    # ===== Evaluate SFT + GRPO =====
    if args.eval_grpo:
        print("\n" + "=" * 60)
        print("Evaluating: SFT + GRPO Fine-Tuned Model")
        print("=" * 60)
        grpo_epoch_dir = os.path.join(args.output_dir, "grpo")

        adapters_to_eval: List[tuple[str, str]] = []

        # If user did not override --grpo-adapter and epoch checkpoints exist,
        # evaluate all checkpoint-* directories under output_dir/grpo plus final grpo_model.
        default_grpo_adapter = os.path.join(args.output_dir, "grpo_model")
        using_default_adapter = (args.grpo_adapter is None) or (
            grpo_adapter == default_grpo_adapter
        )

        if using_default_adapter and os.path.isdir(grpo_epoch_dir):
            checkpoint_names = [
                d
                for d in os.listdir(grpo_epoch_dir)
                if d.startswith("checkpoint")
                and os.path.isdir(os.path.join(grpo_epoch_dir, d))
            ]
            checkpoint_names.sort()

            for name in checkpoint_names:
                path = os.path.join(grpo_epoch_dir, name)
                label = f"sft_grpo_{name}"
                adapters_to_eval.append((label, path))

            if os.path.exists(grpo_adapter):
                adapters_to_eval.append(("sft_grpo_final", grpo_adapter))
        else:
            if os.path.exists(grpo_adapter):
                adapters_to_eval.append(("sft_grpo", grpo_adapter))

        if not adapters_to_eval:
            print(f"[SKIP] No GRPO adapters found. Expected at: {grpo_adapter} or under {grpo_epoch_dir}")
        else:
            for label, adapter_path in adapters_to_eval:
                print("\n" + "-" * 60)
                print(f"Evaluating GRPO adapter: {label}")
                print(f"Path: {adapter_path}")
                model, tokenizer = load_peft_model(
                    args.base_model, adapter_path, bf16=bf16
                )
                metrics = evaluate_model(
                    model,
                    tokenizer,
                    test_ds,
                    max_new_tokens=args.max_new_tokens,
                    batch_size=args.batch_size,
                    verbose=args.verbose,
                )
                print(f"[{label}] Exact Match: {metrics['exact_match_acc']:.4f}")
                print(f"[{label}] Relaxed Match: {metrics['relaxed_match_acc']:.4f}")
                print(f"[{label}] Avg Reward: {metrics['avg_reward']:.4f}")
                print(f"[{label}] Parse Failures: {metrics['parse_failure_rate']:.2%}")
                all_results[label] = metrics
                del model
                torch.cuda.empty_cache()

    # ===== Summary =====
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)
    print(f"{'Model':<12} {'Exact':<10} {'Relaxed':<10} {'Reward':<10} {'Parse Fail':<10}")
    print("-" * 52)
    for name, data in all_results.items():
        if "results" in data:  # Has actual results
            print(f"{name:<12} {data['exact_match_acc']:<10.4f} {data['relaxed_match_acc']:<10.4f} {data['avg_reward']:<10.4f} {data['parse_failure_rate']:<10.2%}")

    # Save results
    if args.save_results:
        save_data = {
            name: {
                "exact_match_acc": data["exact_match_acc"],
                "relaxed_match_acc": data["relaxed_match_acc"],
                "avg_reward": data["avg_reward"],
                "parse_failure_rate": data["parse_failure_rate"],
                "total_samples": data["total_samples"],
                "sample_predictions": data["results"][:10],  # First 10 for inspection
            }
            for name, data in all_results.items()
            if "results" in data
        }
        with open(args.save_results, "w") as f:
            json.dump(save_data, f, indent=2, ensure_ascii=False)
        print(f"\nResults saved to: {args.save_results}")


if __name__ == "__main__":
    main()