Spaces:

bpHigh
/

carrom_rl_env

Sleeping

File size: 13,202 Bytes

13b4881

"""GRPO training for the Carrom agent on Modal — Unsloth + TRL.

Trains an instruction-tuned LLM to play Carrom under ICF rules using
Group Relative Policy Optimization (GRPO) via TRL's GRPOTrainer, with
Unsloth's 4-bit quantization and gradient-checkpointing for memory efficiency.

Cost estimates (Modal pricing, A10G @ ~$1.10/hr):
  --steps  200  (quick smoke-test)  ≈  15 min  ≈ $0.30
  --steps  500  (light training)    ≈  35 min  ≈ $0.65
  --steps 2000  (blog-quality run)  ≈  2.5 hr  ≈ $2.75
  A100 40GB @ ~$3.72/hr: --steps 2000 ≈ 1.5 hr ≈ $5.60

All budgets are well under $25.

Usage
-----
# Quickstart (200 steps, Gemma-3-4B, A10G)
modal run examples/train_modal.py

# Blog-quality run, push to HF Hub
modal run examples/train_modal.py --steps 2000 --push --repo your-username/carrom-grpo-gemma

# Larger model on A100
modal run examples/train_modal.py --model qwen2.5-3b --gpu a100 --steps 2000

# Just preview cost without running
modal run examples/train_modal.py --dry-run

Prerequisites
-------------
1. Install Modal: pip install modal && modal setup
2. Create an HF token secret in Modal dashboard:
       modal secret create hf-token HF_TOKEN=hf_...
3. (Optional) Create a W&B secret for experiment tracking:
       modal secret create wandb WANDB_API_KEY=...
"""

from __future__ import annotations

import sys
import modal

# ---------------------------------------------------------------------------
# Model registry
# ---------------------------------------------------------------------------
# Key → (unsloth_model_id, min_vram_gb, recommended_gpu)
MODELS: dict[str, tuple[str, int, str]] = {
    "gemma-3-1b":   ("unsloth/gemma-3-1b-it",               8,  "a10g"),
    "gemma-3-4b":   ("unsloth/gemma-3-4b-it",               16, "a10g"),  # ← default
    "qwen2.5-1.5b": ("unsloth/Qwen2.5-1.5B-Instruct",       8,  "a10g"),
    "qwen2.5-3b":   ("unsloth/Qwen2.5-3B-Instruct",         12, "a10g"),
    "qwen2.5-7b":   ("unsloth/Qwen2.5-7B-Instruct",         24, "a100"),
}

GPU_MAP = {
    "a10g": modal.gpu.A10G(),
    "a100": modal.gpu.A100(),
}

DEFAULT_MODEL = "gemma-3-4b"
HF_REPO_DEFAULT = "your-username/carrom-grpo-agent"

# ---------------------------------------------------------------------------
# Modal app & image
# ---------------------------------------------------------------------------

app = modal.App("carrom-grpo")

training_image = (
    modal.Image.from_registry(
        "pytorch/pytorch:2.4.0-cuda12.1-cudnn8-runtime",
        add_python="3.11",
    )
    .apt_install("git", "build-essential")
    .pip_install(
        # Core training stack
        "unsloth",
        "trl>=0.15",
        "transformers>=4.45",
        "datasets>=3.0",
        "accelerate>=0.35",
        "peft>=0.12",
        "bitsandbytes>=0.43",
        "huggingface_hub>=0.24",
        # Optional experiment tracking
        "wandb",
        # Carrom env dependencies
        "pymunk>=6.5",
        "numpy>=1.24",
        "pydantic>=2.0",
    )
)

# Mount local carrom code so the training function can import it
carrom_mount   = modal.Mount.from_local_dir(
    local_path="carrom_env",
    remote_path="/root/carrom_env",
    condition=lambda p: not any(s in p for s in ["__pycache__", ".pyc", ".pyo"]),
)
examples_mount = modal.Mount.from_local_dir(
    local_path="examples",
    remote_path="/root/examples",
    condition=lambda p: not any(s in p for s in ["__pycache__", ".pyc", ".pyo"]),
)


# ---------------------------------------------------------------------------
# Training function
# ---------------------------------------------------------------------------

@app.function(
    image=training_image,
    gpu=modal.gpu.A10G(),        # overridden at call time via .with_options()
    timeout=7_200,               # 2-hour cap
    secrets=[
        modal.Secret.from_name("hf-token", required=False),
        modal.Secret.from_name("wandb",    required=False),
    ],
    mounts=[carrom_mount, examples_mount],
)
def _train_remote(
    model_key:          str   = DEFAULT_MODEL,
    max_steps:          int   = 200,
    num_train_samples:  int   = 400,
    num_generations:    int   = 4,
    learning_rate:      float = 5e-6,
    push_to_hub:        bool  = False,
    hf_repo:            str   = HF_REPO_DEFAULT,
    wandb_project:      str   = "carrom-grpo",
) -> dict:
    """Core training logic executed remotely on Modal."""
    import os, random, math, json, time
    import sys
    sys.path.insert(0, "/root")

    import torch
    from datasets import Dataset
    from huggingface_hub import login
    from unsloth import FastLanguageModel
    from trl import GRPOConfig, GRPOTrainer

    from carrom_env.env import CarromEnv
    from carrom_env.models import Action
    from examples.grpo_utils import (
        format_chat_prompt,
        parse_response,
        CARROM_SYSTEM_PROMPT,
    )

    # Auth
    hf_token = os.environ.get("HF_TOKEN", "")
    if hf_token:
        login(token=hf_token, add_to_git_credential=False)

    # W&B
    use_wandb = bool(os.environ.get("WANDB_API_KEY"))
    if use_wandb:
        import wandb
        wandb.login(key=os.environ["WANDB_API_KEY"])

    model_id, _min_vram, _rec_gpu = MODELS[model_key]
    print(f"\n{'='*60}")
    print(f"Model   : {model_id}")
    print(f"Steps   : {max_steps}")
    print(f"Samples : {num_train_samples}")
    print(f"GPU     : {torch.cuda.get_device_name(0)}")
    print(f"VRAM    : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"{'='*60}\n")

    # ------------------------------------------------------------------
    # 1. Load model with Unsloth
    # ------------------------------------------------------------------
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_id,
        max_seq_length=1024,
        load_in_4bit=True,
        dtype=torch.bfloat16,
    )
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj",
                        "gate_proj", "up_proj", "down_proj"],
        lora_alpha=8,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=42,
    )
    print(f"Trainable params: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

    # ------------------------------------------------------------------
    # 2. Generate diverse training board states
    # ------------------------------------------------------------------
    def build_dataset(n: int) -> Dataset:
        samples = []
        for seed in range(n * 3):            # oversample; filter finished games
            if len(samples) >= n:
                break
            env = CarromEnv(seed=seed)
            obs = env.reset()
            for _ in range(random.randint(0, 6)):
                a = Action(
                    placement_x=random.uniform(-0.3, 0.3),
                    angle=random.uniform(-0.9, 0.9),
                    force=random.uniform(0.25, 0.85),
                )
                obs, _, done, trunc, _ = env.step(a)
                if done or trunc:
                    break
            if obs.remaining_coins > 2:
                samples.append({"prompt": format_chat_prompt(obs)})
        print(f"Dataset: {len(samples)} board states")
        return Dataset.from_list(samples)

    train_ds = build_dataset(num_train_samples)

    # ------------------------------------------------------------------
    # 3. Reward function (ICF-aware)
    # ------------------------------------------------------------------
    def carrom_reward(completions, **kwargs):
        rewards = []
        for completion in completions:
            text   = completion[-1]["content"] if isinstance(completion, list) else str(completion)
            reward = 0.0
            action = parse_response(text)
            if action is not None:
                reward += 0.3
                if -0.4 <= action.placement_x <= 0.4:              reward += 0.1
                if 0.15 <= action.force        <= 0.9:             reward += 0.1
                if -math.pi / 2 <= action.angle <= math.pi / 2:   reward += 0.1
                try:
                    env = CarromEnv(seed=abs(hash(text)) % 100_000)
                    env.reset()
                    _, env_reward, _, _, info = env.step(action)
                    reward += env_reward
                    reward += 0.5  * int(info.get("coin_potted", 0) > 0)
                    reward -= 0.3  * info.get("due_coins", 0)   # penalise ICF dues
                    reward -= 0.75 * info.get("foul", 0)        # penalise fouls
                except Exception:
                    pass
            else:
                reward -= 0.5
            rewards.append(reward)
        return rewards

    # ------------------------------------------------------------------
    # 4. GRPO training config
    # ------------------------------------------------------------------
    config = GRPOConfig(
        output_dir="/root/carrom-grpo-output",
        num_train_epochs=1,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_generations=num_generations,
        max_prompt_length=512,
        max_completion_length=256,
        max_steps=max_steps,
        learning_rate=learning_rate,
        warmup_ratio=0.05,
        lr_scheduler_type="cosine",
        optim="adamw_8bit",
        bf16=True,
        logging_steps=5,
        save_steps=max(max_steps // 4, 50),
        report_to="wandb" if use_wandb else "none",
        run_name=f"carrom-grpo-{model_key}",
    )
    if use_wandb:
        config.wandb_project = wandb_project

    trainer = GRPOTrainer(
        model=model,
        processing_class=tokenizer,
        reward_funcs=carrom_reward,
        args=config,
        train_dataset=train_ds,
    )

    # ------------------------------------------------------------------
    # 5. Train
    # ------------------------------------------------------------------
    t0 = time.time()
    trainer.train()
    elapsed = time.time() - t0
    print(f"\nTraining complete in {elapsed/60:.1f} min")

    # ------------------------------------------------------------------
    # 6. Save / push
    # ------------------------------------------------------------------
    save_path = "/root/carrom-grpo-output/final"
    trainer.save_model(save_path)
    tokenizer.save_pretrained(save_path)

    result = {
        "model_id":    model_id,
        "steps":       max_steps,
        "elapsed_min": round(elapsed / 60, 1),
        "saved_to":    save_path,
    }

    if push_to_hub and hf_repo and hf_token:
        trainer.model.push_to_hub(hf_repo)
        tokenizer.push_to_hub(hf_repo)
        print(f"Pushed to  : https://huggingface.co/{hf_repo}")
        result["hf_repo"] = hf_repo
    else:
        print("Skipping HF push (pass --push --repo <username/repo> to enable)")

    return result


# ---------------------------------------------------------------------------
# Local entry point
# ---------------------------------------------------------------------------

@app.local_entrypoint()
def main(
    model: str  = DEFAULT_MODEL,
    steps: int  = 200,
    samples: int = 400,
    generations: int = 4,
    lr: float   = 5e-6,
    gpu: str    = "a10g",
    push: bool  = False,
    repo: str   = HF_REPO_DEFAULT,
    wandb_project: str = "carrom-grpo",
    dry_run: bool = False,
):
    """
    modal run examples/train_modal.py [OPTIONS]

    --model     gemma-3-4b | gemma-3-1b | qwen2.5-1.5b | qwen2.5-3b | qwen2.5-7b
    --steps     200 (smoke) | 500 (light) | 2000 (blog quality)
    --gpu       a10g (~$1.10/hr) | a100 (~$3.72/hr)
    --push      Push trained model to HF Hub
    --repo      HF Hub repo id, e.g. myuser/carrom-grpo-gemma
    --dry-run   Print config and estimated cost, then exit
    """
    if model not in MODELS:
        print(f"Unknown model '{model}'. Choose from: {', '.join(MODELS)}")
        sys.exit(1)
    if gpu not in GPU_MAP:
        print(f"Unknown GPU '{gpu}'. Choose from: {', '.join(GPU_MAP)}")
        sys.exit(1)

    # Cost estimate (approximate)
    gpu_rate = {"a10g": 1.10, "a100": 3.72}[gpu]
    min_per_step = {"gemma-3-1b": 0.04, "gemma-3-4b": 0.08,
                    "qwen2.5-1.5b": 0.04, "qwen2.5-3b": 0.06, "qwen2.5-7b": 0.12}.get(model, 0.08)
    est_minutes = steps * min_per_step
    est_cost    = est_minutes / 60 * gpu_rate

    print(f"\n{'='*55}")
    print(f"  Model      : {MODELS[model][0]}")
    print(f"  Steps      : {steps}")
    print(f"  GPU        : {gpu.upper()} @ ~${gpu_rate:.2f}/hr")
    print(f"  Est. time  : ~{est_minutes:.0f} min")
    print(f"  Est. cost  : ~${est_cost:.2f}  (well under $25)")
    print(f"{'='*55}")

    if dry_run:
        print("\n--dry-run: exiting without launching training.")
        return

    result = _train_remote.with_options(gpu=GPU_MAP[gpu]).remote(
        model_key=model,
        max_steps=steps,
        num_train_samples=samples,
        num_generations=generations,
        learning_rate=lr,
        push_to_hub=push,
        hf_repo=repo,
        wandb_project=wandb_project,
    )
    print("\nResult:", result)