#!/usr/bin/env python3
"""Export a training run to HuggingFace repo format.

Converts .pt checkpoints to safetensors and structures files for HF upload:
  - Root: best checkpoint (model.safetensors, config.json, metrics.jsonl, README.md)
  - checkpoints/step_NNNN/: other checkpoints with truncated metrics

Usage:
    python scripts/export_hf_repo.py \
        --run-dir logs/run_20260322_182707 \
        --output-dir export/pawn-base \
        --repo-name pawn-base \
        --github-url https://github.com/thomas-schweich/PAWN
"""

from __future__ import annotations

import argparse
import json
import shutil
from pathlib import Path

import torch
from safetensors.torch import save_file


def find_best_step(metrics_path: Path) -> int | None:
    """Find the step with lowest val loss from metrics.jsonl."""
    best_loss = float("inf")
    best_step = None
    with open(metrics_path) as f:
        for line in f:
            record = json.loads(line)
            if record.get("type") != "val":
                continue
            loss = record.get("val/loss", float("inf"))
            step = record.get("step")
            if loss < best_loss and step is not None:
                best_loss = loss
                best_step = step
    return best_step


def truncate_metrics(metrics_path: Path, up_to_step: int) -> list[str]:
    """Return metrics lines up to and including the given step."""
    lines = []
    with open(metrics_path) as f:
        for line in f:
            lines.append(line)
            record = json.loads(line)
            if record.get("type") in ("train", "val") and record.get("step", 0) > up_to_step:
                break
    return lines


def convert_pt_to_safetensors(pt_path: Path, output_dir: Path):
    """Convert a .pt checkpoint to safetensors + JSON directory format."""
    output_dir.mkdir(parents=True, exist_ok=True)

    ckpt = torch.load(str(pt_path), map_location="cpu", weights_only=False)

    # Model weights -> safetensors
    state_dict = ckpt["model_state_dict"]
    tensors = {k: v.cpu().contiguous() for k, v in state_dict.items()}
    save_file(tensors, output_dir / "model.safetensors")

    # Config
    config = {
        "format_version": 1,
        "checkpoint_type": "pretrain",
        "model_config": ckpt.get("model_config", {}),
        "training_config": ckpt.get("training_config", {}),
    }
    with open(output_dir / "config.json", "w") as f:
        json.dump(config, f, indent=2, default=str)

    # Optimizer -> safetensors (if present)
    if "optimizer_state_dict" in ckpt:
        from pawn.checkpoint import _flatten_optimizer_state, _rng_to_json, _json_default
        opt_tensors, opt_meta = _flatten_optimizer_state(ckpt["optimizer_state_dict"])
        if opt_tensors:
            save_file(opt_tensors, output_dir / "optimizer.safetensors")

        # Training state
        training_state = {
            "format_version": 1,
            "global_step": ckpt.get("global_step", 0),
            "scheduler_state_dict": ckpt.get("scheduler_state_dict"),
            "scaler_state_dict": ckpt.get("scaler_state_dict"),
            "optimizer_meta": opt_meta,
        }
        rng_state = {}
        if ckpt.get("torch_rng_state") is not None:
            rng_state.update(_rng_to_json(ckpt["torch_rng_state"], ckpt.get("cuda_rng_state")))
        training_state.update(rng_state)

        with open(output_dir / "training_state.json", "w") as f:
            json.dump(training_state, f, indent=2, default=_json_default)


def generate_readme(
    repo_name: str, model_config: dict, training_config: dict,
    best_step: int, val_loss: float, val_acc: float,
    github_url: str, extra_desc: str = "",
) -> str:
    """Generate a HuggingFace model card README."""
    d_model = model_config.get("d_model", "?")
    n_layers = model_config.get("n_layers", "?")
    n_heads = model_config.get("n_heads", "?")
    discard = training_config.get("discard_ply_limit", False)

    # Infer variant name
    variant = "base"
    if d_model == 256:
        variant = "small"
    elif d_model == 640:
        variant = "large"

    params = {"small": "9.5M", "base": "35.8M", "large": "68.4M"}.get(variant, "?")

    return f"""---
license: apache-2.0
library_name: pytorch
tags:
  - chess
  - transformer
  - causal-lm
  - world-model
datasets:
  - random-self-play
model-index:
  - name: {repo_name}
    results:
      - task:
          type: next-move-prediction
        metrics:
          - name: Val Loss
            type: loss
            value: {val_loss}
          - name: Val Accuracy
            type: accuracy
            value: {val_acc}
---

# {repo_name.upper()}

A causal transformer trained on random chess games, designed as a testbed for finetuning and augmentation methods at small scales.
{extra_desc}

## Model Details

| | |
|---|---|
| **Parameters** | {params} |
| **Architecture** | Decoder-only transformer (RMSNorm, SwiGLU, RoPE) |
| **d_model** | {d_model} |
| **Layers** | {n_layers} |
| **Heads** | {n_heads} |
| **Vocabulary** | 4,278 tokens (4,096 grid + 176 promotions + 5 outcomes + 1 PAD) |
| **Sequence length** | 256 |
| **Best val loss** | {val_loss:.4f} (step {best_step:,}) |
| **Best val accuracy** | {val_acc:.1%} |

## Usage

```python
import torch
from safetensors.torch import load_file
from pawn.config import CLMConfig
from pawn.model import PAWNCLM

cfg = CLMConfig.{variant}()
model = PAWNCLM(cfg)
model.load_state_dict(load_file("model.safetensors"))
model.eval()
```

## Training

Trained from scratch on random self-play games generated by a Rust chess engine (shakmaty).
See the [PAWN repository]({github_url}) for training code, data pipeline, and evaluation suite.

## License

Apache 2.0
"""


def main():
    parser = argparse.ArgumentParser(description="Export training run to HF repo format")
    parser.add_argument("--run-dir", required=True, help="Training run directory")
    parser.add_argument("--output-dir", required=True, help="Output directory for HF repo")
    parser.add_argument("--repo-name", required=True, help="Repository name for README")
    parser.add_argument("--github-url", default="https://github.com/thomas-schweich/PAWN")
    parser.add_argument("--best-only", action="store_true", help="Only export best checkpoint")
    parser.add_argument("--extra-desc", default="", help="Extra description for README")
    args = parser.parse_args()

    run_dir = Path(args.run_dir)
    output_dir = Path(args.output_dir)
    metrics_path = run_dir / "metrics.jsonl"

    if not metrics_path.exists():
        print(f"ERROR: {metrics_path} not found")
        return

    # Find best step
    best_step = find_best_step(metrics_path)
    if best_step is None:
        print("ERROR: No val records found in metrics.jsonl")
        return
    print(f"Best val step: {best_step}")

    # Find best val metrics
    best_val_loss, best_val_acc = float("inf"), 0.0
    with open(metrics_path) as f:
        for line in f:
            r = json.loads(line)
            if r.get("type") == "val" and r.get("step") == best_step:
                best_val_loss = r.get("val/loss", float("inf"))
                best_val_acc = r.get("val/accuracy", 0.0)
                break

    # Find all .pt checkpoints
    ckpt_dir = run_dir / "checkpoints"
    checkpoints = sorted(ckpt_dir.glob("step_*.pt")) if ckpt_dir.exists() else []
    if not checkpoints:
        print("ERROR: No checkpoints found")
        return

    # Find nearest checkpoint to best step
    best_ckpt = min(checkpoints, key=lambda p: abs(
        int(p.stem.replace("step_", "")) - best_step
    ))
    print(f"Best checkpoint: {best_ckpt}")

    # Read config from checkpoint
    ckpt_data = torch.load(str(best_ckpt), map_location="cpu", weights_only=False)
    model_config = ckpt_data.get("model_config", {})
    training_config = ckpt_data.get("training_config", {})
    del ckpt_data

    # Export best checkpoint to root
    output_dir.mkdir(parents=True, exist_ok=True)
    print(f"\nExporting best checkpoint to {output_dir}/")
    convert_pt_to_safetensors(best_ckpt, output_dir)

    # Copy full metrics.jsonl
    shutil.copy2(metrics_path, output_dir / "metrics.jsonl")

    # Generate README
    readme = generate_readme(
        args.repo_name, model_config, training_config,
        best_step, best_val_loss, best_val_acc,
        args.github_url, args.extra_desc,
    )
    with open(output_dir / "README.md", "w") as f:
        f.write(readme)

    # Export other checkpoints
    if not args.best_only:
        for ckpt in checkpoints:
            if ckpt == best_ckpt:
                continue
            step_name = ckpt.stem  # e.g. "step_00005000"
            step_num = int(step_name.replace("step_", ""))
            step_dir = output_dir / "checkpoints" / step_name
            print(f"  Exporting {step_name}...")
            convert_pt_to_safetensors(ckpt, step_dir)

            # Truncated metrics
            truncated = truncate_metrics(metrics_path, step_num)
            with open(step_dir / "metrics.jsonl", "w") as f:
                f.writelines(truncated)

    print(f"\nExport complete: {output_dir}")
    print(f"  Best: model.safetensors, config.json, metrics.jsonl, README.md")
    if not args.best_only:
        print(f"  Checkpoints: {len(checkpoints) - 1} in checkpoints/")


if __name__ == "__main__":
    main()