"""Convert colab_submission_script.py into a clean Colab .ipynb notebook.

Splits the script on the `# === ... STEP N ...` banner blocks and emits one
code cell per step, with a markdown intro cell at the top.

Usage:
    python scripts/build_notebook.py
"""

from __future__ import annotations

import json
import re
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
SOURCE = REPO_ROOT / "colab_submission_script.py"
OUTPUT = REPO_ROOT / "notebooks" / "cdn_cache_optimizer_training.ipynb"

INTRO_MD = """\
# CDN Cache Optimizer — Training Notebook

OpenEnv-compliant reinforcement-learning agent for **edge CDN cache admission and eviction**.
Run **Runtime → Run all** in Colab to reproduce training, evaluation, schema-drift verification, and result charts in a single pass.

**Project links**
- Hugging Face Space: https://huggingface.co/spaces/umar-sharif821/cdn-cache-env-improvedone
- GitHub repo: https://github.com/umar-sharif821/cdn-cache-env-improvedone

**What this notebook does**
1. Bootstraps Colab (installs `gymnasium`, `torch`, `matplotlib`, `numpy`; mounts Drive if available).
2. Defines a `SchemaDriftGuard` that normalizes heterogeneous CDN log formats.
3. Builds an OpenEnv-compliant `CDNCacheEnv` (gymnasium 5-tuple, multi-component reward).
4. Trains a REINFORCE policy network.
5. Evaluates LRU baseline vs. the fine-tuned agent.
6. Saves `policy.pt`, `training_results.png`, `drift_report.json`, `metrics.json`.

**Reward function**
`R = w1 * Perf - w2 * Cost`, where `Perf` is edge-vs-origin latency savings and `Cost` is eviction churn + admitted bytes / capacity.
"""

STEP_TITLES = {
    0: "Step 0 — Colab bootstrap (deps + Drive)",
    1: "Step 1 — Imports & deterministic seeding",
    2: "Step 2 — Schema Drift Guard",
    3: "Step 3 — OpenEnv-compliant CDN cache environment",
    4: "Step 4 — Policy network + REINFORCE training loop",
    5: "Step 5 — Evaluation: LRU baseline vs fine-tuned agent",
    6: "Step 6 — Comparison charts",
    7: "Step 7 — Persist artifacts to Drive",
    8: "Step 8 — Submission summary",
}


def make_code_cell(source: str) -> dict:
    return {
        "cell_type": "code",
        "metadata": {},
        "source": source,
        "outputs": [],
        "execution_count": None,
    }


def make_md_cell(source: str) -> dict:
    return {
        "cell_type": "markdown",
        "metadata": {},
        "source": source,
    }


def split_into_steps(text: str) -> list[tuple[int, str]]:
    """Return (step_index, body_without_banner) tuples in order."""
    banner = re.compile(r"# ={5,}\n# STEP (\d+)[^\n]*\n# ={5,}\n")
    matches = list(banner.finditer(text))
    if not matches:
        raise RuntimeError("No STEP banners found in source script.")

    steps: list[tuple[int, str]] = []
    for i, m in enumerate(matches):
        step_idx = int(m.group(1))
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        body = text[start:end].strip("\n")
        steps.append((step_idx, body))
    return steps


def build_notebook() -> dict:
    raw = SOURCE.read_text(encoding="utf-8")
    docstring_match = re.match(r'"""(.*?)"""', raw, flags=re.DOTALL)
    if docstring_match:
        body = raw[docstring_match.end():].lstrip("\n")
    else:
        body = raw

    steps = split_into_steps(body)

    cells: list[dict] = [make_md_cell(INTRO_MD)]
    for step_idx, code in steps:
        title = STEP_TITLES.get(step_idx, f"Step {step_idx}")
        cells.append(make_md_cell(f"## {title}"))
        cells.append(make_code_cell(code))

    return {
        "nbformat": 4,
        "nbformat_minor": 5,
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3",
            },
            "language_info": {
                "name": "python",
                "version": "3.11",
            },
            "colab": {"provenance": []},
        },
        "cells": cells,
    }


def main() -> int:
    OUTPUT.parent.mkdir(parents=True, exist_ok=True)
    nb = build_notebook()
    OUTPUT.write_text(json.dumps(nb, indent=2), encoding="utf-8")
    print(f"Wrote {OUTPUT.relative_to(REPO_ROOT)} ({len(nb['cells'])} cells)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())