File size: 4,371 Bytes
7511eae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""Convert colab_submission_script.py into a clean Colab .ipynb notebook.

Splits the script on the `# === ... STEP N ...` banner blocks and emits one
code cell per step, with a markdown intro cell at the top.

Usage:
    python scripts/build_notebook.py
"""

from __future__ import annotations

import json
import re
from pathlib import Path

REPO_ROOT = Path(__file__).resolve().parents[1]
SOURCE = REPO_ROOT / "colab_submission_script.py"
OUTPUT = REPO_ROOT / "notebooks" / "cdn_cache_optimizer_training.ipynb"

INTRO_MD = """\
# CDN Cache Optimizer β€” Training Notebook

OpenEnv-compliant reinforcement-learning agent for **edge CDN cache admission and eviction**.
Run **Runtime β†’ Run all** in Colab to reproduce training, evaluation, schema-drift verification, and result charts in a single pass.

**Project links**
- Hugging Face Space: https://huggingface.co/spaces/umar-sharif821/cdn-cache-env-improvedone
- GitHub repo: https://github.com/umar-sharif821/cdn-cache-env-improvedone

**What this notebook does**
1. Bootstraps Colab (installs `gymnasium`, `torch`, `matplotlib`, `numpy`; mounts Drive if available).
2. Defines a `SchemaDriftGuard` that normalizes heterogeneous CDN log formats.
3. Builds an OpenEnv-compliant `CDNCacheEnv` (gymnasium 5-tuple, multi-component reward).
4. Trains a REINFORCE policy network.
5. Evaluates LRU baseline vs. the fine-tuned agent.
6. Saves `policy.pt`, `training_results.png`, `drift_report.json`, `metrics.json`.

**Reward function**
`R = w1 * Perf - w2 * Cost`, where `Perf` is edge-vs-origin latency savings and `Cost` is eviction churn + admitted bytes / capacity.
"""

STEP_TITLES = {
    0: "Step 0 β€” Colab bootstrap (deps + Drive)",
    1: "Step 1 β€” Imports & deterministic seeding",
    2: "Step 2 β€” Schema Drift Guard",
    3: "Step 3 β€” OpenEnv-compliant CDN cache environment",
    4: "Step 4 β€” Policy network + REINFORCE training loop",
    5: "Step 5 β€” Evaluation: LRU baseline vs fine-tuned agent",
    6: "Step 6 β€” Comparison charts",
    7: "Step 7 β€” Persist artifacts to Drive",
    8: "Step 8 β€” Submission summary",
}


def make_code_cell(source: str) -> dict:
    return {
        "cell_type": "code",
        "metadata": {},
        "source": source,
        "outputs": [],
        "execution_count": None,
    }


def make_md_cell(source: str) -> dict:
    return {
        "cell_type": "markdown",
        "metadata": {},
        "source": source,
    }


def split_into_steps(text: str) -> list[tuple[int, str]]:
    """Return (step_index, body_without_banner) tuples in order."""
    banner = re.compile(r"# ={5,}\n# STEP (\d+)[^\n]*\n# ={5,}\n")
    matches = list(banner.finditer(text))
    if not matches:
        raise RuntimeError("No STEP banners found in source script.")

    steps: list[tuple[int, str]] = []
    for i, m in enumerate(matches):
        step_idx = int(m.group(1))
        start = m.end()
        end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        body = text[start:end].strip("\n")
        steps.append((step_idx, body))
    return steps


def build_notebook() -> dict:
    raw = SOURCE.read_text(encoding="utf-8")
    docstring_match = re.match(r'"""(.*?)"""', raw, flags=re.DOTALL)
    if docstring_match:
        body = raw[docstring_match.end():].lstrip("\n")
    else:
        body = raw

    steps = split_into_steps(body)

    cells: list[dict] = [make_md_cell(INTRO_MD)]
    for step_idx, code in steps:
        title = STEP_TITLES.get(step_idx, f"Step {step_idx}")
        cells.append(make_md_cell(f"## {title}"))
        cells.append(make_code_cell(code))

    return {
        "nbformat": 4,
        "nbformat_minor": 5,
        "metadata": {
            "kernelspec": {
                "display_name": "Python 3",
                "language": "python",
                "name": "python3",
            },
            "language_info": {
                "name": "python",
                "version": "3.11",
            },
            "colab": {"provenance": []},
        },
        "cells": cells,
    }


def main() -> int:
    OUTPUT.parent.mkdir(parents=True, exist_ok=True)
    nb = build_notebook()
    OUTPUT.write_text(json.dumps(nb, indent=2), encoding="utf-8")
    print(f"Wrote {OUTPUT.relative_to(REPO_ROOT)} ({len(nb['cells'])} cells)")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())