Spaces:

umar-sharif821
/

cdn-cache-env-improvedone

Sleeping

App Files Files Community

cdn-cache-env-improvedone / scripts /build_notebook.py

umar-sharif821

feat: add Colab training notebook + build script for hackathon submission

7511eae 21 days ago

raw

history blame contribute delete

4.37 kB

	"""Convert colab_submission_script.py into a clean Colab .ipynb notebook.

	Splits the script on the `# === ... STEP N ...` banner blocks and emits one
	code cell per step, with a markdown intro cell at the top.

	Usage:
	python scripts/build_notebook.py
	"""

	from __future__ import annotations

	import json
	import re
	from pathlib import Path

	REPO_ROOT = Path(__file__).resolve().parents[1]
	SOURCE = REPO_ROOT / "colab_submission_script.py"
	OUTPUT = REPO_ROOT / "notebooks" / "cdn_cache_optimizer_training.ipynb"

	INTRO_MD = """\
	# CDN Cache Optimizer — Training Notebook

	OpenEnv-compliant reinforcement-learning agent for edge CDN cache admission and eviction.
	Run Runtime → Run all in Colab to reproduce training, evaluation, schema-drift verification, and result charts in a single pass.

	Project links
	- Hugging Face Space: https://huggingface.co/spaces/umar-sharif821/cdn-cache-env-improvedone
	- GitHub repo: https://github.com/umar-sharif821/cdn-cache-env-improvedone

	What this notebook does
	1. Bootstraps Colab (installs `gymnasium`, `torch`, `matplotlib`, `numpy`; mounts Drive if available).
	2. Defines a `SchemaDriftGuard` that normalizes heterogeneous CDN log formats.
	3. Builds an OpenEnv-compliant `CDNCacheEnv` (gymnasium 5-tuple, multi-component reward).
	4. Trains a REINFORCE policy network.
	5. Evaluates LRU baseline vs. the fine-tuned agent.
	6. Saves `policy.pt`, `training_results.png`, `drift_report.json`, `metrics.json`.

	Reward function
	`R = w1 * Perf - w2 * Cost`, where `Perf` is edge-vs-origin latency savings and `Cost` is eviction churn + admitted bytes / capacity.
	"""

	STEP_TITLES = {
	0: "Step 0 — Colab bootstrap (deps + Drive)",
	1: "Step 1 — Imports & deterministic seeding",
	2: "Step 2 — Schema Drift Guard",
	3: "Step 3 — OpenEnv-compliant CDN cache environment",
	4: "Step 4 — Policy network + REINFORCE training loop",
	5: "Step 5 — Evaluation: LRU baseline vs fine-tuned agent",
	6: "Step 6 — Comparison charts",
	7: "Step 7 — Persist artifacts to Drive",
	8: "Step 8 — Submission summary",
	}


	def make_code_cell(source: str) -> dict:
	return {
	"cell_type": "code",
	"metadata": {},
	"source": source,
	"outputs": [],
	"execution_count": None,
	}


	def make_md_cell(source: str) -> dict:
	return {
	"cell_type": "markdown",
	"metadata": {},
	"source": source,
	}


	def split_into_steps(text: str) -> list[tuple[int, str]]:
	"""Return (step_index, body_without_banner) tuples in order."""
	banner = re.compile(r"# ={5,}\n# STEP (\d+)[^\n]*\n# ={5,}\n")
	matches = list(banner.finditer(text))
	if not matches:
	raise RuntimeError("No STEP banners found in source script.")

	steps: list[tuple[int, str]] = []
	for i, m in enumerate(matches):
	step_idx = int(m.group(1))
	start = m.end()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
	body = text[start:end].strip("\n")
	steps.append((step_idx, body))
	return steps


	def build_notebook() -> dict:
	raw = SOURCE.read_text(encoding="utf-8")
	docstring_match = re.match(r'"""(.*?)"""', raw, flags=re.DOTALL)
	if docstring_match:
	body = raw[docstring_match.end():].lstrip("\n")
	else:
	body = raw

	steps = split_into_steps(body)

	cells: list[dict] = [make_md_cell(INTRO_MD)]
	for step_idx, code in steps:
	title = STEP_TITLES.get(step_idx, f"Step {step_idx}")
	cells.append(make_md_cell(f"## {title}"))
	cells.append(make_code_cell(code))

	return {
	"nbformat": 4,
	"nbformat_minor": 5,
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3",
	},
	"language_info": {
	"name": "python",
	"version": "3.11",
	},
	"colab": {"provenance": []},
	},
	"cells": cells,
	}


	def main() -> int:
	OUTPUT.parent.mkdir(parents=True, exist_ok=True)
	nb = build_notebook()
	OUTPUT.write_text(json.dumps(nb, indent=2), encoding="utf-8")
	print(f"Wrote {OUTPUT.relative_to(REPO_ROOT)} ({len(nb['cells'])} cells)")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())