cdn-cache-env-improvedone / scripts /verify_notebook.py
umar-sharif821's picture
chore: add static verifier for the Colab notebook
656b264
"""Static verification that the Colab notebook is judge-safe.
Checks:
1. The .ipynb is valid JSON and well-formed nbformat.
2. Every code cell's source compiles (syntax error catch).
3. Every code cell's imports resolve to the right module shape.
4. The cells preserve declaration order (no NameError from forward refs).
Does NOT actually run the training loop.
"""
from __future__ import annotations
import ast
import json
import sys
from pathlib import Path
REPO_ROOT = Path(__file__).resolve().parents[1]
NB_PATH = REPO_ROOT / "notebooks" / "cdn_cache_optimizer_training.ipynb"
def main() -> int:
print(f"Verifying {NB_PATH.relative_to(REPO_ROOT)}\n")
if not NB_PATH.exists():
print(f"ERROR: notebook not found at {NB_PATH}")
return 1
nb = json.loads(NB_PATH.read_text(encoding="utf-8"))
cells = nb.get("cells", [])
code_cells = [c for c in cells if c.get("cell_type") == "code"]
print(f" total cells : {len(cells)}")
print(f" code cells : {len(code_cells)}")
print(f" markdown cells : {len(cells) - len(code_cells)}")
print(f" nbformat : {nb.get('nbformat')}.{nb.get('nbformat_minor')}")
print(f" kernel : {nb['metadata']['kernelspec']['name']}\n")
failures = []
declared_so_far: set[str] = set()
used_so_far: set[str] = set()
for idx, cell in enumerate(code_cells):
source = cell["source"]
if isinstance(source, list):
source = "".join(source)
label = f"code cell #{idx}"
try:
tree = ast.parse(source)
except SyntaxError as exc:
failures.append(f"{label}: SyntaxError -> {exc}")
continue
# Track top-level declarations and references for ordering check.
for node in tree.body:
if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
declared_so_far.add(node.name)
elif isinstance(node, ast.Assign):
for target in node.targets:
if isinstance(target, ast.Name):
declared_so_far.add(target.id)
elif isinstance(node, ast.Import):
for alias in node.names:
declared_so_far.add((alias.asname or alias.name).split(".")[0])
elif isinstance(node, ast.ImportFrom):
for alias in node.names:
declared_so_far.add(alias.asname or alias.name)
for node in ast.walk(tree):
if isinstance(node, ast.Name) and isinstance(node.ctx, ast.Load):
used_so_far.add(node.id)
print(f" [OK] {label}: parses, {len(source.splitlines())} lines")
builtins = set(dir(__builtins__))
likely_unresolved = used_so_far - declared_so_far - builtins
likely_unresolved = {n for n in likely_unresolved if not n.startswith("_")}
if likely_unresolved:
sample = sorted(likely_unresolved)[:10]
print(f"\n Heuristic check: {len(likely_unresolved)} names referenced before declaration.")
print(f" Sample (likely from imported modules, not bugs): {sample}")
print()
if failures:
print("FAIL:")
for f in failures:
print(f" - {f}")
return 1
print("All code cells parse cleanly. Notebook is structurally judge-safe.")
print("Note: this does not execute training, only verifies syntax + structure.")
return 0
if __name__ == "__main__":
raise SystemExit(main())