#!/usr/bin/env python3 """ Build the curated, TA-runnable submission ZIP for CS3319 Project 2. The repo is 3.2 GB; this script selects the minimal set needed to: (a) read the paper (PDF + LaTeX source + figures), (b) run the load-weights smoke test (code/run_inference.ipynb), (c) inspect the final result + final submission CSV, and leaves the multi-GB intermediate artifacts in the Hugging Face backup. Usage: python scripts/build_submission_zip.py # -> cs3319_final_deliverable.zip python scripts/build_submission_zip.py --out X.zip """ from __future__ import annotations import argparse, zipfile, os, sys from pathlib import Path ROOT = Path(__file__).resolve().parents[1] TOP = "cs3319_final_deliverable" # top-level folder inside the zip # ---- directories included in full ---- FULL_DIRS = [ "code", "checkpoints/final_ens6", "checkpoints/extra_models", "data_and_docs", "figures_v2", "figures_paper", "reports", "docs/diagrams", "docs_first_principles", "env", "notes", ] # ---- individual files (cherry-picked from larger trees) ---- EXTRA_FILES = [ "README.md", "SUBMISSION_README.md", "AI_USAGE.md", "scripts/render_diagrams.sh", "scripts/build_submission_zip.py", "validation_runs/dynamic_summary.csv", "validation_runs/stack_ratio_analysis.csv", "validation_runs/stack_threshold_summary.csv", "validation_runs/dynamic_seed202/val_labels_seed202.npy", "validation_runs/dynamic_seed202/val_pairs_seed202.npy", "cached_scores/test_known_mask.npy", "cached_scores/test_lgb_scores.npy", "cached_scores/test_lgb_v2_scores.npy", "cached_scores/test_bpr_cos.npy", "cached_scores/test_bpr_dot.npy", "cached_scores/lgb_model.pkl", "cached_scores/lgb_v2_model.pkl", "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.tex", "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.pdf", "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.bbl", "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_references.bib", "ACM_Conference_Proceedings_Primary_Article_Template/acmart.cls", "ACM_Conference_Proceedings_Primary_Article_Template/ACM-Reference-Format.bst", ] # whole subdirectory under validation_runs/dynamic_seed202 to include (the result) RESULT_DIRS = [ "validation_runs/dynamic_seed202/high_order_graph_stack", ] SKIP_NAMES = {"__pycache__", ".ipynb_checkpoints", ".git", ".cache"} SKIP_SUFFIXES = (".pyc", ".pyo") def should_skip(p: Path) -> bool: if any(part in SKIP_NAMES for part in p.parts): return True if p.suffix in SKIP_SUFFIXES: return True return False def collect() -> list[Path]: files: list[Path] = [] seen: set[Path] = set() def add(p: Path): if p in seen: return if not p.exists(): print(f" ! missing (skipped): {p}", file=sys.stderr) return seen.add(p) files.append(p) for d in FULL_DIRS + RESULT_DIRS: base = ROOT / d if not base.exists(): print(f" ! missing dir (skipped): {d}", file=sys.stderr) continue for p in base.rglob("*"): if p.is_file() and not should_skip(p): add(p) for f in EXTRA_FILES: p = ROOT / f if p.is_file() and not should_skip(p): add(p) else: print(f" ! missing file (skipped): {f}", file=sys.stderr) return sorted(files) def main(): ap = argparse.ArgumentParser() ap.add_argument("--out", default=str(ROOT / "cs3319_final_deliverable.zip")) ap.add_argument("--top", default=TOP) args = ap.parse_args() files = collect() total = sum(p.stat().st_size for p in files) print(f"selected {len(files)} files, {total/1e6:.1f} MB uncompressed") out = Path(args.out) # write zip (LARGE_DEFLATE via ZIP_DEFLATED; binary files added as-is effectively) with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED, compresslevel=6, allowZip64=True) as z: # MANIFEST first, at top level manifest_lines = [ f"CS3319 Project 2 — curated submission package", f"Built from repo: {ROOT}", f"Files: {len(files)} Uncompressed: {total/1e6:.1f} MB", f"Public LB F1 = 0.96626 | validation F1 = 0.966874", "", "TA verification: see SUBMISSION_README.md (3 steps, CPU, seconds).", "Run: jupyter nbconvert --to notebook --execute code/run_inference.ipynb", "", "=== File list (relative paths) ===", "", ] rels = sorted(p.relative_to(ROOT).as_posix() for p in files) manifest_lines += rels z.writestr(f"{args.top}/MANIFEST.txt", "\n".join(manifest_lines) + "\n") for p in files: arc = f"{args.top}/{p.relative_to(ROOT).as_posix()}" z.write(p, arc) zsize = out.stat().st_size print(f"\nwrote {out}") print(f"zip size: {zsize/1e6:.1f} MB ({zsize/1073741824:.2f} GB)") print(f"ratio: {zsize/max(total,1)*100:.1f}% of uncompressed") if __name__ == "__main__": main()