| |
| """ |
| Build the curated, TA-runnable submission ZIP for CS3319 Project 2. |
| |
| The repo is 3.2 GB; this script selects the minimal set needed to: |
| (a) read the paper (PDF + LaTeX source + figures), |
| (b) run the load-weights smoke test (code/run_inference.ipynb), |
| (c) inspect the final result + final submission CSV, |
| and leaves the multi-GB intermediate artifacts in the Hugging Face backup. |
| |
| Usage: |
| python scripts/build_submission_zip.py # -> cs3319_final_deliverable.zip |
| python scripts/build_submission_zip.py --out X.zip |
| """ |
| from __future__ import annotations |
| import argparse, zipfile, os, sys |
| from pathlib import Path |
|
|
| ROOT = Path(__file__).resolve().parents[1] |
| TOP = "cs3319_final_deliverable" |
|
|
| |
| FULL_DIRS = [ |
| "code", |
| "checkpoints/final_ens6", |
| "checkpoints/extra_models", |
| "data_and_docs", |
| "figures_v2", |
| "figures_paper", |
| "reports", |
| "docs/diagrams", |
| "docs_first_principles", |
| "env", |
| "notes", |
| ] |
|
|
| |
| EXTRA_FILES = [ |
| "README.md", |
| "SUBMISSION_README.md", |
| "AI_USAGE.md", |
| "scripts/render_diagrams.sh", |
| "scripts/build_submission_zip.py", |
| "validation_runs/dynamic_summary.csv", |
| "validation_runs/stack_ratio_analysis.csv", |
| "validation_runs/stack_threshold_summary.csv", |
| "validation_runs/dynamic_seed202/val_labels_seed202.npy", |
| "validation_runs/dynamic_seed202/val_pairs_seed202.npy", |
| "cached_scores/test_known_mask.npy", |
| "cached_scores/test_lgb_scores.npy", |
| "cached_scores/test_lgb_v2_scores.npy", |
| "cached_scores/test_bpr_cos.npy", |
| "cached_scores/test_bpr_dot.npy", |
| "cached_scores/lgb_model.pkl", |
| "cached_scores/lgb_v2_model.pkl", |
| "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.tex", |
| "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.pdf", |
| "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.bbl", |
| "ACM_Conference_Proceedings_Primary_Article_Template/cs3319_references.bib", |
| "ACM_Conference_Proceedings_Primary_Article_Template/acmart.cls", |
| "ACM_Conference_Proceedings_Primary_Article_Template/ACM-Reference-Format.bst", |
| ] |
|
|
| |
| RESULT_DIRS = [ |
| "validation_runs/dynamic_seed202/high_order_graph_stack", |
| ] |
|
|
| SKIP_NAMES = {"__pycache__", ".ipynb_checkpoints", ".git", ".cache"} |
| SKIP_SUFFIXES = (".pyc", ".pyo") |
|
|
|
|
| def should_skip(p: Path) -> bool: |
| if any(part in SKIP_NAMES for part in p.parts): |
| return True |
| if p.suffix in SKIP_SUFFIXES: |
| return True |
| return False |
|
|
|
|
| def collect() -> list[Path]: |
| files: list[Path] = [] |
| seen: set[Path] = set() |
|
|
| def add(p: Path): |
| if p in seen: |
| return |
| if not p.exists(): |
| print(f" ! missing (skipped): {p}", file=sys.stderr) |
| return |
| seen.add(p) |
| files.append(p) |
|
|
| for d in FULL_DIRS + RESULT_DIRS: |
| base = ROOT / d |
| if not base.exists(): |
| print(f" ! missing dir (skipped): {d}", file=sys.stderr) |
| continue |
| for p in base.rglob("*"): |
| if p.is_file() and not should_skip(p): |
| add(p) |
| for f in EXTRA_FILES: |
| p = ROOT / f |
| if p.is_file() and not should_skip(p): |
| add(p) |
| else: |
| print(f" ! missing file (skipped): {f}", file=sys.stderr) |
| return sorted(files) |
|
|
|
|
| def main(): |
| ap = argparse.ArgumentParser() |
| ap.add_argument("--out", default=str(ROOT / "cs3319_final_deliverable.zip")) |
| ap.add_argument("--top", default=TOP) |
| args = ap.parse_args() |
|
|
| files = collect() |
| total = sum(p.stat().st_size for p in files) |
| print(f"selected {len(files)} files, {total/1e6:.1f} MB uncompressed") |
|
|
| out = Path(args.out) |
| |
| with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED, compresslevel=6, allowZip64=True) as z: |
| |
| manifest_lines = [ |
| f"CS3319 Project 2 — curated submission package", |
| f"Built from repo: {ROOT}", |
| f"Files: {len(files)} Uncompressed: {total/1e6:.1f} MB", |
| f"Public LB F1 = 0.96626 | validation F1 = 0.966874", |
| "", |
| "TA verification: see SUBMISSION_README.md (3 steps, CPU, seconds).", |
| "Run: jupyter nbconvert --to notebook --execute code/run_inference.ipynb", |
| "", |
| "=== File list (relative paths) ===", |
| "", |
| ] |
| rels = sorted(p.relative_to(ROOT).as_posix() for p in files) |
| manifest_lines += rels |
| z.writestr(f"{args.top}/MANIFEST.txt", "\n".join(manifest_lines) + "\n") |
|
|
| for p in files: |
| arc = f"{args.top}/{p.relative_to(ROOT).as_posix()}" |
| z.write(p, arc) |
|
|
| zsize = out.stat().st_size |
| print(f"\nwrote {out}") |
| print(f"zip size: {zsize/1e6:.1f} MB ({zsize/1073741824:.2f} GB)") |
| print(f"ratio: {zsize/max(total,1)*100:.1f}% of uncompressed") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|