cs3319-project2 / scripts /build_submission_zip.py
NLP-beginner's picture
Add curated submission zip build script
d7687c2
Raw
History Blame Contribute Delete
5.24 kB
#!/usr/bin/env python3
"""
Build the curated, TA-runnable submission ZIP for CS3319 Project 2.
The repo is 3.2 GB; this script selects the minimal set needed to:
(a) read the paper (PDF + LaTeX source + figures),
(b) run the load-weights smoke test (code/run_inference.ipynb),
(c) inspect the final result + final submission CSV,
and leaves the multi-GB intermediate artifacts in the Hugging Face backup.
Usage:
python scripts/build_submission_zip.py # -> cs3319_final_deliverable.zip
python scripts/build_submission_zip.py --out X.zip
"""
from __future__ import annotations
import argparse, zipfile, os, sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
TOP = "cs3319_final_deliverable" # top-level folder inside the zip
# ---- directories included in full ----
FULL_DIRS = [
"code",
"checkpoints/final_ens6",
"checkpoints/extra_models",
"data_and_docs",
"figures_v2",
"figures_paper",
"reports",
"docs/diagrams",
"docs_first_principles",
"env",
"notes",
]
# ---- individual files (cherry-picked from larger trees) ----
EXTRA_FILES = [
"README.md",
"SUBMISSION_README.md",
"AI_USAGE.md",
"scripts/render_diagrams.sh",
"scripts/build_submission_zip.py",
"validation_runs/dynamic_summary.csv",
"validation_runs/stack_ratio_analysis.csv",
"validation_runs/stack_threshold_summary.csv",
"validation_runs/dynamic_seed202/val_labels_seed202.npy",
"validation_runs/dynamic_seed202/val_pairs_seed202.npy",
"cached_scores/test_known_mask.npy",
"cached_scores/test_lgb_scores.npy",
"cached_scores/test_lgb_v2_scores.npy",
"cached_scores/test_bpr_cos.npy",
"cached_scores/test_bpr_dot.npy",
"cached_scores/lgb_model.pkl",
"cached_scores/lgb_v2_model.pkl",
"ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.tex",
"ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.pdf",
"ACM_Conference_Proceedings_Primary_Article_Template/cs3319_final_paper_cn.bbl",
"ACM_Conference_Proceedings_Primary_Article_Template/cs3319_references.bib",
"ACM_Conference_Proceedings_Primary_Article_Template/acmart.cls",
"ACM_Conference_Proceedings_Primary_Article_Template/ACM-Reference-Format.bst",
]
# whole subdirectory under validation_runs/dynamic_seed202 to include (the result)
RESULT_DIRS = [
"validation_runs/dynamic_seed202/high_order_graph_stack",
]
SKIP_NAMES = {"__pycache__", ".ipynb_checkpoints", ".git", ".cache"}
SKIP_SUFFIXES = (".pyc", ".pyo")
def should_skip(p: Path) -> bool:
if any(part in SKIP_NAMES for part in p.parts):
return True
if p.suffix in SKIP_SUFFIXES:
return True
return False
def collect() -> list[Path]:
files: list[Path] = []
seen: set[Path] = set()
def add(p: Path):
if p in seen:
return
if not p.exists():
print(f" ! missing (skipped): {p}", file=sys.stderr)
return
seen.add(p)
files.append(p)
for d in FULL_DIRS + RESULT_DIRS:
base = ROOT / d
if not base.exists():
print(f" ! missing dir (skipped): {d}", file=sys.stderr)
continue
for p in base.rglob("*"):
if p.is_file() and not should_skip(p):
add(p)
for f in EXTRA_FILES:
p = ROOT / f
if p.is_file() and not should_skip(p):
add(p)
else:
print(f" ! missing file (skipped): {f}", file=sys.stderr)
return sorted(files)
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--out", default=str(ROOT / "cs3319_final_deliverable.zip"))
ap.add_argument("--top", default=TOP)
args = ap.parse_args()
files = collect()
total = sum(p.stat().st_size for p in files)
print(f"selected {len(files)} files, {total/1e6:.1f} MB uncompressed")
out = Path(args.out)
# write zip (LARGE_DEFLATE via ZIP_DEFLATED; binary files added as-is effectively)
with zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED, compresslevel=6, allowZip64=True) as z:
# MANIFEST first, at top level
manifest_lines = [
f"CS3319 Project 2 — curated submission package",
f"Built from repo: {ROOT}",
f"Files: {len(files)} Uncompressed: {total/1e6:.1f} MB",
f"Public LB F1 = 0.96626 | validation F1 = 0.966874",
"",
"TA verification: see SUBMISSION_README.md (3 steps, CPU, seconds).",
"Run: jupyter nbconvert --to notebook --execute code/run_inference.ipynb",
"",
"=== File list (relative paths) ===",
"",
]
rels = sorted(p.relative_to(ROOT).as_posix() for p in files)
manifest_lines += rels
z.writestr(f"{args.top}/MANIFEST.txt", "\n".join(manifest_lines) + "\n")
for p in files:
arc = f"{args.top}/{p.relative_to(ROOT).as_posix()}"
z.write(p, arc)
zsize = out.stat().st_size
print(f"\nwrote {out}")
print(f"zip size: {zsize/1e6:.1f} MB ({zsize/1073741824:.2f} GB)")
print(f"ratio: {zsize/max(total,1)*100:.1f}% of uncompressed")
if __name__ == "__main__":
main()