Wildfire-FM / scripts /audit_release.py
yx21e's picture
Publish compiled paper PDF without manuscript source
0847cd0 verified
#!/usr/bin/env python3
"""Audit the public WildFIRE-FM release before upload."""
from __future__ import annotations
import json
import re
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
REQUIRED = [
"README.md",
"LICENSE",
"requirements.txt",
"data_sources/DATA_SOURCES.md",
"models/wildfire_fm/README.md",
"models/wildfire_fm/modeling_unet.py",
"models/wildfire_fm/checkpoint_manifest.json",
"paper/wildfire_fm_evaluation_contracts.pdf",
"paper_outputs/figures/overview_wildfire.pdf",
"paper_outputs/figures/matching.pdf",
"paper_outputs/figures/fig_task_contract_tiles.pdf",
"paper_outputs/figures/fig_primary_rank_change_map.pdf",
"paper_outputs/figures/fig_selection_regret_scatter.pdf",
"paper_outputs/figures/fig_rank_heatmap1.pdf",
"assets/wildfire_fm_model_card.svg",
"assets/release_contents.svg",
"assets/selection_regret_final.png",
"assets/supporting_rank_map_final.png",
"assets/primary_rank_change_final.png",
"artifacts/manifests/paper_outputs.sha256",
"scripts/check_paper_output_hashes.py",
]
FORBIDDEN_FILE_SUFFIXES = {".tex", ".bib", ".tikz"}
FORBIDDEN_FILE_NAMES = {"manuscript_final.pdf"}
FORBIDDEN_TEXT = [
"/home/yx21e",
"/blue/",
"/orange/",
"fsu-compsci",
"TBD",
"N/A",
"Pangu24",
]
TEXT_SUFFIXES = {".md", ".py", ".sh", ".tex", ".csv", ".json", ".yml", ".yaml", ".txt"}
SKIP_FOR_FORBIDDEN = {"audit_release.py", "build_selection_regret_rq2_figure.py"}
def iter_text_files() -> list[Path]:
out: list[Path] = []
for path in ROOT.rglob("*"):
if ".git" in path.parts or "__pycache__" in path.parts:
continue
if path.name in SKIP_FOR_FORBIDDEN:
continue
if path.is_file() and path.suffix in TEXT_SUFFIXES:
out.append(path)
return sorted(out)
def main() -> None:
issues: list[str] = []
for rel in REQUIRED:
if not (ROOT / rel).exists():
issues.append(f"missing required file: {rel}")
for path in ROOT.rglob("*"):
if ".git" in path.parts or "__pycache__" in path.parts:
continue
if path.is_file() and (path.suffix in FORBIDDEN_FILE_SUFFIXES or path.name in FORBIDDEN_FILE_NAMES):
issues.append(f"forbidden manuscript/source artifact present: {path.relative_to(ROOT)}")
for path in iter_text_files():
text = path.read_text(errors="ignore")
for token in FORBIDDEN_TEXT:
if token in text:
issues.append(f"{path.relative_to(ROOT)} contains forbidden token {token!r}")
readme = (ROOT / "README.md").read_text(errors="ignore")
for phrase in ["WildFIRE-FM", "Quick Load", "Data Sources", "Evaluation Snapshot"]:
if phrase not in readme:
issues.append(f"README missing expected model-card phrase: {phrase}")
manifest_path = ROOT / "models/wildfire_fm/checkpoint_manifest.json"
if manifest_path.exists():
data = json.loads(manifest_path.read_text())
checkpoints = data.get("checkpoints", [])
if len(checkpoints) != 5:
issues.append("checkpoint manifest should list five seeded checkpoints")
for item in checkpoints:
rel = item.get("filename", "")
if not rel.startswith("models/wildfire_fm/checkpoints/seed_"):
issues.append(f"unexpected checkpoint filename in manifest: {rel}")
if "source_path" in item:
issues.append("checkpoint manifest exposes source_path")
if not re.fullmatch(r"[0-9a-f]{64}", str(item.get("sha256", ""))):
issues.append(f"bad sha256 in checkpoint manifest: {item}")
for path in (ROOT / "paper_outputs/tables").glob("*.tex"):
text = path.read_text(errors="ignore")
if re.search(r"\\ms\{[^}]*\}\{0\.0000\}", text):
issues.append(f"{path.relative_to(ROOT)} displays zero std in an \\ms cell")
checksum_manifest = ROOT / "artifacts/manifests/paper_outputs.sha256"
if checksum_manifest.exists():
listed: list[str] = []
for line in checksum_manifest.read_text(errors="ignore").splitlines():
if not line.strip():
continue
parts = line.split(None, 1)
if len(parts) != 2:
issues.append(f"bad checksum manifest line: {line!r}")
continue
rel = parts[1].strip()
listed.append(rel)
if not (ROOT / rel).exists():
issues.append(f"checksum manifest lists missing output: {rel}")
expected_paths = []
for rel_root in ["paper", "paper_outputs", "assets"]:
root_dir = ROOT / rel_root
if root_dir.exists():
expected_paths.extend(str(p.relative_to(ROOT)) for p in root_dir.rglob("*") if p.is_file())
expected = sorted(set(expected_paths))
if sorted(listed) != expected:
missing = sorted(set(expected) - set(listed))
extra = sorted(set(listed) - set(expected))
if missing:
issues.append(f"checksum manifest missing outputs: {missing}")
if extra:
issues.append(f"checksum manifest has extra outputs: {extra}")
if issues:
print("Release audit failed:")
for issue in issues:
print(f"- {issue}")
raise SystemExit(1)
print("Release audit passed.")
if __name__ == "__main__":
main()