File size: 5,499 Bytes
80ef3b2 84b67b3 80ef3b2 84b67b3 80ef3b2 84b67b3 0847cd0 84b67b3 80ef3b2 84b67b3 d3bc17d 84b67b3 80ef3b2 d3bbb53 80ef3b2 84b67b3 80ef3b2 84b67b3 80ef3b2 d3bbb53 80ef3b2 84b67b3 d3bc17d 84b67b3 80ef3b2 84b67b3 80ef3b2 84b67b3 0847cd0 84b67b3 80ef3b2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 | #!/usr/bin/env python3
"""Audit the public WildFIRE-FM release before upload."""
from __future__ import annotations
import json
import re
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
REQUIRED = [
"README.md",
"LICENSE",
"requirements.txt",
"data_sources/DATA_SOURCES.md",
"models/wildfire_fm/README.md",
"models/wildfire_fm/modeling_unet.py",
"models/wildfire_fm/checkpoint_manifest.json",
"paper/wildfire_fm_evaluation_contracts.pdf",
"paper_outputs/figures/overview_wildfire.pdf",
"paper_outputs/figures/matching.pdf",
"paper_outputs/figures/fig_task_contract_tiles.pdf",
"paper_outputs/figures/fig_primary_rank_change_map.pdf",
"paper_outputs/figures/fig_selection_regret_scatter.pdf",
"paper_outputs/figures/fig_rank_heatmap1.pdf",
"assets/wildfire_fm_model_card.svg",
"assets/release_contents.svg",
"assets/selection_regret_final.png",
"assets/supporting_rank_map_final.png",
"assets/primary_rank_change_final.png",
"artifacts/manifests/paper_outputs.sha256",
"scripts/check_paper_output_hashes.py",
]
FORBIDDEN_FILE_SUFFIXES = {".tex", ".bib", ".tikz"}
FORBIDDEN_FILE_NAMES = {"manuscript_final.pdf"}
FORBIDDEN_TEXT = [
"/home/yx21e",
"/blue/",
"/orange/",
"fsu-compsci",
"TBD",
"N/A",
"Pangu24",
]
TEXT_SUFFIXES = {".md", ".py", ".sh", ".tex", ".csv", ".json", ".yml", ".yaml", ".txt"}
SKIP_FOR_FORBIDDEN = {"audit_release.py", "build_selection_regret_rq2_figure.py"}
def iter_text_files() -> list[Path]:
out: list[Path] = []
for path in ROOT.rglob("*"):
if ".git" in path.parts or "__pycache__" in path.parts:
continue
if path.name in SKIP_FOR_FORBIDDEN:
continue
if path.is_file() and path.suffix in TEXT_SUFFIXES:
out.append(path)
return sorted(out)
def main() -> None:
issues: list[str] = []
for rel in REQUIRED:
if not (ROOT / rel).exists():
issues.append(f"missing required file: {rel}")
for path in ROOT.rglob("*"):
if ".git" in path.parts or "__pycache__" in path.parts:
continue
if path.is_file() and (path.suffix in FORBIDDEN_FILE_SUFFIXES or path.name in FORBIDDEN_FILE_NAMES):
issues.append(f"forbidden manuscript/source artifact present: {path.relative_to(ROOT)}")
for path in iter_text_files():
text = path.read_text(errors="ignore")
for token in FORBIDDEN_TEXT:
if token in text:
issues.append(f"{path.relative_to(ROOT)} contains forbidden token {token!r}")
readme = (ROOT / "README.md").read_text(errors="ignore")
for phrase in ["WildFIRE-FM", "Quick Load", "Data Sources", "Evaluation Snapshot"]:
if phrase not in readme:
issues.append(f"README missing expected model-card phrase: {phrase}")
manifest_path = ROOT / "models/wildfire_fm/checkpoint_manifest.json"
if manifest_path.exists():
data = json.loads(manifest_path.read_text())
checkpoints = data.get("checkpoints", [])
if len(checkpoints) != 5:
issues.append("checkpoint manifest should list five seeded checkpoints")
for item in checkpoints:
rel = item.get("filename", "")
if not rel.startswith("models/wildfire_fm/checkpoints/seed_"):
issues.append(f"unexpected checkpoint filename in manifest: {rel}")
if "source_path" in item:
issues.append("checkpoint manifest exposes source_path")
if not re.fullmatch(r"[0-9a-f]{64}", str(item.get("sha256", ""))):
issues.append(f"bad sha256 in checkpoint manifest: {item}")
for path in (ROOT / "paper_outputs/tables").glob("*.tex"):
text = path.read_text(errors="ignore")
if re.search(r"\\ms\{[^}]*\}\{0\.0000\}", text):
issues.append(f"{path.relative_to(ROOT)} displays zero std in an \\ms cell")
checksum_manifest = ROOT / "artifacts/manifests/paper_outputs.sha256"
if checksum_manifest.exists():
listed: list[str] = []
for line in checksum_manifest.read_text(errors="ignore").splitlines():
if not line.strip():
continue
parts = line.split(None, 1)
if len(parts) != 2:
issues.append(f"bad checksum manifest line: {line!r}")
continue
rel = parts[1].strip()
listed.append(rel)
if not (ROOT / rel).exists():
issues.append(f"checksum manifest lists missing output: {rel}")
expected_paths = []
for rel_root in ["paper", "paper_outputs", "assets"]:
root_dir = ROOT / rel_root
if root_dir.exists():
expected_paths.extend(str(p.relative_to(ROOT)) for p in root_dir.rglob("*") if p.is_file())
expected = sorted(set(expected_paths))
if sorted(listed) != expected:
missing = sorted(set(expected) - set(listed))
extra = sorted(set(listed) - set(expected))
if missing:
issues.append(f"checksum manifest missing outputs: {missing}")
if extra:
issues.append(f"checksum manifest has extra outputs: {extra}")
if issues:
print("Release audit failed:")
for issue in issues:
print(f"- {issue}")
raise SystemExit(1)
print("Release audit passed.")
if __name__ == "__main__":
main()
|