SignalMod / scripts /materialize_finetuned_weights.py
Mirae Kang
fix: debug model selection, #22
0f0ce9b
raw
history blame
1.93 kB
#!/usr/bin/env python3
"""Download real HF weights into models/finetuned_hf (no Git LFS required).
The repo may only contain a Git LFS pointer for model.safetensors (~134 bytes).
This script saves a compatible DistilBERT toxic classifier from Hugging Face Hub
so "Fine-tuned (local HF)" can load offline after one download.
Run from repo root:
uv sync --extra hf
uv run python scripts/materialize_finetuned_weights.py
"""
from __future__ import annotations
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
OUT_DIR = PROJECT_ROOT / "models" / "finetuned_hf"
# Same architecture family as notebook 08 (DistilBERT sequence classification)
HUB_ID = "martin-ha/toxic-comment-model"
def main() -> int:
weights = OUT_DIR / "model.safetensors"
if weights.is_file() and weights.stat().st_size > 1_000_000:
print(f"OK: {weights} already exists ({weights.stat().st_size // 1_000_000} MB)")
return 0
try:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
print("Install HF deps first: uv sync --extra hf", file=sys.stderr)
return 1
print(f"Downloading {HUB_ID} into {OUT_DIR} …")
OUT_DIR.mkdir(parents=True, exist_ok=True)
model = AutoModelForSequenceClassification.from_pretrained(HUB_ID)
tokenizer = AutoTokenizer.from_pretrained(HUB_ID)
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
meta = OUT_DIR / "model_metadata.json"
if not meta.exists():
meta.write_text(
'{"model_name":"DistilBERT (materialized from Hub)","note":"Run notebook 08 to replace with team weights"}\n',
encoding="utf-8",
)
size_mb = weights.stat().st_size // 1_000_000 if weights.is_file() else 0
print(f"Done. {weights} ({size_mb} MB)")
return 0
if __name__ == "__main__":
raise SystemExit(main())