File size: 1,925 Bytes
0f0ce9b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/usr/bin/env python3
"""Download real HF weights into models/finetuned_hf (no Git LFS required).
The repo may only contain a Git LFS pointer for model.safetensors (~134 bytes).
This script saves a compatible DistilBERT toxic classifier from Hugging Face Hub
so "Fine-tuned (local HF)" can load offline after one download.
Run from repo root:
uv sync --extra hf
uv run python scripts/materialize_finetuned_weights.py
"""
from __future__ import annotations
import sys
from pathlib import Path
PROJECT_ROOT = Path(__file__).resolve().parents[1]
OUT_DIR = PROJECT_ROOT / "models" / "finetuned_hf"
# Same architecture family as notebook 08 (DistilBERT sequence classification)
HUB_ID = "martin-ha/toxic-comment-model"
def main() -> int:
weights = OUT_DIR / "model.safetensors"
if weights.is_file() and weights.stat().st_size > 1_000_000:
print(f"OK: {weights} already exists ({weights.stat().st_size // 1_000_000} MB)")
return 0
try:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
except ImportError:
print("Install HF deps first: uv sync --extra hf", file=sys.stderr)
return 1
print(f"Downloading {HUB_ID} into {OUT_DIR} …")
OUT_DIR.mkdir(parents=True, exist_ok=True)
model = AutoModelForSequenceClassification.from_pretrained(HUB_ID)
tokenizer = AutoTokenizer.from_pretrained(HUB_ID)
model.save_pretrained(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)
meta = OUT_DIR / "model_metadata.json"
if not meta.exists():
meta.write_text(
'{"model_name":"DistilBERT (materialized from Hub)","note":"Run notebook 08 to replace with team weights"}\n',
encoding="utf-8",
)
size_mb = weights.stat().st_size // 1_000_000 if weights.is_file() else 0
print(f"Done. {weights} ({size_mb} MB)")
return 0
if __name__ == "__main__":
raise SystemExit(main())
|