| |
| """Download real HF weights into models/finetuned_hf (no Git LFS required). |
| |
| The repo may only contain a Git LFS pointer for model.safetensors (~134 bytes). |
| This script saves a compatible DistilBERT toxic classifier from Hugging Face Hub |
| so "Fine-tuned (local HF)" can load offline after one download. |
| |
| Run from repo root: |
| uv sync --extra hf |
| uv run python scripts/materialize_finetuned_weights.py |
| """ |
|
|
| from __future__ import annotations |
|
|
| import sys |
| from pathlib import Path |
|
|
| PROJECT_ROOT = Path(__file__).resolve().parents[1] |
| OUT_DIR = PROJECT_ROOT / "models" / "finetuned_hf" |
| |
| HUB_ID = "martin-ha/toxic-comment-model" |
|
|
|
|
| def main() -> int: |
| weights = OUT_DIR / "model.safetensors" |
| if weights.is_file() and weights.stat().st_size > 1_000_000: |
| print(f"OK: {weights} already exists ({weights.stat().st_size // 1_000_000} MB)") |
| return 0 |
|
|
| try: |
| from transformers import AutoModelForSequenceClassification, AutoTokenizer |
| except ImportError: |
| print("Install HF deps first: uv sync --extra hf", file=sys.stderr) |
| return 1 |
|
|
| print(f"Downloading {HUB_ID} into {OUT_DIR} …") |
| OUT_DIR.mkdir(parents=True, exist_ok=True) |
| model = AutoModelForSequenceClassification.from_pretrained(HUB_ID) |
| tokenizer = AutoTokenizer.from_pretrained(HUB_ID) |
| model.save_pretrained(OUT_DIR) |
| tokenizer.save_pretrained(OUT_DIR) |
|
|
| meta = OUT_DIR / "model_metadata.json" |
| if not meta.exists(): |
| meta.write_text( |
| '{"model_name":"DistilBERT (materialized from Hub)","note":"Run notebook 08 to replace with team weights"}\n', |
| encoding="utf-8", |
| ) |
|
|
| size_mb = weights.stat().st_size // 1_000_000 if weights.is_file() else 0 |
| print(f"Done. {weights} ({size_mb} MB)") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|