File size: 3,765 Bytes
1fed70a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
Prepare local clean model snapshots and experiment copies.
Workflow:
1) Download/save a clean Hugging Face model to a stable local path once.
2) Create a copy of that clean local model for each experiment run.
This prevents accidental overwrites of your base model and keeps
fine-tuning runs isolated.
Examples:
python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala
python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5
"""
from __future__ import annotations
import argparse
import shutil
import sys
from datetime import datetime
from pathlib import Path
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
ROOT = Path(__file__).parent.parent
if str(ROOT) not in sys.path:
sys.path.insert(0, str(ROOT))
from core.constants import DEFAULT_MBART_MODEL
CLEAN_ROOT = ROOT / "seq2seq" / "clean_models"
EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments"
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Download a clean model once and create an isolated experiment copy (GPU required)."
)
parser.add_argument(
"--model-id",
default=DEFAULT_MBART_MODEL,
help="Hugging Face model ID to prepare.",
)
parser.add_argument(
"--clean-dir",
type=Path,
default=None,
help="Optional custom clean-model directory.",
)
parser.add_argument(
"--run-name",
default=None,
help="Optional experiment run folder name. Defaults to timestamp.",
)
parser.add_argument(
"--force-refresh-clean",
action="store_true",
help="Re-download and overwrite the local clean model snapshot.",
)
parser.add_argument(
"--allow-cpu",
action="store_true",
help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.",
)
return parser.parse_args()
def safe_name(model_id: str) -> str:
return model_id.replace("/", "--")
def main() -> None:
args = parse_args()
if not torch.cuda.is_available() and not args.allow_cpu:
raise RuntimeError(
"CUDA GPU is required by default. "
"No GPU detected. Use --allow-cpu only if you intentionally want CPU mode."
)
model_slug = safe_name(args.model_id)
clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug)
if clean_dir.exists() and args.force_refresh_clean:
print(f"Removing existing clean model at: {clean_dir}")
shutil.rmtree(clean_dir)
if not clean_dir.exists():
print(f"Downloading clean model: {args.model_id}")
tokenizer = AutoTokenizer.from_pretrained(args.model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)
clean_dir.mkdir(parents=True, exist_ok=True)
tokenizer.save_pretrained(clean_dir)
model.save_pretrained(clean_dir)
print(f"Saved clean model to: {clean_dir}")
else:
print(f"Using existing clean model: {clean_dir}")
run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S")
exp_dir = EXPERIMENT_ROOT / model_slug / run_name
exp_model_dir = exp_dir / "model"
if exp_model_dir.exists():
raise FileExistsError(
f"Experiment model directory already exists: {exp_model_dir}. "
"Use a different --run-name."
)
exp_dir.mkdir(parents=True, exist_ok=True)
shutil.copytree(clean_dir, exp_model_dir)
print("\nExperiment ready")
print(f" clean_model : {clean_dir}")
print(f" experiment : {exp_dir}")
print(f" model_copy : {exp_model_dir}")
if __name__ == "__main__":
main()
|