Spaces:

Kalana001
/

SinCode

Sleeping

File size: 3,765 Bytes

1fed70a

"""
Prepare local clean model snapshots and experiment copies.

Workflow:
1) Download/save a clean Hugging Face model to a stable local path once.
2) Create a copy of that clean local model for each experiment run.

This prevents accidental overwrites of your base model and keeps
fine-tuning runs isolated.

Examples:
    python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala
    python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5
"""

from __future__ import annotations

import argparse
import shutil
import sys
from datetime import datetime
from pathlib import Path

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ROOT = Path(__file__).parent.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from core.constants import DEFAULT_MBART_MODEL

CLEAN_ROOT = ROOT / "seq2seq" / "clean_models"
EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Download a clean model once and create an isolated experiment copy (GPU required)."
    )
    parser.add_argument(
        "--model-id",
        default=DEFAULT_MBART_MODEL,
        help="Hugging Face model ID to prepare.",
    )
    parser.add_argument(
        "--clean-dir",
        type=Path,
        default=None,
        help="Optional custom clean-model directory.",
    )
    parser.add_argument(
        "--run-name",
        default=None,
        help="Optional experiment run folder name. Defaults to timestamp.",
    )
    parser.add_argument(
        "--force-refresh-clean",
        action="store_true",
        help="Re-download and overwrite the local clean model snapshot.",
    )
    parser.add_argument(
        "--allow-cpu",
        action="store_true",
        help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.",
    )
    return parser.parse_args()


def safe_name(model_id: str) -> str:
    return model_id.replace("/", "--")


def main() -> None:
    args = parse_args()

    if not torch.cuda.is_available() and not args.allow_cpu:
        raise RuntimeError(
            "CUDA GPU is required by default. "
            "No GPU detected. Use --allow-cpu only if you intentionally want CPU mode."
        )

    model_slug = safe_name(args.model_id)
    clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug)

    if clean_dir.exists() and args.force_refresh_clean:
        print(f"Removing existing clean model at: {clean_dir}")
        shutil.rmtree(clean_dir)

    if not clean_dir.exists():
        print(f"Downloading clean model: {args.model_id}")
        tokenizer = AutoTokenizer.from_pretrained(args.model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)

        clean_dir.mkdir(parents=True, exist_ok=True)
        tokenizer.save_pretrained(clean_dir)
        model.save_pretrained(clean_dir)
        print(f"Saved clean model to: {clean_dir}")
    else:
        print(f"Using existing clean model: {clean_dir}")

    run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S")
    exp_dir = EXPERIMENT_ROOT / model_slug / run_name
    exp_model_dir = exp_dir / "model"

    if exp_model_dir.exists():
        raise FileExistsError(
            f"Experiment model directory already exists: {exp_model_dir}. "
            "Use a different --run-name."
        )

    exp_dir.mkdir(parents=True, exist_ok=True)
    shutil.copytree(clean_dir, exp_model_dir)

    print("\nExperiment ready")
    print(f"  clean_model : {clean_dir}")
    print(f"  experiment  : {exp_dir}")
    print(f"  model_copy  : {exp_model_dir}")


if __name__ == "__main__":
    main()