File size: 3,765 Bytes
1fed70a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Prepare local clean model snapshots and experiment copies.

Workflow:
1) Download/save a clean Hugging Face model to a stable local path once.
2) Create a copy of that clean local model for each experiment run.

This prevents accidental overwrites of your base model and keeps
fine-tuning runs isolated.

Examples:
    python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala
    python seq2seq/prepare_experiment_model.py --model-id Kalana001/mbart50-large-singlish-sinhala --run-name exp-lr5e5
"""

from __future__ import annotations

import argparse
import shutil
import sys
from datetime import datetime
from pathlib import Path

import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

ROOT = Path(__file__).parent.parent
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from core.constants import DEFAULT_MBART_MODEL

CLEAN_ROOT = ROOT / "seq2seq" / "clean_models"
EXPERIMENT_ROOT = ROOT / "seq2seq" / "experiments"


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Download a clean model once and create an isolated experiment copy (GPU required)."
    )
    parser.add_argument(
        "--model-id",
        default=DEFAULT_MBART_MODEL,
        help="Hugging Face model ID to prepare.",
    )
    parser.add_argument(
        "--clean-dir",
        type=Path,
        default=None,
        help="Optional custom clean-model directory.",
    )
    parser.add_argument(
        "--run-name",
        default=None,
        help="Optional experiment run folder name. Defaults to timestamp.",
    )
    parser.add_argument(
        "--force-refresh-clean",
        action="store_true",
        help="Re-download and overwrite the local clean model snapshot.",
    )
    parser.add_argument(
        "--allow-cpu",
        action="store_true",
        help="Allow running without CUDA. Default is GPU-only to avoid workstation slowdown.",
    )
    return parser.parse_args()


def safe_name(model_id: str) -> str:
    return model_id.replace("/", "--")


def main() -> None:
    args = parse_args()

    if not torch.cuda.is_available() and not args.allow_cpu:
        raise RuntimeError(
            "CUDA GPU is required by default. "
            "No GPU detected. Use --allow-cpu only if you intentionally want CPU mode."
        )

    model_slug = safe_name(args.model_id)
    clean_dir = args.clean_dir or (CLEAN_ROOT / model_slug)

    if clean_dir.exists() and args.force_refresh_clean:
        print(f"Removing existing clean model at: {clean_dir}")
        shutil.rmtree(clean_dir)

    if not clean_dir.exists():
        print(f"Downloading clean model: {args.model_id}")
        tokenizer = AutoTokenizer.from_pretrained(args.model_id)
        model = AutoModelForSeq2SeqLM.from_pretrained(args.model_id)

        clean_dir.mkdir(parents=True, exist_ok=True)
        tokenizer.save_pretrained(clean_dir)
        model.save_pretrained(clean_dir)
        print(f"Saved clean model to: {clean_dir}")
    else:
        print(f"Using existing clean model: {clean_dir}")

    run_name = args.run_name or datetime.now().strftime("run-%Y%m%d-%H%M%S")
    exp_dir = EXPERIMENT_ROOT / model_slug / run_name
    exp_model_dir = exp_dir / "model"

    if exp_model_dir.exists():
        raise FileExistsError(
            f"Experiment model directory already exists: {exp_model_dir}. "
            "Use a different --run-name."
        )

    exp_dir.mkdir(parents=True, exist_ok=True)
    shutil.copytree(clean_dir, exp_model_dir)

    print("\nExperiment ready")
    print(f"  clean_model : {clean_dir}")
    print(f"  experiment  : {exp_dir}")
    print(f"  model_copy  : {exp_model_dir}")


if __name__ == "__main__":
    main()