mailSort / src /mailsort /prepare_model.py
enzofrnt's picture
feat(training): pipeline minimal train/test + artefacts HF
8153a62 unverified
from __future__ import annotations
import argparse
import shutil
from dataclasses import dataclass
from pathlib import Path
@dataclass(frozen=True)
class Config:
outputs_dir: Path
model_dir: Path
def _parse_args() -> Config:
p = argparse.ArgumentParser(description="Prepare model/ folder from an outputs/ training directory.")
p.add_argument("--outputs-dir", default="outputs", help="Training output directory (from mailsort.train).")
p.add_argument("--model-dir", default="model", help="Target folder to commit/push to Hugging Face.")
a = p.parse_args()
return Config(outputs_dir=Path(a.outputs_dir), model_dir=Path(a.model_dir))
def main() -> int:
cfg = _parse_args()
if not cfg.outputs_dir.exists():
raise SystemExit(f"outputs-dir not found: {cfg.outputs_dir}")
cfg.model_dir.mkdir(parents=True, exist_ok=True)
# clean target (keep it explicit and predictable)
for p in cfg.model_dir.iterdir():
if p.is_dir():
shutil.rmtree(p)
else:
p.unlink()
# Copy only final artifacts (root files), ignore trainer checkpoints.
for p in cfg.outputs_dir.iterdir():
if p.is_dir():
# ignore checkpoint-* dirs
continue
shutil.copy2(p, cfg.model_dir / p.name)
# sanity: expected minimum files
expected_any = [
"config.json",
"tokenizer.json",
"tokenizer_config.json",
]
missing = [n for n in expected_any if not (cfg.model_dir / n).exists()]
if missing:
raise SystemExit(f"Missing expected files in {cfg.model_dir}: {missing}")
print(f"Prepared {cfg.model_dir} from {cfg.outputs_dir}")
return 0
if __name__ == "__main__":
raise SystemExit(main())