File size: 2,651 Bytes
from __future__ import annotations

import argparse
import sys
from pathlib import Path

BASE_DIR = Path(__file__).resolve().parent.parent
if str(BASE_DIR) not in sys.path:
    sys.path.insert(0, str(BASE_DIR))

from config import (  # noqa: E402
    HEAD_CONFIGS,
    IAB_CLASSIFIER_MODEL_DIR,
    MULTITASK_INTENT_MODEL_DIR,
    _looks_like_local_hf_model_dir,
)


def verify_production_artifacts() -> tuple[bool, list[tuple[str, bool, str]]]:
    """Return (all_ok, rows of (label, ok, path_str))."""
    rows: list[tuple[str, bool, str]] = []
    ok = True

    for label, path in (
        ("multitask weights", MULTITASK_INTENT_MODEL_DIR / "multitask_model.pt"),
        ("multitask metadata", MULTITASK_INTENT_MODEL_DIR / "metadata.json"),
    ):
        exists = path.exists()
        rows.append((label, exists, str(path)))
        ok = ok and exists

    # AutoTokenizer(...) typically saves tokenizer.json/vocab.txt/tokenizer_config.json,
    # but not necessarily a top-level `config.json` (that would be model config).
    multitask_tokenizer_files = [
        "tokenizer.json",
        "vocab.txt",
        "tokenizer_config.json",
        "special_tokens_map.json",
    ]
    tokenizer_exists = any(
        (MULTITASK_INTENT_MODEL_DIR / fname).exists() for fname in multitask_tokenizer_files
    )
    rows.append(
        (
            "multitask tokenizer files",
            tokenizer_exists,
            str(MULTITASK_INTENT_MODEL_DIR),
        )
    )
    ok = ok and tokenizer_exists

    iab_dir = IAB_CLASSIFIER_MODEL_DIR
    iab_ok = _looks_like_local_hf_model_dir(iab_dir)
    rows.append(("IAB classifier (HF layout)", iab_ok, str(iab_dir)))
    ok = ok and iab_ok

    for slug, cfg in HEAD_CONFIGS.items():
        path = cfg.calibration_path
        exists = path.exists()
        rows.append((f"calibration {slug}", exists, str(path)))
        ok = ok and exists

    return ok, rows


def main() -> int:
    parser = argparse.ArgumentParser(
        description="Verify multitask, IAB, and calibration artifacts exist for production inference."
    )
    args = parser.parse_args()
    all_ok, rows = verify_production_artifacts()
    for label, row_ok, path in rows:
        status = "OK " if row_ok else "MISS"
        print(f"[{status}] {label}: {path}")
    if not all_ok:
        print(
            "\nFix: run training/run_full_training_pipeline.py (or train_multitask_intent, train_iab, "
            "calibrate_confidence for each head).",
            flush=True,
        )
        return 1
    print("\nAll production artifacts present.")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())