#!/usr/bin/env python3 """ Upload trained artifacts to Hugging Face Hub. This repo uses local-path inference. The upload is intended so you can later download these directories into the same folder layout and run inference. """ from __future__ import annotations import argparse import os import shutil import sys import tempfile import time from datetime import datetime, timezone from pathlib import Path LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024 def _parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.") parser.add_argument( "--repo-id", required=True, help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.", ) parser.add_argument( "--token", default=os.environ.get("HF_TOKEN"), help="HF token. If omitted, uses env HF_TOKEN.", ) parser.add_argument( "--private", action="store_true", help="Create the repo as private.", ) parser.add_argument( "--include-multitask", action="store_true", help="Upload multitask intent model output directory.", ) parser.add_argument( "--include-iab", action="store_true", help="Upload IAB classifier model output directory.", ) parser.add_argument( "--include-calibration", action="store_true", help="Upload artifacts/calibration directory.", ) parser.add_argument( "--include-hf-readme", action="store_true", help="Upload a Hugging Face model card file as README.md in the Hub repo root.", ) parser.add_argument( "--include-serving-code", action="store_true", help="Upload core runtime Python/code files required for Hub trust_remote_code inference.", ) parser.add_argument( "--include-root-checkpoint", action="store_true", help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.", ) parser.add_argument( "--include-all", action="store_true", help=( "Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + " "HF README + serving code + root checkpoint/tokenizer files." ), ) parser.add_argument( "--hf-readme-path", default="HF_MODEL_CARD.md", help="Local path to the HF model card markdown to upload as README.md (relative to repo root).", ) parser.add_argument( "--multitask-dir", default="multitask_intent_model_output", help="Path to multitask intent output directory (relative to this script's base).", ) parser.add_argument( "--iab-dir", default="iab_classifier_model_output", help="Path to IAB classifier model output directory (relative to this script's base).", ) parser.add_argument( "--calibration-dir", default="artifacts/calibration", help="Path to calibration artifacts directory (relative to this script's base).", ) parser.add_argument( "--dry-run", action="store_true", help="Print what would be uploaded without actually uploading.", ) return parser.parse_args() def _iter_local_files(path: Path) -> list[Path]: if path.is_file(): return [path] return sorted(p for p in path.rglob("*") if p.is_file()) def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]: if local_path.is_file(): return [path_in_repo] return [ f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}" for file_path in _iter_local_files(local_path) ] def _requires_large_upload(local_path: Path) -> bool: return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path)) def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None: with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir: staging_root = Path(tmp_dir) staged_target = staging_root / repo_path staged_target.parent.mkdir(parents=True, exist_ok=True) if local_path.is_file(): shutil.copy2(local_path, staged_target) else: shutil.copytree( local_path, staged_target, ignore=shutil.ignore_patterns(".cache", "__pycache__"), ) # Ensure resumable-upload metadata from previous local attempts does not # get carried into the fresh staging directory. shutil.rmtree(staged_target / ".cache", ignore_errors=True) api.upload_large_folder( repo_id=repo_id, repo_type="model", folder_path=str(staging_root), print_report=False, ) def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None: expected = set(_remote_file_paths(repo_path, local_path)) for attempt in range(4): files = set(api.list_repo_files(repo_id=repo_id, repo_type="model")) missing = sorted(expected - files) if not missing: return if attempt == 3: raise RuntimeError( "Upload completed but the following remote files are still missing: " + ", ".join(missing[:20]) ) time.sleep(2 * (attempt + 1)) def main() -> int: started_at = time.perf_counter() started_wall = datetime.now(timezone.utc).isoformat() args = _parse_args() if not args.token: print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr) return 2 repo_root = Path(__file__).resolve().parent.parent multitask_dir = (repo_root / args.multitask_dir).resolve() iab_dir = (repo_root / args.iab_dir).resolve() calibration_dir = (repo_root / args.calibration_dir).resolve() hf_readme_path = (repo_root / args.hf_readme_path).resolve() if args.include_all: args.include_multitask = True args.include_iab = True args.include_calibration = True args.include_hf_readme = True args.include_serving_code = True args.include_root_checkpoint = True to_upload: list[tuple[str, Path]] = [] if args.include_multitask: to_upload.append(("multitask_intent_model_output", multitask_dir)) if args.include_iab: to_upload.append(("iab_classifier_model_output", iab_dir)) if args.include_calibration: to_upload.append(("artifacts/calibration", calibration_dir)) if args.include_hf_readme: to_upload.append(("README.md", hf_readme_path)) if args.include_serving_code: # Files needed by trust_remote_code execution path. for rel in [ "pipeline.py", "config.py", "config.json", "combined_inference.py", "model_runtime.py", "multitask_runtime.py", "multitask_model.py", "schemas.py", "inference_intent_type.py", "inference_subtype.py", "inference_decision_phase.py", "inference_iab_classifier.py", "iab_classifier.py", "iab_taxonomy.py", ]: to_upload.append((rel, (repo_root / rel).resolve())) if args.include_root_checkpoint: for rel in [ "model.safetensors", "tokenizer.json", "tokenizer_config.json", "special_tokens_map.json", "vocab.txt", ]: to_upload.append((rel, (repo_root / rel).resolve())) if not to_upload: print( "Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: " "--include-multitask --include-iab --include-calibration --include-hf-readme " "--include-serving-code --include-root-checkpoint.", file=sys.stderr, ) return 2 # Import lazily so `--dry-run` works without extra deps. try: from huggingface_hub import HfApi except ModuleNotFoundError: print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr) return 2 api = HfApi(token=args.token) api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True) for repo_path, local_dir in to_upload: if not local_dir.exists(): print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr) continue if args.dry_run: print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}") continue step_start = time.perf_counter() mode = "large-folder" if _requires_large_upload(local_dir) else "standard" print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})") if mode == "large-folder": _upload_via_large_folder(api, args.repo_id, repo_path, local_dir) elif local_dir.is_file(): api.upload_file( repo_id=args.repo_id, repo_type="model", path_or_fileobj=str(local_dir), path_in_repo=repo_path, ) else: api.upload_folder( repo_id=args.repo_id, repo_type="model", folder_path=str(local_dir), path_in_repo=repo_path, ) _verify_remote_upload(api, args.repo_id, repo_path, local_dir) print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s") ended_wall = datetime.now(timezone.utc).isoformat() elapsed_s = time.perf_counter() - started_at print(f"Upload complete.\nstart: {started_wall}\nend: {ended_wall}\ntotal: {elapsed_s:.2f}s") return 0 if __name__ == "__main__": raise SystemExit(main())