| |
| """ |
| Upload trained artifacts to Hugging Face Hub. |
| |
| This repo uses local-path inference. The upload is intended so you can later |
| download these directories into the same folder layout and run inference. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import argparse |
| import os |
| import shutil |
| import sys |
| import tempfile |
| import time |
| from datetime import datetime, timezone |
| from pathlib import Path |
|
|
| LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024 |
|
|
|
|
| def _parse_args() -> argparse.Namespace: |
| parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.") |
| parser.add_argument( |
| "--repo-id", |
| required=True, |
| help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.", |
| ) |
| parser.add_argument( |
| "--token", |
| default=os.environ.get("HF_TOKEN"), |
| help="HF token. If omitted, uses env HF_TOKEN.", |
| ) |
| parser.add_argument( |
| "--private", |
| action="store_true", |
| help="Create the repo as private.", |
| ) |
| parser.add_argument( |
| "--include-multitask", |
| action="store_true", |
| help="Upload multitask intent model output directory.", |
| ) |
| parser.add_argument( |
| "--include-iab", |
| action="store_true", |
| help="Upload IAB classifier model output directory.", |
| ) |
| parser.add_argument( |
| "--include-calibration", |
| action="store_true", |
| help="Upload artifacts/calibration directory.", |
| ) |
| parser.add_argument( |
| "--include-hf-readme", |
| action="store_true", |
| help="Upload a Hugging Face model card file as README.md in the Hub repo root.", |
| ) |
| parser.add_argument( |
| "--include-serving-code", |
| action="store_true", |
| help="Upload core runtime Python/code files required for Hub trust_remote_code inference.", |
| ) |
| parser.add_argument( |
| "--include-root-checkpoint", |
| action="store_true", |
| help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.", |
| ) |
| parser.add_argument( |
| "--include-all", |
| action="store_true", |
| help=( |
| "Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + " |
| "HF README + serving code + root checkpoint/tokenizer files." |
| ), |
| ) |
| parser.add_argument( |
| "--hf-readme-path", |
| default="HF_MODEL_CARD.md", |
| help="Local path to the HF model card markdown to upload as README.md (relative to repo root).", |
| ) |
| parser.add_argument( |
| "--multitask-dir", |
| default="multitask_intent_model_output", |
| help="Path to multitask intent output directory (relative to this script's base).", |
| ) |
| parser.add_argument( |
| "--iab-dir", |
| default="iab_classifier_model_output", |
| help="Path to IAB classifier model output directory (relative to this script's base).", |
| ) |
| parser.add_argument( |
| "--calibration-dir", |
| default="artifacts/calibration", |
| help="Path to calibration artifacts directory (relative to this script's base).", |
| ) |
| parser.add_argument( |
| "--dry-run", |
| action="store_true", |
| help="Print what would be uploaded without actually uploading.", |
| ) |
| return parser.parse_args() |
|
|
|
|
| def _iter_local_files(path: Path) -> list[Path]: |
| if path.is_file(): |
| return [path] |
| return sorted(p for p in path.rglob("*") if p.is_file()) |
|
|
|
|
| def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]: |
| if local_path.is_file(): |
| return [path_in_repo] |
| return [ |
| f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}" |
| for file_path in _iter_local_files(local_path) |
| ] |
|
|
|
|
| def _requires_large_upload(local_path: Path) -> bool: |
| return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path)) |
|
|
|
|
| def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None: |
| with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir: |
| staging_root = Path(tmp_dir) |
| staged_target = staging_root / repo_path |
| staged_target.parent.mkdir(parents=True, exist_ok=True) |
| if local_path.is_file(): |
| shutil.copy2(local_path, staged_target) |
| else: |
| shutil.copytree( |
| local_path, |
| staged_target, |
| ignore=shutil.ignore_patterns(".cache", "__pycache__"), |
| ) |
| |
| |
| shutil.rmtree(staged_target / ".cache", ignore_errors=True) |
| api.upload_large_folder( |
| repo_id=repo_id, |
| repo_type="model", |
| folder_path=str(staging_root), |
| print_report=False, |
| ) |
|
|
|
|
| def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None: |
| expected = set(_remote_file_paths(repo_path, local_path)) |
| for attempt in range(4): |
| files = set(api.list_repo_files(repo_id=repo_id, repo_type="model")) |
| missing = sorted(expected - files) |
| if not missing: |
| return |
| if attempt == 3: |
| raise RuntimeError( |
| "Upload completed but the following remote files are still missing: " |
| + ", ".join(missing[:20]) |
| ) |
| time.sleep(2 * (attempt + 1)) |
|
|
|
|
| def main() -> int: |
| started_at = time.perf_counter() |
| started_wall = datetime.now(timezone.utc).isoformat() |
| args = _parse_args() |
| if not args.token: |
| print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr) |
| return 2 |
|
|
| repo_root = Path(__file__).resolve().parent.parent |
|
|
| multitask_dir = (repo_root / args.multitask_dir).resolve() |
| iab_dir = (repo_root / args.iab_dir).resolve() |
| calibration_dir = (repo_root / args.calibration_dir).resolve() |
| hf_readme_path = (repo_root / args.hf_readme_path).resolve() |
|
|
| if args.include_all: |
| args.include_multitask = True |
| args.include_iab = True |
| args.include_calibration = True |
| args.include_hf_readme = True |
| args.include_serving_code = True |
| args.include_root_checkpoint = True |
|
|
| to_upload: list[tuple[str, Path]] = [] |
| if args.include_multitask: |
| to_upload.append(("multitask_intent_model_output", multitask_dir)) |
| if args.include_iab: |
| to_upload.append(("iab_classifier_model_output", iab_dir)) |
| if args.include_calibration: |
| to_upload.append(("artifacts/calibration", calibration_dir)) |
| if args.include_hf_readme: |
| to_upload.append(("README.md", hf_readme_path)) |
|
|
| if args.include_serving_code: |
| |
| for rel in [ |
| "pipeline.py", |
| "config.py", |
| "config.json", |
| "combined_inference.py", |
| "model_runtime.py", |
| "multitask_runtime.py", |
| "multitask_model.py", |
| "schemas.py", |
| "inference_intent_type.py", |
| "inference_subtype.py", |
| "inference_decision_phase.py", |
| "inference_iab_classifier.py", |
| "iab_classifier.py", |
| "iab_taxonomy.py", |
| ]: |
| to_upload.append((rel, (repo_root / rel).resolve())) |
|
|
| if args.include_root_checkpoint: |
| for rel in [ |
| "model.safetensors", |
| "tokenizer.json", |
| "tokenizer_config.json", |
| "special_tokens_map.json", |
| "vocab.txt", |
| ]: |
| to_upload.append((rel, (repo_root / rel).resolve())) |
|
|
| if not to_upload: |
| print( |
| "Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: " |
| "--include-multitask --include-iab --include-calibration --include-hf-readme " |
| "--include-serving-code --include-root-checkpoint.", |
| file=sys.stderr, |
| ) |
| return 2 |
|
|
| |
| try: |
| from huggingface_hub import HfApi |
| except ModuleNotFoundError: |
| print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr) |
| return 2 |
|
|
| api = HfApi(token=args.token) |
| api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True) |
|
|
| for repo_path, local_dir in to_upload: |
| if not local_dir.exists(): |
| print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr) |
| continue |
| if args.dry_run: |
| print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}") |
| continue |
| step_start = time.perf_counter() |
| mode = "large-folder" if _requires_large_upload(local_dir) else "standard" |
| print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})") |
| if mode == "large-folder": |
| _upload_via_large_folder(api, args.repo_id, repo_path, local_dir) |
| elif local_dir.is_file(): |
| api.upload_file( |
| repo_id=args.repo_id, |
| repo_type="model", |
| path_or_fileobj=str(local_dir), |
| path_in_repo=repo_path, |
| ) |
| else: |
| api.upload_folder( |
| repo_id=args.repo_id, |
| repo_type="model", |
| folder_path=str(local_dir), |
| path_in_repo=repo_path, |
| ) |
| _verify_remote_upload(api, args.repo_id, repo_path, local_dir) |
| print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s") |
|
|
| ended_wall = datetime.now(timezone.utc).isoformat() |
| elapsed_s = time.perf_counter() - started_at |
| print(f"Upload complete.\nstart: {started_wall}\nend: {ended_wall}\ntotal: {elapsed_s:.2f}s") |
| return 0 |
|
|
|
|
| if __name__ == "__main__": |
| raise SystemExit(main()) |
|
|
|
|