Upload folder using huggingface_hub

53d5d9f verified 5 days ago

10 kB

	#!/usr/bin/env python3
	"""
	Upload trained artifacts to Hugging Face Hub.

	This repo uses local-path inference. The upload is intended so you can later
	download these directories into the same folder layout and run inference.
	"""

	from __future__ import annotations

	import argparse
	import os
	import shutil
	import sys
	import tempfile
	import time
	from datetime import datetime, timezone
	from pathlib import Path

	LARGE_FILE_UPLOAD_THRESHOLD_BYTES = 100 * 1024 * 1024


	def _parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(description="Upload trained intent/IAB artifacts to Hugging Face Hub.")
	parser.add_argument(
	"--repo-id",
	required=True,
	help="HF repo id, e.g. 'yourname/admesh-intent-iab-v1'.",
	)
	parser.add_argument(
	"--token",
	default=os.environ.get("HF_TOKEN"),
	help="HF token. If omitted, uses env HF_TOKEN.",
	)
	parser.add_argument(
	"--private",
	action="store_true",
	help="Create the repo as private.",
	)
	parser.add_argument(
	"--include-multitask",
	action="store_true",
	help="Upload multitask intent model output directory.",
	)
	parser.add_argument(
	"--include-iab",
	action="store_true",
	help="Upload IAB classifier model output directory.",
	)
	parser.add_argument(
	"--include-calibration",
	action="store_true",
	help="Upload artifacts/calibration directory.",
	)
	parser.add_argument(
	"--include-hf-readme",
	action="store_true",
	help="Upload a Hugging Face model card file as README.md in the Hub repo root.",
	)
	parser.add_argument(
	"--include-serving-code",
	action="store_true",
	help="Upload core runtime Python/code files required for Hub trust_remote_code inference.",
	)
	parser.add_argument(
	"--include-root-checkpoint",
	action="store_true",
	help="Upload root-level compatibility checkpoint/tokenizer files used by transformers.pipeline loader.",
	)
	parser.add_argument(
	"--include-all",
	action="store_true",
	help=(
	"Upload everything needed for end-to-end Hub usage: multitask + iab + calibration + "
	"HF README + serving code + root checkpoint/tokenizer files."
	),
	)
	parser.add_argument(
	"--hf-readme-path",
	default="HF_MODEL_CARD.md",
	help="Local path to the HF model card markdown to upload as README.md (relative to repo root).",
	)
	parser.add_argument(
	"--multitask-dir",
	default="multitask_intent_model_output",
	help="Path to multitask intent output directory (relative to this script's base).",
	)
	parser.add_argument(
	"--iab-dir",
	default="iab_classifier_model_output",
	help="Path to IAB classifier model output directory (relative to this script's base).",
	)
	parser.add_argument(
	"--calibration-dir",
	default="artifacts/calibration",
	help="Path to calibration artifacts directory (relative to this script's base).",
	)
	parser.add_argument(
	"--dry-run",
	action="store_true",
	help="Print what would be uploaded without actually uploading.",
	)
	return parser.parse_args()


	def _iter_local_files(path: Path) -> list[Path]:
	if path.is_file():
	return [path]
	return sorted(p for p in path.rglob("*") if p.is_file())


	def _remote_file_paths(path_in_repo: str, local_path: Path) -> list[str]:
	if local_path.is_file():
	return [path_in_repo]
	return [
	f"{path_in_repo}/{file_path.relative_to(local_path).as_posix()}"
	for file_path in _iter_local_files(local_path)
	]


	def _requires_large_upload(local_path: Path) -> bool:
	return any(file_path.stat().st_size >= LARGE_FILE_UPLOAD_THRESHOLD_BYTES for file_path in _iter_local_files(local_path))


	def _upload_via_large_folder(api, repo_id: str, repo_path: str, local_path: Path) -> None:
	with tempfile.TemporaryDirectory(prefix="hf_large_upload_") as tmp_dir:
	staging_root = Path(tmp_dir)
	staged_target = staging_root / repo_path
	staged_target.parent.mkdir(parents=True, exist_ok=True)
	if local_path.is_file():
	shutil.copy2(local_path, staged_target)
	else:
	shutil.copytree(
	local_path,
	staged_target,
	ignore=shutil.ignore_patterns(".cache", "__pycache__"),
	)
	# Ensure resumable-upload metadata from previous local attempts does not
	# get carried into the fresh staging directory.
	shutil.rmtree(staged_target / ".cache", ignore_errors=True)
	api.upload_large_folder(
	repo_id=repo_id,
	repo_type="model",
	folder_path=str(staging_root),
	print_report=False,
	)


	def _verify_remote_upload(api, repo_id: str, repo_path: str, local_path: Path) -> None:
	expected = set(_remote_file_paths(repo_path, local_path))
	for attempt in range(4):
	files = set(api.list_repo_files(repo_id=repo_id, repo_type="model"))
	missing = sorted(expected - files)
	if not missing:
	return
	if attempt == 3:
	raise RuntimeError(
	"Upload completed but the following remote files are still missing: "
	+ ", ".join(missing[:20])
	)
	time.sleep(2 * (attempt + 1))


	def main() -> int:
	started_at = time.perf_counter()
	started_wall = datetime.now(timezone.utc).isoformat()
	args = _parse_args()
	if not args.token:
	print("Missing HF token. Provide --token or set env HF_TOKEN.", file=sys.stderr)
	return 2

	repo_root = Path(__file__).resolve().parent.parent

	multitask_dir = (repo_root / args.multitask_dir).resolve()
	iab_dir = (repo_root / args.iab_dir).resolve()
	calibration_dir = (repo_root / args.calibration_dir).resolve()
	hf_readme_path = (repo_root / args.hf_readme_path).resolve()

	if args.include_all:
	args.include_multitask = True
	args.include_iab = True
	args.include_calibration = True
	args.include_hf_readme = True
	args.include_serving_code = True
	args.include_root_checkpoint = True

	to_upload: list[tuple[str, Path]] = []
	if args.include_multitask:
	to_upload.append(("multitask_intent_model_output", multitask_dir))
	if args.include_iab:
	to_upload.append(("iab_classifier_model_output", iab_dir))
	if args.include_calibration:
	to_upload.append(("artifacts/calibration", calibration_dir))
	if args.include_hf_readme:
	to_upload.append(("README.md", hf_readme_path))

	if args.include_serving_code:
	# Files needed by trust_remote_code execution path.
	for rel in [
	"pipeline.py",
	"config.py",
	"config.json",
	"combined_inference.py",
	"model_runtime.py",
	"multitask_runtime.py",
	"multitask_model.py",
	"schemas.py",
	"inference_intent_type.py",
	"inference_subtype.py",
	"inference_decision_phase.py",
	"inference_iab_classifier.py",
	"iab_classifier.py",
	"iab_taxonomy.py",
	]:
	to_upload.append((rel, (repo_root / rel).resolve()))

	if args.include_root_checkpoint:
	for rel in [
	"model.safetensors",
	"tokenizer.json",
	"tokenizer_config.json",
	"special_tokens_map.json",
	"vocab.txt",
	]:
	to_upload.append((rel, (repo_root / rel).resolve()))

	if not to_upload:
	print(
	"Nothing to upload. Pass include flags (e.g. --include-all), or one/more of: "
	"--include-multitask --include-iab --include-calibration --include-hf-readme "
	"--include-serving-code --include-root-checkpoint.",
	file=sys.stderr,
	)
	return 2

	# Import lazily so `--dry-run` works without extra deps.
	try:
	from huggingface_hub import HfApi
	except ModuleNotFoundError:
	print("Missing dependency: huggingface_hub. Install with: pip install huggingface_hub", file=sys.stderr)
	return 2

	api = HfApi(token=args.token)
	api.create_repo(repo_id=args.repo_id, repo_type="model", private=args.private, exist_ok=True)

	for repo_path, local_dir in to_upload:
	if not local_dir.exists():
	print(f"[SKIP] {repo_path}: local path does not exist: {local_dir}", file=sys.stderr)
	continue
	if args.dry_run:
	print(f"[DRY] Would upload {local_dir} -> {args.repo_id}:{repo_path}")
	continue
	step_start = time.perf_counter()
	mode = "large-folder" if _requires_large_upload(local_dir) else "standard"
	print(f"[UPLOAD] {local_dir} -> {args.repo_id}:{repo_path} ({mode})")
	if mode == "large-folder":
	_upload_via_large_folder(api, args.repo_id, repo_path, local_dir)
	elif local_dir.is_file():
	api.upload_file(
	repo_id=args.repo_id,
	repo_type="model",
	path_or_fileobj=str(local_dir),
	path_in_repo=repo_path,
	)
	else:
	api.upload_folder(
	repo_id=args.repo_id,
	repo_type="model",
	folder_path=str(local_dir),
	path_in_repo=repo_path,
	)
	_verify_remote_upload(api, args.repo_id, repo_path, local_dir)
	print(f"[DONE ] {repo_path} took {(time.perf_counter() - step_start):.2f}s")

	ended_wall = datetime.now(timezone.utc).isoformat()
	elapsed_s = time.perf_counter() - started_at
	print(f"Upload complete.\nstart: {started_wall}\nend: {ended_wall}\ntotal: {elapsed_s:.2f}s")
	return 0


	if __name__ == "__main__":
	raise SystemExit(main())