guide / scripts /upload_models_to_hub.py
sangram kumar yerra
fix: use as_posix() for HF upload paths to avoid Windows backslashes
9be2796
Raw
History Blame Contribute Delete
4.57 kB
"""
Upload G.U.I.D.E. inference model weights to Hugging Face Model Hub.
Uploads only the files needed for inference — skips checkpoint-* subdirectories
(intermediate training snapshots) and .DS_Store files.
Usage:
python scripts/upload_models_to_hub.py --repo-id <username>/guide-models
Prerequisites:
huggingface-cli login # run once before uploading
"""
from __future__ import annotations
import argparse
import logging
import sys
from pathlib import Path
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger(__name__)
_REPO_ROOT = Path(__file__).parent.parent
_MODELS_ROOT = _REPO_ROOT / "models"
_MODEL_SUBDIRS = [
"domain_classifier",
"evidence_ner",
"next_action",
]
def parse_args() -> argparse.Namespace:
"""Parse CLI arguments; exits with usage message if --repo-id is missing."""
parser = argparse.ArgumentParser(
description="Upload G.U.I.D.E. model weights to Hugging Face Model Hub.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=(
"examples:\n"
" python scripts/upload_models_to_hub.py --repo-id myuser/guide-models\n"
),
)
parser.add_argument(
"--repo-id",
required=True,
metavar="USER/REPO",
help="HF Hub repo ID to upload to (e.g. myuser/guide-models)",
)
return parser.parse_args()
def filter_inference_files(model_dir: Path) -> list[Path]:
"""
Return only the inference-needed files directly under model_dir.
Skips:
- Any file inside a checkpoint-* subdirectory (training artifacts)
- Any .DS_Store file
Args:
model_dir: Path to a model directory (e.g. models/evidence_ner).
Returns:
List of Path objects for files that should be uploaded.
"""
results: list[Path] = []
for path in model_dir.rglob("*"):
if not path.is_file():
continue
if path.name == ".DS_Store":
continue
# Skip anything inside a checkpoint-* directory
if any(part.startswith("checkpoint-") for part in path.relative_to(model_dir).parts):
continue
results.append(path)
return results
def upload_models(repo_id: str) -> None:
"""
Upload inference files for all three G.U.I.D.E. models to HF Hub.
Iterates over domain_classifier/, evidence_ner/, and next_action/ under
models/, filters to inference-only files via filter_inference_files(), and
uploads each file preserving its path relative to models/ as the Hub path.
Logs a warning and continues if a model directory does not exist locally.
Args:
repo_id: HF Hub repository ID, e.g. "myuser/guide-models".
"""
try:
from huggingface_hub import HfApi
from huggingface_hub.utils import HfHubHTTPError
except ImportError:
logger.error("huggingface_hub is not installed. Run: pip install huggingface_hub")
sys.exit(1)
api = HfApi()
# Verify auth before doing any work
try:
api.whoami()
except HfHubHTTPError:
logger.error(
"Not authenticated with Hugging Face. Run: huggingface-cli login"
)
sys.exit(1)
# Ensure repo exists (creates it if not)
api.create_repo(repo_id=repo_id, repo_type="model", exist_ok=True)
logger.info("Uploading to: https://huggingface.co/%s", repo_id)
total_uploaded = 0
for subdir_name in _MODEL_SUBDIRS:
model_dir = _MODELS_ROOT / subdir_name
if not model_dir.exists():
logger.warning("Model directory not found, skipping: %s", model_dir)
continue
files = filter_inference_files(model_dir)
if not files:
logger.warning("No inference files found in %s, skipping.", model_dir)
continue
logger.info("Uploading %d file(s) from %s …", len(files), subdir_name)
for local_path in files:
# Hub path mirrors local structure: domain_classifier/config.json etc.
path_in_repo = local_path.relative_to(_MODELS_ROOT).as_posix()
api.upload_file(
path_or_fileobj=str(local_path),
path_in_repo=path_in_repo,
repo_id=repo_id,
repo_type="model",
)
logger.info(" uploaded: %s", path_in_repo)
total_uploaded += 1
logger.info("Done. %d file(s) uploaded to %s", total_uploaded, repo_id)
if __name__ == "__main__":
args = parse_args()
upload_models(args.repo_id)