Thoracic-Radiology-RAG-System / scripts /publish_index_to_hf.py
ZhangNy's picture
Add Space app files
75db650
"""
Publish a built index folder to Hugging Face Datasets.
Example:
python scripts/publish_index_to_hf.py \
--repo ZhangNy/radiology-index-qwen3-embedding-0.6b \
--folder ./index_out \
--token $HF_TOKEN
"""
from __future__ import annotations
import argparse
import sys
from pathlib import Path
# Allow running as `python scripts/*.py` without installing the package.
sys.path.append(str(Path(__file__).resolve().parents[1]))
def main() -> int:
parser = argparse.ArgumentParser(description="Upload index artifacts to HF datasets repo")
parser.add_argument("--repo", type=str, required=True, help="HF dataset repo id, e.g. user/my-index")
parser.add_argument("--folder", type=str, required=True, help="Local folder containing chroma_db/ + doc_store.db")
parser.add_argument("--token", type=str, default=None, help="HF token (or set HF_TOKEN env)")
parser.add_argument("--private", action="store_true", help="Create repo as private")
parser.add_argument("--revision", type=str, default="main", help="Target revision/branch")
parser.add_argument(
"--ignore",
type=str,
default="",
help="Comma-separated ignore patterns for upload_folder (e.g. 'images/**,**/images/**')",
)
args = parser.parse_args()
from huggingface_hub import HfApi
token = args.token or None
if token is None:
import os
token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
if not token:
raise SystemExit("Missing token. Provide --token or set HF_TOKEN.")
folder = Path(args.folder)
if not folder.exists():
raise SystemExit(f"Folder not found: {folder}")
api = HfApi()
api.create_repo(
repo_id=args.repo,
repo_type="dataset",
private=bool(args.private),
exist_ok=True,
token=token,
)
api.upload_folder(
repo_id=args.repo,
repo_type="dataset",
folder_path=str(folder),
path_in_repo="",
token=token,
revision=args.revision,
commit_message="Upload prebuilt radiology RAG index",
ignore_patterns=[p.strip() for p in (args.ignore or "").split(",") if p.strip()] or None,
)
print(f"✓ Uploaded index folder to HF dataset repo: {args.repo}")
return 0
if __name__ == "__main__":
raise SystemExit(main())