tarekziade's picture
tarekziade HF Staff
also sync the source file
520cc96
"""
Sync the local index directory with the team-0/ragstudio HF bucket.
Usage:
python sync.py # push index_data/ -> hf://buckets/team-0/ragstudio
python sync.py --pull # pull hf://buckets/team-0/ragstudio -> index_data/
Extra flags (e.g. --delete, --dry-run) are forwarded to `hf sync`.
"""
import argparse
import subprocess
import sys
from pathlib import Path
from indexers import INDEX_DIR
BUCKET = "hf://buckets/team-0/ragstudio"
# Resolve `hf` next to the active interpreter so we don't accidentally pick
# up an older hf from PATH that lacks the `sync` subcommand.
HF_BIN = str(Path(sys.executable).parent / "hf")
def _run(cmd: list[str]) -> None:
print(" ".join(cmd))
subprocess.run(cmd, check=True)
def sync(pull: bool = False, extra: list[str] | None = None) -> None:
INDEX_DIR.mkdir(exist_ok=True)
local = str(INDEX_DIR)
extra = extra or []
src, dst = (BUCKET, local) if pull else (local, BUCKET)
_run([HF_BIN, "sync", src, dst, *extra])
# On push, also upload the source files that were indexed. The folder is
# recorded in _source.txt by build_index(). Pull doesn't need a second call:
# files pushed to <bucket>/source/ come back under <INDEX_DIR>/source/ as
# part of the main bucket->INDEX_DIR sync above.
if pull:
return
manifest = INDEX_DIR / "_source.txt"
if not manifest.exists():
return
source = Path(manifest.read_text().strip())
if not source.is_dir():
print(f"skipping source push: {source} not found")
return
_run([HF_BIN, "sync", str(source), f"{BUCKET}/source", *extra])
def main() -> None:
parser = argparse.ArgumentParser(description=__doc__.strip())
parser.add_argument(
"--pull",
action="store_true",
help="pull from bucket to local (default is push)",
)
args, extra = parser.parse_known_args()
sync(pull=args.pull, extra=extra)
if __name__ == "__main__":
main()