import os import sys import json import pathlib from typing import List import httpx from contextlib import suppress def _is_ascii(s: str) -> bool: try: s.encode("ascii") return True except Exception: return False def list_initial_files(root: pathlib.Path, start: int, limit: int, include_license: bool, include_readme: bool) -> List[pathlib.Path]: meta = root / "metadata" docs_dir = meta / "documents" files = [ docs_dir / "index.json", meta / "knowledge_bases.json", ] if include_license and (root / "LICENSE").exists(): files.append(root / "LICENSE") if include_readme and (root / "README.md").exists(): files.append(root / "README.md") index = json.loads((docs_dir / "index.json").read_text("utf8")) parts = index.get("parts", []) slice_parts = parts[start:start + limit] for p in slice_parts: files.append(docs_dir / p) return files def create_commit(repo_id: str, token: str, root: pathlib.Path, paths: List[pathlib.Path], message: str): from huggingface_hub import HfApi, CommitOperationAdd api = HfApi(token=token) api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True) ops = [] for p in paths: rel = p.relative_to(root) ops.append(CommitOperationAdd(path_in_repo=str(rel), path_or_fileobj=str(p))) def _commit(ops_slice: List[CommitOperationAdd], msg: str): try: api.create_commit(repo_id=repo_id, repo_type="dataset", operations=ops_slice, commit_message=msg) except Exception as e: s = str(e) timeout_like = isinstance(e, httpx.ReadTimeout) or "ReadTimeout" in s or "Timeout" in s if timeout_like and len(ops_slice) > 1: mid = len(ops_slice) // 2 left = ops_slice[:mid] right = ops_slice[mid:] _commit(left, msg + " [chunk A]") _commit(right, msg + " [chunk B]") else: raise chunk_size_env = os.getenv("HF_COMMIT_CHUNK_SIZE", "") chunk_size = int(chunk_size_env) if chunk_size_env.isdigit() else 0 if chunk_size and chunk_size > 0: for i in range(0, len(ops), chunk_size): _commit(ops[i:i + chunk_size], message + f" [batch {i//chunk_size}]") else: _commit(ops, message) def dry_run_summary(paths: List[pathlib.Path]): total = 0 items = [] for p in paths: size = p.stat().st_size total += size items.append((str(p), size)) print(json.dumps({"files": [{"path": i[0], "size": i[1]} for i in items], "total_bytes": total}, ensure_ascii=False, indent=2)) def main(): root_env = os.getenv("HF_DATASET_ROOT", "") repo_id = os.getenv("HF_REPO_ID", "") token = os.getenv("HF_TOKEN", "") or os.getenv("HUGGINGFACE_TOKEN", "") offset_env = os.getenv("HF_PARTS_OFFSET", "") limit_env = os.getenv("HF_PARTS_LIMIT", "") or os.getenv("HF_INITIAL_PARTS", "") include_license = os.getenv("HF_INCLUDE_LICENSE", "1") not in ("0", "false", "False") include_readme = os.getenv("HF_INCLUDE_README", "1") not in ("0", "false", "False") offset = int(offset_env) if offset_env.isdigit() else 0 limit = int(limit_env) if limit_env.isdigit() else 2 root = pathlib.Path(root_env or pathlib.Path(__file__).resolve().parents[2] / "hf_dataset_rag") paths = list_initial_files(root, offset, limit, include_license, include_readme) if not repo_id or not token: print("Missing HF_REPO_ID or HF_TOKEN; performing dry-run") dry_run_summary(paths) sys.exit(0) if not _is_ascii(token) or not _is_ascii(repo_id): print("HF_TOKEN or HF_REPO_ID contains non-ASCII characters") sys.exit(2) os.environ["HF_HUB_USER_AGENT"] = "rag-kb-uploader" os.environ.setdefault("HF_HUB_TIMEOUT", "60") os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60") with suppress(Exception): from huggingface_hub import HfApi HfApi(token=token).whoami() create_commit(repo_id, token, root, paths, f"Upload: metadata + parts [{offset}, {offset + limit})") print("Commit completed") if __name__ == "__main__": main()