Spaces:
Build error
Build error
| import os | |
| import sys | |
| import json | |
| import pathlib | |
| from typing import List | |
| import httpx | |
| from contextlib import suppress | |
| def _is_ascii(s: str) -> bool: | |
| try: | |
| s.encode("ascii") | |
| return True | |
| except Exception: | |
| return False | |
| def list_initial_files(root: pathlib.Path, start: int, limit: int, include_license: bool, include_readme: bool) -> List[pathlib.Path]: | |
| meta = root / "metadata" | |
| docs_dir = meta / "documents" | |
| files = [ | |
| docs_dir / "index.json", | |
| meta / "knowledge_bases.json", | |
| ] | |
| if include_license and (root / "LICENSE").exists(): | |
| files.append(root / "LICENSE") | |
| if include_readme and (root / "README.md").exists(): | |
| files.append(root / "README.md") | |
| index = json.loads((docs_dir / "index.json").read_text("utf8")) | |
| parts = index.get("parts", []) | |
| slice_parts = parts[start:start + limit] | |
| for p in slice_parts: | |
| files.append(docs_dir / p) | |
| return files | |
| def create_commit(repo_id: str, token: str, root: pathlib.Path, paths: List[pathlib.Path], message: str): | |
| from huggingface_hub import HfApi, CommitOperationAdd | |
| api = HfApi(token=token) | |
| api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True) | |
| ops = [] | |
| for p in paths: | |
| rel = p.relative_to(root) | |
| ops.append(CommitOperationAdd(path_in_repo=str(rel), path_or_fileobj=str(p))) | |
| def _commit(ops_slice: List[CommitOperationAdd], msg: str): | |
| try: | |
| api.create_commit(repo_id=repo_id, repo_type="dataset", operations=ops_slice, commit_message=msg) | |
| except Exception as e: | |
| s = str(e) | |
| timeout_like = isinstance(e, httpx.ReadTimeout) or "ReadTimeout" in s or "Timeout" in s | |
| if timeout_like and len(ops_slice) > 1: | |
| mid = len(ops_slice) // 2 | |
| left = ops_slice[:mid] | |
| right = ops_slice[mid:] | |
| _commit(left, msg + " [chunk A]") | |
| _commit(right, msg + " [chunk B]") | |
| else: | |
| raise | |
| chunk_size_env = os.getenv("HF_COMMIT_CHUNK_SIZE", "") | |
| chunk_size = int(chunk_size_env) if chunk_size_env.isdigit() else 0 | |
| if chunk_size and chunk_size > 0: | |
| for i in range(0, len(ops), chunk_size): | |
| _commit(ops[i:i + chunk_size], message + f" [batch {i//chunk_size}]") | |
| else: | |
| _commit(ops, message) | |
| def dry_run_summary(paths: List[pathlib.Path]): | |
| total = 0 | |
| items = [] | |
| for p in paths: | |
| size = p.stat().st_size | |
| total += size | |
| items.append((str(p), size)) | |
| print(json.dumps({"files": [{"path": i[0], "size": i[1]} for i in items], "total_bytes": total}, ensure_ascii=False, indent=2)) | |
| def main(): | |
| root_env = os.getenv("HF_DATASET_ROOT", "") | |
| repo_id = os.getenv("HF_REPO_ID", "") | |
| token = os.getenv("HF_TOKEN", "") or os.getenv("HUGGINGFACE_TOKEN", "") | |
| offset_env = os.getenv("HF_PARTS_OFFSET", "") | |
| limit_env = os.getenv("HF_PARTS_LIMIT", "") or os.getenv("HF_INITIAL_PARTS", "") | |
| include_license = os.getenv("HF_INCLUDE_LICENSE", "1") not in ("0", "false", "False") | |
| include_readme = os.getenv("HF_INCLUDE_README", "1") not in ("0", "false", "False") | |
| offset = int(offset_env) if offset_env.isdigit() else 0 | |
| limit = int(limit_env) if limit_env.isdigit() else 2 | |
| root = pathlib.Path(root_env or pathlib.Path(__file__).resolve().parents[2] / "hf_dataset_rag") | |
| paths = list_initial_files(root, offset, limit, include_license, include_readme) | |
| if not repo_id or not token: | |
| print("Missing HF_REPO_ID or HF_TOKEN; performing dry-run") | |
| dry_run_summary(paths) | |
| sys.exit(0) | |
| if not _is_ascii(token) or not _is_ascii(repo_id): | |
| print("HF_TOKEN or HF_REPO_ID contains non-ASCII characters") | |
| sys.exit(2) | |
| os.environ["HF_HUB_USER_AGENT"] = "rag-kb-uploader" | |
| os.environ.setdefault("HF_HUB_TIMEOUT", "60") | |
| os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60") | |
| with suppress(Exception): | |
| from huggingface_hub import HfApi | |
| HfApi(token=token).whoami() | |
| create_commit(repo_id, token, root, paths, f"Upload: metadata + parts [{offset}, {offset + limit})") | |
| print("Commit completed") | |
| if __name__ == "__main__": | |
| main() | |