rag-kb-system / scripts /push_hf_dataset.py
duqing2026's picture
同步 hf
9ed89c8
import os
import sys
import json
import pathlib
from typing import List
import httpx
from contextlib import suppress
def _is_ascii(s: str) -> bool:
try:
s.encode("ascii")
return True
except Exception:
return False
def list_initial_files(root: pathlib.Path, start: int, limit: int, include_license: bool, include_readme: bool) -> List[pathlib.Path]:
meta = root / "metadata"
docs_dir = meta / "documents"
files = [
docs_dir / "index.json",
meta / "knowledge_bases.json",
]
if include_license and (root / "LICENSE").exists():
files.append(root / "LICENSE")
if include_readme and (root / "README.md").exists():
files.append(root / "README.md")
index = json.loads((docs_dir / "index.json").read_text("utf8"))
parts = index.get("parts", [])
slice_parts = parts[start:start + limit]
for p in slice_parts:
files.append(docs_dir / p)
return files
def create_commit(repo_id: str, token: str, root: pathlib.Path, paths: List[pathlib.Path], message: str):
from huggingface_hub import HfApi, CommitOperationAdd
api = HfApi(token=token)
api.create_repo(repo_id=repo_id, repo_type="dataset", private=True, exist_ok=True)
ops = []
for p in paths:
rel = p.relative_to(root)
ops.append(CommitOperationAdd(path_in_repo=str(rel), path_or_fileobj=str(p)))
def _commit(ops_slice: List[CommitOperationAdd], msg: str):
try:
api.create_commit(repo_id=repo_id, repo_type="dataset", operations=ops_slice, commit_message=msg)
except Exception as e:
s = str(e)
timeout_like = isinstance(e, httpx.ReadTimeout) or "ReadTimeout" in s or "Timeout" in s
if timeout_like and len(ops_slice) > 1:
mid = len(ops_slice) // 2
left = ops_slice[:mid]
right = ops_slice[mid:]
_commit(left, msg + " [chunk A]")
_commit(right, msg + " [chunk B]")
else:
raise
chunk_size_env = os.getenv("HF_COMMIT_CHUNK_SIZE", "")
chunk_size = int(chunk_size_env) if chunk_size_env.isdigit() else 0
if chunk_size and chunk_size > 0:
for i in range(0, len(ops), chunk_size):
_commit(ops[i:i + chunk_size], message + f" [batch {i//chunk_size}]")
else:
_commit(ops, message)
def dry_run_summary(paths: List[pathlib.Path]):
total = 0
items = []
for p in paths:
size = p.stat().st_size
total += size
items.append((str(p), size))
print(json.dumps({"files": [{"path": i[0], "size": i[1]} for i in items], "total_bytes": total}, ensure_ascii=False, indent=2))
def main():
root_env = os.getenv("HF_DATASET_ROOT", "")
repo_id = os.getenv("HF_REPO_ID", "")
token = os.getenv("HF_TOKEN", "") or os.getenv("HUGGINGFACE_TOKEN", "")
offset_env = os.getenv("HF_PARTS_OFFSET", "")
limit_env = os.getenv("HF_PARTS_LIMIT", "") or os.getenv("HF_INITIAL_PARTS", "")
include_license = os.getenv("HF_INCLUDE_LICENSE", "1") not in ("0", "false", "False")
include_readme = os.getenv("HF_INCLUDE_README", "1") not in ("0", "false", "False")
offset = int(offset_env) if offset_env.isdigit() else 0
limit = int(limit_env) if limit_env.isdigit() else 2
root = pathlib.Path(root_env or pathlib.Path(__file__).resolve().parents[2] / "hf_dataset_rag")
paths = list_initial_files(root, offset, limit, include_license, include_readme)
if not repo_id or not token:
print("Missing HF_REPO_ID or HF_TOKEN; performing dry-run")
dry_run_summary(paths)
sys.exit(0)
if not _is_ascii(token) or not _is_ascii(repo_id):
print("HF_TOKEN or HF_REPO_ID contains non-ASCII characters")
sys.exit(2)
os.environ["HF_HUB_USER_AGENT"] = "rag-kb-uploader"
os.environ.setdefault("HF_HUB_TIMEOUT", "60")
os.environ.setdefault("HF_HUB_READ_TIMEOUT", "60")
with suppress(Exception):
from huggingface_hub import HfApi
HfApi(token=token).whoami()
create_commit(repo_id, token, root, paths, f"Upload: metadata + parts [{offset}, {offset + limit})")
print("Commit completed")
if __name__ == "__main__":
main()