Rabbinic-Embedding-Bench / upload_dataset.py
Lev Israel
Uploade dataset to HF
a9dad42
#!/usr/bin/env python3
"""
Upload the Rabbinic benchmark dataset to Hugging Face Hub.
Usage:
python upload_dataset.py --repo-id YOUR_USERNAME/rabbinic-benchmark
"""
import argparse
import json
import shutil
import tempfile
from pathlib import Path
def main():
parser = argparse.ArgumentParser(
description="Upload Rabbinic benchmark dataset to Hugging Face Hub"
)
parser.add_argument(
"--repo-id",
type=str,
required=True,
help="HuggingFace repo ID (e.g., 'username/rabbinic-benchmark')",
)
parser.add_argument(
"--benchmark-path",
type=str,
default="benchmark_data/benchmark.json",
help="Path to benchmark JSON file",
)
parser.add_argument(
"--private",
action="store_true",
help="Make the dataset private",
)
args = parser.parse_args()
# Check that huggingface_hub is installed
try:
from huggingface_hub import HfApi, upload_folder, login, whoami
except ImportError:
print("Required packages not installed. Run:")
print(" pip install huggingface_hub")
return 1
# Check current auth status
try:
user_info = whoami()
print(f"Logged in as: {user_info['name']}")
if 'orgs' in user_info:
orgs = [org['name'] for org in user_info.get('orgs', [])]
print(f"Organizations: {orgs}")
except Exception as e:
print(f"Not logged in or token issue: {e}")
print("Running login...")
login()
# Load benchmark data to verify it
print(f"Loading benchmark from {args.benchmark_path}...")
with open(args.benchmark_path, "r", encoding="utf-8") as f:
data = json.load(f)
print(f"Loaded {len(data)} pairs")
# Create a temp folder with the files to upload
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Copy benchmark data
data_dir = tmpdir / "data"
data_dir.mkdir()
shutil.copy(args.benchmark_path, data_dir / "benchmark.json")
print(f"Prepared data/benchmark.json")
# Copy README
readme_src = Path("dataset/README.md")
if readme_src.exists():
shutil.copy(readme_src, tmpdir / "README.md")
print(f"Prepared README.md")
# Create repo if needed
api = HfApi()
try:
api.create_repo(
repo_id=args.repo_id,
repo_type="dataset",
private=args.private,
exist_ok=True,
)
print(f"Repository verified: {args.repo_id}")
except Exception as e:
print(f"Note: {e}")
# Upload the folder (create PR if we don't have direct write access)
print(f"\nUploading to HuggingFace Hub: {args.repo_id}...")
commit_info = upload_folder(
folder_path=str(tmpdir),
repo_id=args.repo_id,
repo_type="dataset",
create_pr=True, # Create a PR instead of direct commit
commit_message="Add Rabbinic Hebrew-English parallel corpus",
)
if commit_info.pr_url:
print(f"\n📝 Pull Request created: {commit_info.pr_url}")
print(" Ask an org admin to merge it.")
print(f"\n✅ Dataset uploaded successfully!")
print(f" View at: https://huggingface.co/datasets/{args.repo_id}")
return 0
if __name__ == "__main__":
exit(main())