#!/usr/bin/env python3 """ Upload the Rabbinic benchmark dataset to Hugging Face Hub. Usage: python upload_dataset.py --repo-id YOUR_USERNAME/rabbinic-benchmark """ import argparse import json import shutil import tempfile from pathlib import Path def main(): parser = argparse.ArgumentParser( description="Upload Rabbinic benchmark dataset to Hugging Face Hub" ) parser.add_argument( "--repo-id", type=str, required=True, help="HuggingFace repo ID (e.g., 'username/rabbinic-benchmark')", ) parser.add_argument( "--benchmark-path", type=str, default="benchmark_data/benchmark.json", help="Path to benchmark JSON file", ) parser.add_argument( "--private", action="store_true", help="Make the dataset private", ) args = parser.parse_args() # Check that huggingface_hub is installed try: from huggingface_hub import HfApi, upload_folder, login, whoami except ImportError: print("Required packages not installed. Run:") print(" pip install huggingface_hub") return 1 # Check current auth status try: user_info = whoami() print(f"Logged in as: {user_info['name']}") if 'orgs' in user_info: orgs = [org['name'] for org in user_info.get('orgs', [])] print(f"Organizations: {orgs}") except Exception as e: print(f"Not logged in or token issue: {e}") print("Running login...") login() # Load benchmark data to verify it print(f"Loading benchmark from {args.benchmark_path}...") with open(args.benchmark_path, "r", encoding="utf-8") as f: data = json.load(f) print(f"Loaded {len(data)} pairs") # Create a temp folder with the files to upload with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # Copy benchmark data data_dir = tmpdir / "data" data_dir.mkdir() shutil.copy(args.benchmark_path, data_dir / "benchmark.json") print(f"Prepared data/benchmark.json") # Copy README readme_src = Path("dataset/README.md") if readme_src.exists(): shutil.copy(readme_src, tmpdir / "README.md") print(f"Prepared README.md") # Create repo if needed api = HfApi() try: api.create_repo( repo_id=args.repo_id, repo_type="dataset", private=args.private, exist_ok=True, ) print(f"Repository verified: {args.repo_id}") except Exception as e: print(f"Note: {e}") # Upload the folder (create PR if we don't have direct write access) print(f"\nUploading to HuggingFace Hub: {args.repo_id}...") commit_info = upload_folder( folder_path=str(tmpdir), repo_id=args.repo_id, repo_type="dataset", create_pr=True, # Create a PR instead of direct commit commit_message="Add Rabbinic Hebrew-English parallel corpus", ) if commit_info.pr_url: print(f"\nšŸ“ Pull Request created: {commit_info.pr_url}") print(" Ask an org admin to merge it.") print(f"\nāœ… Dataset uploaded successfully!") print(f" View at: https://huggingface.co/datasets/{args.repo_id}") return 0 if __name__ == "__main__": exit(main())