Spaces:

Sefaria
/

Rabbinic-Embedding-Bench

Running

Rabbinic-Embedding-Bench / upload_dataset.py

Lev Israel

Uploade dataset to HF

a9dad42 11 days ago

3.55 kB

	#!/usr/bin/env python3
	"""
	Upload the Rabbinic benchmark dataset to Hugging Face Hub.

	Usage:
	python upload_dataset.py --repo-id YOUR_USERNAME/rabbinic-benchmark
	"""

	import argparse
	import json
	import shutil
	import tempfile
	from pathlib import Path


	def main():
	parser = argparse.ArgumentParser(
	description="Upload Rabbinic benchmark dataset to Hugging Face Hub"
	)
	parser.add_argument(
	"--repo-id",
	type=str,
	required=True,
	help="HuggingFace repo ID (e.g., 'username/rabbinic-benchmark')",
	)
	parser.add_argument(
	"--benchmark-path",
	type=str,
	default="benchmark_data/benchmark.json",
	help="Path to benchmark JSON file",
	)
	parser.add_argument(
	"--private",
	action="store_true",
	help="Make the dataset private",
	)

	args = parser.parse_args()

	# Check that huggingface_hub is installed
	try:
	from huggingface_hub import HfApi, upload_folder, login, whoami
	except ImportError:
	print("Required packages not installed. Run:")
	print(" pip install huggingface_hub")
	return 1

	# Check current auth status
	try:
	user_info = whoami()
	print(f"Logged in as: {user_info['name']}")
	if 'orgs' in user_info:
	orgs = [org['name'] for org in user_info.get('orgs', [])]
	print(f"Organizations: {orgs}")
	except Exception as e:
	print(f"Not logged in or token issue: {e}")
	print("Running login...")
	login()

	# Load benchmark data to verify it
	print(f"Loading benchmark from {args.benchmark_path}...")
	with open(args.benchmark_path, "r", encoding="utf-8") as f:
	data = json.load(f)
	print(f"Loaded {len(data)} pairs")

	# Create a temp folder with the files to upload
	with tempfile.TemporaryDirectory() as tmpdir:
	tmpdir = Path(tmpdir)

	# Copy benchmark data
	data_dir = tmpdir / "data"
	data_dir.mkdir()
	shutil.copy(args.benchmark_path, data_dir / "benchmark.json")
	print(f"Prepared data/benchmark.json")

	# Copy README
	readme_src = Path("dataset/README.md")
	if readme_src.exists():
	shutil.copy(readme_src, tmpdir / "README.md")
	print(f"Prepared README.md")

	# Create repo if needed
	api = HfApi()
	try:
	api.create_repo(
	repo_id=args.repo_id,
	repo_type="dataset",
	private=args.private,
	exist_ok=True,
	)
	print(f"Repository verified: {args.repo_id}")
	except Exception as e:
	print(f"Note: {e}")

	# Upload the folder (create PR if we don't have direct write access)
	print(f"\nUploading to HuggingFace Hub: {args.repo_id}...")
	commit_info = upload_folder(
	folder_path=str(tmpdir),
	repo_id=args.repo_id,
	repo_type="dataset",
	create_pr=True, # Create a PR instead of direct commit
	commit_message="Add Rabbinic Hebrew-English parallel corpus",
	)

	if commit_info.pr_url:
	print(f"\n📝 Pull Request created: {commit_info.pr_url}")
	print(" Ask an org admin to merge it.")

	print(f"\n✅ Dataset uploaded successfully!")
	print(f" View at: https://huggingface.co/datasets/{args.repo_id}")

	return 0


	if __name__ == "__main__":
	exit(main())