|
|
|
|
|
""" |
|
|
Upload the Rabbinic benchmark dataset to Hugging Face Hub. |
|
|
|
|
|
Usage: |
|
|
python upload_dataset.py --repo-id YOUR_USERNAME/rabbinic-benchmark |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import json |
|
|
import shutil |
|
|
import tempfile |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Upload Rabbinic benchmark dataset to Hugging Face Hub" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--repo-id", |
|
|
type=str, |
|
|
required=True, |
|
|
help="HuggingFace repo ID (e.g., 'username/rabbinic-benchmark')", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--benchmark-path", |
|
|
type=str, |
|
|
default="benchmark_data/benchmark.json", |
|
|
help="Path to benchmark JSON file", |
|
|
) |
|
|
parser.add_argument( |
|
|
"--private", |
|
|
action="store_true", |
|
|
help="Make the dataset private", |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
try: |
|
|
from huggingface_hub import HfApi, upload_folder, login, whoami |
|
|
except ImportError: |
|
|
print("Required packages not installed. Run:") |
|
|
print(" pip install huggingface_hub") |
|
|
return 1 |
|
|
|
|
|
|
|
|
try: |
|
|
user_info = whoami() |
|
|
print(f"Logged in as: {user_info['name']}") |
|
|
if 'orgs' in user_info: |
|
|
orgs = [org['name'] for org in user_info.get('orgs', [])] |
|
|
print(f"Organizations: {orgs}") |
|
|
except Exception as e: |
|
|
print(f"Not logged in or token issue: {e}") |
|
|
print("Running login...") |
|
|
login() |
|
|
|
|
|
|
|
|
print(f"Loading benchmark from {args.benchmark_path}...") |
|
|
with open(args.benchmark_path, "r", encoding="utf-8") as f: |
|
|
data = json.load(f) |
|
|
print(f"Loaded {len(data)} pairs") |
|
|
|
|
|
|
|
|
with tempfile.TemporaryDirectory() as tmpdir: |
|
|
tmpdir = Path(tmpdir) |
|
|
|
|
|
|
|
|
data_dir = tmpdir / "data" |
|
|
data_dir.mkdir() |
|
|
shutil.copy(args.benchmark_path, data_dir / "benchmark.json") |
|
|
print(f"Prepared data/benchmark.json") |
|
|
|
|
|
|
|
|
readme_src = Path("dataset/README.md") |
|
|
if readme_src.exists(): |
|
|
shutil.copy(readme_src, tmpdir / "README.md") |
|
|
print(f"Prepared README.md") |
|
|
|
|
|
|
|
|
api = HfApi() |
|
|
try: |
|
|
api.create_repo( |
|
|
repo_id=args.repo_id, |
|
|
repo_type="dataset", |
|
|
private=args.private, |
|
|
exist_ok=True, |
|
|
) |
|
|
print(f"Repository verified: {args.repo_id}") |
|
|
except Exception as e: |
|
|
print(f"Note: {e}") |
|
|
|
|
|
|
|
|
print(f"\nUploading to HuggingFace Hub: {args.repo_id}...") |
|
|
commit_info = upload_folder( |
|
|
folder_path=str(tmpdir), |
|
|
repo_id=args.repo_id, |
|
|
repo_type="dataset", |
|
|
create_pr=True, |
|
|
commit_message="Add Rabbinic Hebrew-English parallel corpus", |
|
|
) |
|
|
|
|
|
if commit_info.pr_url: |
|
|
print(f"\n📝 Pull Request created: {commit_info.pr_url}") |
|
|
print(" Ask an org admin to merge it.") |
|
|
|
|
|
print(f"\n✅ Dataset uploaded successfully!") |
|
|
print(f" View at: https://huggingface.co/datasets/{args.repo_id}") |
|
|
|
|
|
return 0 |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
exit(main()) |
|
|
|