EASI-Leaderboard / scripts /sync_dataset.py
yangzhitao
feat: add dataset synchronization script to download and upload datasets between Hugging Face repositories
c842956
raw
history blame
6.21 kB
# !/usr/bin/env python3
# /// script
# dependencies = [
# "rich",
# "python-dotenv",
# "huggingface-hub",
# ]
# ///
"""
Download a dataset from source and upload to target dataset.
This script:
1. Downloads a source dataset to eval-results (replacing existing content)
2. Uploads the downloaded content to a target dataset
Usage:
```bash
# Using environment variables
uv run scripts/sync_dataset.py
# Or with command line arguments
uv run scripts/sync_dataset.py --source-dataset owner/source-dataset --target-dataset owner/target-dataset
```
"""
import argparse
import os
import shutil
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import HfApi, snapshot_download
from rich import print
# Load environment variables
load_dotenv()
# Configuration
LOCAL_FOLDER = Path("eval-results")
REPO_TYPE = "dataset"
DEFAULT_SOURCE_DATASET = "lmms-lab-si/EASI-Leaderboard-Results"
DEFAULT_TARGET_DATASET = "y-playground/results"
def arg_parser():
parser = argparse.ArgumentParser(description="Download a dataset and upload to another dataset")
parser.add_argument(
"--source-dataset",
type=str,
default=os.getenv("SOURCE_DATASET") or DEFAULT_SOURCE_DATASET,
help="Source dataset ID (e.g., owner/source-dataset). Can also be set via SOURCE_DATASET env var.",
)
parser.add_argument(
"--target-dataset",
type=str,
default=os.getenv("TARGET_DATASET") or DEFAULT_TARGET_DATASET,
help="Target dataset ID (e.g., owner/target-dataset). Can also be set via TARGET_DATASET env var.",
)
args = parser.parse_args()
return args
def download_dataset(
source_repo_id: str,
local_dir: Path,
hf_token: str,
) -> None:
"""Download only the latest version of a dataset from HuggingFace Hub to local directory.
This function downloads only the latest version (main branch) and ignores historical versions.
"""
print(f"Downloading dataset (latest version only): [magenta]{source_repo_id}[/magenta]")
print(f"Destination: [magenta]{local_dir}[/magenta]")
print()
# Remove existing directory if it exists
if local_dir.exists():
print(f"Removing existing directory: [magenta]{local_dir}[/magenta]")
shutil.rmtree(local_dir)
print(f"[green]βœ“[/green] Directory removed [magenta]{local_dir}[/magenta]")
# Create parent directory if needed
local_dir.parent.mkdir(parents=True, exist_ok=True)
# Download only the latest version of the dataset
download_kwargs = {
"repo_id": source_repo_id,
"local_dir": str(local_dir),
"repo_type": REPO_TYPE,
"token": hf_token,
"local_dir_use_symlinks": False,
"revision": "main", # Explicitly download only the latest version (main branch)
"force_download": True, # Force re-download to avoid cached deleted files
"resume_download": False, # Don't resume from cache to ensure clean download
}
snapshot_download(**download_kwargs)
print(
f"[green]βœ“[/green] Successfully downloaded latest version of [magenta]{source_repo_id}[/magenta] to [magenta]{local_dir}[/magenta]"
)
def upload_dataset(
local_dir: Path,
target_repo_id: str,
hf_token: str,
path_in_repo: str | None = None,
) -> None:
"""Upload local directory to HuggingFace Hub dataset."""
# Validate local folder exists
if not local_dir.exists():
raise FileNotFoundError(f"Local folder not found: {local_dir}")
if not local_dir.is_dir():
raise ValueError(f"Path is not a directory: {local_dir}")
# Initialize Hugging Face API
api = HfApi(token=hf_token)
upload_path = path_in_repo if path_in_repo else "."
print(f"Uploading folder: [magenta]{local_dir}[/magenta]")
print(f"Destination: [magenta]{target_repo_id}/{upload_path}[/magenta] (type: [magenta]{REPO_TYPE}[/magenta])")
print()
# Upload folder
api.upload_folder(
folder_path=str(local_dir),
repo_id=target_repo_id,
repo_type=REPO_TYPE,
path_in_repo=upload_path,
commit_message=f"Sync dataset from {local_dir}",
)
print(
f"[green]βœ“[/green] Successfully uploaded [magenta]{local_dir}[/magenta] to [magenta]{target_repo_id}/{upload_path}[/magenta]"
)
print(f" View at: [magenta]https://huggingface.co/datasets/{target_repo_id}[/magenta]")
def main():
args = arg_parser()
# Get Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError(
"HF_TOKEN environment variable is not set. "
"Please set it in your .env file or export it as an environment variable."
)
# Validate required arguments
if not args.source_dataset:
raise ValueError(
"Source dataset ID is required. "
"Set it via --source-dataset argument or SOURCE_DATASET environment variable."
)
if not args.target_dataset:
raise ValueError(
"Target dataset ID is required. "
"Set it via --target-dataset argument or TARGET_DATASET environment variable."
)
# Parse allow_patterns from env var if not provided via args
print("=" * 60)
print("Dataset Sync Script")
print("=" * 60)
print(f"Source: [magenta]{args.source_dataset}[/magenta]")
print(f"Target: [magenta]{args.target_dataset}[/magenta]")
print(f"Local folder: [magenta]{LOCAL_FOLDER}[/magenta]")
print("=" * 60)
print()
# Step 1: Download from source
print("Step 1: Downloading from source dataset...")
print("-" * 60)
download_dataset(
source_repo_id=args.source_dataset,
local_dir=LOCAL_FOLDER,
hf_token=hf_token,
)
print()
# Step 2: Upload to target
print("Step 2: Uploading to target dataset...")
print("-" * 60)
upload_dataset(
local_dir=LOCAL_FOLDER,
target_repo_id=args.target_dataset,
hf_token=hf_token,
)
print()
print("=" * 60)
print("[green]βœ“[/green] Dataset sync completed successfully!")
print("=" * 60)
if __name__ == "__main__":
main()