Spaces:
Running
Running
yangzhitao
feat: add dataset synchronization script to download and upload datasets between Hugging Face repositories
c842956
| # !/usr/bin/env python3 | |
| # /// script | |
| # dependencies = [ | |
| # "rich", | |
| # "python-dotenv", | |
| # "huggingface-hub", | |
| # ] | |
| # /// | |
| """ | |
| Download a dataset from source and upload to target dataset. | |
| This script: | |
| 1. Downloads a source dataset to eval-results (replacing existing content) | |
| 2. Uploads the downloaded content to a target dataset | |
| Usage: | |
| ```bash | |
| # Using environment variables | |
| uv run scripts/sync_dataset.py | |
| # Or with command line arguments | |
| uv run scripts/sync_dataset.py --source-dataset owner/source-dataset --target-dataset owner/target-dataset | |
| ``` | |
| """ | |
| import argparse | |
| import os | |
| import shutil | |
| from pathlib import Path | |
| from dotenv import load_dotenv | |
| from huggingface_hub import HfApi, snapshot_download | |
| from rich import print | |
| # Load environment variables | |
| load_dotenv() | |
| # Configuration | |
| LOCAL_FOLDER = Path("eval-results") | |
| REPO_TYPE = "dataset" | |
| DEFAULT_SOURCE_DATASET = "lmms-lab-si/EASI-Leaderboard-Results" | |
| DEFAULT_TARGET_DATASET = "y-playground/results" | |
| def arg_parser(): | |
| parser = argparse.ArgumentParser(description="Download a dataset and upload to another dataset") | |
| parser.add_argument( | |
| "--source-dataset", | |
| type=str, | |
| default=os.getenv("SOURCE_DATASET") or DEFAULT_SOURCE_DATASET, | |
| help="Source dataset ID (e.g., owner/source-dataset). Can also be set via SOURCE_DATASET env var.", | |
| ) | |
| parser.add_argument( | |
| "--target-dataset", | |
| type=str, | |
| default=os.getenv("TARGET_DATASET") or DEFAULT_TARGET_DATASET, | |
| help="Target dataset ID (e.g., owner/target-dataset). Can also be set via TARGET_DATASET env var.", | |
| ) | |
| args = parser.parse_args() | |
| return args | |
| def download_dataset( | |
| source_repo_id: str, | |
| local_dir: Path, | |
| hf_token: str, | |
| ) -> None: | |
| """Download only the latest version of a dataset from HuggingFace Hub to local directory. | |
| This function downloads only the latest version (main branch) and ignores historical versions. | |
| """ | |
| print(f"Downloading dataset (latest version only): [magenta]{source_repo_id}[/magenta]") | |
| print(f"Destination: [magenta]{local_dir}[/magenta]") | |
| print() | |
| # Remove existing directory if it exists | |
| if local_dir.exists(): | |
| print(f"Removing existing directory: [magenta]{local_dir}[/magenta]") | |
| shutil.rmtree(local_dir) | |
| print(f"[green]β[/green] Directory removed [magenta]{local_dir}[/magenta]") | |
| # Create parent directory if needed | |
| local_dir.parent.mkdir(parents=True, exist_ok=True) | |
| # Download only the latest version of the dataset | |
| download_kwargs = { | |
| "repo_id": source_repo_id, | |
| "local_dir": str(local_dir), | |
| "repo_type": REPO_TYPE, | |
| "token": hf_token, | |
| "local_dir_use_symlinks": False, | |
| "revision": "main", # Explicitly download only the latest version (main branch) | |
| "force_download": True, # Force re-download to avoid cached deleted files | |
| "resume_download": False, # Don't resume from cache to ensure clean download | |
| } | |
| snapshot_download(**download_kwargs) | |
| print( | |
| f"[green]β[/green] Successfully downloaded latest version of [magenta]{source_repo_id}[/magenta] to [magenta]{local_dir}[/magenta]" | |
| ) | |
| def upload_dataset( | |
| local_dir: Path, | |
| target_repo_id: str, | |
| hf_token: str, | |
| path_in_repo: str | None = None, | |
| ) -> None: | |
| """Upload local directory to HuggingFace Hub dataset.""" | |
| # Validate local folder exists | |
| if not local_dir.exists(): | |
| raise FileNotFoundError(f"Local folder not found: {local_dir}") | |
| if not local_dir.is_dir(): | |
| raise ValueError(f"Path is not a directory: {local_dir}") | |
| # Initialize Hugging Face API | |
| api = HfApi(token=hf_token) | |
| upload_path = path_in_repo if path_in_repo else "." | |
| print(f"Uploading folder: [magenta]{local_dir}[/magenta]") | |
| print(f"Destination: [magenta]{target_repo_id}/{upload_path}[/magenta] (type: [magenta]{REPO_TYPE}[/magenta])") | |
| print() | |
| # Upload folder | |
| api.upload_folder( | |
| folder_path=str(local_dir), | |
| repo_id=target_repo_id, | |
| repo_type=REPO_TYPE, | |
| path_in_repo=upload_path, | |
| commit_message=f"Sync dataset from {local_dir}", | |
| ) | |
| print( | |
| f"[green]β[/green] Successfully uploaded [magenta]{local_dir}[/magenta] to [magenta]{target_repo_id}/{upload_path}[/magenta]" | |
| ) | |
| print(f" View at: [magenta]https://huggingface.co/datasets/{target_repo_id}[/magenta]") | |
| def main(): | |
| args = arg_parser() | |
| # Get Hugging Face token from environment | |
| hf_token = os.getenv("HF_TOKEN") | |
| if not hf_token: | |
| raise ValueError( | |
| "HF_TOKEN environment variable is not set. " | |
| "Please set it in your .env file or export it as an environment variable." | |
| ) | |
| # Validate required arguments | |
| if not args.source_dataset: | |
| raise ValueError( | |
| "Source dataset ID is required. " | |
| "Set it via --source-dataset argument or SOURCE_DATASET environment variable." | |
| ) | |
| if not args.target_dataset: | |
| raise ValueError( | |
| "Target dataset ID is required. " | |
| "Set it via --target-dataset argument or TARGET_DATASET environment variable." | |
| ) | |
| # Parse allow_patterns from env var if not provided via args | |
| print("=" * 60) | |
| print("Dataset Sync Script") | |
| print("=" * 60) | |
| print(f"Source: [magenta]{args.source_dataset}[/magenta]") | |
| print(f"Target: [magenta]{args.target_dataset}[/magenta]") | |
| print(f"Local folder: [magenta]{LOCAL_FOLDER}[/magenta]") | |
| print("=" * 60) | |
| print() | |
| # Step 1: Download from source | |
| print("Step 1: Downloading from source dataset...") | |
| print("-" * 60) | |
| download_dataset( | |
| source_repo_id=args.source_dataset, | |
| local_dir=LOCAL_FOLDER, | |
| hf_token=hf_token, | |
| ) | |
| print() | |
| # Step 2: Upload to target | |
| print("Step 2: Uploading to target dataset...") | |
| print("-" * 60) | |
| upload_dataset( | |
| local_dir=LOCAL_FOLDER, | |
| target_repo_id=args.target_dataset, | |
| hf_token=hf_token, | |
| ) | |
| print() | |
| print("=" * 60) | |
| print("[green]β[/green] Dataset sync completed successfully!") | |
| print("=" * 60) | |
| if __name__ == "__main__": | |
| main() | |