Spaces:
Running
Running
File size: 6,214 Bytes
c842956 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
# !/usr/bin/env python3
# /// script
# dependencies = [
# "rich",
# "python-dotenv",
# "huggingface-hub",
# ]
# ///
"""
Download a dataset from source and upload to target dataset.
This script:
1. Downloads a source dataset to eval-results (replacing existing content)
2. Uploads the downloaded content to a target dataset
Usage:
```bash
# Using environment variables
uv run scripts/sync_dataset.py
# Or with command line arguments
uv run scripts/sync_dataset.py --source-dataset owner/source-dataset --target-dataset owner/target-dataset
```
"""
import argparse
import os
import shutil
from pathlib import Path
from dotenv import load_dotenv
from huggingface_hub import HfApi, snapshot_download
from rich import print
# Load environment variables
load_dotenv()
# Configuration
LOCAL_FOLDER = Path("eval-results")
REPO_TYPE = "dataset"
DEFAULT_SOURCE_DATASET = "lmms-lab-si/EASI-Leaderboard-Results"
DEFAULT_TARGET_DATASET = "y-playground/results"
def arg_parser():
parser = argparse.ArgumentParser(description="Download a dataset and upload to another dataset")
parser.add_argument(
"--source-dataset",
type=str,
default=os.getenv("SOURCE_DATASET") or DEFAULT_SOURCE_DATASET,
help="Source dataset ID (e.g., owner/source-dataset). Can also be set via SOURCE_DATASET env var.",
)
parser.add_argument(
"--target-dataset",
type=str,
default=os.getenv("TARGET_DATASET") or DEFAULT_TARGET_DATASET,
help="Target dataset ID (e.g., owner/target-dataset). Can also be set via TARGET_DATASET env var.",
)
args = parser.parse_args()
return args
def download_dataset(
source_repo_id: str,
local_dir: Path,
hf_token: str,
) -> None:
"""Download only the latest version of a dataset from HuggingFace Hub to local directory.
This function downloads only the latest version (main branch) and ignores historical versions.
"""
print(f"Downloading dataset (latest version only): [magenta]{source_repo_id}[/magenta]")
print(f"Destination: [magenta]{local_dir}[/magenta]")
print()
# Remove existing directory if it exists
if local_dir.exists():
print(f"Removing existing directory: [magenta]{local_dir}[/magenta]")
shutil.rmtree(local_dir)
print(f"[green]β[/green] Directory removed [magenta]{local_dir}[/magenta]")
# Create parent directory if needed
local_dir.parent.mkdir(parents=True, exist_ok=True)
# Download only the latest version of the dataset
download_kwargs = {
"repo_id": source_repo_id,
"local_dir": str(local_dir),
"repo_type": REPO_TYPE,
"token": hf_token,
"local_dir_use_symlinks": False,
"revision": "main", # Explicitly download only the latest version (main branch)
"force_download": True, # Force re-download to avoid cached deleted files
"resume_download": False, # Don't resume from cache to ensure clean download
}
snapshot_download(**download_kwargs)
print(
f"[green]β[/green] Successfully downloaded latest version of [magenta]{source_repo_id}[/magenta] to [magenta]{local_dir}[/magenta]"
)
def upload_dataset(
local_dir: Path,
target_repo_id: str,
hf_token: str,
path_in_repo: str | None = None,
) -> None:
"""Upload local directory to HuggingFace Hub dataset."""
# Validate local folder exists
if not local_dir.exists():
raise FileNotFoundError(f"Local folder not found: {local_dir}")
if not local_dir.is_dir():
raise ValueError(f"Path is not a directory: {local_dir}")
# Initialize Hugging Face API
api = HfApi(token=hf_token)
upload_path = path_in_repo if path_in_repo else "."
print(f"Uploading folder: [magenta]{local_dir}[/magenta]")
print(f"Destination: [magenta]{target_repo_id}/{upload_path}[/magenta] (type: [magenta]{REPO_TYPE}[/magenta])")
print()
# Upload folder
api.upload_folder(
folder_path=str(local_dir),
repo_id=target_repo_id,
repo_type=REPO_TYPE,
path_in_repo=upload_path,
commit_message=f"Sync dataset from {local_dir}",
)
print(
f"[green]β[/green] Successfully uploaded [magenta]{local_dir}[/magenta] to [magenta]{target_repo_id}/{upload_path}[/magenta]"
)
print(f" View at: [magenta]https://huggingface.co/datasets/{target_repo_id}[/magenta]")
def main():
args = arg_parser()
# Get Hugging Face token from environment
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
raise ValueError(
"HF_TOKEN environment variable is not set. "
"Please set it in your .env file or export it as an environment variable."
)
# Validate required arguments
if not args.source_dataset:
raise ValueError(
"Source dataset ID is required. "
"Set it via --source-dataset argument or SOURCE_DATASET environment variable."
)
if not args.target_dataset:
raise ValueError(
"Target dataset ID is required. "
"Set it via --target-dataset argument or TARGET_DATASET environment variable."
)
# Parse allow_patterns from env var if not provided via args
print("=" * 60)
print("Dataset Sync Script")
print("=" * 60)
print(f"Source: [magenta]{args.source_dataset}[/magenta]")
print(f"Target: [magenta]{args.target_dataset}[/magenta]")
print(f"Local folder: [magenta]{LOCAL_FOLDER}[/magenta]")
print("=" * 60)
print()
# Step 1: Download from source
print("Step 1: Downloading from source dataset...")
print("-" * 60)
download_dataset(
source_repo_id=args.source_dataset,
local_dir=LOCAL_FOLDER,
hf_token=hf_token,
)
print()
# Step 2: Upload to target
print("Step 2: Uploading to target dataset...")
print("-" * 60)
upload_dataset(
local_dir=LOCAL_FOLDER,
target_repo_id=args.target_dataset,
hf_token=hf_token,
)
print()
print("=" * 60)
print("[green]β[/green] Dataset sync completed successfully!")
print("=" * 60)
if __name__ == "__main__":
main()
|