File size: 6,214 Bytes
c842956
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
# !/usr/bin/env python3
# /// script
# dependencies = [
#   "rich",
#   "python-dotenv",
#   "huggingface-hub",
# ]
# ///
"""
Download a dataset from source and upload to target dataset.

This script:
1. Downloads a source dataset to eval-results (replacing existing content)
2. Uploads the downloaded content to a target dataset

Usage:

```bash
# Using environment variables
uv run scripts/sync_dataset.py

# Or with command line arguments
uv run scripts/sync_dataset.py --source-dataset owner/source-dataset --target-dataset owner/target-dataset
```
"""

import argparse
import os
import shutil
from pathlib import Path

from dotenv import load_dotenv
from huggingface_hub import HfApi, snapshot_download
from rich import print

# Load environment variables
load_dotenv()

# Configuration
LOCAL_FOLDER = Path("eval-results")
REPO_TYPE = "dataset"

DEFAULT_SOURCE_DATASET = "lmms-lab-si/EASI-Leaderboard-Results"
DEFAULT_TARGET_DATASET = "y-playground/results"


def arg_parser():
    parser = argparse.ArgumentParser(description="Download a dataset and upload to another dataset")
    parser.add_argument(
        "--source-dataset",
        type=str,
        default=os.getenv("SOURCE_DATASET") or DEFAULT_SOURCE_DATASET,
        help="Source dataset ID (e.g., owner/source-dataset). Can also be set via SOURCE_DATASET env var.",
    )
    parser.add_argument(
        "--target-dataset",
        type=str,
        default=os.getenv("TARGET_DATASET") or DEFAULT_TARGET_DATASET,
        help="Target dataset ID (e.g., owner/target-dataset). Can also be set via TARGET_DATASET env var.",
    )
    args = parser.parse_args()
    return args


def download_dataset(
    source_repo_id: str,
    local_dir: Path,
    hf_token: str,
) -> None:
    """Download only the latest version of a dataset from HuggingFace Hub to local directory.

    This function downloads only the latest version (main branch) and ignores historical versions.
    """
    print(f"Downloading dataset (latest version only): [magenta]{source_repo_id}[/magenta]")
    print(f"Destination: [magenta]{local_dir}[/magenta]")
    print()

    # Remove existing directory if it exists
    if local_dir.exists():
        print(f"Removing existing directory: [magenta]{local_dir}[/magenta]")
        shutil.rmtree(local_dir)
        print(f"[green]βœ“[/green] Directory removed [magenta]{local_dir}[/magenta]")

    # Create parent directory if needed
    local_dir.parent.mkdir(parents=True, exist_ok=True)

    # Download only the latest version of the dataset
    download_kwargs = {
        "repo_id": source_repo_id,
        "local_dir": str(local_dir),
        "repo_type": REPO_TYPE,
        "token": hf_token,
        "local_dir_use_symlinks": False,
        "revision": "main",  # Explicitly download only the latest version (main branch)
        "force_download": True,  # Force re-download to avoid cached deleted files
        "resume_download": False,  # Don't resume from cache to ensure clean download
    }

    snapshot_download(**download_kwargs)

    print(
        f"[green]βœ“[/green] Successfully downloaded latest version of [magenta]{source_repo_id}[/magenta] to [magenta]{local_dir}[/magenta]"
    )


def upload_dataset(
    local_dir: Path,
    target_repo_id: str,
    hf_token: str,
    path_in_repo: str | None = None,
) -> None:
    """Upload local directory to HuggingFace Hub dataset."""
    # Validate local folder exists
    if not local_dir.exists():
        raise FileNotFoundError(f"Local folder not found: {local_dir}")

    if not local_dir.is_dir():
        raise ValueError(f"Path is not a directory: {local_dir}")

    # Initialize Hugging Face API
    api = HfApi(token=hf_token)

    upload_path = path_in_repo if path_in_repo else "."
    print(f"Uploading folder: [magenta]{local_dir}[/magenta]")
    print(f"Destination: [magenta]{target_repo_id}/{upload_path}[/magenta] (type: [magenta]{REPO_TYPE}[/magenta])")
    print()

    # Upload folder
    api.upload_folder(
        folder_path=str(local_dir),
        repo_id=target_repo_id,
        repo_type=REPO_TYPE,
        path_in_repo=upload_path,
        commit_message=f"Sync dataset from {local_dir}",
    )

    print(
        f"[green]βœ“[/green] Successfully uploaded [magenta]{local_dir}[/magenta] to [magenta]{target_repo_id}/{upload_path}[/magenta]"
    )
    print(f"  View at: [magenta]https://huggingface.co/datasets/{target_repo_id}[/magenta]")


def main():
    args = arg_parser()

    # Get Hugging Face token from environment
    hf_token = os.getenv("HF_TOKEN")
    if not hf_token:
        raise ValueError(
            "HF_TOKEN environment variable is not set. "
            "Please set it in your .env file or export it as an environment variable."
        )

    # Validate required arguments
    if not args.source_dataset:
        raise ValueError(
            "Source dataset ID is required. "
            "Set it via --source-dataset argument or SOURCE_DATASET environment variable."
        )

    if not args.target_dataset:
        raise ValueError(
            "Target dataset ID is required. "
            "Set it via --target-dataset argument or TARGET_DATASET environment variable."
        )

    # Parse allow_patterns from env var if not provided via args
    print("=" * 60)
    print("Dataset Sync Script")
    print("=" * 60)
    print(f"Source:       [magenta]{args.source_dataset}[/magenta]")
    print(f"Target:       [magenta]{args.target_dataset}[/magenta]")
    print(f"Local folder: [magenta]{LOCAL_FOLDER}[/magenta]")
    print("=" * 60)
    print()

    # Step 1: Download from source
    print("Step 1: Downloading from source dataset...")
    print("-" * 60)
    download_dataset(
        source_repo_id=args.source_dataset,
        local_dir=LOCAL_FOLDER,
        hf_token=hf_token,
    )
    print()

    # Step 2: Upload to target
    print("Step 2: Uploading to target dataset...")
    print("-" * 60)
    upload_dataset(
        local_dir=LOCAL_FOLDER,
        target_repo_id=args.target_dataset,
        hf_token=hf_token,
    )
    print()

    print("=" * 60)
    print("[green]βœ“[/green] Dataset sync completed successfully!")
    print("=" * 60)


if __name__ == "__main__":
    main()