File size: 6,607 Bytes
078d201
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
"""
Download script for CSI-4CAST datasets.

This script downloads all available datasets from the CSI-4CAST Hugging Face organization
by checking for all possible combinations of channel models, delay spreads, and speeds.

Usage:
    python3 download.py [--output-dir OUTPUT_DIR]

If no arguments provided, it will download datasets to a 'datasets' folder.
"""

import argparse
from pathlib import Path

from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm

# Configuration constants
ORG = "CSI-4CAST"

# Regular dataset parameters
LIST_CHANNEL_MODEL = ["A", "C", "D"]
LIST_DELAY_SPREAD = [30e-9, 100e-9, 300e-9]
LIST_MIN_SPEED = [1, 10, 30]

# Generalization dataset parameters
LIST_CHANNEL_MODEL_GEN = ["A", "B", "C", "D", "E"]
LIST_DELAY_SPREAD_GEN = [30e-9, 50e-9, 100e-9, 200e-9, 300e-9, 400e-9]
LIST_MIN_SPEED_GEN = sorted([*range(3, 46, 3), 1, 10])

def make_folder_name(cm: str, ds: float, ms: int, **kwargs) -> str:
    """Generate a standardized folder name based on channel model, delay spread, and minimum speed.

    Args:
        cm (str): Channel model identifier (e.g., 'A', 'B', 'C', 'D', 'E')
        ds (float): Delay spread in seconds (e.g., 30e-9, 100e-9, 300e-9)
        ms (int): Minimum speed in km/h (e.g., 1, 10, 30)
        **kwargs: Additional keyword arguments (unused)

    Returns:
        str: Formatted folder name in the format 'cm_{cm}_ds_{ds}_ms_{ms}'
             where ds is converted to nanoseconds and zero-padded to 3 digits,
             and ms is zero-padded to 3 digits

    Example:
        >>> make_folder_name('A', 30e-9, 10)
        'cm_A_ds_030_ms_010'
    """
    # the precision of the delay spread is int
    ds = round(ds * 1e9)
    ds_str = str(ds).zfill(3)

    # the precision of the min speed is .1
    ms_str = str(ms)
    ms_str = ms_str.zfill(3)

    # the file name
    return f"cm_{cm}_ds_{ds_str}_ms_{ms_str}"

def check_repo_exists(api: HfApi, repo_id: str) -> bool:
    """Check if a repository exists in the organization."""
    try:
        api.repo_info(repo_id, repo_type="dataset")
        return True
    except Exception:
        return False

def generate_dataset_combinations():
    """Generate all possible dataset combinations."""
    combinations = []
    
    # Stats dataset
    combinations.append("stats")
    
    # Train regular datasets
    for cm in LIST_CHANNEL_MODEL:
        for ds in LIST_DELAY_SPREAD:
            for ms in LIST_MIN_SPEED:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"train_regular_{folder_name}"
                combinations.append(repo_name)
    
    # Test regular datasets
    for cm in LIST_CHANNEL_MODEL:
        for ds in LIST_DELAY_SPREAD:
            for ms in LIST_MIN_SPEED:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"test_regular_{folder_name}"
                combinations.append(repo_name)
    
    # Test generalization datasets
    for cm in LIST_CHANNEL_MODEL_GEN:
        for ds in LIST_DELAY_SPREAD_GEN:
            for ms in LIST_MIN_SPEED_GEN:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"test_generalization_{folder_name}"
                combinations.append(repo_name)
    
    return combinations

def download_dataset(api: HfApi, org: str, repo_name: str, output_dir: Path, dry_run: bool = False) -> bool:
    """Download a single dataset if it exists."""
    repo_id = f"{org}/{repo_name}"
    
    if not check_repo_exists(api, repo_id):
        return False
    
    try:
        # Create target directory
        target_dir = output_dir / repo_name
        target_dir.mkdir(parents=True, exist_ok=True)
        
        if dry_run:
            # Create empty placeholder file
            placeholder_file = target_dir / "placeholder.txt"
            placeholder_file.write_text("")
            print(f"βœ… Dry run - Created placeholder: {repo_name}")
        else:
            # Download the dataset
            snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                local_dir=target_dir,
                local_dir_use_symlinks=False
            )
            print(f"βœ… Downloaded: {repo_name}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error downloading {repo_name}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Download all CSI-4CAST datasets from Hugging Face")
    parser.add_argument("--output-dir", "-o", default="datasets",
                       help="Output directory for downloaded datasets (default: 'datasets')")
    parser.add_argument("--dry-run", action="store_true",
                       help="Dry run mode: create empty placeholder files instead of downloading")
    
    args = parser.parse_args()
    
    output_dir = Path(args.output_dir).resolve()
    org = ORG
    
    mode = "Dry run" if args.dry_run else "Downloading"
    print(f"{mode} datasets from organization: {org}")
    print(f"Output directory: {output_dir}")
    print()
    
    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize Hugging Face API
    api = HfApi()
    
    # Generate all possible combinations
    print("Generating dataset combinations...")
    combinations = generate_dataset_combinations()
    print(f"Total possible combinations: {len(combinations)}")
    print()
    
    # Download datasets
    action = "Checking and creating placeholders for" if args.dry_run else "Checking and downloading"
    print(f"{action} existing datasets...")
    downloaded_count = 0
    skipped_count = 0
    
    for repo_name in tqdm(combinations, desc="Processing datasets"):
        if download_dataset(api, org, repo_name, output_dir, args.dry_run):
            downloaded_count += 1
        else:
            skipped_count += 1
    
    print()
    if args.dry_run:
        print("πŸŽ‰ Dry run complete!")
        print(f"βœ… Created placeholders: {downloaded_count} datasets")
        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
        print(f"πŸ“ Placeholders saved to: {output_dir}")
    else:
        print("πŸŽ‰ Download complete!")
        print(f"βœ… Downloaded: {downloaded_count} datasets")
        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
        print(f"πŸ“ Datasets saved to: {output_dir}")
    print()
    print("To reconstruct the original folder structure, run:")
    print(f"python3 reconstruction.py --input-dir {output_dir}")

if __name__ == "__main__":
    main()