File size: 6,607 Bytes
078d201 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 | """
Download script for CSI-4CAST datasets.
This script downloads all available datasets from the CSI-4CAST Hugging Face organization
by checking for all possible combinations of channel models, delay spreads, and speeds.
Usage:
python3 download.py [--output-dir OUTPUT_DIR]
If no arguments provided, it will download datasets to a 'datasets' folder.
"""
import argparse
from pathlib import Path
from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm
# Configuration constants
ORG = "CSI-4CAST"
# Regular dataset parameters
LIST_CHANNEL_MODEL = ["A", "C", "D"]
LIST_DELAY_SPREAD = [30e-9, 100e-9, 300e-9]
LIST_MIN_SPEED = [1, 10, 30]
# Generalization dataset parameters
LIST_CHANNEL_MODEL_GEN = ["A", "B", "C", "D", "E"]
LIST_DELAY_SPREAD_GEN = [30e-9, 50e-9, 100e-9, 200e-9, 300e-9, 400e-9]
LIST_MIN_SPEED_GEN = sorted([*range(3, 46, 3), 1, 10])
def make_folder_name(cm: str, ds: float, ms: int, **kwargs) -> str:
"""Generate a standardized folder name based on channel model, delay spread, and minimum speed.
Args:
cm (str): Channel model identifier (e.g., 'A', 'B', 'C', 'D', 'E')
ds (float): Delay spread in seconds (e.g., 30e-9, 100e-9, 300e-9)
ms (int): Minimum speed in km/h (e.g., 1, 10, 30)
**kwargs: Additional keyword arguments (unused)
Returns:
str: Formatted folder name in the format 'cm_{cm}_ds_{ds}_ms_{ms}'
where ds is converted to nanoseconds and zero-padded to 3 digits,
and ms is zero-padded to 3 digits
Example:
>>> make_folder_name('A', 30e-9, 10)
'cm_A_ds_030_ms_010'
"""
# the precision of the delay spread is int
ds = round(ds * 1e9)
ds_str = str(ds).zfill(3)
# the precision of the min speed is .1
ms_str = str(ms)
ms_str = ms_str.zfill(3)
# the file name
return f"cm_{cm}_ds_{ds_str}_ms_{ms_str}"
def check_repo_exists(api: HfApi, repo_id: str) -> bool:
"""Check if a repository exists in the organization."""
try:
api.repo_info(repo_id, repo_type="dataset")
return True
except Exception:
return False
def generate_dataset_combinations():
"""Generate all possible dataset combinations."""
combinations = []
# Stats dataset
combinations.append("stats")
# Train regular datasets
for cm in LIST_CHANNEL_MODEL:
for ds in LIST_DELAY_SPREAD:
for ms in LIST_MIN_SPEED:
folder_name = make_folder_name(cm, ds, ms)
repo_name = f"train_regular_{folder_name}"
combinations.append(repo_name)
# Test regular datasets
for cm in LIST_CHANNEL_MODEL:
for ds in LIST_DELAY_SPREAD:
for ms in LIST_MIN_SPEED:
folder_name = make_folder_name(cm, ds, ms)
repo_name = f"test_regular_{folder_name}"
combinations.append(repo_name)
# Test generalization datasets
for cm in LIST_CHANNEL_MODEL_GEN:
for ds in LIST_DELAY_SPREAD_GEN:
for ms in LIST_MIN_SPEED_GEN:
folder_name = make_folder_name(cm, ds, ms)
repo_name = f"test_generalization_{folder_name}"
combinations.append(repo_name)
return combinations
def download_dataset(api: HfApi, org: str, repo_name: str, output_dir: Path, dry_run: bool = False) -> bool:
"""Download a single dataset if it exists."""
repo_id = f"{org}/{repo_name}"
if not check_repo_exists(api, repo_id):
return False
try:
# Create target directory
target_dir = output_dir / repo_name
target_dir.mkdir(parents=True, exist_ok=True)
if dry_run:
# Create empty placeholder file
placeholder_file = target_dir / "placeholder.txt"
placeholder_file.write_text("")
print(f"β
Dry run - Created placeholder: {repo_name}")
else:
# Download the dataset
snapshot_download(
repo_id=repo_id,
repo_type="dataset",
local_dir=target_dir,
local_dir_use_symlinks=False
)
print(f"β
Downloaded: {repo_name}")
return True
except Exception as e:
print(f"β Error downloading {repo_name}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Download all CSI-4CAST datasets from Hugging Face")
parser.add_argument("--output-dir", "-o", default="datasets",
help="Output directory for downloaded datasets (default: 'datasets')")
parser.add_argument("--dry-run", action="store_true",
help="Dry run mode: create empty placeholder files instead of downloading")
args = parser.parse_args()
output_dir = Path(args.output_dir).resolve()
org = ORG
mode = "Dry run" if args.dry_run else "Downloading"
print(f"{mode} datasets from organization: {org}")
print(f"Output directory: {output_dir}")
print()
# Create output directory
output_dir.mkdir(parents=True, exist_ok=True)
# Initialize Hugging Face API
api = HfApi()
# Generate all possible combinations
print("Generating dataset combinations...")
combinations = generate_dataset_combinations()
print(f"Total possible combinations: {len(combinations)}")
print()
# Download datasets
action = "Checking and creating placeholders for" if args.dry_run else "Checking and downloading"
print(f"{action} existing datasets...")
downloaded_count = 0
skipped_count = 0
for repo_name in tqdm(combinations, desc="Processing datasets"):
if download_dataset(api, org, repo_name, output_dir, args.dry_run):
downloaded_count += 1
else:
skipped_count += 1
print()
if args.dry_run:
print("π Dry run complete!")
print(f"β
Created placeholders: {downloaded_count} datasets")
print(f"βοΈ Skipped: {skipped_count} datasets (not found)")
print(f"π Placeholders saved to: {output_dir}")
else:
print("π Download complete!")
print(f"β
Downloaded: {downloaded_count} datasets")
print(f"βοΈ Skipped: {skipped_count} datasets (not found)")
print(f"π Datasets saved to: {output_dir}")
print()
print("To reconstruct the original folder structure, run:")
print(f"python3 reconstruction.py --input-dir {output_dir}")
if __name__ == "__main__":
main()
|