Spaces:

CSI-4CAST
/

README

Running

App Files Files Community

SIKAI-C commited on Oct 13, 2025

Commit

078d201

verified ·

1 Parent(s): bb483aa

Create download.py

Browse files

Files changed (1) hide show

download.py +192 -0

download.py ADDED Viewed

	@@ -0,0 +1,192 @@

+"""
+Download script for CSI-4CAST datasets.
+This script downloads all available datasets from the CSI-4CAST Hugging Face organization
+by checking for all possible combinations of channel models, delay spreads, and speeds.
+Usage:
+    python3 download.py [--output-dir OUTPUT_DIR]
+If no arguments provided, it will download datasets to a 'datasets' folder.
+"""
+import argparse
+from pathlib import Path
+from huggingface_hub import HfApi, snapshot_download
+from tqdm import tqdm
+# Configuration constants
+ORG = "CSI-4CAST"
+# Regular dataset parameters
+LIST_CHANNEL_MODEL = ["A", "C", "D"]
+LIST_DELAY_SPREAD = [30e-9, 100e-9, 300e-9]
+LIST_MIN_SPEED = [1, 10, 30]
+# Generalization dataset parameters
+LIST_CHANNEL_MODEL_GEN = ["A", "B", "C", "D", "E"]
+LIST_DELAY_SPREAD_GEN = [30e-9, 50e-9, 100e-9, 200e-9, 300e-9, 400e-9]
+LIST_MIN_SPEED_GEN = sorted([*range(3, 46, 3), 1, 10])
+def make_folder_name(cm: str, ds: float, ms: int, **kwargs) -> str:
+    """Generate a standardized folder name based on channel model, delay spread, and minimum speed.
+    Args:
+        cm (str): Channel model identifier (e.g., 'A', 'B', 'C', 'D', 'E')
+        ds (float): Delay spread in seconds (e.g., 30e-9, 100e-9, 300e-9)
+        ms (int): Minimum speed in km/h (e.g., 1, 10, 30)
+        **kwargs: Additional keyword arguments (unused)
+    Returns:
+        str: Formatted folder name in the format 'cm_{cm}_ds_{ds}_ms_{ms}'
+             where ds is converted to nanoseconds and zero-padded to 3 digits,
+             and ms is zero-padded to 3 digits
+    Example:
+        >>> make_folder_name('A', 30e-9, 10)
+        'cm_A_ds_030_ms_010'
+    """
+    # the precision of the delay spread is int
+    ds = round(ds * 1e9)
+    ds_str = str(ds).zfill(3)
+    # the precision of the min speed is .1
+    ms_str = str(ms)
+    ms_str = ms_str.zfill(3)
+    # the file name
+    return f"cm_{cm}_ds_{ds_str}_ms_{ms_str}"
+def check_repo_exists(api: HfApi, repo_id: str) -> bool:
+    """Check if a repository exists in the organization."""
+    try:
+        api.repo_info(repo_id, repo_type="dataset")
+        return True
+    except Exception:
+        return False
+def generate_dataset_combinations():
+    """Generate all possible dataset combinations."""
+    combinations = []
+    # Stats dataset
+    combinations.append("stats")
+    # Train regular datasets
+    for cm in LIST_CHANNEL_MODEL:
+        for ds in LIST_DELAY_SPREAD:
+            for ms in LIST_MIN_SPEED:
+                folder_name = make_folder_name(cm, ds, ms)
+                repo_name = f"train_regular_{folder_name}"
+                combinations.append(repo_name)
+    # Test regular datasets
+    for cm in LIST_CHANNEL_MODEL:
+        for ds in LIST_DELAY_SPREAD:
+            for ms in LIST_MIN_SPEED:
+                folder_name = make_folder_name(cm, ds, ms)
+                repo_name = f"test_regular_{folder_name}"
+                combinations.append(repo_name)
+    # Test generalization datasets
+    for cm in LIST_CHANNEL_MODEL_GEN:
+        for ds in LIST_DELAY_SPREAD_GEN:
+            for ms in LIST_MIN_SPEED_GEN:
+                folder_name = make_folder_name(cm, ds, ms)
+                repo_name = f"test_generalization_{folder_name}"
+                combinations.append(repo_name)
+    return combinations
+def download_dataset(api: HfApi, org: str, repo_name: str, output_dir: Path, dry_run: bool = False) -> bool:
+    """Download a single dataset if it exists."""
+    repo_id = f"{org}/{repo_name}"
+    if not check_repo_exists(api, repo_id):
+        return False
+    try:
+        # Create target directory
+        target_dir = output_dir / repo_name
+        target_dir.mkdir(parents=True, exist_ok=True)
+        if dry_run:
+            # Create empty placeholder file
+            placeholder_file = target_dir / "placeholder.txt"
+            placeholder_file.write_text("")
+            print(f"✅ Dry run - Created placeholder: {repo_name}")
+        else:
+            # Download the dataset
+            snapshot_download(
+                repo_id=repo_id,
+                repo_type="dataset",
+                local_dir=target_dir,
+                local_dir_use_symlinks=False
+            )
+            print(f"✅ Downloaded: {repo_name}")
+        return True
+    except Exception as e:
+        print(f"❌ Error downloading {repo_name}: {e}")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Download all CSI-4CAST datasets from Hugging Face")
+    parser.add_argument("--output-dir", "-o", default="datasets",
+                       help="Output directory for downloaded datasets (default: 'datasets')")
+    parser.add_argument("--dry-run", action="store_true",
+                       help="Dry run mode: create empty placeholder files instead of downloading")
+    args = parser.parse_args()
+    output_dir = Path(args.output_dir).resolve()
+    org = ORG
+    mode = "Dry run" if args.dry_run else "Downloading"
+    print(f"{mode} datasets from organization: {org}")
+    print(f"Output directory: {output_dir}")
+    print()
+    # Create output directory
+    output_dir.mkdir(parents=True, exist_ok=True)
+    # Initialize Hugging Face API
+    api = HfApi()
+    # Generate all possible combinations
+    print("Generating dataset combinations...")
+    combinations = generate_dataset_combinations()
+    print(f"Total possible combinations: {len(combinations)}")
+    print()
+    # Download datasets
+    action = "Checking and creating placeholders for" if args.dry_run else "Checking and downloading"
+    print(f"{action} existing datasets...")
+    downloaded_count = 0
+    skipped_count = 0
+    for repo_name in tqdm(combinations, desc="Processing datasets"):
+        if download_dataset(api, org, repo_name, output_dir, args.dry_run):
+            downloaded_count += 1
+        else:
+            skipped_count += 1
+    print()
+    if args.dry_run:
+        print("🎉 Dry run complete!")
+        print(f"✅ Created placeholders: {downloaded_count} datasets")
+        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
+        print(f"📁 Placeholders saved to: {output_dir}")
+    else:
+        print("🎉 Download complete!")
+        print(f"✅ Downloaded: {downloaded_count} datasets")
+        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
+        print(f"📁 Datasets saved to: {output_dir}")
+    print()
+    print("To reconstruct the original folder structure, run:")
+    print(f"python3 reconstruction.py --input-dir {output_dir}")
+if __name__ == "__main__":
+    main()