Spaces:

CSI-4CAST
/

README

Running

File size: 6,607 Bytes

078d201

"""
Download script for CSI-4CAST datasets.

This script downloads all available datasets from the CSI-4CAST Hugging Face organization
by checking for all possible combinations of channel models, delay spreads, and speeds.

Usage:
    python3 download.py [--output-dir OUTPUT_DIR]

If no arguments provided, it will download datasets to a 'datasets' folder.
"""

import argparse
from pathlib import Path

from huggingface_hub import HfApi, snapshot_download
from tqdm import tqdm

# Configuration constants
ORG = "CSI-4CAST"

# Regular dataset parameters
LIST_CHANNEL_MODEL = ["A", "C", "D"]
LIST_DELAY_SPREAD = [30e-9, 100e-9, 300e-9]
LIST_MIN_SPEED = [1, 10, 30]

# Generalization dataset parameters
LIST_CHANNEL_MODEL_GEN = ["A", "B", "C", "D", "E"]
LIST_DELAY_SPREAD_GEN = [30e-9, 50e-9, 100e-9, 200e-9, 300e-9, 400e-9]
LIST_MIN_SPEED_GEN = sorted([*range(3, 46, 3), 1, 10])

def make_folder_name(cm: str, ds: float, ms: int, **kwargs) -> str:
    """Generate a standardized folder name based on channel model, delay spread, and minimum speed.

    Args:
        cm (str): Channel model identifier (e.g., 'A', 'B', 'C', 'D', 'E')
        ds (float): Delay spread in seconds (e.g., 30e-9, 100e-9, 300e-9)
        ms (int): Minimum speed in km/h (e.g., 1, 10, 30)
        **kwargs: Additional keyword arguments (unused)

    Returns:
        str: Formatted folder name in the format 'cm_{cm}_ds_{ds}_ms_{ms}'
             where ds is converted to nanoseconds and zero-padded to 3 digits,
             and ms is zero-padded to 3 digits

    Example:
        >>> make_folder_name('A', 30e-9, 10)
        'cm_A_ds_030_ms_010'
    """
    # the precision of the delay spread is int
    ds = round(ds * 1e9)
    ds_str = str(ds).zfill(3)

    # the precision of the min speed is .1
    ms_str = str(ms)
    ms_str = ms_str.zfill(3)

    # the file name
    return f"cm_{cm}_ds_{ds_str}_ms_{ms_str}"

def check_repo_exists(api: HfApi, repo_id: str) -> bool:
    """Check if a repository exists in the organization."""
    try:
        api.repo_info(repo_id, repo_type="dataset")
        return True
    except Exception:
        return False

def generate_dataset_combinations():
    """Generate all possible dataset combinations."""
    combinations = []
    
    # Stats dataset
    combinations.append("stats")
    
    # Train regular datasets
    for cm in LIST_CHANNEL_MODEL:
        for ds in LIST_DELAY_SPREAD:
            for ms in LIST_MIN_SPEED:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"train_regular_{folder_name}"
                combinations.append(repo_name)
    
    # Test regular datasets
    for cm in LIST_CHANNEL_MODEL:
        for ds in LIST_DELAY_SPREAD:
            for ms in LIST_MIN_SPEED:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"test_regular_{folder_name}"
                combinations.append(repo_name)
    
    # Test generalization datasets
    for cm in LIST_CHANNEL_MODEL_GEN:
        for ds in LIST_DELAY_SPREAD_GEN:
            for ms in LIST_MIN_SPEED_GEN:
                folder_name = make_folder_name(cm, ds, ms)
                repo_name = f"test_generalization_{folder_name}"
                combinations.append(repo_name)
    
    return combinations

def download_dataset(api: HfApi, org: str, repo_name: str, output_dir: Path, dry_run: bool = False) -> bool:
    """Download a single dataset if it exists."""
    repo_id = f"{org}/{repo_name}"
    
    if not check_repo_exists(api, repo_id):
        return False
    
    try:
        # Create target directory
        target_dir = output_dir / repo_name
        target_dir.mkdir(parents=True, exist_ok=True)
        
        if dry_run:
            # Create empty placeholder file
            placeholder_file = target_dir / "placeholder.txt"
            placeholder_file.write_text("")
            print(f"✅ Dry run - Created placeholder: {repo_name}")
        else:
            # Download the dataset
            snapshot_download(
                repo_id=repo_id,
                repo_type="dataset",
                local_dir=target_dir,
                local_dir_use_symlinks=False
            )
            print(f"✅ Downloaded: {repo_name}")
        
        return True
        
    except Exception as e:
        print(f"❌ Error downloading {repo_name}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Download all CSI-4CAST datasets from Hugging Face")
    parser.add_argument("--output-dir", "-o", default="datasets",
                       help="Output directory for downloaded datasets (default: 'datasets')")
    parser.add_argument("--dry-run", action="store_true",
                       help="Dry run mode: create empty placeholder files instead of downloading")
    
    args = parser.parse_args()
    
    output_dir = Path(args.output_dir).resolve()
    org = ORG
    
    mode = "Dry run" if args.dry_run else "Downloading"
    print(f"{mode} datasets from organization: {org}")
    print(f"Output directory: {output_dir}")
    print()
    
    # Create output directory
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Initialize Hugging Face API
    api = HfApi()
    
    # Generate all possible combinations
    print("Generating dataset combinations...")
    combinations = generate_dataset_combinations()
    print(f"Total possible combinations: {len(combinations)}")
    print()
    
    # Download datasets
    action = "Checking and creating placeholders for" if args.dry_run else "Checking and downloading"
    print(f"{action} existing datasets...")
    downloaded_count = 0
    skipped_count = 0
    
    for repo_name in tqdm(combinations, desc="Processing datasets"):
        if download_dataset(api, org, repo_name, output_dir, args.dry_run):
            downloaded_count += 1
        else:
            skipped_count += 1
    
    print()
    if args.dry_run:
        print("🎉 Dry run complete!")
        print(f"✅ Created placeholders: {downloaded_count} datasets")
        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
        print(f"📁 Placeholders saved to: {output_dir}")
    else:
        print("🎉 Download complete!")
        print(f"✅ Downloaded: {downloaded_count} datasets")
        print(f"⏭️  Skipped: {skipped_count} datasets (not found)")
        print(f"📁 Datasets saved to: {output_dir}")
    print()
    print("To reconstruct the original folder structure, run:")
    print(f"python3 reconstruction.py --input-dir {output_dir}")

if __name__ == "__main__":
    main()