Spaces:

CSI-4CAST
/

README

Running

File size: 5,191 Bytes

aa28c84

#!/usr/bin/env python3
"""
Reconstruction script for CSI-4CAST datasets.

This script helps users reconstruct the original folder structure after downloading
datasets from the CSI-4CAST Hugging Face organization.

Usage:
    python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR]

If no arguments provided, it will look for downloaded datasets in the current directory
and reconstruct the structure in a 'data' folder.
"""

import argparse
import shutil
from pathlib import Path


def create_directory_structure(base_path: Path):
    """Create the original directory structure"""
    dirs_to_create = [
        "stats",
        "test/regular", 
        "test/generalization",
        "train/regular"
    ]
    
    for dir_path in dirs_to_create:
        full_path = base_path / dir_path
        full_path.mkdir(parents=True, exist_ok=True)
        print(f"Created directory: {full_path}")

def find_downloaded_datasets(input_dir: Path):
    """Find all downloaded dataset folders"""
    datasets = {
        'stats': [],
        'test_regular': [],
        'test_generalization': [],
        'train_regular': []
    }
    
    # Look for folders that match our naming patterns
    for item in input_dir.iterdir():
        if item.is_dir():
            if item.name == "stats":
                datasets['stats'].append(item.name)
            elif item.name.startswith("test_regular_"):
                datasets['test_regular'].append(item.name)
            elif item.name.startswith("test_generalization_"):
                datasets['test_generalization'].append(item.name)
            elif item.name.startswith("train_regular_"):
                datasets['train_regular'].append(item.name)
    
    return datasets

def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool:
    """Reconstruct a single dataset by removing prefix and moving to target location"""
    if prefix_to_remove:
        # Remove the prefix to get the original folder name
        original_name = dataset_name[len(prefix_to_remove):]
    else:
        original_name = dataset_name
    
    target_folder = target_path / original_name
    
    if target_folder.exists():
        print(f"Warning: {target_folder} already exists, skipping...")
        return False
    
    try:
        shutil.copytree(str(source_path), str(target_folder))
        print(f"Reconstructed: {dataset_name} -> {target_folder}")
        return True
    except Exception as e:
        print(f"Error reconstructing {dataset_name}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure")
    parser.add_argument("--input-dir", "-i", default=".", 
                       help="Directory containing downloaded datasets (default: current directory)")
    parser.add_argument("--output-dir", "-o", default="data",
                       help="Output directory for reconstructed structure (default: 'data')")
    
    args = parser.parse_args()
    
    input_dir = Path(args.input_dir).resolve()
    output_dir = Path(args.output_dir).resolve()
    
    print(f"Looking for datasets in: {input_dir}")
    print(f"Reconstructing structure in: {output_dir}")
    print()
    
    # Create the directory structure
    create_directory_structure(output_dir)
    
    # Find all downloaded datasets
    datasets = find_downloaded_datasets(input_dir)
    
    total_reconstructed = 0
    
    # Reconstruct stats
    for dataset in datasets['stats']:
        source_path = input_dir / dataset
        target_path = output_dir / "stats"
        if reconstruct_dataset(dataset, source_path, target_path, ""):
            total_reconstructed += 1
    
    # Reconstruct test/regular datasets
    for dataset in datasets['test_regular']:
        source_path = input_dir / dataset
        target_path = output_dir / "test" / "regular"
        if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"):
            total_reconstructed += 1
    
    # Reconstruct test/generalization datasets
    for dataset in datasets['test_generalization']:
        source_path = input_dir / dataset
        target_path = output_dir / "test" / "generalization"
        if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"):
            total_reconstructed += 1
    
    # Reconstruct train/regular datasets
    for dataset in datasets['train_regular']:
        source_path = input_dir / dataset
        target_path = output_dir / "train" / "regular"
        if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"):
            total_reconstructed += 1
    
    print()
    print("✅ Reconstruction complete!")
    print(f"Total datasets reconstructed: {total_reconstructed}")
    print(f"Reconstructed structure available at: {output_dir}")
    print()
    print("Final structure:")
    print("data/")
    print("├── stats/")
    print("├── test/")
    print("│   ├── regular/")
    print("│   └── generalization/")
    print("└── train/")
    print("    └── regular/")

if __name__ == "__main__":
    main()