File size: 5,191 Bytes
aa28c84
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#!/usr/bin/env python3
"""
Reconstruction script for CSI-4CAST datasets.

This script helps users reconstruct the original folder structure after downloading
datasets from the CSI-4CAST Hugging Face organization.

Usage:
    python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR]

If no arguments provided, it will look for downloaded datasets in the current directory
and reconstruct the structure in a 'data' folder.
"""

import argparse
import shutil
from pathlib import Path


def create_directory_structure(base_path: Path):
    """Create the original directory structure"""
    dirs_to_create = [
        "stats",
        "test/regular", 
        "test/generalization",
        "train/regular"
    ]
    
    for dir_path in dirs_to_create:
        full_path = base_path / dir_path
        full_path.mkdir(parents=True, exist_ok=True)
        print(f"Created directory: {full_path}")

def find_downloaded_datasets(input_dir: Path):
    """Find all downloaded dataset folders"""
    datasets = {
        'stats': [],
        'test_regular': [],
        'test_generalization': [],
        'train_regular': []
    }
    
    # Look for folders that match our naming patterns
    for item in input_dir.iterdir():
        if item.is_dir():
            if item.name == "stats":
                datasets['stats'].append(item.name)
            elif item.name.startswith("test_regular_"):
                datasets['test_regular'].append(item.name)
            elif item.name.startswith("test_generalization_"):
                datasets['test_generalization'].append(item.name)
            elif item.name.startswith("train_regular_"):
                datasets['train_regular'].append(item.name)
    
    return datasets

def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool:
    """Reconstruct a single dataset by removing prefix and moving to target location"""
    if prefix_to_remove:
        # Remove the prefix to get the original folder name
        original_name = dataset_name[len(prefix_to_remove):]
    else:
        original_name = dataset_name
    
    target_folder = target_path / original_name
    
    if target_folder.exists():
        print(f"Warning: {target_folder} already exists, skipping...")
        return False
    
    try:
        shutil.copytree(str(source_path), str(target_folder))
        print(f"Reconstructed: {dataset_name} -> {target_folder}")
        return True
    except Exception as e:
        print(f"Error reconstructing {dataset_name}: {e}")
        return False

def main():
    parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure")
    parser.add_argument("--input-dir", "-i", default=".", 
                       help="Directory containing downloaded datasets (default: current directory)")
    parser.add_argument("--output-dir", "-o", default="data",
                       help="Output directory for reconstructed structure (default: 'data')")
    
    args = parser.parse_args()
    
    input_dir = Path(args.input_dir).resolve()
    output_dir = Path(args.output_dir).resolve()
    
    print(f"Looking for datasets in: {input_dir}")
    print(f"Reconstructing structure in: {output_dir}")
    print()
    
    # Create the directory structure
    create_directory_structure(output_dir)
    
    # Find all downloaded datasets
    datasets = find_downloaded_datasets(input_dir)
    
    total_reconstructed = 0
    
    # Reconstruct stats
    for dataset in datasets['stats']:
        source_path = input_dir / dataset
        target_path = output_dir / "stats"
        if reconstruct_dataset(dataset, source_path, target_path, ""):
            total_reconstructed += 1
    
    # Reconstruct test/regular datasets
    for dataset in datasets['test_regular']:
        source_path = input_dir / dataset
        target_path = output_dir / "test" / "regular"
        if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"):
            total_reconstructed += 1
    
    # Reconstruct test/generalization datasets
    for dataset in datasets['test_generalization']:
        source_path = input_dir / dataset
        target_path = output_dir / "test" / "generalization"
        if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"):
            total_reconstructed += 1
    
    # Reconstruct train/regular datasets
    for dataset in datasets['train_regular']:
        source_path = input_dir / dataset
        target_path = output_dir / "train" / "regular"
        if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"):
            total_reconstructed += 1
    
    print()
    print("βœ… Reconstruction complete!")
    print(f"Total datasets reconstructed: {total_reconstructed}")
    print(f"Reconstructed structure available at: {output_dir}")
    print()
    print("Final structure:")
    print("data/")
    print("β”œβ”€β”€ stats/")
    print("β”œβ”€β”€ test/")
    print("β”‚   β”œβ”€β”€ regular/")
    print("β”‚   └── generalization/")
    print("└── train/")
    print("    └── regular/")

if __name__ == "__main__":
    main()