|
|
|
|
|
""" |
|
|
Reconstruction script for CSI-4CAST datasets. |
|
|
|
|
|
This script helps users reconstruct the original folder structure after downloading |
|
|
datasets from the CSI-4CAST Hugging Face organization. |
|
|
|
|
|
Usage: |
|
|
python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR] |
|
|
|
|
|
If no arguments provided, it will look for downloaded datasets in the current directory |
|
|
and reconstruct the structure in a 'data' folder. |
|
|
""" |
|
|
|
|
|
import argparse |
|
|
import shutil |
|
|
from pathlib import Path |
|
|
|
|
|
|
|
|
def create_directory_structure(base_path: Path): |
|
|
"""Create the original directory structure""" |
|
|
dirs_to_create = [ |
|
|
"stats", |
|
|
"test/regular", |
|
|
"test/generalization", |
|
|
"train/regular" |
|
|
] |
|
|
|
|
|
for dir_path in dirs_to_create: |
|
|
full_path = base_path / dir_path |
|
|
full_path.mkdir(parents=True, exist_ok=True) |
|
|
print(f"Created directory: {full_path}") |
|
|
|
|
|
def find_downloaded_datasets(input_dir: Path): |
|
|
"""Find all downloaded dataset folders""" |
|
|
datasets = { |
|
|
'stats': [], |
|
|
'test_regular': [], |
|
|
'test_generalization': [], |
|
|
'train_regular': [] |
|
|
} |
|
|
|
|
|
|
|
|
for item in input_dir.iterdir(): |
|
|
if item.is_dir(): |
|
|
if item.name == "stats": |
|
|
datasets['stats'].append(item.name) |
|
|
elif item.name.startswith("test_regular_"): |
|
|
datasets['test_regular'].append(item.name) |
|
|
elif item.name.startswith("test_generalization_"): |
|
|
datasets['test_generalization'].append(item.name) |
|
|
elif item.name.startswith("train_regular_"): |
|
|
datasets['train_regular'].append(item.name) |
|
|
|
|
|
return datasets |
|
|
|
|
|
def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool: |
|
|
"""Reconstruct a single dataset by removing prefix and moving to target location""" |
|
|
if prefix_to_remove: |
|
|
|
|
|
original_name = dataset_name[len(prefix_to_remove):] |
|
|
else: |
|
|
original_name = dataset_name |
|
|
|
|
|
target_folder = target_path / original_name |
|
|
|
|
|
if target_folder.exists(): |
|
|
print(f"Warning: {target_folder} already exists, skipping...") |
|
|
return False |
|
|
|
|
|
try: |
|
|
shutil.copytree(str(source_path), str(target_folder)) |
|
|
print(f"Reconstructed: {dataset_name} -> {target_folder}") |
|
|
return True |
|
|
except Exception as e: |
|
|
print(f"Error reconstructing {dataset_name}: {e}") |
|
|
return False |
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure") |
|
|
parser.add_argument("--input-dir", "-i", default=".", |
|
|
help="Directory containing downloaded datasets (default: current directory)") |
|
|
parser.add_argument("--output-dir", "-o", default="data", |
|
|
help="Output directory for reconstructed structure (default: 'data')") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
input_dir = Path(args.input_dir).resolve() |
|
|
output_dir = Path(args.output_dir).resolve() |
|
|
|
|
|
print(f"Looking for datasets in: {input_dir}") |
|
|
print(f"Reconstructing structure in: {output_dir}") |
|
|
print() |
|
|
|
|
|
|
|
|
create_directory_structure(output_dir) |
|
|
|
|
|
|
|
|
datasets = find_downloaded_datasets(input_dir) |
|
|
|
|
|
total_reconstructed = 0 |
|
|
|
|
|
|
|
|
for dataset in datasets['stats']: |
|
|
source_path = input_dir / dataset |
|
|
target_path = output_dir / "stats" |
|
|
if reconstruct_dataset(dataset, source_path, target_path, ""): |
|
|
total_reconstructed += 1 |
|
|
|
|
|
|
|
|
for dataset in datasets['test_regular']: |
|
|
source_path = input_dir / dataset |
|
|
target_path = output_dir / "test" / "regular" |
|
|
if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"): |
|
|
total_reconstructed += 1 |
|
|
|
|
|
|
|
|
for dataset in datasets['test_generalization']: |
|
|
source_path = input_dir / dataset |
|
|
target_path = output_dir / "test" / "generalization" |
|
|
if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"): |
|
|
total_reconstructed += 1 |
|
|
|
|
|
|
|
|
for dataset in datasets['train_regular']: |
|
|
source_path = input_dir / dataset |
|
|
target_path = output_dir / "train" / "regular" |
|
|
if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"): |
|
|
total_reconstructed += 1 |
|
|
|
|
|
print() |
|
|
print("β
Reconstruction complete!") |
|
|
print(f"Total datasets reconstructed: {total_reconstructed}") |
|
|
print(f"Reconstructed structure available at: {output_dir}") |
|
|
print() |
|
|
print("Final structure:") |
|
|
print("data/") |
|
|
print("βββ stats/") |
|
|
print("βββ test/") |
|
|
print("β βββ regular/") |
|
|
print("β βββ generalization/") |
|
|
print("βββ train/") |
|
|
print(" βββ regular/") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|