README / reconstruction.py
SIKAI-C's picture
Create reconstruction.py
aa28c84 verified
#!/usr/bin/env python3
"""
Reconstruction script for CSI-4CAST datasets.
This script helps users reconstruct the original folder structure after downloading
datasets from the CSI-4CAST Hugging Face organization.
Usage:
python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR]
If no arguments provided, it will look for downloaded datasets in the current directory
and reconstruct the structure in a 'data' folder.
"""
import argparse
import shutil
from pathlib import Path
def create_directory_structure(base_path: Path):
"""Create the original directory structure"""
dirs_to_create = [
"stats",
"test/regular",
"test/generalization",
"train/regular"
]
for dir_path in dirs_to_create:
full_path = base_path / dir_path
full_path.mkdir(parents=True, exist_ok=True)
print(f"Created directory: {full_path}")
def find_downloaded_datasets(input_dir: Path):
"""Find all downloaded dataset folders"""
datasets = {
'stats': [],
'test_regular': [],
'test_generalization': [],
'train_regular': []
}
# Look for folders that match our naming patterns
for item in input_dir.iterdir():
if item.is_dir():
if item.name == "stats":
datasets['stats'].append(item.name)
elif item.name.startswith("test_regular_"):
datasets['test_regular'].append(item.name)
elif item.name.startswith("test_generalization_"):
datasets['test_generalization'].append(item.name)
elif item.name.startswith("train_regular_"):
datasets['train_regular'].append(item.name)
return datasets
def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool:
"""Reconstruct a single dataset by removing prefix and moving to target location"""
if prefix_to_remove:
# Remove the prefix to get the original folder name
original_name = dataset_name[len(prefix_to_remove):]
else:
original_name = dataset_name
target_folder = target_path / original_name
if target_folder.exists():
print(f"Warning: {target_folder} already exists, skipping...")
return False
try:
shutil.copytree(str(source_path), str(target_folder))
print(f"Reconstructed: {dataset_name} -> {target_folder}")
return True
except Exception as e:
print(f"Error reconstructing {dataset_name}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure")
parser.add_argument("--input-dir", "-i", default=".",
help="Directory containing downloaded datasets (default: current directory)")
parser.add_argument("--output-dir", "-o", default="data",
help="Output directory for reconstructed structure (default: 'data')")
args = parser.parse_args()
input_dir = Path(args.input_dir).resolve()
output_dir = Path(args.output_dir).resolve()
print(f"Looking for datasets in: {input_dir}")
print(f"Reconstructing structure in: {output_dir}")
print()
# Create the directory structure
create_directory_structure(output_dir)
# Find all downloaded datasets
datasets = find_downloaded_datasets(input_dir)
total_reconstructed = 0
# Reconstruct stats
for dataset in datasets['stats']:
source_path = input_dir / dataset
target_path = output_dir / "stats"
if reconstruct_dataset(dataset, source_path, target_path, ""):
total_reconstructed += 1
# Reconstruct test/regular datasets
for dataset in datasets['test_regular']:
source_path = input_dir / dataset
target_path = output_dir / "test" / "regular"
if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"):
total_reconstructed += 1
# Reconstruct test/generalization datasets
for dataset in datasets['test_generalization']:
source_path = input_dir / dataset
target_path = output_dir / "test" / "generalization"
if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"):
total_reconstructed += 1
# Reconstruct train/regular datasets
for dataset in datasets['train_regular']:
source_path = input_dir / dataset
target_path = output_dir / "train" / "regular"
if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"):
total_reconstructed += 1
print()
print("βœ… Reconstruction complete!")
print(f"Total datasets reconstructed: {total_reconstructed}")
print(f"Reconstructed structure available at: {output_dir}")
print()
print("Final structure:")
print("data/")
print("β”œβ”€β”€ stats/")
print("β”œβ”€β”€ test/")
print("β”‚ β”œβ”€β”€ regular/")
print("β”‚ └── generalization/")
print("└── train/")
print(" └── regular/")
if __name__ == "__main__":
main()