File size: 5,191 Bytes
aa28c84 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
#!/usr/bin/env python3
"""
Reconstruction script for CSI-4CAST datasets.
This script helps users reconstruct the original folder structure after downloading
datasets from the CSI-4CAST Hugging Face organization.
Usage:
python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR]
If no arguments provided, it will look for downloaded datasets in the current directory
and reconstruct the structure in a 'data' folder.
"""
import argparse
import shutil
from pathlib import Path
def create_directory_structure(base_path: Path):
"""Create the original directory structure"""
dirs_to_create = [
"stats",
"test/regular",
"test/generalization",
"train/regular"
]
for dir_path in dirs_to_create:
full_path = base_path / dir_path
full_path.mkdir(parents=True, exist_ok=True)
print(f"Created directory: {full_path}")
def find_downloaded_datasets(input_dir: Path):
"""Find all downloaded dataset folders"""
datasets = {
'stats': [],
'test_regular': [],
'test_generalization': [],
'train_regular': []
}
# Look for folders that match our naming patterns
for item in input_dir.iterdir():
if item.is_dir():
if item.name == "stats":
datasets['stats'].append(item.name)
elif item.name.startswith("test_regular_"):
datasets['test_regular'].append(item.name)
elif item.name.startswith("test_generalization_"):
datasets['test_generalization'].append(item.name)
elif item.name.startswith("train_regular_"):
datasets['train_regular'].append(item.name)
return datasets
def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool:
"""Reconstruct a single dataset by removing prefix and moving to target location"""
if prefix_to_remove:
# Remove the prefix to get the original folder name
original_name = dataset_name[len(prefix_to_remove):]
else:
original_name = dataset_name
target_folder = target_path / original_name
if target_folder.exists():
print(f"Warning: {target_folder} already exists, skipping...")
return False
try:
shutil.copytree(str(source_path), str(target_folder))
print(f"Reconstructed: {dataset_name} -> {target_folder}")
return True
except Exception as e:
print(f"Error reconstructing {dataset_name}: {e}")
return False
def main():
parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure")
parser.add_argument("--input-dir", "-i", default=".",
help="Directory containing downloaded datasets (default: current directory)")
parser.add_argument("--output-dir", "-o", default="data",
help="Output directory for reconstructed structure (default: 'data')")
args = parser.parse_args()
input_dir = Path(args.input_dir).resolve()
output_dir = Path(args.output_dir).resolve()
print(f"Looking for datasets in: {input_dir}")
print(f"Reconstructing structure in: {output_dir}")
print()
# Create the directory structure
create_directory_structure(output_dir)
# Find all downloaded datasets
datasets = find_downloaded_datasets(input_dir)
total_reconstructed = 0
# Reconstruct stats
for dataset in datasets['stats']:
source_path = input_dir / dataset
target_path = output_dir / "stats"
if reconstruct_dataset(dataset, source_path, target_path, ""):
total_reconstructed += 1
# Reconstruct test/regular datasets
for dataset in datasets['test_regular']:
source_path = input_dir / dataset
target_path = output_dir / "test" / "regular"
if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"):
total_reconstructed += 1
# Reconstruct test/generalization datasets
for dataset in datasets['test_generalization']:
source_path = input_dir / dataset
target_path = output_dir / "test" / "generalization"
if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"):
total_reconstructed += 1
# Reconstruct train/regular datasets
for dataset in datasets['train_regular']:
source_path = input_dir / dataset
target_path = output_dir / "train" / "regular"
if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"):
total_reconstructed += 1
print()
print("β
Reconstruction complete!")
print(f"Total datasets reconstructed: {total_reconstructed}")
print(f"Reconstructed structure available at: {output_dir}")
print()
print("Final structure:")
print("data/")
print("βββ stats/")
print("βββ test/")
print("β βββ regular/")
print("β βββ generalization/")
print("βββ train/")
print(" βββ regular/")
if __name__ == "__main__":
main()
|