Spaces:

CSI-4CAST
/

README

Running

App Files Files Community

README / reconstruction.py

SIKAI-C

Create reconstruction.py

aa28c84 verified 4 months ago

raw

history blame contribute delete

5.19 kB

	#!/usr/bin/env python3
	"""
	Reconstruction script for CSI-4CAST datasets.

	This script helps users reconstruct the original folder structure after downloading
	datasets from the CSI-4CAST Hugging Face organization.

	Usage:
	python reconstruction.py [--input-dir INPUT_DIR] [--output-dir OUTPUT_DIR]

	If no arguments provided, it will look for downloaded datasets in the current directory
	and reconstruct the structure in a 'data' folder.
	"""

	import argparse
	import shutil
	from pathlib import Path


	def create_directory_structure(base_path: Path):
	"""Create the original directory structure"""
	dirs_to_create = [
	"stats",
	"test/regular",
	"test/generalization",
	"train/regular"
	]

	for dir_path in dirs_to_create:
	full_path = base_path / dir_path
	full_path.mkdir(parents=True, exist_ok=True)
	print(f"Created directory: {full_path}")

	def find_downloaded_datasets(input_dir: Path):
	"""Find all downloaded dataset folders"""
	datasets = {
	'stats': [],
	'test_regular': [],
	'test_generalization': [],
	'train_regular': []
	}

	# Look for folders that match our naming patterns
	for item in input_dir.iterdir():
	if item.is_dir():
	if item.name == "stats":
	datasets['stats'].append(item.name)
	elif item.name.startswith("test_regular_"):
	datasets['test_regular'].append(item.name)
	elif item.name.startswith("test_generalization_"):
	datasets['test_generalization'].append(item.name)
	elif item.name.startswith("train_regular_"):
	datasets['train_regular'].append(item.name)

	return datasets

	def reconstruct_dataset(dataset_name: str, source_path: Path, target_path: Path, prefix_to_remove: str) -> bool:
	"""Reconstruct a single dataset by removing prefix and moving to target location"""
	if prefix_to_remove:
	# Remove the prefix to get the original folder name
	original_name = dataset_name[len(prefix_to_remove):]
	else:
	original_name = dataset_name

	target_folder = target_path / original_name

	if target_folder.exists():
	print(f"Warning: {target_folder} already exists, skipping...")
	return False

	try:
	shutil.copytree(str(source_path), str(target_folder))
	print(f"Reconstructed: {dataset_name} -> {target_folder}")
	return True
	except Exception as e:
	print(f"Error reconstructing {dataset_name}: {e}")
	return False

	def main():
	parser = argparse.ArgumentParser(description="Reconstruct CSI-4CAST dataset folder structure")
	parser.add_argument("--input-dir", "-i", default=".",
	help="Directory containing downloaded datasets (default: current directory)")
	parser.add_argument("--output-dir", "-o", default="data",
	help="Output directory for reconstructed structure (default: 'data')")

	args = parser.parse_args()

	input_dir = Path(args.input_dir).resolve()
	output_dir = Path(args.output_dir).resolve()

	print(f"Looking for datasets in: {input_dir}")
	print(f"Reconstructing structure in: {output_dir}")
	print()

	# Create the directory structure
	create_directory_structure(output_dir)

	# Find all downloaded datasets
	datasets = find_downloaded_datasets(input_dir)

	total_reconstructed = 0

	# Reconstruct stats
	for dataset in datasets['stats']:
	source_path = input_dir / dataset
	target_path = output_dir / "stats"
	if reconstruct_dataset(dataset, source_path, target_path, ""):
	total_reconstructed += 1

	# Reconstruct test/regular datasets
	for dataset in datasets['test_regular']:
	source_path = input_dir / dataset
	target_path = output_dir / "test" / "regular"
	if reconstruct_dataset(dataset, source_path, target_path, "test_regular_"):
	total_reconstructed += 1

	# Reconstruct test/generalization datasets
	for dataset in datasets['test_generalization']:
	source_path = input_dir / dataset
	target_path = output_dir / "test" / "generalization"
	if reconstruct_dataset(dataset, source_path, target_path, "test_generalization_"):
	total_reconstructed += 1

	# Reconstruct train/regular datasets
	for dataset in datasets['train_regular']:
	source_path = input_dir / dataset
	target_path = output_dir / "train" / "regular"
	if reconstruct_dataset(dataset, source_path, target_path, "train_regular_"):
	total_reconstructed += 1

	print()
	print("✅ Reconstruction complete!")
	print(f"Total datasets reconstructed: {total_reconstructed}")
	print(f"Reconstructed structure available at: {output_dir}")
	print()
	print("Final structure:")
	print("data/")
	print("├── stats/")
	print("├── test/")
	print("│ ├── regular/")
	print("│ └── generalization/")
	print("└── train/")
	print(" └── regular/")

	if __name__ == "__main__":
	main()