Spaces:

dev-jas
/

polymer-aging-with-ml

Sleeping

polymer-aging-with-ml / backend /utils /prepare_data.py

devjas1

Initial Release: Polymer Aging With ML [Standalone Appliance]

4a0e21d about 1 month ago

2.95 kB

	"""
	Data Preparation Script

	This script takes a raw dataset, performs a stratified split into
	training, validation, and test sets, and saves them to a processed
	data directory. This ensures a consistent and reproducible data
	splitting strategy.

	Usage:
	python scripts/prepare_data.py --data-path /path/to/raw/data.csv --output-path data/processed
	"""

	from pathlib import Path
	import argparse
	import pandas as pd
	from sklearn.model_selection import train_test_split


	def prepare_data(data_path: Path, output_path: Path, test_size: float = 0.2, val_size: float = 0.15):
	"""
	Loads data, performs stratified train-val-test split, and saves the splits.

	Args:
	data_path (Path): Path to the raw data file (CSV expected).
	output_path (Path): Directory to save the processed data splits.
	test_size (float): Proportion of the dataset to include in the test split.
	val_size (float): Proportion of the training set to use for validation.
	"""
	if not data_path.exists():
	raise FileNotFoundError(f"Raw data not found at {data_path}")

	print(f"Loading data from {data_path}...")
	# This assumes a CSV with a 'spectra' column and a 'label' column.
	# You will need to adapt this to your actual raw data format.
	df = pd.read_csv(data_path)

	# Ensure the 'label' column exists in the dataset
	if 'label' not in df.columns:
	raise ValueError(
	"The input data must contain a 'label' column for stratified splitting.")

	# Ensure output directory exists
	output_path.mkdir(parents=True, exist_ok=True)

	print("Performing stratified train-test split...")
	# Split off the test set first
	train_val_df, test_df = train_test_split(
	df, test_size=test_size, stratify=df['label'], random_state=42
	)

	# Split the remaining data into training and validation sets
	train_df, val_df = train_test_split(
	train_val_df, test_size=val_size, stratify=train_val_df['label'], random_state=42
	)

	print(f"Train set size: {len(train_df)}")
	print(f"Validation set size: {len(val_df)}")
	print(f"Test set size: {len(test_df)}")

	# Save the splits
	train_df.to_csv(output_path / "train.csv", index=False)
	val_df.to_csv(output_path / "validation.csv", index=False)
	test_df.to_csv(output_path / "test.csv", index=False)

	print(f"✅ Data splits saved to {output_path}")


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Prepare and split spectral data.")
	parser.add_argument("--data-path", type=Path, required=True,
	help="Path to the raw data CSV file.")
	parser.add_argument("--output-path", type=Path, default=Path(
	"data/processed"), help="Directory to save data splits.")
	args = parser.parse_args()
	prepare_data(args.data_path, args.output_path)