Spaces:

SharleyK
/

PredictiveMaintenance

Sleeping

Upload folder using huggingface_hub

9d8621a verified about 1 month ago

1.21 kB

	#!/usr/bin/env python3
	"""Split data into train and test sets"""
	import os
	import logging
	import pandas as pd
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	logger.info("Splitting data...")

	# Load cleaned data
	df = pd.read_csv("data/cleaned_data.csv")

	# Separate features and target
	X = df.drop('engine_condition', axis=1)
	y = df['engine_condition']

	# Split data
	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=0.2, random_state=42, stratify=y
	)

	logger.info(f"Train shape: {X_train.shape}")
	logger.info(f"Test shape: {X_test.shape}")

	# Scale features
	scaler = StandardScaler()
	X_train_scaled = scaler.fit_transform(X_train)
	X_test_scaled = scaler.transform(X_test)

	# Save as DataFrames
	train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
	train_df['engine_condition'] = y_train.values

	test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
	test_df['engine_condition'] = y_test.values

	train_df.to_csv('data/train_scaled.csv', index=False)
	test_df.to_csv('data/test_scaled.csv', index=False)

	logger.info("✓ Train-test split completed!")