Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Split data into train and test sets""" | |
| import os | |
| import logging | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| logger.info("Splitting data...") | |
| # Load cleaned data | |
| df = pd.read_csv("data/cleaned_data.csv") | |
| # Separate features and target | |
| X = df.drop('engine_condition', axis=1) | |
| y = df['engine_condition'] | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=0.2, random_state=42, stratify=y | |
| ) | |
| logger.info(f"Train shape: {X_train.shape}") | |
| logger.info(f"Test shape: {X_test.shape}") | |
| # Scale features | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train) | |
| X_test_scaled = scaler.transform(X_test) | |
| # Save as DataFrames | |
| train_df = pd.DataFrame(X_train_scaled, columns=X.columns) | |
| train_df['engine_condition'] = y_train.values | |
| test_df = pd.DataFrame(X_test_scaled, columns=X.columns) | |
| test_df['engine_condition'] = y_test.values | |
| train_df.to_csv('data/train_scaled.csv', index=False) | |
| test_df.to_csv('data/test_scaled.csv', index=False) | |
| logger.info("✓ Train-test split completed!") | |