SharleyK's picture
Upload folder using huggingface_hub
9d8621a verified
#!/usr/bin/env python3
"""Split data into train and test sets"""
import os
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.info("Splitting data...")
# Load cleaned data
df = pd.read_csv("data/cleaned_data.csv")
# Separate features and target
X = df.drop('engine_condition', axis=1)
y = df['engine_condition']
# Split data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
logger.info(f"Train shape: {X_train.shape}")
logger.info(f"Test shape: {X_test.shape}")
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Save as DataFrames
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df['engine_condition'] = y_train.values
test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df['engine_condition'] = y_test.values
train_df.to_csv('data/train_scaled.csv', index=False)
test_df.to_csv('data/test_scaled.csv', index=False)
logger.info("✓ Train-test split completed!")