File size: 1,213 Bytes
9d8621a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
"""Split data into train and test sets"""
import os
import logging
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

logger.info("Splitting data...")

# Load cleaned data
df = pd.read_csv("data/cleaned_data.csv")

# Separate features and target
X = df.drop('engine_condition', axis=1)
y = df['engine_condition']

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

logger.info(f"Train shape: {X_train.shape}")
logger.info(f"Test shape: {X_test.shape}")

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Save as DataFrames
train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
train_df['engine_condition'] = y_train.values

test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
test_df['engine_condition'] = y_test.values

train_df.to_csv('data/train_scaled.csv', index=False)
test_df.to_csv('data/test_scaled.csv', index=False)

logger.info("✓ Train-test split completed!")