import pandas as pd import numpy as np import os import sys # Add the project root to sys.path to import path_utils sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import path_utils def perform_feature_engineering(): # Load raw data raw_path = path_utils.get_raw_data_path('ai4i2020.csv') if not os.path.exists(raw_path): print(f"Error: Raw dataset not found at {raw_path}") return df = pd.read_csv(raw_path) print("Raw data loaded.") # 1. Temperature Difference (Process - Air) df['temp_diff'] = df['Process temperature [K]'] - df['Air temperature [K]'] # 2. Power (Torque * angular speed in rad/s) # Angular speed = RPM * 2 * PI / 60 df['power'] = df['Torque [Nm]'] * (df['Rotational speed [rpm]'] * 2 * np.pi / 60) # 3. Tool Wear * Torque (Mechanical stress indicator) df['tool_wear_torque'] = df['Tool wear [min]'] * df['Torque [Nm]'] # 4. Ordinal Encoding for Type (L < M < H quality) type_map = {'L': 0, 'M': 1, 'H': 2} df['Type'] = df['Type'].map(type_map) # 5. Drop Data Leakage and Unnecessary columns # Sub-labels (TWF, HDF, PWF, OSF, RNF) indicate the cause of failure, which is leakage for binary classification cols_to_drop = ['UDI', 'Product ID', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'] df = df.drop(columns=cols_to_drop) # Save processed features processed_path = path_utils.get_processed_data_path('features.csv') df.to_csv(processed_path, index=False) print(f"Feature engineering complete. File saved to {processed_path}") print(f"Columns in processed data: {df.columns.tolist()}") if __name__ == "__main__": perform_feature_engineering()