Spaces:

Divya499
/

ReliabilityPulse

Sleeping

ReliabilityPulse / pipeline /02_feature_engineering.py

DIVYANSHI SINGH

Final Precision Deployment: Stable UI + Git LFS

27a3018 2 months ago

1.71 kB

	import pandas as pd
	import numpy as np
	import os
	import sys

	# Add the project root to sys.path to import path_utils
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
	import path_utils

	def perform_feature_engineering():
	# Load raw data
	raw_path = path_utils.get_raw_data_path('ai4i2020.csv')
	if not os.path.exists(raw_path):
	print(f"Error: Raw dataset not found at {raw_path}")
	return

	df = pd.read_csv(raw_path)
	print("Raw data loaded.")

	# 1. Temperature Difference (Process - Air)
	df['temp_diff'] = df['Process temperature [K]'] - df['Air temperature [K]']

	# 2. Power (Torque * angular speed in rad/s)
	# Angular speed = RPM * 2 * PI / 60
	df['power'] = df['Torque [Nm]'] * (df['Rotational speed [rpm]'] * 2 * np.pi / 60)

	# 3. Tool Wear * Torque (Mechanical stress indicator)
	df['tool_wear_torque'] = df['Tool wear [min]'] * df['Torque [Nm]']

	# 4. Ordinal Encoding for Type (L < M < H quality)
	type_map = {'L': 0, 'M': 1, 'H': 2}
	df['Type'] = df['Type'].map(type_map)

	# 5. Drop Data Leakage and Unnecessary columns
	# Sub-labels (TWF, HDF, PWF, OSF, RNF) indicate the cause of failure, which is leakage for binary classification
	cols_to_drop = ['UDI', 'Product ID', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF']
	df = df.drop(columns=cols_to_drop)

	# Save processed features
	processed_path = path_utils.get_processed_data_path('features.csv')
	df.to_csv(processed_path, index=False)
	print(f"Feature engineering complete. File saved to {processed_path}")
	print(f"Columns in processed data: {df.columns.tolist()}")

	if __name__ == "__main__":
	perform_feature_engineering()