Spaces:

ashandilgith
/

predictive_maintenance

Sleeping

predictive_maintenance / prepare_data.py

GitHub Actions Bot

feat: Deploy latest version of Gradio app

41d029b 8 months ago

2.5 kB

	import pandas as pd
	import numpy as np


	def prepare_data():
	#def prepare_data():

	#load the data as a pandas dataset

	train_df = pd.read_csv('data/train_FD001.txt', sep = ' ', header=None)
	#train_df = pd.read_csv('data/train_csv.txt', sep = ' ', header = None)
	#train_df = pd.read_csv)'data/train_FD001.txt', sep = ' ', header = None)



	# --- This is the crucial fix! ---
	# The original file has trailing spaces, which pandas reads as empty columns.
	# We drop these empty columns (usually numbered 26 and 27) before doing anything else.
	train_df.drop(columns=[26, 27], inplace=True, errors='ignore')

	#name the columns
	columns = ['unit_number', 'time_in_cycles', 'setting_1', 'setting_2', 'setting'] + [f's_{i}' for i in range (1,22)]

	#name the columns of the new dataset
	train_df.columns = columns

	#calculate Remaining Useful Life for each machine (unit_number on the table), by:
	# 1. changing max cycle series to just the highest number (as it's currently a series of numbered runs)
	# 2. simply changing max cycle to RUL by just removing the number of cycles run so far

	# Calculate RUL for the training data

	# 1️⃣ Group by unit_number (machine ID) and get the final cycle (maximum time_in_cycles) for each unit
	max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()
	#max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()
	#max_cycles = train_df/groupby('unit_number')['time_in_cycles'].max()
	#max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()

	# 2️⃣ Merge the max_cycles back into the original dataframe, adding a 'max_cycle' column for each record based on unit_number
	train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on='unit_number', right_index=True)
	#train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on = 'unit_number', right_index=True)

	# 3️⃣ Calculate RUL by subtracting current cycle from the final cycle for each record
	train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles']
	#train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles']

	# 4️⃣ Drop the now-unnecessary 'max_cycle' column since we've already computed RUL
	train_df.drop(columns=['max_cycle'], inplace=True)

	#save the processed data
	train_df.to_csv('data/processed_data.csv', index=False)


	if __name__ == "__main__":
	prepare_data()