Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| def prepare_data(): | |
| #def prepare_data(): | |
| #load the data as a pandas dataset | |
| train_df = pd.read_csv('data/train_FD001.txt', sep = ' ', header=None) | |
| #train_df = pd.read_csv('data/train_csv.txt', sep = ' ', header = None) | |
| #train_df = pd.read_csv)'data/train_FD001.txt', sep = ' ', header = None) | |
| # --- This is the crucial fix! --- | |
| # The original file has trailing spaces, which pandas reads as empty columns. | |
| # We drop these empty columns (usually numbered 26 and 27) before doing anything else. | |
| train_df.drop(columns=[26, 27], inplace=True, errors='ignore') | |
| #name the columns | |
| columns = ['unit_number', 'time_in_cycles', 'setting_1', 'setting_2', 'setting'] + [f's_{i}' for i in range (1,22)] | |
| #name the columns of the new dataset | |
| train_df.columns = columns | |
| #calculate Remaining Useful Life for each machine (unit_number on the table), by: | |
| # 1. changing max cycle series to just the highest number (as it's currently a series of numbered runs) | |
| # 2. simply changing max cycle to RUL by just removing the number of cycles run so far | |
| # Calculate RUL for the training data | |
| # 1️⃣ Group by unit_number (machine ID) and get the final cycle (maximum time_in_cycles) for each unit | |
| max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max() | |
| #max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max() | |
| #max_cycles = train_df/groupby('unit_number')['time_in_cycles'].max() | |
| #max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max() | |
| # 2️⃣ Merge the max_cycles back into the original dataframe, adding a 'max_cycle' column for each record based on unit_number | |
| train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on='unit_number', right_index=True) | |
| #train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on = 'unit_number', right_index=True) | |
| # 3️⃣ Calculate RUL by subtracting current cycle from the final cycle for each record | |
| train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles'] | |
| #train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles'] | |
| # 4️⃣ Drop the now-unnecessary 'max_cycle' column since we've already computed RUL | |
| train_df.drop(columns=['max_cycle'], inplace=True) | |
| #save the processed data | |
| train_df.to_csv('data/processed_data.csv', index=False) | |
| if __name__ == "__main__": | |
| prepare_data() | |