import pandas as pd 
import numpy as np 


def prepare_data():
#def prepare_data():

    #load the data as a pandas dataset 

    train_df = pd.read_csv('data/train_FD001.txt', sep = ' ', header=None)
    #train_df = pd.read_csv('data/train_csv.txt', sep = ' ', header = None)
    #train_df = pd.read_csv)'data/train_FD001.txt', sep = ' ', header = None) 
    

    # --- This is the crucial fix! ---
    # The original file has trailing spaces, which pandas reads as empty columns.
    # We drop these empty columns (usually numbered 26 and 27) before doing anything else.
    train_df.drop(columns=[26, 27], inplace=True, errors='ignore')

    #name the columns 
    columns = ['unit_number', 'time_in_cycles', 'setting_1', 'setting_2', 'setting'] + [f's_{i}' for i in range (1,22)]

    #name the columns of the new dataset
    train_df.columns = columns 

    #calculate Remaining Useful Life for each machine (unit_number on the table), by:
    # 1. changing max cycle series to just the highest number (as it's currently a series of numbered runs)
    # 2. simply changing max cycle to RUL by just removing the number of cycles run so far

   # Calculate RUL for the training data

    # 1️⃣ Group by unit_number (machine ID) and get the final cycle (maximum time_in_cycles) for each unit
    max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()
    #max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()
    #max_cycles = train_df/groupby('unit_number')['time_in_cycles'].max()
    #max_cycles = train_df.groupby('unit_number')['time_in_cycles'].max()

    # 2️⃣ Merge the max_cycles back into the original dataframe, adding a 'max_cycle' column for each record based on unit_number
    train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on='unit_number', right_index=True)
    #train_df = train_df.merge(max_cycles.to_frame(name='max_cycle'), left_on = 'unit_number', right_index=True)

    # 3️⃣ Calculate RUL by subtracting current cycle from the final cycle for each record
    train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles']
    #train_df['RUL'] = train_df['max_cycle'] - train_df['time_in_cycles']

    # 4️⃣ Drop the now-unnecessary 'max_cycle' column since we've already computed RUL
    train_df.drop(columns=['max_cycle'], inplace=True)

    #save the processed data 
    train_df.to_csv('data/processed_data.csv', index=False)


if __name__ == "__main__": 
    prepare_data()