Spaces:

mininato
/

EmotionClassificationPipeline

Sleeping

File size: 4,826 Bytes
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from _config import config

class CreateCombinedDataFrame(BaseEstimator, TransformerMixin):
    def __init__(self, time_window, label_columns=None):
        self.time_window = time_window
        self.label_columns = label_columns

    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        df_reports, df_accel = X

        print(f"PreprocesssingCombined initialized with label_columns: {self.label_columns}")
        # Ensure the chosen label columns exist in the dataset
        valid_conditions = (df_reports['timeOfEngagement'] != 0) 
        for label in self.label_columns:
            valid_conditions &= (df_reports[label] != "NONE")
        
        df_reports = df_reports[valid_conditions].copy()

        # No datetime conversion needed; timestamps remain as integers
        df_accel.rename(columns={'timestamp': 'timeOfNotification'}, inplace=True)

        print(f"ExtractAccelData initialized with time_window: {self.time_window}")
        df_reports['accel_data'] = df_reports.apply(lambda row: self._extract_accel_data(row, df_accel), axis=1)
    
        print(f"Combining called with label_columns: {self.label_columns}")
        combined_data = []

        for _, row in df_reports.iterrows():
            accel_data = row['accel_data']
            for _, accel_row in accel_data.iterrows():
                combined_row = {
                    'participantId': row['participantId'],  # Participant ID
                    'selfreport_time': row['timeOfNotification'],  # Self-report time
                    'accel_time': accel_row['timeOfNotification'],  # Accelerometer data time
                    'x': accel_row['x'],  # x-axis accelerometer data
                    'y': accel_row['y'],  # y-axis accelerometer data
                    'z': accel_row['z']   # z-axis accelerometer data
                }

                # Dynamically add emotion labels to the combined row
                for label in self.label_columns:
                    combined_row[label] = row[label]

                combined_data.append(combined_row)

        combined_df = pd.DataFrame(combined_data)

        # Convert integer timestamps back to datetime format for the CSV
        combined_df['selfreport_time'] = pd.to_datetime(combined_df['selfreport_time'], unit='ms')
        combined_df['accel_time'] = pd.to_datetime(combined_df['accel_time'], unit='ms')

        # Create groupid column (unique identifier based on participantId and selfreport_time)
        combined_df['groupid'] = combined_df.groupby(['participantId', 'selfreport_time']).ngroup() + 1
        col = combined_df.pop("groupid")  # Move groupid to the first column
        combined_df.insert(0, col.name, col)

        # Export the combined dataframe to CSV
        time_window_str = str(self.time_window)
        label_columns_str = "_".join(self.label_columns)
        file_name = f"combined_data_timewindow_{time_window_str}min_labels_{label_columns_str}.csv"
        combined_df.to_csv(file_name, index=False)
        print(f"Combined dataframe exported successfully to {file_name}.")

        return combined_df
    
    def _extract_accel_data(self, row, accel_data):
        time_delta = self.time_window * 60 * 1000  # Convert minutes to milliseconds
        start_time = row['timeOfNotification'] - time_delta  # Keep as integer
        end_time = row['timeOfNotification'] + time_delta  # Keep as integer
        participant_id = row['participantId']

        # Ensure accel_data['timeOfNotification'] is also an integer
        accel_data['timeOfNotification'] = accel_data['timeOfNotification'].astype(np.int64)  # Ensure integer format

        # Log a warning if the desired time range exceeds the available data range
        if start_time < accel_data['timeOfNotification'].min() or end_time > accel_data['timeOfNotification'].max():
            print(
                f"Warning: Data does not cover the full {self.time_window}-minute window for participant {participant_id}. "
                f"Available range: {accel_data['timeOfNotification'].min()} to {accel_data['timeOfNotification'].max()}. "
                f"Requested range: {start_time} to {end_time}."
            )

        # Apply the filtering mask
        mask = (
            (accel_data['participantId'] == participant_id) &
            (accel_data['timeOfNotification'] >= max(start_time, accel_data['timeOfNotification'].min())) &
            (accel_data['timeOfNotification'] <= min(end_time, accel_data['timeOfNotification'].max()))
        )

        

        print("Start Time (ms):", start_time)
        print("End Time (ms):", end_time)
        print("Filtered Rows:\n", accel_data[mask])

        return accel_data[mask]