|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.base import BaseEstimator, TransformerMixin |
|
|
from _config import config |
|
|
|
|
|
class CreateCombinedDataFrame(BaseEstimator, TransformerMixin): |
|
|
def __init__(self, time_window, label_columns=None): |
|
|
self.time_window = time_window |
|
|
self.label_columns = label_columns |
|
|
|
|
|
def fit(self, X, y=None): |
|
|
return self |
|
|
|
|
|
def transform(self, X): |
|
|
df_reports, df_accel = X |
|
|
|
|
|
print(f"PreprocesssingCombined initialized with label_columns: {self.label_columns}") |
|
|
|
|
|
valid_conditions = (df_reports['timeOfEngagement'] != 0) |
|
|
for label in self.label_columns: |
|
|
valid_conditions &= (df_reports[label] != "NONE") |
|
|
|
|
|
df_reports = df_reports[valid_conditions].copy() |
|
|
|
|
|
|
|
|
df_accel.rename(columns={'timestamp': 'timeOfNotification'}, inplace=True) |
|
|
|
|
|
print(f"ExtractAccelData initialized with time_window: {self.time_window}") |
|
|
df_reports['accel_data'] = df_reports.apply(lambda row: self._extract_accel_data(row, df_accel), axis=1) |
|
|
|
|
|
print(f"Combining called with label_columns: {self.label_columns}") |
|
|
combined_data = [] |
|
|
|
|
|
for _, row in df_reports.iterrows(): |
|
|
accel_data = row['accel_data'] |
|
|
for _, accel_row in accel_data.iterrows(): |
|
|
combined_row = { |
|
|
'participantId': row['participantId'], |
|
|
'selfreport_time': row['timeOfNotification'], |
|
|
'accel_time': accel_row['timeOfNotification'], |
|
|
'x': accel_row['x'], |
|
|
'y': accel_row['y'], |
|
|
'z': accel_row['z'] |
|
|
} |
|
|
|
|
|
|
|
|
for label in self.label_columns: |
|
|
combined_row[label] = row[label] |
|
|
|
|
|
combined_data.append(combined_row) |
|
|
|
|
|
combined_df = pd.DataFrame(combined_data) |
|
|
|
|
|
|
|
|
combined_df['selfreport_time'] = pd.to_datetime(combined_df['selfreport_time'], unit='ms') |
|
|
combined_df['accel_time'] = pd.to_datetime(combined_df['accel_time'], unit='ms') |
|
|
|
|
|
|
|
|
combined_df['groupid'] = combined_df.groupby(['participantId', 'selfreport_time']).ngroup() + 1 |
|
|
col = combined_df.pop("groupid") |
|
|
combined_df.insert(0, col.name, col) |
|
|
|
|
|
|
|
|
time_window_str = str(self.time_window) |
|
|
label_columns_str = "_".join(self.label_columns) |
|
|
file_name = f"combined_data_timewindow_{time_window_str}min_labels_{label_columns_str}.csv" |
|
|
combined_df.to_csv(file_name, index=False) |
|
|
print(f"Combined dataframe exported successfully to {file_name}.") |
|
|
|
|
|
return combined_df |
|
|
|
|
|
def _extract_accel_data(self, row, accel_data): |
|
|
time_delta = self.time_window * 60 * 1000 |
|
|
start_time = row['timeOfNotification'] - time_delta |
|
|
end_time = row['timeOfNotification'] + time_delta |
|
|
participant_id = row['participantId'] |
|
|
|
|
|
|
|
|
accel_data['timeOfNotification'] = accel_data['timeOfNotification'].astype(np.int64) |
|
|
|
|
|
|
|
|
if start_time < accel_data['timeOfNotification'].min() or end_time > accel_data['timeOfNotification'].max(): |
|
|
print( |
|
|
f"Warning: Data does not cover the full {self.time_window}-minute window for participant {participant_id}. " |
|
|
f"Available range: {accel_data['timeOfNotification'].min()} to {accel_data['timeOfNotification'].max()}. " |
|
|
f"Requested range: {start_time} to {end_time}." |
|
|
) |
|
|
|
|
|
|
|
|
mask = ( |
|
|
(accel_data['participantId'] == participant_id) & |
|
|
(accel_data['timeOfNotification'] >= max(start_time, accel_data['timeOfNotification'].min())) & |
|
|
(accel_data['timeOfNotification'] <= min(end_time, accel_data['timeOfNotification'].max())) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
print("Start Time (ms):", start_time) |
|
|
print("End Time (ms):", end_time) |
|
|
print("Filtered Rows:\n", accel_data[mask]) |
|
|
|
|
|
return accel_data[mask] |
|
|
|
|
|
|