Spaces:

mininato
/

EmotionClassificationPipeline

Sleeping

App Files Files Community

mininato commited on Dec 15, 2024

Commit

2962055

verified ·

1 Parent(s): eac5305

Upload 34 files

Browse files

Files changed (34) hide show

01_combining_dataframes_pipeline.py +26 -0
02_feature_extraction_pipeline.py +27 -0
03_training_model_pipeline.py +21 -0
04_analyzing_data_pipeline.py +24 -0
05_complete_trainmodel_pipeline.py +30 -0
LICENSE +21 -0
_config.py +57 -0
app.py +342 -0
pipeline_classes/__init__.py +10 -0
pipeline_classes/__pycache__/__init__.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/__init__.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/extract_features.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/extract_features.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/import_data.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/pcahandler.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/pcahandler.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc +0 -0
pipeline_classes/__pycache__/train_model.cpython-310.pyc +0 -0
pipeline_classes/__pycache__/train_model.cpython-313.pyc +0 -0
pipeline_classes/classify_movementdata.py +40 -0
pipeline_classes/create_combineddataframe.py +103 -0
pipeline_classes/extract_features.py +272 -0
pipeline_classes/lowpassfilter.py +55 -0
pipeline_classes/pcahandler.py +24 -0
pipeline_classes/scale_xyzdata.py +28 -0
pipeline_classes/train_model.py +195 -0
requirements.txt +9 -0

01_combining_dataframes_pipeline.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from sklearn.pipeline import Pipeline
+from pipeline_classes import CreateCombinedDataFrame
+from _config import config
+import time
+import pandas as pd
+accel_data = pd.read_csv(config["accel_path"])
+reports_data = pd.read_csv(config["reports_path"])
+X = (reports_data, accel_data)
+# This pipeline combines the self-reports and accelerometer dataframes with a given timewindow into a single dataframe as a csv file
+combining_dataframes_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),  # input path to self-reports data),
+    ('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
+])
+# This will measure the time taken to run the pipeline
+start_time = time.time()
+# This will start the pipeline and return the combined dataframe
+output_df = combining_dataframes_pipeline.fit_transform(X)
+end_time = time.time()
+print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")

02_feature_extraction_pipeline.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from sklearn.pipeline import Pipeline
+from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures
+from _config import config
+import time
+# This pipeline extracts features from the combined dataframe and exports it to a csv file
+feature_extraction_pipeline = Pipeline([
+    ('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)), # input path to combined data
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
+    ('extract_features', ExtractFeatures(window_length=config["window_length"],
+                                         window_step_size=config["window_step_size"],
+                                         data_frequency=config["data_frequency"],
+                                         selected_domains=config["selected_domains"],
+                                         include_magnitude=config["include_magnitude"],
+                                         label_columns=config["label_columns"])),
+])
+# This will measure the time taken to run the pipeline
+start_time = time.time()
+# This will start the pipeline and return the feature dataframe
+output_df = feature_extraction_pipeline.fit_transform(None)
+end_time = time.time()
+print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")

03_training_model_pipeline.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from sklearn.pipeline import Pipeline
+from pipeline_classes import ImportData, PCAHandler, TrainModel
+from _config import config
+import time
+# This pipeline trains a model on the feature dataframe and export the model to a pickle file and general information to a json file
+training_model_pipeline = Pipeline([
+    ('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
+    ('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
+    ('train_model', TrainModel(config=config)),
+])
+# This will measure the time taken to run the pipeline
+start_time = time.time()
+# This will start the pipeline and return the model and a report
+output_df = training_model_pipeline.fit_transform(None)
+end_time = time.time()
+print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")

04_analyzing_data_pipeline.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from sklearn.pipeline import Pipeline
+from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, ClassifyMovementData
+from _config import config
+import time
+# This is the pipeline that will be used to analyze data which hasnt been classified yet and export the classified dataframe as a csv file
+analyzing_data_pipeline = Pipeline([
+    ('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)), # input path to accelerometer data)
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
+    ('extract_features', ExtractFeatures(window_length=config['window_length'], window_step_size=config["window_step_size"], data_frequency=config["data_frequency"],
+                                          selected_domains=config['selected_domains'], include_magnitude=config['include_magnitude'])),
+    ('classify_movement_data', ClassifyMovementData()),
+])
+# This will measure the time taken to run the pipeline
+start_time = time.time()
+# This will start the pipeline and return the classified dataframe
+output_df = analyzing_data_pipeline.fit_transform(None)
+end_time = time.time()
+print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")

05_complete_trainmodel_pipeline.py ADDED Viewed

	@@ -0,0 +1,30 @@

+from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, CreateCombinedDataFrame, TrainModel, PCAHandler
+from _config import config
+from sklearn.pipeline import Pipeline
+import time
+# This is the complete pipeline that will be used to train a model on the combined dataframe and export the model to a pickle file and general information to a json file
+complete_training_model_pipeline = Pipeline([
+    ('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
+    ('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
+    ('extract_features', ExtractFeatures(window_length=config["window_length"],
+                                         window_step_size=config["window_step_size"],
+                                         data_frequency=config["data_frequency"],
+                                         selected_domains=config["selected_domains"],
+                                         include_magnitude=config["include_magnitude"],
+                                         label_columns=config["label_columns"])),
+    ('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
+    ('train_model', TrainModel(config=config)),
+])
+# This will measure the time taken to run the pipeline
+start_time = time.time()
+# This will start the pipeline and return the model and a report
+output_df = complete_training_model_pipeline.fit_transform(None)
+end_time = time.time()
+print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 mininato
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

_config.py ADDED Viewed

	@@ -0,0 +1,57 @@

+# Configuration file for the pipeline
+config = {
+    # Paths for Import Data
+    "accel_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/AccelerometerMeasurements_backup.csv",  # Path to the accelerometer data
+    "reports_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/UserTestingSelfReports.csv",  # Path to the self-reports data
+    #"combined_data_path": "Path or Name of File of Combined Data File",  # Path to the combined data
+    #"features_data_path": "Path or Name of File of Features Data File",  # Path to the features data
+    #"model_path": "Path or Name of Trained Model File",  # Path to the trained model
+    # Label Configuration
+    "label_columns": ["valence", "arousal"],  # Here you should input the emotion-labels that you are using
+    "target_label": "arousal",  # This is the target label that you want to predict (Only one label can be selected)
+    # Configuration for combined data
+    "time_window": 3,  # Minutes before and after the self-report
+    # Configuration for feature extraction
+    "window_length": 60,  # Window length in seconds / 60
+    "window_step_size": 20,  # Step size in seconds / 10%-50% of window_length / 20
+    "data_frequency": 25,  # Data frequency in Hz
+    "selected_domains": None,  # Default: Every domain / 'time_domain', 'spatial', 'frequency', 'statistical', 'wavelet' / multiple domains: ["time_domain", "frequency"] / order is not important
+    "include_magnitude": True,  # Include magnitude-based features or not
+    #Configuration for Low-pass filter
+    "cutoff_frequency": 10,  # Cut-off frequency for the low-pass filter
+    "order": 4,  # Order of the filter
+    # Configuration for Scaling
+    "scaler_type": "standard",  # Possible Scaler: 'standard' or 'minmax'
+    # Configuration for PCA
+    "apply_pca": False,  # Apply PCA or not
+    "pca_variance": 0.95,  # PCA variance threshold
+    # Configuration for model training
+    "classifier": "xgboost",  # Default classifier ('xgboost', 'svm', 'randomforest')
+    # Configuration for hyperparameter tuning
+    "n_splits": 5, # Number of splits for cross-validation
+    "n_iter": 30,   # Number of iterations for hyperparameter tuning
+    "n_jobs": -1,   # Number of jobs for parallel processing
+    "n_points": 1,  # Number of points to sample in the hyperparameter space
+    # If users want to define custom param_space, they can specify it here
+    "param_space": {
+        "learning_rate": (0.05, 0.2),
+        "n_estimators": (200, 800),
+        "max_depth": (4, 8),
+        "min_child_weight": (1, 5),
+        "subsample": (0.6, 0.9),
+        "colsample_bytree": (0.6, 0.9),
+        "gamma": (0, 5),
+        "reg_alpha": (0, 5),
+        "reg_lambda": (0, 5)
+    },  # Set to {None} to use default inside the TrainModel class
+}

app.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import gradio as gr
+from pipeline_classes import CreateCombinedDataFrame, ScaleXYZData, ExtractFeatures, TrainModel, ClassifyMovementData, LowPassFilter, PCAHandler
+from sklearn.pipeline import Pipeline
+from _config import config
+import pandas as pd
+import numpy as np
+import joblib
+import json
+# Define pipelines
+combining_dataframes_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
+    ('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
+])
+feature_extraction_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)),
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
+    ('extract_features', ExtractFeatures(window_length=None,
+                                         window_step_size=None,
+                                         data_frequency=None,
+                                         selected_domains=None,
+                                         include_magnitude=None,
+                                         features_label_columns=None)),
+])
+training_model_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
+    ('pca_handler', PCAHandler(apply_pca=None, variance=None)),
+    ('train_model', TrainModel(classifier=None, train_label= None, target=None)),
+])
+analyzing_data_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)),
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
+    ('extract_features', ExtractFeatures(window_length=None,
+                                         window_step_size=None,
+                                         data_frequency=None,
+                                         selected_domains=None,
+                                         include_magnitude=None,
+                                         features_label_columns=None)),
+    ('classify_movement_data', ClassifyMovementData(model_file=None)),
+])
+complete_training_model_pipeline = Pipeline([
+    #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
+    ('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
+    ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
+    ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
+    ('extract_features', ExtractFeatures(window_length=None,
+                                         window_step_size=None,
+                                         data_frequency=None,
+                                         selected_domains=None,
+                                         include_magnitude=None,
+                                         features_label_columns=None)),
+    ('pca_handler', PCAHandler(apply_pca=None, variance=None)),
+    ('train_model', TrainModel(classifier=None, train_label= None, target=None)),
+])
+def execute_combine_pipeline(accel_file, report_file,
+                     time_window=None, label_columns=None
+                     ):
+    try:
+        # Load data files only if paths are valid
+        accel_data = pd.read_csv(accel_file) if accel_file else None
+        report_data = pd.read_csv(report_file) if report_file else None
+        # Validate inputs for the selected pipeline
+        if accel_data is None or report_data is None:
+            return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
+        combining_dataframes_pipeline.set_params(
+            create_combined_dataframe__time_window=time_window,
+            create_combined_dataframe__label_columns=label_columns.split(','))
+        X = report_data, accel_data
+        result = combining_dataframes_pipeline.fit_transform(X)
+        output_file = "combine_dataframes_output.csv"
+        result.to_csv(output_file, index=False)
+        return output_file
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        return str(e), None
+def execute_feature_extraction_pipeline(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
+    try:
+        combined_data = pd.read_csv(combined_file) if combined_file else None
+        if combined_data is None:
+            return "Error: Combined data file is required for this pipeline.", None
+        feature_extraction_pipeline.set_params(
+            low_pass_filter__cutoff_frequency=cutoff_frequency,
+            low_pass_filter__order=order,
+            low_pass_filter__sampling_rate=data_frequency,
+            scale_xyz_data__scaler_type=scaler_type,
+            extract_features__window_length=window_length,
+            extract_features__window_step_size=window_step_size,
+            extract_features__data_frequency=data_frequency,
+            #extract_features__selected_domains=None,
+            extract_features__include_magnitude=include_magnitude,
+            extract_features__features_label_columns=features_label_columns.split(','))
+        result = feature_extraction_pipeline.fit_transform(combined_data)
+        output_file = "extract_features_output.csv"
+        result.to_csv(output_file, index=False)
+        return output_file
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        return str(e)
+def execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target):
+    try:
+        print(f"features_file: {features_file}")
+        features_data = pd.read_csv(features_file) if features_file else None
+        if features_data is None:
+            return "Error: Features data file is required for this pipeline.", None
+        training_model_pipeline.set_params(
+            pca_handler__apply_pca=apply_pca,
+            pca_handler__variance=pca_variance,
+            train_model__classifier=classifier,
+            train_model__train_label=train_label,
+            train_model__target=target)
+        X = features_data
+        training_model_pipeline.fit(X)
+        output_file, secondary_output_file = training_model_pipeline.named_steps['train_model'].get_output_files()
+        return output_file, secondary_output_file
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        return str(e), None
+def execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
+    try:
+        print("hallo")
+        accel_data = pd.read_csv(accel_file) if accel_file else None
+        print("hallo2")
+        if accel_data is None:
+            return "Error: Accelerometer data file is required for this pipeline.", None
+        analyzing_data_pipeline.set_params(
+            low_pass_filter__cutoff_frequency=cutoff_frequency,
+            low_pass_filter__order=order,
+            low_pass_filter__sampling_rate=data_frequency,
+            scale_xyz_data__scaler_type=scaler_type,
+            extract_features__window_length=window_length,
+            extract_features__window_step_size=window_length,
+            extract_features__data_frequency=data_frequency,
+            #extract_features__selected_domains=None,
+            extract_features__include_magnitude=include_magnitude,
+            extract_features__features_label_columns=features_label_columns.split(','),
+            classify_movement_data__model_file=model_file.name
+            )
+        print("hallo3")
+        result = analyzing_data_pipeline.fit_transform(accel_data)
+        output_file = "analyze_data_output.csv"
+        result.to_csv(output_file, index=False)
+        return output_file
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        return str(e), None
+def execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
+                                       cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
+                                        apply_pca, pca_variance, classifier, train_label, target):
+    try:
+        accel_data = pd.read_csv(accel_file) if accel_file else None
+        report_data = pd.read_csv(report_file) if report_file else None
+        if accel_data is None or report_data is None:
+            return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
+        complete_training_model_pipeline.set_params(
+            create_combined_dataframe__time_window=time_window,
+            create_combined_dataframe__label_columns=label_columns.split(','),
+            low_pass_filter__cutoff_frequency=cutoff_frequency,
+            low_pass_filter__order=order,
+            low_pass_filter__sampling_rate=data_frequency,
+            scale_xyz_data__scaler_type=scaler_type,
+            extract_features__window_length=window_length,
+            extract_features__window_step_size=window_step_size,
+            extract_features__data_frequency=data_frequency,
+            #extract_features__selected_domains=None,
+            extract_features__include_magnitude=include_magnitude,
+            extract_features__features_label_columns=label_columns.split(','),
+            pca_handler__apply_pca=apply_pca,
+            pca_handler__variance=pca_variance,
+            train_model__classifier=classifier,
+            train_model__train_label=label_columns,
+            train_model__target=target
+        )
+        X = report_data, accel_data
+        complete_training_model_pipeline.fit(X)
+        output_file, secondary_output_file = complete_training_model_pipeline.named_steps['train_model'].get_output_files()
+        return output_file, secondary_output_file
+    except Exception as e:
+        print(f"Error occurred: {str(e)}")
+        return str(e), None
+# Gradio Blocks Interface
+with gr.Blocks() as demo:
+    with gr.Tabs():
+        with gr.TabItem("Combine DataFrames"):
+            accel_file = gr.File(label="Upload Accelerometer Data")
+            report_file = gr.File(label="Upload Self-Report Data")
+            time_window = gr.Number(label="Time Window (minutes)", value=2)
+            label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            combine_button = gr.Button("Combine DataFrames")
+            combine_output = gr.File(label="Download Combined DataFrame")
+            def combine_dataframes(accel_file, report_file, time_window, label_columns):
+                output_file = execute_combine_pipeline(accel_file, report_file, time_window, label_columns)
+                return output_file
+            combine_button.click(combine_dataframes, inputs=[accel_file, report_file, time_window, label_columns], outputs=combine_output)
+        with gr.TabItem("Extract Features"):
+            combined_file = gr.File(label="Upload Combined Data")
+            cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
+            order = gr.Number(label="Order", value=4)
+            scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
+            window_length = gr.Number(label="Window Length (seconds)", value=60)
+            window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
+            data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
+            #selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
+            include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
+            features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            extract_button = gr.Button("Extract Features")
+            extract_output = gr.File(label="Download Extracted Features")
+            def extract_features(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
+                    output_file = execute_feature_extraction_pipeline(combined_file,
+                                                    cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency,
+                                                     include_magnitude, features_label_columns
+                                                    )
+                    return output_file
+            extract_button.click(extract_features, inputs=[combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size,
+                                                           data_frequency, include_magnitude, features_label_columns],  outputs=extract_output)
+        with gr.TabItem("Train Model"):
+            features_file = gr.File(label="Upload Features Data")
+            apply_pca = gr.Checkbox(label="Apply PCA", value=False)
+            pca_variance = gr.Number(label="PCA Variance", value=0.95)
+            classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
+            train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            target = gr.Textbox(label="Target Label", value="arousal")
+            train_button = gr.Button("Train Model")
+            train_output_json = gr.File(label="Download Model JSON")
+            train_output_pkl = gr.File(label="Download Model PKL")
+            def train_model(features_file, apply_pca, pca_variance, classifier, train_label,  target):
+                output_file, secondary_output_file = execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target)
+                return output_file, secondary_output_file
+            train_button.click(train_model, inputs=[features_file, apply_pca, pca_variance, classifier, train_label, target], outputs=[train_output_json, train_output_pkl])
+        with gr.TabItem("Analyze Data"):
+            accel_file = gr.File(label="Upload Accelerometer Data")
+            model_file = gr.File(label="Upload Model")
+            cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
+            order = gr.Number(label="Order", value=4)
+            scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
+            window_length = gr.Number(label="Window Length (seconds)", value=60)
+            data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
+            #selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
+            include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
+            features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            analyze_button = gr.Button("Analyze Data")
+            analyze_output = gr.File(label="Download Analyzed Data")
+            def analyze_data(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
+                output_file = execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
+                                                           data_frequency, include_magnitude, features_label_columns)
+                return output_file
+            analyze_button.click(analyze_data, inputs=[accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
+                                                           data_frequency, include_magnitude, features_label_columns ], outputs=analyze_output)
+        with gr.TabItem("Complete Train Model"):
+            accel_file = gr.File(label="Upload Accelerometer Data")
+            report_file = gr.File(label="Upload Self-Report Data")
+            time_window = gr.Number(label="Time Window (minutes)", value=2)
+            label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
+            order = gr.Number(label="Order", value=4)
+            scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
+            window_length = gr.Number(label="Window Length (seconds)", value=60)
+            window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
+            data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
+            include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
+            #features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            apply_pca = gr.Checkbox(label="Apply PCA", value=False)
+            pca_variance = gr.Number(label="PCA Variance", value=0.95)
+            classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
+            #train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
+            target = gr.Textbox(label="Target Label", value="arousal")
+            complete_train_button = gr.Button("Complete Train Model")
+            complete_train_output_pkl = gr.File(label="Download Model PKL")
+            complete_train_output_json = gr.File(label="Download Model JSON")
+            def complete_train_model(accel_file, report_file, time_window, label_columns,
+                                       cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
+                                        apply_pca, pca_variance, classifier, train_label, target):
+                output_file, secondary_output_file = execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
+                                       cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
+                                        apply_pca, pca_variance, classifier, train_label, target)
+                return output_file, secondary_output_file
+            complete_train_button.click(complete_train_model, inputs=[accel_file, report_file, time_window, label_columns,
+                                       cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
+                                        apply_pca, pca_variance, classifier, train_label, target], outputs=[complete_train_output_pkl, complete_train_output_json])
+demo.launch()

pipeline_classes/__init__.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Description: This file is used to import all the classes in the pipeline_classes folder.
+# from .import_data import ImportData
+from .create_combineddataframe import CreateCombinedDataFrame
+from .scale_xyzdata import ScaleXYZData
+from .extract_features import ExtractFeatures
+from .pcahandler import PCAHandler
+from .train_model import TrainModel
+from .classify_movementdata import ClassifyMovementData
+from .lowpassfilter import LowPassFilter

pipeline_classes/__pycache__/__init__.cpython-310.pyc ADDED Viewed

Binary file (563 Bytes). View file

pipeline_classes/__pycache__/__init__.cpython-313.pyc ADDED Viewed

Binary file (622 Bytes). View file

pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc ADDED Viewed

Binary file (1.48 kB). View file

pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc ADDED Viewed

Binary file (2.05 kB). View file

pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc ADDED Viewed

Binary file (3.21 kB). View file

pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc ADDED Viewed

Binary file (4.57 kB). View file

pipeline_classes/__pycache__/extract_features.cpython-310.pyc ADDED Viewed

Binary file (8.93 kB). View file

pipeline_classes/__pycache__/extract_features.cpython-313.pyc ADDED Viewed

Binary file (18.3 kB). View file

pipeline_classes/__pycache__/import_data.cpython-313.pyc ADDED Viewed

Binary file (2.77 kB). View file

pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc ADDED Viewed

Binary file (2.42 kB). View file

pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc ADDED Viewed

Binary file (2.75 kB). View file

pipeline_classes/__pycache__/pcahandler.cpython-310.pyc ADDED Viewed

Binary file (1.18 kB). View file

pipeline_classes/__pycache__/pcahandler.cpython-313.pyc ADDED Viewed

Binary file (1.64 kB). View file

pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc ADDED Viewed

Binary file (1.4 kB). View file

pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc ADDED Viewed

Binary file (1.82 kB). View file

pipeline_classes/__pycache__/train_model.cpython-310.pyc ADDED Viewed

Binary file (5.32 kB). View file

pipeline_classes/__pycache__/train_model.cpython-313.pyc ADDED Viewed

Binary file (8.5 kB). View file

pipeline_classes/classify_movementdata.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+import joblib
+from _config import config
+# This class is used to classify the movement data using a pre-trained model
+class ClassifyMovementData(BaseEstimator, TransformerMixin):
+    def __init__(self, model_file = None):
+        #self.model_path = model_path if model_path else config.get("model_path")
+        self.model_file = model_file
+        self.model = None
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        if self.model is None:
+            if self.model_file is None:
+                raise ValueError("Model file is not provided.")
+            try:
+                self.model = joblib.load(self.model_file)  # Load the model
+            except Exception as e:
+                raise ValueError(f"Failed to load the model file: {e}")
+        # Assuming `X` is a DataFrame of pre-extracted features.
+        predictions = self.model.predict(X)
+        # Adding predictions to the DataFrame as the first column
+        X.insert(0, 'predicted_emotion', predictions)
+        print("Data classified successfully.")
+        # Export the labeled DataFrame to CSV
+        #window_length_str = str(config["window_length"])
+        output_file = f"classified_movement_data.csv"
+        X.to_csv(output_file, index=False)
+        print(f"Classified movement data exported successfully to {output_file}.")
+        return X

pipeline_classes/create_combineddataframe.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from _config import config
+class CreateCombinedDataFrame(BaseEstimator, TransformerMixin):
+    def __init__(self, time_window, label_columns=None):
+        self.time_window = time_window
+        self.label_columns = label_columns
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        df_reports, df_accel = X
+        print(f"PreprocesssingCombined initialized with label_columns: {self.label_columns}")
+        # Ensure the chosen label columns exist in the dataset
+        valid_conditions = (df_reports['timeOfEngagement'] != 0)
+        for label in self.label_columns:
+            valid_conditions &= (df_reports[label] != "NONE")
+        df_reports = df_reports[valid_conditions].copy()
+        # No datetime conversion needed; timestamps remain as integers
+        df_accel.rename(columns={'timestamp': 'timeOfNotification'}, inplace=True)
+        print(f"ExtractAccelData initialized with time_window: {self.time_window}")
+        df_reports['accel_data'] = df_reports.apply(lambda row: self._extract_accel_data(row, df_accel), axis=1)
+        print(f"Combining called with label_columns: {self.label_columns}")
+        combined_data = []
+        for _, row in df_reports.iterrows():
+            accel_data = row['accel_data']
+            for _, accel_row in accel_data.iterrows():
+                combined_row = {
+                    'participantId': row['participantId'],  # Participant ID
+                    'selfreport_time': row['timeOfNotification'],  # Self-report time
+                    'accel_time': accel_row['timeOfNotification'],  # Accelerometer data time
+                    'x': accel_row['x'],  # x-axis accelerometer data
+                    'y': accel_row['y'],  # y-axis accelerometer data
+                    'z': accel_row['z']   # z-axis accelerometer data
+                }
+                # Dynamically add emotion labels to the combined row
+                for label in self.label_columns:
+                    combined_row[label] = row[label]
+                combined_data.append(combined_row)
+        combined_df = pd.DataFrame(combined_data)
+        # Convert integer timestamps back to datetime format for the CSV
+        combined_df['selfreport_time'] = pd.to_datetime(combined_df['selfreport_time'], unit='ms')
+        combined_df['accel_time'] = pd.to_datetime(combined_df['accel_time'], unit='ms')
+        # Create groupid column (unique identifier based on participantId and selfreport_time)
+        combined_df['groupid'] = combined_df.groupby(['participantId', 'selfreport_time']).ngroup() + 1
+        col = combined_df.pop("groupid")  # Move groupid to the first column
+        combined_df.insert(0, col.name, col)
+        # Export the combined dataframe to CSV
+        time_window_str = str(self.time_window)
+        label_columns_str = "_".join(self.label_columns)
+        file_name = f"combined_data_timewindow_{time_window_str}min_labels_{label_columns_str}.csv"
+        combined_df.to_csv(file_name, index=False)
+        print(f"Combined dataframe exported successfully to {file_name}.")
+        return combined_df
+    def _extract_accel_data(self, row, accel_data):
+        time_delta = self.time_window * 60 * 1000  # Convert minutes to milliseconds
+        start_time = row['timeOfNotification'] - time_delta  # Keep as integer
+        end_time = row['timeOfNotification'] + time_delta  # Keep as integer
+        participant_id = row['participantId']
+        # Ensure accel_data['timeOfNotification'] is also an integer
+        accel_data['timeOfNotification'] = accel_data['timeOfNotification'].astype(np.int64)  # Ensure integer format
+        # Log a warning if the desired time range exceeds the available data range
+        if start_time < accel_data['timeOfNotification'].min() or end_time > accel_data['timeOfNotification'].max():
+            print(
+                f"Warning: Data does not cover the full {self.time_window}-minute window for participant {participant_id}. "
+                f"Available range: {accel_data['timeOfNotification'].min()} to {accel_data['timeOfNotification'].max()}. "
+                f"Requested range: {start_time} to {end_time}."
+            )
+        # Apply the filtering mask
+        mask = (
+            (accel_data['participantId'] == participant_id) &
+            (accel_data['timeOfNotification'] >= max(start_time, accel_data['timeOfNotification'].min())) &
+            (accel_data['timeOfNotification'] <= min(end_time, accel_data['timeOfNotification'].max()))
+        )
+        print("Start Time (ms):", start_time)
+        print("End Time (ms):", end_time)
+        print("Filtered Rows:\n", accel_data[mask])
+        return accel_data[mask]

pipeline_classes/extract_features.py ADDED Viewed

	@@ -0,0 +1,272 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from scipy.fftpack import fft
+from scipy.signal import welch
+import pywt
+from _config import config
+class ExtractFeatures(BaseEstimator, TransformerMixin):
+    def __init__(self, window_length, window_step_size, data_frequency, selected_domains=None, include_magnitude=False, features_label_columns=None):
+        self.window_length = window_length
+        self.window_step_size = window_step_size
+        self.data_frequency = data_frequency
+        self.selected_domains = selected_domains
+        self.include_magnitude = include_magnitude
+        self.features_label_columns = features_label_columns #if label_columns else ["arousal", "valence"]  # Default to arousal and valence if not specified
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        features_list = []
+        if 'groupid' in X.columns:  # Check for groupid column
+            for groupid in X['groupid'].unique():  # Iterate over unique group IDs
+                temp = X[X['groupid'] == groupid]  # Filter rows by group ID
+                temp_ex = temp[['accel_time', 'x', 'y', 'z']].copy()  # Keep only the necessary columns (accel_time can be removed if unused)
+                windows = self._window_data(temp_ex[['x', 'y', 'z']])  # Create windows of data
+                for window in windows:
+                    features = self._extract_features_from_window(window)  # Extract features from each window
+                    features['groupid'] = groupid  # Add groupid to the features
+                    # Dynamically add emotion labels to the features
+                    for label in self.features_label_columns:
+                        features[label] = temp[label].iloc[0]
+                    features_list.append(pd.DataFrame([features]))  # Convert dictionary to DataFrame
+        else:  # In case there's no groupid, calculate features without it
+            windows = self._window_data(X[['x', 'y', 'z']])
+            for window in windows:
+                features = self._extract_features_from_window(window)
+                features_list.append(pd.DataFrame([features]))
+        all_features = pd.concat(features_list, ignore_index=True)
+        # Export features to CSV
+        window_length_str = str(self.window_length)
+        window_step_size_str = str(self.window_step_size)
+        if self.selected_domains is None:  # All features calculated if domains are not selected
+            domain_str = "all_features"
+        else:
+            domain_str = "_".join(self.selected_domains)
+        file_name = f"features_window_{window_length_str}_step_{window_step_size_str}_{domain_str}.csv"
+        all_features.to_csv(file_name, index=False)
+        print("All features extracted successfully.")
+        return all_features
+    # Time Domain Features
+    def _calculate_magnitude(self, window):
+        return np.sqrt(window[:, 0]**2 + window[:, 1]**2 + window[:, 2]**2)
+    def _window_data(self, data):                                                            # Function to create windows of the data
+        window_samples = int(self.window_length * self.data_frequency)                       # Number of samples in each window 60sec * 25Hz = 1500 samples
+        step_samples = int(self.window_step_size * self.data_frequency)                                             # Number of samples to move the window
+        windows = [data[i:i + window_samples] for i in range(0, len(data) - window_samples + 1, step_samples)]      # Create windows
+        return np.array(windows)
+    def _extract_features_from_window(self, window):                        #DONE Mehrere domains gleichzeitig berechnen
+        all_features = {}
+        if self.selected_domains is None or 'time_domain' in self.selected_domains:
+            all_features.update(self._extract_time_domain_features(window))
+        if self.selected_domains is None or 'spatial' in self.selected_domains:
+            all_features.update(self._extract_spatial_features(window))
+        if self.selected_domains is None or 'frequency' in self.selected_domains:
+            all_features.update(self._extract_frequency_domain_features(window))
+        if self.selected_domains is None or 'statistical' in self.selected_domains:
+            all_features.update(self._extract_statistical_features(window))
+        if self.selected_domains is None or 'wavelet' in self.selected_domains:
+            all_features.update(self._extract_wavelet_features(window))
+        return all_features
+    def _extract_time_domain_features(self, window):
+        features = {
+            'mean_x': np.mean(window[:, 0]),
+            'mean_y': np.mean(window[:, 1]),
+            'mean_z': np.mean(window[:, 2]),
+            'std_x': np.std(window[:, 0]),
+            'std_y': np.std(window[:, 1]),
+            'std_z': np.std(window[:, 2]),
+            'variance_x': np.var(window[:, 0]),
+            'variance_y': np.var(window[:, 1]),
+            'variance_z': np.var(window[:, 2]),
+            'rms_x': np.sqrt(np.mean(window[:, 0]**2)),
+            'rms_y': np.sqrt(np.mean(window[:, 1]**2)),
+            'rms_z': np.sqrt(np.mean(window[:, 2]**2)),
+            'max_x': np.max(window[:, 0]),
+            'max_y': np.max(window[:, 1]),
+            'max_z': np.max(window[:, 2]),
+            'min_x': np.min(window[:, 0]),
+            'min_y': np.min(window[:, 1]),
+            'min_z': np.min(window[:, 2]),
+            'peak_to_peak_x': np.ptp(window[:, 0]),
+            'peak_to_peak_y': np.ptp(window[:, 1]),
+            'peak_to_peak_z': np.ptp(window[:, 2]),
+            'skewness_x': pd.Series(window[:, 0]).skew(),
+            'skewness_y': pd.Series(window[:, 1]).skew(),
+            'skewness_z': pd.Series(window[:, 2]).skew(),
+            'kurtosis_x': pd.Series(window[:, 0]).kurt(),
+            'kurtosis_y': pd.Series(window[:, 1]).kurt(),
+            'kurtosis_z': pd.Series(window[:, 2]).kurt(),
+            'zero_crossing_rate_x': np.sum(np.diff(np.sign(window[:, 0])) != 0),
+            'zero_crossing_rate_y': np.sum(np.diff(np.sign(window[:, 1])) != 0),
+            'zero_crossing_rate_z': np.sum(np.diff(np.sign(window[:, 2])) != 0),
+            'sma' : np.sum(np.abs(window[:, 0])) + np.sum(np.abs(window[:, 1])) + np.sum(np.abs(window[:, 2])), #Signal Magnitude Area
+        }
+        # print(f"Time domain features extracted successfully.")
+        # Additional features for Magnitude (xyz in one vector)
+        if self.include_magnitude:
+            magnitude = self._calculate_magnitude(window)
+            features['mean_magnitude'] = np.mean(magnitude)
+            features['std_magnitude'] = np.std(magnitude)
+            features['variance_magnitude'] = np.var(magnitude)
+            features['rms_magnitude'] = np.sqrt(np.mean(magnitude**2))
+            features['max_magnitude'] = np.max(magnitude)
+            features['min_magnitude'] = np.min(magnitude)
+            features['peak_to_peak_magnitude'] = np.ptp(magnitude)
+            features['skewness_magnitude'] = pd.Series(magnitude).skew()
+            features['kurtosis_magnitude'] = pd.Series(magnitude).kurt()
+            features['zero_crossing_rate_magnitude'] = np.sum(np.diff(np.sign(magnitude)) != 0)
+            # print(f"Additional time domain features for magnitude extracted successfully.")
+        return features
+    # Spatial Features
+    def _extract_spatial_features(self, window):
+        features = {}
+        # Euclidean Norm (Magnitude)
+        magnitude = self._calculate_magnitude(window)
+        features['euclidean_norm'] = np.mean(magnitude)  # or np.linalg.norm for each window
+        # Tilt Angles (Pitch and Roll)
+        pitch = np.arctan2(window[:, 1], np.sqrt(window[:, 0]**2 + window[:, 2]**2)) * (180 / np.pi)
+        roll = np.arctan2(window[:, 0], np.sqrt(window[:, 1]**2 + window[:, 2]**2)) * (180 / np.pi)
+        features['mean_pitch'] = np.mean(pitch)
+        features['mean_roll'] = np.mean(roll)
+        # Correlation between Axes
+        features['correlation_xy'] = np.corrcoef(window[:, 0], window[:, 1])[0, 1]
+        features['correlation_xz'] = np.corrcoef(window[:, 0], window[:, 2])[0, 1]
+        features['correlation_yz'] = np.corrcoef(window[:, 1], window[:, 2])[0, 1]
+        # print(f"Spatial features extracted successfully.")
+        return features
+    # Frequency Domain Features
+    def _extract_frequency_domain_features(self, window):
+        n = len(window)
+        freq_values = np.fft.fftfreq(n, d=1/self.data_frequency)[:n // 2]
+        fft_values = fft(window, axis=0)
+        fft_magnitude = np.abs(fft_values)[:n // 2]
+        features = {}
+        # Spectral Entropy
+        def spectral_entropy(signal):
+            psd = np.square(signal)
+            psd_norm = psd / np.sum(psd)
+            return -np.sum(psd_norm * np.log(psd_norm + 1e-10))
+        for i, axis in enumerate(['x', 'y', 'z']):
+            # Dominant Frequency
+            dominant_frequency = freq_values[np.argmax(fft_magnitude[:, i])]
+            features[f'dominant_frequency_{axis}'] = dominant_frequency
+            # Spectral Entropy
+            entropy = spectral_entropy(fft_magnitude[:, i])
+            features[f'spectral_entropy_{axis}'] = entropy
+            # Power Spectral Density (PSD) and Energy
+            f, psd_values = welch(window[:, i], fs=self.data_frequency, nperseg=n)
+            features[f'psd_mean_{axis}'] = np.mean(psd_values)
+            features[f'energy_{axis}'] = np.sum(psd_values**2)
+            # Bandwidth (frequency range containing significant portion of the energy)
+            cumulative_energy = np.cumsum(psd_values)
+            total_energy = cumulative_energy[-1]
+            low_cutoff_idx = np.argmax(cumulative_energy > 0.1 * total_energy)
+            high_cutoff_idx = np.argmax(cumulative_energy > 0.9 * total_energy)
+            bandwidth = f[high_cutoff_idx] - f[low_cutoff_idx]
+            features[f'bandwidth_{axis}'] = bandwidth
+            # Spectral Centroid (Center of mass of the spectrum)
+            spectral_centroid = np.sum(f * psd_values) / np.sum(psd_values)
+            features[f'spectral_centroid_{axis}'] = spectral_centroid
+        if self.include_magnitude:
+            # Magnitude-based Frequency Domain Features
+            magnitude = self._calculate_magnitude(window)
+            fft_magnitude_mag = np.abs(fft(magnitude))[:n // 2]
+            # Dominant Frequency for Magnitude
+            features['dominant_frequency_magnitude'] = freq_values[np.argmax(fft_magnitude_mag)]
+            # Spectral Entropy for Magnitude
+            features['spectral_entropy_magnitude'] = spectral_entropy(fft_magnitude_mag)
+            # Power Spectral Density and Energy for Magnitude
+            f, psd_values_mag = welch(magnitude, fs=self.data_frequency, nperseg=n)
+            features['psd_mean_magnitude'] = np.mean(psd_values_mag)
+            features['energy_magnitude'] = np.sum(psd_values_mag**2)
+            # Bandwidth for Magnitude
+            cumulative_energy_mag = np.cumsum(psd_values_mag)
+            total_energy_mag = cumulative_energy_mag[-1]
+            low_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.1 * total_energy_mag)
+            high_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.9 * total_energy_mag)
+            bandwidth_mag = f[high_cutoff_idx_mag] - f[low_cutoff_idx_mag]
+            features['bandwidth_magnitude'] = bandwidth_mag
+            # Spectral Centroid for Magnitude
+            features['spectral_centroid_magnitude'] = np.sum(f * psd_values_mag) / np.sum(psd_values_mag)
+        # print(f"Frequency domain features extracted successfully.")
+        return features
+    def _extract_statistical_features(self, window):
+        features = {
+            '25th_percentile_x': np.percentile(window[:, 0], 25),
+            '25th_percentile_y': np.percentile(window[:, 1], 25),
+            '25th_percentile_z': np.percentile(window[:, 2], 25),
+            '75th_percentile_x': np.percentile(window[:, 0], 75),
+            '75th_percentile_y': np.percentile(window[:, 1], 75),
+            '75th_percentile_z': np.percentile(window[:, 2], 75),
+        }
+        if self.include_magnitude:
+            magnitude = self._calculate_magnitude(window)
+            features['25th_percentile_magnitude'] = np.percentile(magnitude, 25)
+            features['75th_percentile_magnitude'] = np.percentile(magnitude, 75)
+        # print(f"Statistical features extracted successfully.")
+        return features
+    def _extract_wavelet_features(self, window, wavelet='db1'):
+        coeffs = pywt.wavedec(window, wavelet, axis=0, level=3)
+        features = {
+            'wavelet_energy_approx_x': np.sum(coeffs[0][:, 0]**2),
+            'wavelet_energy_approx_y': np.sum(coeffs[0][:, 1]**2),
+            'wavelet_energy_approx_z': np.sum(coeffs[0][:, 2]**2),
+        }
+        if self.include_magnitude:
+            magnitude = self._calculate_magnitude(window)
+            coeffs_magnitude = pywt.wavedec(magnitude, wavelet, level=3)
+            features['wavelet_energy_approx_magnitude'] = np.sum(coeffs_magnitude[0]**2)
+        # print(f"Wavelet features extracted successfully.")
+        return features

pipeline_classes/lowpassfilter.py ADDED Viewed

	@@ -0,0 +1,55 @@

+import numpy as np
+import pandas as pd
+from sklearn.base import BaseEstimator, TransformerMixin
+from scipy.signal import butter, filtfilt
+class LowPassFilter(BaseEstimator, TransformerMixin):
+    def __init__(self, cutoff_frequency, sampling_rate, order):
+        """
+        Initialize the LowPassFilter class.
+        Parameters:
+        - cutoff_frequency: The cutoff frequency for the low-pass filter (default: 5 Hz).
+        - sampling_rate: The sampling rate of the accelerometer data (default: 25 Hz).
+        - order: The order of the filter (default: 4).
+        """
+        self.cutoff_frequency = cutoff_frequency
+        self.sampling_rate = sampling_rate
+        self.order = order
+    def _butter_lowpass_filter(self, data):
+        """
+        Apply a Butterworth low-pass filter to the data.
+        Parameters:
+        - data: A NumPy array containing the accelerometer data to be filtered.
+        Returns:
+        - A filtered NumPy array.
+        """
+        nyquist = 0.5 * self.sampling_rate
+        normalized_cutoff = self.cutoff_frequency / nyquist
+        b, a = butter(self.order, normalized_cutoff, btype='low', analog=False)
+        filtered_data = filtfilt(b, a, data, axis=0)
+        return filtered_data
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        """
+        Apply the low-pass filter to the accelerometer data.
+        Parameters:
+        - X: A DataFrame with 'x', 'y', and 'z' columns representing the accelerometer data.
+        Returns:
+        - The DataFrame with filtered 'x', 'y', and 'z' columns.
+        """
+        if 'x' in X.columns and 'y' in X.columns and 'z' in X.columns:
+            X[['x', 'y', 'z']] = self._butter_lowpass_filter(X[['x', 'y', 'z']].values)
+            print("Low-pass filter applied successfully.")
+        else:
+            raise ValueError("The input DataFrame must contain 'x', 'y', and 'z' columns.")
+        return X

pipeline_classes/pcahandler.py ADDED Viewed

	@@ -0,0 +1,24 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.decomposition import PCA
+from _config import config
+class PCAHandler(BaseEstimator, TransformerMixin):
+    def __init__(self, apply_pca=False, variance=0.95):
+        self.apply_pca = apply_pca
+        self.variance = variance
+        self.pca = None
+    def fit(self, X, y=None):
+        if self.apply_pca:
+            self.pca = PCA(n_components=self.variance)
+            self.pca.fit(X)
+        return self
+    def transform(self, X):
+        if self.apply_pca and self.pca:
+            X_transformed = self.pca.transform(X)
+            return pd.DataFrame(X_transformed, index=X.index)
+        return X

pipeline_classes/scale_xyzdata.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.preprocessing import StandardScaler, MinMaxScaler
+from _config import config
+class ScaleXYZData(BaseEstimator, TransformerMixin):
+    def __init__(self, scaler_type='standard'):
+        self.scaler_type = scaler_type
+    def fit(self, X, y=None):
+        return self
+    def transform(self, X):
+        columns_to_scale = ['x', 'y', 'z']
+        if self.scaler_type == 'standard':                  # Scale the columns using StandardScaler or MinMaxScaler
+            scaler = StandardScaler()
+        elif self.scaler_type == 'minmax':
+            scaler = MinMaxScaler()
+        elif self.scaler_type == 'none':
+            return X  # Return the DataFrame without scaling
+        else:
+            raise ValueError("Invalid scaler_type. Expected 'standard' or 'minmax'.")   # Raise an error if scaler_type is invalid
+        scaled_columns = scaler.fit_transform(X[columns_to_scale])
+        scaled_df = pd.DataFrame(scaled_columns, columns=columns_to_scale, index=X.index)
+        X[columns_to_scale] = scaled_df
+        print("Data scaled successfully.")
+        return X

pipeline_classes/train_model.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import pandas as pd
+import numpy as np
+from sklearn.base import BaseEstimator, TransformerMixin
+from sklearn.model_selection import StratifiedGroupKFold
+from skopt import BayesSearchCV
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+from xgboost import XGBClassifier
+import joblib
+from skopt.space import Real, Integer, Categorical
+from sklearn.metrics import classification_report, accuracy_score
+import json
+from sklearn.preprocessing import LabelEncoder
+#from _config import config
+class TrainModel(BaseEstimator, TransformerMixin):
+    def __init__(self, classifier, train_label, target):
+        #self.config = config
+        #self.target = config.get("target_label", None)  # User-defined target label in config
+        self.classifier = classifier
+        self.train_label = train_label
+        self.target = target
+        self.label_encoder = LabelEncoder()
+        #self.selected_domains = self.config.get("selected_domains", "All domains")  # Default to all domains if None
+        #if not self.target:
+        #    raise ValueError("No target label specified in the config. Please set 'target_label'.")
+    def get_default_param_space(self, classifier):
+        """ Returns the default hyperparameter space for a given classifier. """
+        if classifier == 'xgboost':
+            return {
+                'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
+                'n_estimators': Integer(100, 1000),
+                'max_depth': Integer(3, 10),
+                'min_child_weight': (1, 10),
+                'subsample': (0.5, 1.0),
+                'colsample_bytree': (0.5, 1.0),
+                'gamma': (0, 10),
+                'reg_alpha': (0, 10),
+                'reg_lambda': (0, 10),
+            }
+        elif classifier == 'svm':
+            return {
+                'C': Real(0.1, 10, prior='log-uniform'),
+                'kernel': Categorical(['linear', 'rbf'])
+            }
+        elif classifier == 'randomforest':
+            return {
+                'n_estimators': Integer(100, 1000),
+                'max_depth': Integer(3, 10)
+            }
+        else:
+            raise ValueError(f"Unsupported classifier type: {classifier}")
+    def fit(self, X, y=None):
+        # Ensure the target column exists in the dataset
+        if self.target not in X.columns:
+            raise ValueError(f"Target label '{self.target}' not found in the dataset.")
+        # Fit the label encoder on the target column
+        print(f"Encoding the target labels for '{self.target}'...")
+        self.label_encoder.fit(X[self.target])
+        # Print the mapping between original labels and encoded labels
+        original_labels = list(self.label_encoder.classes_)
+        encoded_labels = list(range(len(original_labels)))
+        label_mapping = dict(zip(encoded_labels, original_labels))
+        print(f"Label encoding complete. Mapping: {label_mapping}")
+        # Transform the target column and add it as 'encoded_target'
+        X['encoded_target'] = self.label_encoder.transform(X[self.target])
+        # Value counts for the encoded target
+        value_counts = X['encoded_target'].value_counts().to_dict()
+        print(f"Value counts for encoded target: {value_counts}")
+        print(X.columns)
+        # Pop unnecessary columns (groupid, emotion labels not being used, etc.)
+        groups = X.pop('groupid')
+        print(f"Group IDs popped from the dataset.")
+        # Pop the label columns which aren't used
+        self.train_label = self.train_label.split(",")
+        for label in self.train_label:
+            X.pop(label)
+        print(f"Label columns popped from the dataset.")
+        # Pop the encoded target as Y
+        y = X.pop('encoded_target')
+        print(f"Encoded target column popped from the dataset.")
+        print(X.columns)
+        # Store the feature names for later use
+        feature_names = X.columns.tolist()
+        print(f"hallo")
+        # Choose classifier
+        classifier = self.classifier
+        if classifier == 'xgboost':
+            model = XGBClassifier(objective='multi:softmax', random_state=42)
+        elif classifier == 'svm':
+            model = SVC(probability=True)
+        elif classifier == 'randomforest':
+            model = RandomForestClassifier(random_state=42)
+        else:
+            raise ValueError(f"Unsupported classifier type: {classifier}")
+        print(f"Training the model using {classifier}...")
+        # Use user-defined param_space if provided, otherwise use default
+        print(f"Classifier: {classifier}")
+        default_param_space = self.get_default_param_space(classifier)
+        param_space = default_param_space
+        # Hyperparameter tuning using Bayesian optimization
+        sgkf = StratifiedGroupKFold(n_splits=5)
+        print(f"Parameter space being used: {param_space}")
+        if param_space is None:
+            raise ValueError("Parameter space cannot be None. Please check the classifier configuration.")
+        opt = BayesSearchCV(
+            estimator=model,
+            search_spaces=param_space,
+            cv=sgkf,
+            n_iter=5,
+            n_jobs=-1,
+            n_points=1,
+            verbose=1,
+            scoring='accuracy'
+        )
+        print("Hyperparameter tuning in progress...")
+        print(X.describe(),X.columns)
+        print(f"stop")
+        # Fit the model using the encoded target
+        opt.fit(X, y, groups=groups)
+        self.best_model = opt.best_estimator_
+        print(f"Best parameters found: {opt.best_params_}")
+        # Print classification metrics
+        y_pred = self.best_model.predict(X)
+        accuracy = accuracy_score(y, y_pred)
+        report = classification_report(y, y_pred, target_names=self.label_encoder.classes_, output_dict=True)
+        # Save classification report
+        classification_report_json = report
+        with open(f'classification_report_{self.target}.json', 'w') as f:
+            json.dump(classification_report_json, f, indent=4)
+        print(f"Accuracy: {accuracy}")
+        print(f"Classification Report:\n{report}")
+        # Save the best model with the target label in the file name
+        model_name = f"{classifier}_best_model_{self.target}.pkl"
+        joblib.dump(self.best_model, model_name)
+        print("Model saved successfully.")
+        # Save model metadata
+        model_metadata = {
+            "best_params": opt.best_params_,
+            "accuracy": accuracy,
+            "classification_report": classification_report_json,
+            "label_mapping": label_mapping,
+            "model_name": model_name,
+            "value_counts": value_counts,
+            #"selected_domains": self.selected_domains,
+            #"include_magnitude": self.config.get("include_magnitude", True)
+        }
+        if hasattr(self.best_model, "feature_importances_"):
+            feature_importances = self.best_model.feature_importances_
+            # Convert feature importances to native Python floats
+            feature_importance_dict = {feature: float(importance) for feature, importance in zip(feature_names, feature_importances)}
+            model_metadata["feature_importances"] = feature_importance_dict
+            print("Feature Importances:")
+            for feature, importance in feature_importance_dict.items():
+                print(f"{feature}: {importance:.4f}")
+        # Save metadata with the target name in the file name
+        metadata_file = f"{classifier}_model_metadata_{self.target}.json"
+        with open(metadata_file, "w") as f:
+            json.dump(model_metadata, f, indent=4)
+            print(f"Model metadata saved to {metadata_file}.")
+        # Save file paths internally for later retrieval
+        self.model_file = f"{classifier}_best_model_{self.target}.pkl"
+        self.metadata_file = f"{classifier}_model_metadata_{self.target}.json"
+        return self
+    def get_output_files(self):
+        return self.model_file, self.metadata_file
+    def transform(self, X):
+        return X  # Placeholder for transform step (not needed for training)

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+pandas == 2.2.3
+numpy == 2.1.2
+scikit-learn == 1.5.2
+scikit-optimize == 0.10.2
+xgboost == 2.1.1
+joblib == 1.4.2
+PyWavelets == 1.7.0
+scipy == 1.14.1
+gradio == 5.8.0