mininato commited on
Commit
2962055
·
verified ·
1 Parent(s): eac5305

Upload 34 files

Browse files
Files changed (34) hide show
  1. 01_combining_dataframes_pipeline.py +26 -0
  2. 02_feature_extraction_pipeline.py +27 -0
  3. 03_training_model_pipeline.py +21 -0
  4. 04_analyzing_data_pipeline.py +24 -0
  5. 05_complete_trainmodel_pipeline.py +30 -0
  6. LICENSE +21 -0
  7. _config.py +57 -0
  8. app.py +342 -0
  9. pipeline_classes/__init__.py +10 -0
  10. pipeline_classes/__pycache__/__init__.cpython-310.pyc +0 -0
  11. pipeline_classes/__pycache__/__init__.cpython-313.pyc +0 -0
  12. pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc +0 -0
  13. pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc +0 -0
  14. pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc +0 -0
  15. pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc +0 -0
  16. pipeline_classes/__pycache__/extract_features.cpython-310.pyc +0 -0
  17. pipeline_classes/__pycache__/extract_features.cpython-313.pyc +0 -0
  18. pipeline_classes/__pycache__/import_data.cpython-313.pyc +0 -0
  19. pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc +0 -0
  20. pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc +0 -0
  21. pipeline_classes/__pycache__/pcahandler.cpython-310.pyc +0 -0
  22. pipeline_classes/__pycache__/pcahandler.cpython-313.pyc +0 -0
  23. pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc +0 -0
  24. pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc +0 -0
  25. pipeline_classes/__pycache__/train_model.cpython-310.pyc +0 -0
  26. pipeline_classes/__pycache__/train_model.cpython-313.pyc +0 -0
  27. pipeline_classes/classify_movementdata.py +40 -0
  28. pipeline_classes/create_combineddataframe.py +103 -0
  29. pipeline_classes/extract_features.py +272 -0
  30. pipeline_classes/lowpassfilter.py +55 -0
  31. pipeline_classes/pcahandler.py +24 -0
  32. pipeline_classes/scale_xyzdata.py +28 -0
  33. pipeline_classes/train_model.py +195 -0
  34. requirements.txt +9 -0
01_combining_dataframes_pipeline.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.pipeline import Pipeline
2
+ from pipeline_classes import CreateCombinedDataFrame
3
+ from _config import config
4
+ import time
5
+ import pandas as pd
6
+
7
+ accel_data = pd.read_csv(config["accel_path"])
8
+ reports_data = pd.read_csv(config["reports_path"])
9
+
10
+ X = (reports_data, accel_data)
11
+
12
+ # This pipeline combines the self-reports and accelerometer dataframes with a given timewindow into a single dataframe as a csv file
13
+ combining_dataframes_pipeline = Pipeline([
14
+ #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)), # input path to self-reports data),
15
+ ('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
16
+ ])
17
+
18
+ # This will measure the time taken to run the pipeline
19
+ start_time = time.time()
20
+
21
+ # This will start the pipeline and return the combined dataframe
22
+ output_df = combining_dataframes_pipeline.fit_transform(X)
23
+
24
+
25
+ end_time = time.time()
26
+ print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
02_feature_extraction_pipeline.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.pipeline import Pipeline
2
+ from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures
3
+ from _config import config
4
+ import time
5
+
6
+ # This pipeline extracts features from the combined dataframe and exports it to a csv file
7
+ feature_extraction_pipeline = Pipeline([
8
+ ('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)), # input path to combined data
9
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
10
+ ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
11
+ ('extract_features', ExtractFeatures(window_length=config["window_length"],
12
+ window_step_size=config["window_step_size"],
13
+ data_frequency=config["data_frequency"],
14
+ selected_domains=config["selected_domains"],
15
+ include_magnitude=config["include_magnitude"],
16
+ label_columns=config["label_columns"])),
17
+ ])
18
+
19
+ # This will measure the time taken to run the pipeline
20
+ start_time = time.time()
21
+
22
+ # This will start the pipeline and return the feature dataframe
23
+ output_df = feature_extraction_pipeline.fit_transform(None)
24
+
25
+
26
+ end_time = time.time()
27
+ print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
03_training_model_pipeline.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.pipeline import Pipeline
2
+ from pipeline_classes import ImportData, PCAHandler, TrainModel
3
+ from _config import config
4
+ import time
5
+
6
+ # This pipeline trains a model on the feature dataframe and export the model to a pickle file and general information to a json file
7
+ training_model_pipeline = Pipeline([
8
+ ('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
9
+ ('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
10
+ ('train_model', TrainModel(config=config)),
11
+ ])
12
+
13
+ # This will measure the time taken to run the pipeline
14
+ start_time = time.time()
15
+
16
+ # This will start the pipeline and return the model and a report
17
+ output_df = training_model_pipeline.fit_transform(None)
18
+
19
+
20
+ end_time = time.time()
21
+ print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
04_analyzing_data_pipeline.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from sklearn.pipeline import Pipeline
2
+ from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, ClassifyMovementData
3
+ from _config import config
4
+ import time
5
+
6
+ # This is the pipeline that will be used to analyze data which hasnt been classified yet and export the classified dataframe as a csv file
7
+ analyzing_data_pipeline = Pipeline([
8
+ ('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)), # input path to accelerometer data)
9
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
10
+ ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
11
+ ('extract_features', ExtractFeatures(window_length=config['window_length'], window_step_size=config["window_step_size"], data_frequency=config["data_frequency"],
12
+ selected_domains=config['selected_domains'], include_magnitude=config['include_magnitude'])),
13
+ ('classify_movement_data', ClassifyMovementData()),
14
+ ])
15
+
16
+ # This will measure the time taken to run the pipeline
17
+ start_time = time.time()
18
+
19
+ # This will start the pipeline and return the classified dataframe
20
+ output_df = analyzing_data_pipeline.fit_transform(None)
21
+
22
+
23
+ end_time = time.time()
24
+ print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
05_complete_trainmodel_pipeline.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, CreateCombinedDataFrame, TrainModel, PCAHandler
2
+ from _config import config
3
+ from sklearn.pipeline import Pipeline
4
+ import time
5
+
6
+ # This is the complete pipeline that will be used to train a model on the combined dataframe and export the model to a pickle file and general information to a json file
7
+ complete_training_model_pipeline = Pipeline([
8
+ ('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
9
+ ('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
10
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
11
+ ('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
12
+ ('extract_features', ExtractFeatures(window_length=config["window_length"],
13
+ window_step_size=config["window_step_size"],
14
+ data_frequency=config["data_frequency"],
15
+ selected_domains=config["selected_domains"],
16
+ include_magnitude=config["include_magnitude"],
17
+ label_columns=config["label_columns"])),
18
+ ('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
19
+ ('train_model', TrainModel(config=config)),
20
+ ])
21
+
22
+ # This will measure the time taken to run the pipeline
23
+ start_time = time.time()
24
+
25
+ # This will start the pipeline and return the model and a report
26
+ output_df = complete_training_model_pipeline.fit_transform(None)
27
+
28
+
29
+ end_time = time.time()
30
+ print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2024 mininato
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
_config.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration file for the pipeline
2
+
3
+ config = {
4
+ # Paths for Import Data
5
+ "accel_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/AccelerometerMeasurements_backup.csv", # Path to the accelerometer data
6
+ "reports_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/UserTestingSelfReports.csv", # Path to the self-reports data
7
+ #"combined_data_path": "Path or Name of File of Combined Data File", # Path to the combined data
8
+ #"features_data_path": "Path or Name of File of Features Data File", # Path to the features data
9
+ #"model_path": "Path or Name of Trained Model File", # Path to the trained model
10
+
11
+ # Label Configuration
12
+ "label_columns": ["valence", "arousal"], # Here you should input the emotion-labels that you are using
13
+ "target_label": "arousal", # This is the target label that you want to predict (Only one label can be selected)
14
+
15
+ # Configuration for combined data
16
+ "time_window": 3, # Minutes before and after the self-report
17
+
18
+ # Configuration for feature extraction
19
+ "window_length": 60, # Window length in seconds / 60
20
+ "window_step_size": 20, # Step size in seconds / 10%-50% of window_length / 20
21
+ "data_frequency": 25, # Data frequency in Hz
22
+ "selected_domains": None, # Default: Every domain / 'time_domain', 'spatial', 'frequency', 'statistical', 'wavelet' / multiple domains: ["time_domain", "frequency"] / order is not important
23
+ "include_magnitude": True, # Include magnitude-based features or not
24
+
25
+ #Configuration for Low-pass filter
26
+ "cutoff_frequency": 10, # Cut-off frequency for the low-pass filter
27
+ "order": 4, # Order of the filter
28
+
29
+ # Configuration for Scaling
30
+ "scaler_type": "standard", # Possible Scaler: 'standard' or 'minmax'
31
+
32
+ # Configuration for PCA
33
+ "apply_pca": False, # Apply PCA or not
34
+ "pca_variance": 0.95, # PCA variance threshold
35
+
36
+ # Configuration for model training
37
+ "classifier": "xgboost", # Default classifier ('xgboost', 'svm', 'randomforest')
38
+
39
+ # Configuration for hyperparameter tuning
40
+ "n_splits": 5, # Number of splits for cross-validation
41
+ "n_iter": 30, # Number of iterations for hyperparameter tuning
42
+ "n_jobs": -1, # Number of jobs for parallel processing
43
+ "n_points": 1, # Number of points to sample in the hyperparameter space
44
+
45
+ # If users want to define custom param_space, they can specify it here
46
+ "param_space": {
47
+ "learning_rate": (0.05, 0.2),
48
+ "n_estimators": (200, 800),
49
+ "max_depth": (4, 8),
50
+ "min_child_weight": (1, 5),
51
+ "subsample": (0.6, 0.9),
52
+ "colsample_bytree": (0.6, 0.9),
53
+ "gamma": (0, 5),
54
+ "reg_alpha": (0, 5),
55
+ "reg_lambda": (0, 5)
56
+ }, # Set to {None} to use default inside the TrainModel class
57
+ }
app.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from pipeline_classes import CreateCombinedDataFrame, ScaleXYZData, ExtractFeatures, TrainModel, ClassifyMovementData, LowPassFilter, PCAHandler
3
+ from sklearn.pipeline import Pipeline
4
+ from _config import config
5
+ import pandas as pd
6
+ import numpy as np
7
+ import joblib
8
+ import json
9
+
10
+
11
+ # Define pipelines
12
+ combining_dataframes_pipeline = Pipeline([
13
+ #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
14
+ ('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
15
+ ])
16
+
17
+ feature_extraction_pipeline = Pipeline([
18
+ #('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)),
19
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
20
+ ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
21
+ ('extract_features', ExtractFeatures(window_length=None,
22
+ window_step_size=None,
23
+ data_frequency=None,
24
+ selected_domains=None,
25
+ include_magnitude=None,
26
+ features_label_columns=None)),
27
+ ])
28
+
29
+ training_model_pipeline = Pipeline([
30
+ #('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
31
+ ('pca_handler', PCAHandler(apply_pca=None, variance=None)),
32
+ ('train_model', TrainModel(classifier=None, train_label= None, target=None)),
33
+ ])
34
+
35
+ analyzing_data_pipeline = Pipeline([
36
+ #('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)),
37
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
38
+ ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
39
+ ('extract_features', ExtractFeatures(window_length=None,
40
+ window_step_size=None,
41
+ data_frequency=None,
42
+ selected_domains=None,
43
+ include_magnitude=None,
44
+ features_label_columns=None)),
45
+ ('classify_movement_data', ClassifyMovementData(model_file=None)),
46
+ ])
47
+
48
+ complete_training_model_pipeline = Pipeline([
49
+ #('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
50
+ ('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
51
+ ('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
52
+ ('scale_xyz_data', ScaleXYZData(scaler_type=None)),
53
+ ('extract_features', ExtractFeatures(window_length=None,
54
+ window_step_size=None,
55
+ data_frequency=None,
56
+ selected_domains=None,
57
+ include_magnitude=None,
58
+ features_label_columns=None)),
59
+ ('pca_handler', PCAHandler(apply_pca=None, variance=None)),
60
+ ('train_model', TrainModel(classifier=None, train_label= None, target=None)),
61
+ ])
62
+
63
+ def execute_combine_pipeline(accel_file, report_file,
64
+ time_window=None, label_columns=None
65
+ ):
66
+ try:
67
+ # Load data files only if paths are valid
68
+ accel_data = pd.read_csv(accel_file) if accel_file else None
69
+ report_data = pd.read_csv(report_file) if report_file else None
70
+
71
+ # Validate inputs for the selected pipeline
72
+ if accel_data is None or report_data is None:
73
+ return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
74
+ combining_dataframes_pipeline.set_params(
75
+ create_combined_dataframe__time_window=time_window,
76
+ create_combined_dataframe__label_columns=label_columns.split(','))
77
+ X = report_data, accel_data
78
+ result = combining_dataframes_pipeline.fit_transform(X)
79
+ output_file = "combine_dataframes_output.csv"
80
+ result.to_csv(output_file, index=False)
81
+
82
+ return output_file
83
+
84
+ except Exception as e:
85
+ print(f"Error occurred: {str(e)}")
86
+ return str(e), None
87
+
88
+
89
+ def execute_feature_extraction_pipeline(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
90
+ try:
91
+ combined_data = pd.read_csv(combined_file) if combined_file else None
92
+ if combined_data is None:
93
+ return "Error: Combined data file is required for this pipeline.", None
94
+
95
+ feature_extraction_pipeline.set_params(
96
+ low_pass_filter__cutoff_frequency=cutoff_frequency,
97
+ low_pass_filter__order=order,
98
+ low_pass_filter__sampling_rate=data_frequency,
99
+ scale_xyz_data__scaler_type=scaler_type,
100
+ extract_features__window_length=window_length,
101
+ extract_features__window_step_size=window_step_size,
102
+ extract_features__data_frequency=data_frequency,
103
+ #extract_features__selected_domains=None,
104
+ extract_features__include_magnitude=include_magnitude,
105
+ extract_features__features_label_columns=features_label_columns.split(','))
106
+ result = feature_extraction_pipeline.fit_transform(combined_data)
107
+ output_file = "extract_features_output.csv"
108
+ result.to_csv(output_file, index=False)
109
+ return output_file
110
+
111
+ except Exception as e:
112
+ print(f"Error occurred: {str(e)}")
113
+ return str(e)
114
+
115
+ def execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target):
116
+ try:
117
+ print(f"features_file: {features_file}")
118
+ features_data = pd.read_csv(features_file) if features_file else None
119
+ if features_data is None:
120
+ return "Error: Features data file is required for this pipeline.", None
121
+
122
+ training_model_pipeline.set_params(
123
+ pca_handler__apply_pca=apply_pca,
124
+ pca_handler__variance=pca_variance,
125
+ train_model__classifier=classifier,
126
+ train_model__train_label=train_label,
127
+ train_model__target=target)
128
+
129
+ X = features_data
130
+ training_model_pipeline.fit(X)
131
+ output_file, secondary_output_file = training_model_pipeline.named_steps['train_model'].get_output_files()
132
+ return output_file, secondary_output_file
133
+
134
+ except Exception as e:
135
+ print(f"Error occurred: {str(e)}")
136
+ return str(e), None
137
+
138
+ def execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
139
+ try:
140
+ print("hallo")
141
+ accel_data = pd.read_csv(accel_file) if accel_file else None
142
+ print("hallo2")
143
+ if accel_data is None:
144
+ return "Error: Accelerometer data file is required for this pipeline.", None
145
+
146
+ analyzing_data_pipeline.set_params(
147
+ low_pass_filter__cutoff_frequency=cutoff_frequency,
148
+ low_pass_filter__order=order,
149
+ low_pass_filter__sampling_rate=data_frequency,
150
+ scale_xyz_data__scaler_type=scaler_type,
151
+ extract_features__window_length=window_length,
152
+ extract_features__window_step_size=window_length,
153
+ extract_features__data_frequency=data_frequency,
154
+ #extract_features__selected_domains=None,
155
+ extract_features__include_magnitude=include_magnitude,
156
+ extract_features__features_label_columns=features_label_columns.split(','),
157
+ classify_movement_data__model_file=model_file.name
158
+ )
159
+ print("hallo3")
160
+
161
+ result = analyzing_data_pipeline.fit_transform(accel_data)
162
+ output_file = "analyze_data_output.csv"
163
+ result.to_csv(output_file, index=False)
164
+ return output_file
165
+
166
+ except Exception as e:
167
+ print(f"Error occurred: {str(e)}")
168
+ return str(e), None
169
+
170
+ def execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
171
+ cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
172
+ apply_pca, pca_variance, classifier, train_label, target):
173
+ try:
174
+ accel_data = pd.read_csv(accel_file) if accel_file else None
175
+ report_data = pd.read_csv(report_file) if report_file else None
176
+ if accel_data is None or report_data is None:
177
+ return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
178
+
179
+ complete_training_model_pipeline.set_params(
180
+ create_combined_dataframe__time_window=time_window,
181
+ create_combined_dataframe__label_columns=label_columns.split(','),
182
+ low_pass_filter__cutoff_frequency=cutoff_frequency,
183
+ low_pass_filter__order=order,
184
+ low_pass_filter__sampling_rate=data_frequency,
185
+ scale_xyz_data__scaler_type=scaler_type,
186
+ extract_features__window_length=window_length,
187
+ extract_features__window_step_size=window_step_size,
188
+ extract_features__data_frequency=data_frequency,
189
+ #extract_features__selected_domains=None,
190
+ extract_features__include_magnitude=include_magnitude,
191
+ extract_features__features_label_columns=label_columns.split(','),
192
+ pca_handler__apply_pca=apply_pca,
193
+ pca_handler__variance=pca_variance,
194
+ train_model__classifier=classifier,
195
+ train_model__train_label=label_columns,
196
+ train_model__target=target
197
+ )
198
+ X = report_data, accel_data
199
+ complete_training_model_pipeline.fit(X)
200
+ output_file, secondary_output_file = complete_training_model_pipeline.named_steps['train_model'].get_output_files()
201
+ return output_file, secondary_output_file
202
+
203
+ except Exception as e:
204
+ print(f"Error occurred: {str(e)}")
205
+ return str(e), None
206
+
207
+ # Gradio Blocks Interface
208
+ with gr.Blocks() as demo:
209
+ with gr.Tabs():
210
+ with gr.TabItem("Combine DataFrames"):
211
+ accel_file = gr.File(label="Upload Accelerometer Data")
212
+ report_file = gr.File(label="Upload Self-Report Data")
213
+ time_window = gr.Number(label="Time Window (minutes)", value=2)
214
+ label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
215
+ combine_button = gr.Button("Combine DataFrames")
216
+ combine_output = gr.File(label="Download Combined DataFrame")
217
+
218
+ def combine_dataframes(accel_file, report_file, time_window, label_columns):
219
+ output_file = execute_combine_pipeline(accel_file, report_file, time_window, label_columns)
220
+ return output_file
221
+
222
+ combine_button.click(combine_dataframes, inputs=[accel_file, report_file, time_window, label_columns], outputs=combine_output)
223
+
224
+ with gr.TabItem("Extract Features"):
225
+ combined_file = gr.File(label="Upload Combined Data")
226
+
227
+ cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
228
+ order = gr.Number(label="Order", value=4)
229
+
230
+ scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
231
+
232
+ window_length = gr.Number(label="Window Length (seconds)", value=60)
233
+ window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
234
+ data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
235
+
236
+ #selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
237
+ include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
238
+ features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
239
+
240
+ extract_button = gr.Button("Extract Features")
241
+ extract_output = gr.File(label="Download Extracted Features")
242
+
243
+ def extract_features(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
244
+ output_file = execute_feature_extraction_pipeline(combined_file,
245
+ cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency,
246
+ include_magnitude, features_label_columns
247
+ )
248
+ return output_file
249
+
250
+ extract_button.click(extract_features, inputs=[combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size,
251
+ data_frequency, include_magnitude, features_label_columns], outputs=extract_output)
252
+
253
+ with gr.TabItem("Train Model"):
254
+ features_file = gr.File(label="Upload Features Data")
255
+
256
+ apply_pca = gr.Checkbox(label="Apply PCA", value=False)
257
+ pca_variance = gr.Number(label="PCA Variance", value=0.95)
258
+ classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
259
+ train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
260
+ target = gr.Textbox(label="Target Label", value="arousal")
261
+
262
+ train_button = gr.Button("Train Model")
263
+ train_output_json = gr.File(label="Download Model JSON")
264
+ train_output_pkl = gr.File(label="Download Model PKL")
265
+
266
+ def train_model(features_file, apply_pca, pca_variance, classifier, train_label, target):
267
+ output_file, secondary_output_file = execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target)
268
+ return output_file, secondary_output_file
269
+
270
+ train_button.click(train_model, inputs=[features_file, apply_pca, pca_variance, classifier, train_label, target], outputs=[train_output_json, train_output_pkl])
271
+
272
+ with gr.TabItem("Analyze Data"):
273
+ accel_file = gr.File(label="Upload Accelerometer Data")
274
+ model_file = gr.File(label="Upload Model")
275
+
276
+ cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
277
+ order = gr.Number(label="Order", value=4)
278
+
279
+ scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
280
+
281
+ window_length = gr.Number(label="Window Length (seconds)", value=60)
282
+ data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
283
+
284
+ #selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
285
+ include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
286
+ features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
287
+
288
+ analyze_button = gr.Button("Analyze Data")
289
+ analyze_output = gr.File(label="Download Analyzed Data")
290
+
291
+ def analyze_data(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
292
+ output_file = execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
293
+ data_frequency, include_magnitude, features_label_columns)
294
+ return output_file
295
+
296
+ analyze_button.click(analyze_data, inputs=[accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
297
+ data_frequency, include_magnitude, features_label_columns ], outputs=analyze_output)
298
+
299
+ with gr.TabItem("Complete Train Model"):
300
+ accel_file = gr.File(label="Upload Accelerometer Data")
301
+ report_file = gr.File(label="Upload Self-Report Data")
302
+
303
+ time_window = gr.Number(label="Time Window (minutes)", value=2)
304
+ label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
305
+
306
+ cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
307
+ order = gr.Number(label="Order", value=4)
308
+
309
+ scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
310
+
311
+ window_length = gr.Number(label="Window Length (seconds)", value=60)
312
+ window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
313
+ data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
314
+
315
+ include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
316
+ #features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
317
+
318
+ apply_pca = gr.Checkbox(label="Apply PCA", value=False)
319
+ pca_variance = gr.Number(label="PCA Variance", value=0.95)
320
+ classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
321
+ #train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
322
+ target = gr.Textbox(label="Target Label", value="arousal")
323
+
324
+ complete_train_button = gr.Button("Complete Train Model")
325
+
326
+ complete_train_output_pkl = gr.File(label="Download Model PKL")
327
+ complete_train_output_json = gr.File(label="Download Model JSON")
328
+
329
+ def complete_train_model(accel_file, report_file, time_window, label_columns,
330
+ cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
331
+ apply_pca, pca_variance, classifier, train_label, target):
332
+ output_file, secondary_output_file = execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
333
+ cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
334
+ apply_pca, pca_variance, classifier, train_label, target)
335
+ return output_file, secondary_output_file
336
+
337
+ complete_train_button.click(complete_train_model, inputs=[accel_file, report_file, time_window, label_columns,
338
+ cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
339
+ apply_pca, pca_variance, classifier, train_label, target], outputs=[complete_train_output_pkl, complete_train_output_json])
340
+
341
+
342
+ demo.launch()
pipeline_classes/__init__.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Description: This file is used to import all the classes in the pipeline_classes folder.
2
+
3
+ # from .import_data import ImportData
4
+ from .create_combineddataframe import CreateCombinedDataFrame
5
+ from .scale_xyzdata import ScaleXYZData
6
+ from .extract_features import ExtractFeatures
7
+ from .pcahandler import PCAHandler
8
+ from .train_model import TrainModel
9
+ from .classify_movementdata import ClassifyMovementData
10
+ from .lowpassfilter import LowPassFilter
pipeline_classes/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (563 Bytes). View file
 
pipeline_classes/__pycache__/__init__.cpython-313.pyc ADDED
Binary file (622 Bytes). View file
 
pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc ADDED
Binary file (1.48 kB). View file
 
pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc ADDED
Binary file (2.05 kB). View file
 
pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc ADDED
Binary file (3.21 kB). View file
 
pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc ADDED
Binary file (4.57 kB). View file
 
pipeline_classes/__pycache__/extract_features.cpython-310.pyc ADDED
Binary file (8.93 kB). View file
 
pipeline_classes/__pycache__/extract_features.cpython-313.pyc ADDED
Binary file (18.3 kB). View file
 
pipeline_classes/__pycache__/import_data.cpython-313.pyc ADDED
Binary file (2.77 kB). View file
 
pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc ADDED
Binary file (2.42 kB). View file
 
pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc ADDED
Binary file (2.75 kB). View file
 
pipeline_classes/__pycache__/pcahandler.cpython-310.pyc ADDED
Binary file (1.18 kB). View file
 
pipeline_classes/__pycache__/pcahandler.cpython-313.pyc ADDED
Binary file (1.64 kB). View file
 
pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc ADDED
Binary file (1.4 kB). View file
 
pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc ADDED
Binary file (1.82 kB). View file
 
pipeline_classes/__pycache__/train_model.cpython-310.pyc ADDED
Binary file (5.32 kB). View file
 
pipeline_classes/__pycache__/train_model.cpython-313.pyc ADDED
Binary file (8.5 kB). View file
 
pipeline_classes/classify_movementdata.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ import joblib
5
+ from _config import config
6
+
7
+ # This class is used to classify the movement data using a pre-trained model
8
+ class ClassifyMovementData(BaseEstimator, TransformerMixin):
9
+ def __init__(self, model_file = None):
10
+ #self.model_path = model_path if model_path else config.get("model_path")
11
+ self.model_file = model_file
12
+ self.model = None
13
+
14
+ def fit(self, X, y=None):
15
+ return self
16
+
17
+ def transform(self, X):
18
+ if self.model is None:
19
+ if self.model_file is None:
20
+ raise ValueError("Model file is not provided.")
21
+ try:
22
+ self.model = joblib.load(self.model_file) # Load the model
23
+ except Exception as e:
24
+ raise ValueError(f"Failed to load the model file: {e}")
25
+
26
+ # Assuming `X` is a DataFrame of pre-extracted features.
27
+ predictions = self.model.predict(X)
28
+
29
+ # Adding predictions to the DataFrame as the first column
30
+ X.insert(0, 'predicted_emotion', predictions)
31
+
32
+ print("Data classified successfully.")
33
+
34
+ # Export the labeled DataFrame to CSV
35
+ #window_length_str = str(config["window_length"])
36
+ output_file = f"classified_movement_data.csv"
37
+ X.to_csv(output_file, index=False)
38
+ print(f"Classified movement data exported successfully to {output_file}.")
39
+
40
+ return X
pipeline_classes/create_combineddataframe.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from _config import config
5
+
6
+ class CreateCombinedDataFrame(BaseEstimator, TransformerMixin):
7
+ def __init__(self, time_window, label_columns=None):
8
+ self.time_window = time_window
9
+ self.label_columns = label_columns
10
+
11
+ def fit(self, X, y=None):
12
+ return self
13
+
14
+ def transform(self, X):
15
+ df_reports, df_accel = X
16
+
17
+ print(f"PreprocesssingCombined initialized with label_columns: {self.label_columns}")
18
+ # Ensure the chosen label columns exist in the dataset
19
+ valid_conditions = (df_reports['timeOfEngagement'] != 0)
20
+ for label in self.label_columns:
21
+ valid_conditions &= (df_reports[label] != "NONE")
22
+
23
+ df_reports = df_reports[valid_conditions].copy()
24
+
25
+ # No datetime conversion needed; timestamps remain as integers
26
+ df_accel.rename(columns={'timestamp': 'timeOfNotification'}, inplace=True)
27
+
28
+ print(f"ExtractAccelData initialized with time_window: {self.time_window}")
29
+ df_reports['accel_data'] = df_reports.apply(lambda row: self._extract_accel_data(row, df_accel), axis=1)
30
+
31
+ print(f"Combining called with label_columns: {self.label_columns}")
32
+ combined_data = []
33
+
34
+ for _, row in df_reports.iterrows():
35
+ accel_data = row['accel_data']
36
+ for _, accel_row in accel_data.iterrows():
37
+ combined_row = {
38
+ 'participantId': row['participantId'], # Participant ID
39
+ 'selfreport_time': row['timeOfNotification'], # Self-report time
40
+ 'accel_time': accel_row['timeOfNotification'], # Accelerometer data time
41
+ 'x': accel_row['x'], # x-axis accelerometer data
42
+ 'y': accel_row['y'], # y-axis accelerometer data
43
+ 'z': accel_row['z'] # z-axis accelerometer data
44
+ }
45
+
46
+ # Dynamically add emotion labels to the combined row
47
+ for label in self.label_columns:
48
+ combined_row[label] = row[label]
49
+
50
+ combined_data.append(combined_row)
51
+
52
+ combined_df = pd.DataFrame(combined_data)
53
+
54
+ # Convert integer timestamps back to datetime format for the CSV
55
+ combined_df['selfreport_time'] = pd.to_datetime(combined_df['selfreport_time'], unit='ms')
56
+ combined_df['accel_time'] = pd.to_datetime(combined_df['accel_time'], unit='ms')
57
+
58
+ # Create groupid column (unique identifier based on participantId and selfreport_time)
59
+ combined_df['groupid'] = combined_df.groupby(['participantId', 'selfreport_time']).ngroup() + 1
60
+ col = combined_df.pop("groupid") # Move groupid to the first column
61
+ combined_df.insert(0, col.name, col)
62
+
63
+ # Export the combined dataframe to CSV
64
+ time_window_str = str(self.time_window)
65
+ label_columns_str = "_".join(self.label_columns)
66
+ file_name = f"combined_data_timewindow_{time_window_str}min_labels_{label_columns_str}.csv"
67
+ combined_df.to_csv(file_name, index=False)
68
+ print(f"Combined dataframe exported successfully to {file_name}.")
69
+
70
+ return combined_df
71
+
72
+ def _extract_accel_data(self, row, accel_data):
73
+ time_delta = self.time_window * 60 * 1000 # Convert minutes to milliseconds
74
+ start_time = row['timeOfNotification'] - time_delta # Keep as integer
75
+ end_time = row['timeOfNotification'] + time_delta # Keep as integer
76
+ participant_id = row['participantId']
77
+
78
+ # Ensure accel_data['timeOfNotification'] is also an integer
79
+ accel_data['timeOfNotification'] = accel_data['timeOfNotification'].astype(np.int64) # Ensure integer format
80
+
81
+ # Log a warning if the desired time range exceeds the available data range
82
+ if start_time < accel_data['timeOfNotification'].min() or end_time > accel_data['timeOfNotification'].max():
83
+ print(
84
+ f"Warning: Data does not cover the full {self.time_window}-minute window for participant {participant_id}. "
85
+ f"Available range: {accel_data['timeOfNotification'].min()} to {accel_data['timeOfNotification'].max()}. "
86
+ f"Requested range: {start_time} to {end_time}."
87
+ )
88
+
89
+ # Apply the filtering mask
90
+ mask = (
91
+ (accel_data['participantId'] == participant_id) &
92
+ (accel_data['timeOfNotification'] >= max(start_time, accel_data['timeOfNotification'].min())) &
93
+ (accel_data['timeOfNotification'] <= min(end_time, accel_data['timeOfNotification'].max()))
94
+ )
95
+
96
+
97
+
98
+ print("Start Time (ms):", start_time)
99
+ print("End Time (ms):", end_time)
100
+ print("Filtered Rows:\n", accel_data[mask])
101
+
102
+ return accel_data[mask]
103
+
pipeline_classes/extract_features.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from scipy.fftpack import fft
5
+ from scipy.signal import welch
6
+ import pywt
7
+ from _config import config
8
+
9
+ class ExtractFeatures(BaseEstimator, TransformerMixin):
10
+ def __init__(self, window_length, window_step_size, data_frequency, selected_domains=None, include_magnitude=False, features_label_columns=None):
11
+ self.window_length = window_length
12
+ self.window_step_size = window_step_size
13
+ self.data_frequency = data_frequency
14
+ self.selected_domains = selected_domains
15
+ self.include_magnitude = include_magnitude
16
+ self.features_label_columns = features_label_columns #if label_columns else ["arousal", "valence"] # Default to arousal and valence if not specified
17
+
18
+ def fit(self, X, y=None):
19
+ return self
20
+
21
+ def transform(self, X):
22
+ features_list = []
23
+
24
+ if 'groupid' in X.columns: # Check for groupid column
25
+ for groupid in X['groupid'].unique(): # Iterate over unique group IDs
26
+ temp = X[X['groupid'] == groupid] # Filter rows by group ID
27
+ temp_ex = temp[['accel_time', 'x', 'y', 'z']].copy() # Keep only the necessary columns (accel_time can be removed if unused)
28
+ windows = self._window_data(temp_ex[['x', 'y', 'z']]) # Create windows of data
29
+
30
+ for window in windows:
31
+ features = self._extract_features_from_window(window) # Extract features from each window
32
+ features['groupid'] = groupid # Add groupid to the features
33
+
34
+ # Dynamically add emotion labels to the features
35
+ for label in self.features_label_columns:
36
+ features[label] = temp[label].iloc[0]
37
+
38
+ features_list.append(pd.DataFrame([features])) # Convert dictionary to DataFrame
39
+
40
+ else: # In case there's no groupid, calculate features without it
41
+ windows = self._window_data(X[['x', 'y', 'z']])
42
+ for window in windows:
43
+ features = self._extract_features_from_window(window)
44
+ features_list.append(pd.DataFrame([features]))
45
+
46
+ all_features = pd.concat(features_list, ignore_index=True)
47
+
48
+ # Export features to CSV
49
+ window_length_str = str(self.window_length)
50
+ window_step_size_str = str(self.window_step_size)
51
+ if self.selected_domains is None: # All features calculated if domains are not selected
52
+ domain_str = "all_features"
53
+ else:
54
+ domain_str = "_".join(self.selected_domains)
55
+ file_name = f"features_window_{window_length_str}_step_{window_step_size_str}_{domain_str}.csv"
56
+ all_features.to_csv(file_name, index=False)
57
+
58
+ print("All features extracted successfully.")
59
+ return all_features
60
+
61
+ # Time Domain Features
62
+ def _calculate_magnitude(self, window):
63
+ return np.sqrt(window[:, 0]**2 + window[:, 1]**2 + window[:, 2]**2)
64
+
65
+ def _window_data(self, data): # Function to create windows of the data
66
+ window_samples = int(self.window_length * self.data_frequency) # Number of samples in each window 60sec * 25Hz = 1500 samples
67
+ step_samples = int(self.window_step_size * self.data_frequency) # Number of samples to move the window
68
+ windows = [data[i:i + window_samples] for i in range(0, len(data) - window_samples + 1, step_samples)] # Create windows
69
+ return np.array(windows)
70
+
71
+ def _extract_features_from_window(self, window): #DONE Mehrere domains gleichzeitig berechnen
72
+ all_features = {}
73
+
74
+ if self.selected_domains is None or 'time_domain' in self.selected_domains:
75
+ all_features.update(self._extract_time_domain_features(window))
76
+
77
+ if self.selected_domains is None or 'spatial' in self.selected_domains:
78
+ all_features.update(self._extract_spatial_features(window))
79
+
80
+ if self.selected_domains is None or 'frequency' in self.selected_domains:
81
+ all_features.update(self._extract_frequency_domain_features(window))
82
+
83
+ if self.selected_domains is None or 'statistical' in self.selected_domains:
84
+ all_features.update(self._extract_statistical_features(window))
85
+
86
+ if self.selected_domains is None or 'wavelet' in self.selected_domains:
87
+ all_features.update(self._extract_wavelet_features(window))
88
+
89
+ return all_features
90
+
91
+ def _extract_time_domain_features(self, window):
92
+ features = {
93
+ 'mean_x': np.mean(window[:, 0]),
94
+ 'mean_y': np.mean(window[:, 1]),
95
+ 'mean_z': np.mean(window[:, 2]),
96
+ 'std_x': np.std(window[:, 0]),
97
+ 'std_y': np.std(window[:, 1]),
98
+ 'std_z': np.std(window[:, 2]),
99
+ 'variance_x': np.var(window[:, 0]),
100
+ 'variance_y': np.var(window[:, 1]),
101
+ 'variance_z': np.var(window[:, 2]),
102
+ 'rms_x': np.sqrt(np.mean(window[:, 0]**2)),
103
+ 'rms_y': np.sqrt(np.mean(window[:, 1]**2)),
104
+ 'rms_z': np.sqrt(np.mean(window[:, 2]**2)),
105
+ 'max_x': np.max(window[:, 0]),
106
+ 'max_y': np.max(window[:, 1]),
107
+ 'max_z': np.max(window[:, 2]),
108
+ 'min_x': np.min(window[:, 0]),
109
+ 'min_y': np.min(window[:, 1]),
110
+ 'min_z': np.min(window[:, 2]),
111
+ 'peak_to_peak_x': np.ptp(window[:, 0]),
112
+ 'peak_to_peak_y': np.ptp(window[:, 1]),
113
+ 'peak_to_peak_z': np.ptp(window[:, 2]),
114
+ 'skewness_x': pd.Series(window[:, 0]).skew(),
115
+ 'skewness_y': pd.Series(window[:, 1]).skew(),
116
+ 'skewness_z': pd.Series(window[:, 2]).skew(),
117
+ 'kurtosis_x': pd.Series(window[:, 0]).kurt(),
118
+ 'kurtosis_y': pd.Series(window[:, 1]).kurt(),
119
+ 'kurtosis_z': pd.Series(window[:, 2]).kurt(),
120
+ 'zero_crossing_rate_x': np.sum(np.diff(np.sign(window[:, 0])) != 0),
121
+ 'zero_crossing_rate_y': np.sum(np.diff(np.sign(window[:, 1])) != 0),
122
+ 'zero_crossing_rate_z': np.sum(np.diff(np.sign(window[:, 2])) != 0),
123
+ 'sma' : np.sum(np.abs(window[:, 0])) + np.sum(np.abs(window[:, 1])) + np.sum(np.abs(window[:, 2])), #Signal Magnitude Area
124
+ }
125
+ # print(f"Time domain features extracted successfully.")
126
+
127
+ # Additional features for Magnitude (xyz in one vector)
128
+ if self.include_magnitude:
129
+ magnitude = self._calculate_magnitude(window)
130
+ features['mean_magnitude'] = np.mean(magnitude)
131
+ features['std_magnitude'] = np.std(magnitude)
132
+ features['variance_magnitude'] = np.var(magnitude)
133
+ features['rms_magnitude'] = np.sqrt(np.mean(magnitude**2))
134
+ features['max_magnitude'] = np.max(magnitude)
135
+ features['min_magnitude'] = np.min(magnitude)
136
+ features['peak_to_peak_magnitude'] = np.ptp(magnitude)
137
+ features['skewness_magnitude'] = pd.Series(magnitude).skew()
138
+ features['kurtosis_magnitude'] = pd.Series(magnitude).kurt()
139
+ features['zero_crossing_rate_magnitude'] = np.sum(np.diff(np.sign(magnitude)) != 0)
140
+ # print(f"Additional time domain features for magnitude extracted successfully.")
141
+
142
+ return features
143
+
144
+ # Spatial Features
145
+ def _extract_spatial_features(self, window):
146
+ features = {}
147
+
148
+ # Euclidean Norm (Magnitude)
149
+ magnitude = self._calculate_magnitude(window)
150
+ features['euclidean_norm'] = np.mean(magnitude) # or np.linalg.norm for each window
151
+
152
+ # Tilt Angles (Pitch and Roll)
153
+ pitch = np.arctan2(window[:, 1], np.sqrt(window[:, 0]**2 + window[:, 2]**2)) * (180 / np.pi)
154
+ roll = np.arctan2(window[:, 0], np.sqrt(window[:, 1]**2 + window[:, 2]**2)) * (180 / np.pi)
155
+ features['mean_pitch'] = np.mean(pitch)
156
+ features['mean_roll'] = np.mean(roll)
157
+
158
+ # Correlation between Axes
159
+ features['correlation_xy'] = np.corrcoef(window[:, 0], window[:, 1])[0, 1]
160
+ features['correlation_xz'] = np.corrcoef(window[:, 0], window[:, 2])[0, 1]
161
+ features['correlation_yz'] = np.corrcoef(window[:, 1], window[:, 2])[0, 1]
162
+
163
+ # print(f"Spatial features extracted successfully.")
164
+ return features
165
+
166
+
167
+
168
+ # Frequency Domain Features
169
+ def _extract_frequency_domain_features(self, window):
170
+ n = len(window)
171
+ freq_values = np.fft.fftfreq(n, d=1/self.data_frequency)[:n // 2]
172
+ fft_values = fft(window, axis=0)
173
+ fft_magnitude = np.abs(fft_values)[:n // 2]
174
+
175
+ features = {}
176
+
177
+ # Spectral Entropy
178
+ def spectral_entropy(signal):
179
+ psd = np.square(signal)
180
+ psd_norm = psd / np.sum(psd)
181
+ return -np.sum(psd_norm * np.log(psd_norm + 1e-10))
182
+
183
+ for i, axis in enumerate(['x', 'y', 'z']):
184
+ # Dominant Frequency
185
+ dominant_frequency = freq_values[np.argmax(fft_magnitude[:, i])]
186
+ features[f'dominant_frequency_{axis}'] = dominant_frequency
187
+
188
+ # Spectral Entropy
189
+ entropy = spectral_entropy(fft_magnitude[:, i])
190
+ features[f'spectral_entropy_{axis}'] = entropy
191
+
192
+ # Power Spectral Density (PSD) and Energy
193
+ f, psd_values = welch(window[:, i], fs=self.data_frequency, nperseg=n)
194
+ features[f'psd_mean_{axis}'] = np.mean(psd_values)
195
+ features[f'energy_{axis}'] = np.sum(psd_values**2)
196
+
197
+ # Bandwidth (frequency range containing significant portion of the energy)
198
+ cumulative_energy = np.cumsum(psd_values)
199
+ total_energy = cumulative_energy[-1]
200
+ low_cutoff_idx = np.argmax(cumulative_energy > 0.1 * total_energy)
201
+ high_cutoff_idx = np.argmax(cumulative_energy > 0.9 * total_energy)
202
+ bandwidth = f[high_cutoff_idx] - f[low_cutoff_idx]
203
+ features[f'bandwidth_{axis}'] = bandwidth
204
+
205
+ # Spectral Centroid (Center of mass of the spectrum)
206
+ spectral_centroid = np.sum(f * psd_values) / np.sum(psd_values)
207
+ features[f'spectral_centroid_{axis}'] = spectral_centroid
208
+
209
+ if self.include_magnitude:
210
+ # Magnitude-based Frequency Domain Features
211
+ magnitude = self._calculate_magnitude(window)
212
+ fft_magnitude_mag = np.abs(fft(magnitude))[:n // 2]
213
+
214
+ # Dominant Frequency for Magnitude
215
+ features['dominant_frequency_magnitude'] = freq_values[np.argmax(fft_magnitude_mag)]
216
+
217
+ # Spectral Entropy for Magnitude
218
+ features['spectral_entropy_magnitude'] = spectral_entropy(fft_magnitude_mag)
219
+
220
+ # Power Spectral Density and Energy for Magnitude
221
+ f, psd_values_mag = welch(magnitude, fs=self.data_frequency, nperseg=n)
222
+ features['psd_mean_magnitude'] = np.mean(psd_values_mag)
223
+ features['energy_magnitude'] = np.sum(psd_values_mag**2)
224
+
225
+ # Bandwidth for Magnitude
226
+ cumulative_energy_mag = np.cumsum(psd_values_mag)
227
+ total_energy_mag = cumulative_energy_mag[-1]
228
+ low_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.1 * total_energy_mag)
229
+ high_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.9 * total_energy_mag)
230
+ bandwidth_mag = f[high_cutoff_idx_mag] - f[low_cutoff_idx_mag]
231
+ features['bandwidth_magnitude'] = bandwidth_mag
232
+
233
+ # Spectral Centroid for Magnitude
234
+ features['spectral_centroid_magnitude'] = np.sum(f * psd_values_mag) / np.sum(psd_values_mag)
235
+
236
+ # print(f"Frequency domain features extracted successfully.")
237
+ return features
238
+
239
+
240
+ def _extract_statistical_features(self, window):
241
+ features = {
242
+ '25th_percentile_x': np.percentile(window[:, 0], 25),
243
+ '25th_percentile_y': np.percentile(window[:, 1], 25),
244
+ '25th_percentile_z': np.percentile(window[:, 2], 25),
245
+ '75th_percentile_x': np.percentile(window[:, 0], 75),
246
+ '75th_percentile_y': np.percentile(window[:, 1], 75),
247
+ '75th_percentile_z': np.percentile(window[:, 2], 75),
248
+ }
249
+
250
+ if self.include_magnitude:
251
+ magnitude = self._calculate_magnitude(window)
252
+ features['25th_percentile_magnitude'] = np.percentile(magnitude, 25)
253
+ features['75th_percentile_magnitude'] = np.percentile(magnitude, 75)
254
+
255
+ # print(f"Statistical features extracted successfully.")
256
+ return features
257
+
258
+ def _extract_wavelet_features(self, window, wavelet='db1'):
259
+ coeffs = pywt.wavedec(window, wavelet, axis=0, level=3)
260
+ features = {
261
+ 'wavelet_energy_approx_x': np.sum(coeffs[0][:, 0]**2),
262
+ 'wavelet_energy_approx_y': np.sum(coeffs[0][:, 1]**2),
263
+ 'wavelet_energy_approx_z': np.sum(coeffs[0][:, 2]**2),
264
+ }
265
+
266
+ if self.include_magnitude:
267
+ magnitude = self._calculate_magnitude(window)
268
+ coeffs_magnitude = pywt.wavedec(magnitude, wavelet, level=3)
269
+ features['wavelet_energy_approx_magnitude'] = np.sum(coeffs_magnitude[0]**2)
270
+
271
+ # print(f"Wavelet features extracted successfully.")
272
+ return features
pipeline_classes/lowpassfilter.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from scipy.signal import butter, filtfilt
5
+
6
+ class LowPassFilter(BaseEstimator, TransformerMixin):
7
+ def __init__(self, cutoff_frequency, sampling_rate, order):
8
+ """
9
+ Initialize the LowPassFilter class.
10
+
11
+ Parameters:
12
+ - cutoff_frequency: The cutoff frequency for the low-pass filter (default: 5 Hz).
13
+ - sampling_rate: The sampling rate of the accelerometer data (default: 25 Hz).
14
+ - order: The order of the filter (default: 4).
15
+ """
16
+ self.cutoff_frequency = cutoff_frequency
17
+ self.sampling_rate = sampling_rate
18
+ self.order = order
19
+
20
+ def _butter_lowpass_filter(self, data):
21
+ """
22
+ Apply a Butterworth low-pass filter to the data.
23
+
24
+ Parameters:
25
+ - data: A NumPy array containing the accelerometer data to be filtered.
26
+
27
+ Returns:
28
+ - A filtered NumPy array.
29
+ """
30
+ nyquist = 0.5 * self.sampling_rate
31
+ normalized_cutoff = self.cutoff_frequency / nyquist
32
+ b, a = butter(self.order, normalized_cutoff, btype='low', analog=False)
33
+ filtered_data = filtfilt(b, a, data, axis=0)
34
+ return filtered_data
35
+
36
+ def fit(self, X, y=None):
37
+ return self
38
+
39
+ def transform(self, X):
40
+ """
41
+ Apply the low-pass filter to the accelerometer data.
42
+
43
+ Parameters:
44
+ - X: A DataFrame with 'x', 'y', and 'z' columns representing the accelerometer data.
45
+
46
+ Returns:
47
+ - The DataFrame with filtered 'x', 'y', and 'z' columns.
48
+ """
49
+ if 'x' in X.columns and 'y' in X.columns and 'z' in X.columns:
50
+ X[['x', 'y', 'z']] = self._butter_lowpass_filter(X[['x', 'y', 'z']].values)
51
+ print("Low-pass filter applied successfully.")
52
+ else:
53
+ raise ValueError("The input DataFrame must contain 'x', 'y', and 'z' columns.")
54
+
55
+ return X
pipeline_classes/pcahandler.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.decomposition import PCA
5
+ from _config import config
6
+
7
+ class PCAHandler(BaseEstimator, TransformerMixin):
8
+ def __init__(self, apply_pca=False, variance=0.95):
9
+ self.apply_pca = apply_pca
10
+ self.variance = variance
11
+ self.pca = None
12
+
13
+ def fit(self, X, y=None):
14
+ if self.apply_pca:
15
+ self.pca = PCA(n_components=self.variance)
16
+ self.pca.fit(X)
17
+ return self
18
+
19
+ def transform(self, X):
20
+ if self.apply_pca and self.pca:
21
+ X_transformed = self.pca.transform(X)
22
+ return pd.DataFrame(X_transformed, index=X.index)
23
+
24
+ return X
pipeline_classes/scale_xyzdata.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
5
+ from _config import config
6
+
7
+ class ScaleXYZData(BaseEstimator, TransformerMixin):
8
+ def __init__(self, scaler_type='standard'):
9
+ self.scaler_type = scaler_type
10
+
11
+ def fit(self, X, y=None):
12
+ return self
13
+
14
+ def transform(self, X):
15
+ columns_to_scale = ['x', 'y', 'z']
16
+ if self.scaler_type == 'standard': # Scale the columns using StandardScaler or MinMaxScaler
17
+ scaler = StandardScaler()
18
+ elif self.scaler_type == 'minmax':
19
+ scaler = MinMaxScaler()
20
+ elif self.scaler_type == 'none':
21
+ return X # Return the DataFrame without scaling
22
+ else:
23
+ raise ValueError("Invalid scaler_type. Expected 'standard' or 'minmax'.") # Raise an error if scaler_type is invalid
24
+ scaled_columns = scaler.fit_transform(X[columns_to_scale])
25
+ scaled_df = pd.DataFrame(scaled_columns, columns=columns_to_scale, index=X.index)
26
+ X[columns_to_scale] = scaled_df
27
+ print("Data scaled successfully.")
28
+ return X
pipeline_classes/train_model.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from sklearn.base import BaseEstimator, TransformerMixin
4
+ from sklearn.model_selection import StratifiedGroupKFold
5
+ from skopt import BayesSearchCV
6
+ from sklearn.ensemble import RandomForestClassifier
7
+ from sklearn.svm import SVC
8
+ from xgboost import XGBClassifier
9
+ import joblib
10
+ from skopt.space import Real, Integer, Categorical
11
+ from sklearn.metrics import classification_report, accuracy_score
12
+ import json
13
+ from sklearn.preprocessing import LabelEncoder
14
+ #from _config import config
15
+
16
+ class TrainModel(BaseEstimator, TransformerMixin):
17
+ def __init__(self, classifier, train_label, target):
18
+ #self.config = config
19
+ #self.target = config.get("target_label", None) # User-defined target label in config
20
+ self.classifier = classifier
21
+ self.train_label = train_label
22
+ self.target = target
23
+ self.label_encoder = LabelEncoder()
24
+ #self.selected_domains = self.config.get("selected_domains", "All domains") # Default to all domains if None
25
+
26
+ #if not self.target:
27
+ # raise ValueError("No target label specified in the config. Please set 'target_label'.")
28
+
29
+ def get_default_param_space(self, classifier):
30
+ """ Returns the default hyperparameter space for a given classifier. """
31
+ if classifier == 'xgboost':
32
+ return {
33
+ 'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
34
+ 'n_estimators': Integer(100, 1000),
35
+ 'max_depth': Integer(3, 10),
36
+ 'min_child_weight': (1, 10),
37
+ 'subsample': (0.5, 1.0),
38
+ 'colsample_bytree': (0.5, 1.0),
39
+ 'gamma': (0, 10),
40
+ 'reg_alpha': (0, 10),
41
+ 'reg_lambda': (0, 10),
42
+ }
43
+ elif classifier == 'svm':
44
+ return {
45
+ 'C': Real(0.1, 10, prior='log-uniform'),
46
+ 'kernel': Categorical(['linear', 'rbf'])
47
+ }
48
+ elif classifier == 'randomforest':
49
+ return {
50
+ 'n_estimators': Integer(100, 1000),
51
+ 'max_depth': Integer(3, 10)
52
+ }
53
+ else:
54
+ raise ValueError(f"Unsupported classifier type: {classifier}")
55
+
56
+ def fit(self, X, y=None):
57
+ # Ensure the target column exists in the dataset
58
+ if self.target not in X.columns:
59
+ raise ValueError(f"Target label '{self.target}' not found in the dataset.")
60
+
61
+ # Fit the label encoder on the target column
62
+ print(f"Encoding the target labels for '{self.target}'...")
63
+ self.label_encoder.fit(X[self.target])
64
+
65
+ # Print the mapping between original labels and encoded labels
66
+ original_labels = list(self.label_encoder.classes_)
67
+ encoded_labels = list(range(len(original_labels)))
68
+ label_mapping = dict(zip(encoded_labels, original_labels))
69
+ print(f"Label encoding complete. Mapping: {label_mapping}")
70
+
71
+ # Transform the target column and add it as 'encoded_target'
72
+ X['encoded_target'] = self.label_encoder.transform(X[self.target])
73
+
74
+ # Value counts for the encoded target
75
+ value_counts = X['encoded_target'].value_counts().to_dict()
76
+ print(f"Value counts for encoded target: {value_counts}")
77
+ print(X.columns)
78
+ # Pop unnecessary columns (groupid, emotion labels not being used, etc.)
79
+ groups = X.pop('groupid')
80
+ print(f"Group IDs popped from the dataset.")
81
+ # Pop the label columns which aren't used
82
+
83
+ self.train_label = self.train_label.split(",")
84
+ for label in self.train_label:
85
+ X.pop(label)
86
+
87
+ print(f"Label columns popped from the dataset.")
88
+ # Pop the encoded target as Y
89
+ y = X.pop('encoded_target')
90
+ print(f"Encoded target column popped from the dataset.")
91
+ print(X.columns)
92
+
93
+ # Store the feature names for later use
94
+ feature_names = X.columns.tolist()
95
+ print(f"hallo")
96
+ # Choose classifier
97
+ classifier = self.classifier
98
+ if classifier == 'xgboost':
99
+ model = XGBClassifier(objective='multi:softmax', random_state=42)
100
+ elif classifier == 'svm':
101
+ model = SVC(probability=True)
102
+ elif classifier == 'randomforest':
103
+ model = RandomForestClassifier(random_state=42)
104
+ else:
105
+ raise ValueError(f"Unsupported classifier type: {classifier}")
106
+
107
+ print(f"Training the model using {classifier}...")
108
+
109
+ # Use user-defined param_space if provided, otherwise use default
110
+ print(f"Classifier: {classifier}")
111
+ default_param_space = self.get_default_param_space(classifier)
112
+ param_space = default_param_space
113
+
114
+ # Hyperparameter tuning using Bayesian optimization
115
+ sgkf = StratifiedGroupKFold(n_splits=5)
116
+ print(f"Parameter space being used: {param_space}")
117
+ if param_space is None:
118
+ raise ValueError("Parameter space cannot be None. Please check the classifier configuration.")
119
+
120
+ opt = BayesSearchCV(
121
+ estimator=model,
122
+ search_spaces=param_space,
123
+ cv=sgkf,
124
+ n_iter=5,
125
+ n_jobs=-1,
126
+ n_points=1,
127
+ verbose=1,
128
+ scoring='accuracy'
129
+ )
130
+
131
+ print("Hyperparameter tuning in progress...")
132
+ print(X.describe(),X.columns)
133
+ print(f"stop")
134
+
135
+ # Fit the model using the encoded target
136
+ opt.fit(X, y, groups=groups)
137
+ self.best_model = opt.best_estimator_
138
+ print(f"Best parameters found: {opt.best_params_}")
139
+
140
+ # Print classification metrics
141
+ y_pred = self.best_model.predict(X)
142
+ accuracy = accuracy_score(y, y_pred)
143
+ report = classification_report(y, y_pred, target_names=self.label_encoder.classes_, output_dict=True)
144
+
145
+ # Save classification report
146
+ classification_report_json = report
147
+ with open(f'classification_report_{self.target}.json', 'w') as f:
148
+ json.dump(classification_report_json, f, indent=4)
149
+
150
+ print(f"Accuracy: {accuracy}")
151
+ print(f"Classification Report:\n{report}")
152
+
153
+ # Save the best model with the target label in the file name
154
+ model_name = f"{classifier}_best_model_{self.target}.pkl"
155
+ joblib.dump(self.best_model, model_name)
156
+ print("Model saved successfully.")
157
+
158
+ # Save model metadata
159
+ model_metadata = {
160
+ "best_params": opt.best_params_,
161
+ "accuracy": accuracy,
162
+ "classification_report": classification_report_json,
163
+ "label_mapping": label_mapping,
164
+ "model_name": model_name,
165
+ "value_counts": value_counts,
166
+ #"selected_domains": self.selected_domains,
167
+ #"include_magnitude": self.config.get("include_magnitude", True)
168
+ }
169
+
170
+ if hasattr(self.best_model, "feature_importances_"):
171
+ feature_importances = self.best_model.feature_importances_
172
+ # Convert feature importances to native Python floats
173
+ feature_importance_dict = {feature: float(importance) for feature, importance in zip(feature_names, feature_importances)}
174
+ model_metadata["feature_importances"] = feature_importance_dict
175
+ print("Feature Importances:")
176
+ for feature, importance in feature_importance_dict.items():
177
+ print(f"{feature}: {importance:.4f}")
178
+
179
+ # Save metadata with the target name in the file name
180
+ metadata_file = f"{classifier}_model_metadata_{self.target}.json"
181
+ with open(metadata_file, "w") as f:
182
+ json.dump(model_metadata, f, indent=4)
183
+ print(f"Model metadata saved to {metadata_file}.")
184
+
185
+ # Save file paths internally for later retrieval
186
+ self.model_file = f"{classifier}_best_model_{self.target}.pkl"
187
+ self.metadata_file = f"{classifier}_model_metadata_{self.target}.json"
188
+
189
+ return self
190
+
191
+ def get_output_files(self):
192
+ return self.model_file, self.metadata_file
193
+
194
+ def transform(self, X):
195
+ return X # Placeholder for transform step (not needed for training)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ pandas == 2.2.3
2
+ numpy == 2.1.2
3
+ scikit-learn == 1.5.2
4
+ scikit-optimize == 0.10.2
5
+ xgboost == 2.1.1
6
+ joblib == 1.4.2
7
+ PyWavelets == 1.7.0
8
+ scipy == 1.14.1
9
+ gradio == 5.8.0