Upload 34 files
Browse files- 01_combining_dataframes_pipeline.py +26 -0
- 02_feature_extraction_pipeline.py +27 -0
- 03_training_model_pipeline.py +21 -0
- 04_analyzing_data_pipeline.py +24 -0
- 05_complete_trainmodel_pipeline.py +30 -0
- LICENSE +21 -0
- _config.py +57 -0
- app.py +342 -0
- pipeline_classes/__init__.py +10 -0
- pipeline_classes/__pycache__/__init__.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/__init__.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/extract_features.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/extract_features.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/import_data.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/pcahandler.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/pcahandler.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc +0 -0
- pipeline_classes/__pycache__/train_model.cpython-310.pyc +0 -0
- pipeline_classes/__pycache__/train_model.cpython-313.pyc +0 -0
- pipeline_classes/classify_movementdata.py +40 -0
- pipeline_classes/create_combineddataframe.py +103 -0
- pipeline_classes/extract_features.py +272 -0
- pipeline_classes/lowpassfilter.py +55 -0
- pipeline_classes/pcahandler.py +24 -0
- pipeline_classes/scale_xyzdata.py +28 -0
- pipeline_classes/train_model.py +195 -0
- requirements.txt +9 -0
01_combining_dataframes_pipeline.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.pipeline import Pipeline
|
| 2 |
+
from pipeline_classes import CreateCombinedDataFrame
|
| 3 |
+
from _config import config
|
| 4 |
+
import time
|
| 5 |
+
import pandas as pd
|
| 6 |
+
|
| 7 |
+
accel_data = pd.read_csv(config["accel_path"])
|
| 8 |
+
reports_data = pd.read_csv(config["reports_path"])
|
| 9 |
+
|
| 10 |
+
X = (reports_data, accel_data)
|
| 11 |
+
|
| 12 |
+
# This pipeline combines the self-reports and accelerometer dataframes with a given timewindow into a single dataframe as a csv file
|
| 13 |
+
combining_dataframes_pipeline = Pipeline([
|
| 14 |
+
#('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)), # input path to self-reports data),
|
| 15 |
+
('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
|
| 16 |
+
])
|
| 17 |
+
|
| 18 |
+
# This will measure the time taken to run the pipeline
|
| 19 |
+
start_time = time.time()
|
| 20 |
+
|
| 21 |
+
# This will start the pipeline and return the combined dataframe
|
| 22 |
+
output_df = combining_dataframes_pipeline.fit_transform(X)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
end_time = time.time()
|
| 26 |
+
print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
|
02_feature_extraction_pipeline.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.pipeline import Pipeline
|
| 2 |
+
from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures
|
| 3 |
+
from _config import config
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# This pipeline extracts features from the combined dataframe and exports it to a csv file
|
| 7 |
+
feature_extraction_pipeline = Pipeline([
|
| 8 |
+
('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)), # input path to combined data
|
| 9 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
|
| 10 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
|
| 11 |
+
('extract_features', ExtractFeatures(window_length=config["window_length"],
|
| 12 |
+
window_step_size=config["window_step_size"],
|
| 13 |
+
data_frequency=config["data_frequency"],
|
| 14 |
+
selected_domains=config["selected_domains"],
|
| 15 |
+
include_magnitude=config["include_magnitude"],
|
| 16 |
+
label_columns=config["label_columns"])),
|
| 17 |
+
])
|
| 18 |
+
|
| 19 |
+
# This will measure the time taken to run the pipeline
|
| 20 |
+
start_time = time.time()
|
| 21 |
+
|
| 22 |
+
# This will start the pipeline and return the feature dataframe
|
| 23 |
+
output_df = feature_extraction_pipeline.fit_transform(None)
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
end_time = time.time()
|
| 27 |
+
print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
|
03_training_model_pipeline.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.pipeline import Pipeline
|
| 2 |
+
from pipeline_classes import ImportData, PCAHandler, TrainModel
|
| 3 |
+
from _config import config
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# This pipeline trains a model on the feature dataframe and export the model to a pickle file and general information to a json file
|
| 7 |
+
training_model_pipeline = Pipeline([
|
| 8 |
+
('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
|
| 9 |
+
('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
|
| 10 |
+
('train_model', TrainModel(config=config)),
|
| 11 |
+
])
|
| 12 |
+
|
| 13 |
+
# This will measure the time taken to run the pipeline
|
| 14 |
+
start_time = time.time()
|
| 15 |
+
|
| 16 |
+
# This will start the pipeline and return the model and a report
|
| 17 |
+
output_df = training_model_pipeline.fit_transform(None)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
end_time = time.time()
|
| 21 |
+
print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
|
04_analyzing_data_pipeline.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.pipeline import Pipeline
|
| 2 |
+
from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, ClassifyMovementData
|
| 3 |
+
from _config import config
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# This is the pipeline that will be used to analyze data which hasnt been classified yet and export the classified dataframe as a csv file
|
| 7 |
+
analyzing_data_pipeline = Pipeline([
|
| 8 |
+
('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)), # input path to accelerometer data)
|
| 9 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
|
| 10 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
|
| 11 |
+
('extract_features', ExtractFeatures(window_length=config['window_length'], window_step_size=config["window_step_size"], data_frequency=config["data_frequency"],
|
| 12 |
+
selected_domains=config['selected_domains'], include_magnitude=config['include_magnitude'])),
|
| 13 |
+
('classify_movement_data', ClassifyMovementData()),
|
| 14 |
+
])
|
| 15 |
+
|
| 16 |
+
# This will measure the time taken to run the pipeline
|
| 17 |
+
start_time = time.time()
|
| 18 |
+
|
| 19 |
+
# This will start the pipeline and return the classified dataframe
|
| 20 |
+
output_df = analyzing_data_pipeline.fit_transform(None)
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
end_time = time.time()
|
| 24 |
+
print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
|
05_complete_trainmodel_pipeline.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pipeline_classes import ImportData, LowPassFilter, ScaleXYZData, ExtractFeatures, CreateCombinedDataFrame, TrainModel, PCAHandler
|
| 2 |
+
from _config import config
|
| 3 |
+
from sklearn.pipeline import Pipeline
|
| 4 |
+
import time
|
| 5 |
+
|
| 6 |
+
# This is the complete pipeline that will be used to train a model on the combined dataframe and export the model to a pickle file and general information to a json file
|
| 7 |
+
complete_training_model_pipeline = Pipeline([
|
| 8 |
+
('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
|
| 9 |
+
('create_combined_dataframe', CreateCombinedDataFrame(time_window=config["time_window"], label_columns=config["label_columns"])),
|
| 10 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=config["cutoff_frequency"], sampling_rate=config["data_frequency"], order=config["order"])),
|
| 11 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=config["scaler_type"])),
|
| 12 |
+
('extract_features', ExtractFeatures(window_length=config["window_length"],
|
| 13 |
+
window_step_size=config["window_step_size"],
|
| 14 |
+
data_frequency=config["data_frequency"],
|
| 15 |
+
selected_domains=config["selected_domains"],
|
| 16 |
+
include_magnitude=config["include_magnitude"],
|
| 17 |
+
label_columns=config["label_columns"])),
|
| 18 |
+
('pca_handler', PCAHandler(apply_pca=config["apply_pca"], variance=config["pca_variance"])),
|
| 19 |
+
('train_model', TrainModel(config=config)),
|
| 20 |
+
])
|
| 21 |
+
|
| 22 |
+
# This will measure the time taken to run the pipeline
|
| 23 |
+
start_time = time.time()
|
| 24 |
+
|
| 25 |
+
# This will start the pipeline and return the model and a report
|
| 26 |
+
output_df = complete_training_model_pipeline.fit_transform(None)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
end_time = time.time()
|
| 30 |
+
print(f"Time taken: {int((end_time - start_time) // 60)} minutes and {(end_time - start_time) % 60:.2f} seconds")
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2024 mininato
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
_config.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration file for the pipeline
|
| 2 |
+
|
| 3 |
+
config = {
|
| 4 |
+
# Paths for Import Data
|
| 5 |
+
"accel_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/AccelerometerMeasurements_backup.csv", # Path to the accelerometer data
|
| 6 |
+
"reports_path": "/Users/anhducduong/Documents/GitHub/EmotionRecognitionPipeline/EmotionRecognitionPipeline/UserTestingSelfReports.csv", # Path to the self-reports data
|
| 7 |
+
#"combined_data_path": "Path or Name of File of Combined Data File", # Path to the combined data
|
| 8 |
+
#"features_data_path": "Path or Name of File of Features Data File", # Path to the features data
|
| 9 |
+
#"model_path": "Path or Name of Trained Model File", # Path to the trained model
|
| 10 |
+
|
| 11 |
+
# Label Configuration
|
| 12 |
+
"label_columns": ["valence", "arousal"], # Here you should input the emotion-labels that you are using
|
| 13 |
+
"target_label": "arousal", # This is the target label that you want to predict (Only one label can be selected)
|
| 14 |
+
|
| 15 |
+
# Configuration for combined data
|
| 16 |
+
"time_window": 3, # Minutes before and after the self-report
|
| 17 |
+
|
| 18 |
+
# Configuration for feature extraction
|
| 19 |
+
"window_length": 60, # Window length in seconds / 60
|
| 20 |
+
"window_step_size": 20, # Step size in seconds / 10%-50% of window_length / 20
|
| 21 |
+
"data_frequency": 25, # Data frequency in Hz
|
| 22 |
+
"selected_domains": None, # Default: Every domain / 'time_domain', 'spatial', 'frequency', 'statistical', 'wavelet' / multiple domains: ["time_domain", "frequency"] / order is not important
|
| 23 |
+
"include_magnitude": True, # Include magnitude-based features or not
|
| 24 |
+
|
| 25 |
+
#Configuration for Low-pass filter
|
| 26 |
+
"cutoff_frequency": 10, # Cut-off frequency for the low-pass filter
|
| 27 |
+
"order": 4, # Order of the filter
|
| 28 |
+
|
| 29 |
+
# Configuration for Scaling
|
| 30 |
+
"scaler_type": "standard", # Possible Scaler: 'standard' or 'minmax'
|
| 31 |
+
|
| 32 |
+
# Configuration for PCA
|
| 33 |
+
"apply_pca": False, # Apply PCA or not
|
| 34 |
+
"pca_variance": 0.95, # PCA variance threshold
|
| 35 |
+
|
| 36 |
+
# Configuration for model training
|
| 37 |
+
"classifier": "xgboost", # Default classifier ('xgboost', 'svm', 'randomforest')
|
| 38 |
+
|
| 39 |
+
# Configuration for hyperparameter tuning
|
| 40 |
+
"n_splits": 5, # Number of splits for cross-validation
|
| 41 |
+
"n_iter": 30, # Number of iterations for hyperparameter tuning
|
| 42 |
+
"n_jobs": -1, # Number of jobs for parallel processing
|
| 43 |
+
"n_points": 1, # Number of points to sample in the hyperparameter space
|
| 44 |
+
|
| 45 |
+
# If users want to define custom param_space, they can specify it here
|
| 46 |
+
"param_space": {
|
| 47 |
+
"learning_rate": (0.05, 0.2),
|
| 48 |
+
"n_estimators": (200, 800),
|
| 49 |
+
"max_depth": (4, 8),
|
| 50 |
+
"min_child_weight": (1, 5),
|
| 51 |
+
"subsample": (0.6, 0.9),
|
| 52 |
+
"colsample_bytree": (0.6, 0.9),
|
| 53 |
+
"gamma": (0, 5),
|
| 54 |
+
"reg_alpha": (0, 5),
|
| 55 |
+
"reg_lambda": (0, 5)
|
| 56 |
+
}, # Set to {None} to use default inside the TrainModel class
|
| 57 |
+
}
|
app.py
ADDED
|
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
from pipeline_classes import CreateCombinedDataFrame, ScaleXYZData, ExtractFeatures, TrainModel, ClassifyMovementData, LowPassFilter, PCAHandler
|
| 3 |
+
from sklearn.pipeline import Pipeline
|
| 4 |
+
from _config import config
|
| 5 |
+
import pandas as pd
|
| 6 |
+
import numpy as np
|
| 7 |
+
import joblib
|
| 8 |
+
import json
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
# Define pipelines
|
| 12 |
+
combining_dataframes_pipeline = Pipeline([
|
| 13 |
+
#('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
|
| 14 |
+
('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
|
| 15 |
+
])
|
| 16 |
+
|
| 17 |
+
feature_extraction_pipeline = Pipeline([
|
| 18 |
+
#('import_data', ImportData(use_accel=False, use_reports=False, use_combined=True, use_features=False)),
|
| 19 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
|
| 20 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=None)),
|
| 21 |
+
('extract_features', ExtractFeatures(window_length=None,
|
| 22 |
+
window_step_size=None,
|
| 23 |
+
data_frequency=None,
|
| 24 |
+
selected_domains=None,
|
| 25 |
+
include_magnitude=None,
|
| 26 |
+
features_label_columns=None)),
|
| 27 |
+
])
|
| 28 |
+
|
| 29 |
+
training_model_pipeline = Pipeline([
|
| 30 |
+
#('import_data', ImportData(use_accel=False, use_reports=False, use_combined=False, use_features=True)),
|
| 31 |
+
('pca_handler', PCAHandler(apply_pca=None, variance=None)),
|
| 32 |
+
('train_model', TrainModel(classifier=None, train_label= None, target=None)),
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
analyzing_data_pipeline = Pipeline([
|
| 36 |
+
#('import_data', ImportData(use_accel=True, use_reports=False, use_combined=False, use_features=False)),
|
| 37 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
|
| 38 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=None)),
|
| 39 |
+
('extract_features', ExtractFeatures(window_length=None,
|
| 40 |
+
window_step_size=None,
|
| 41 |
+
data_frequency=None,
|
| 42 |
+
selected_domains=None,
|
| 43 |
+
include_magnitude=None,
|
| 44 |
+
features_label_columns=None)),
|
| 45 |
+
('classify_movement_data', ClassifyMovementData(model_file=None)),
|
| 46 |
+
])
|
| 47 |
+
|
| 48 |
+
complete_training_model_pipeline = Pipeline([
|
| 49 |
+
#('import_data', ImportData(use_accel=True, use_reports=True, use_combined=False, use_features=False)),
|
| 50 |
+
('create_combined_dataframe', CreateCombinedDataFrame(time_window=None, label_columns=None)),
|
| 51 |
+
('low_pass_filter', LowPassFilter(cutoff_frequency=None, sampling_rate=None, order=None)),
|
| 52 |
+
('scale_xyz_data', ScaleXYZData(scaler_type=None)),
|
| 53 |
+
('extract_features', ExtractFeatures(window_length=None,
|
| 54 |
+
window_step_size=None,
|
| 55 |
+
data_frequency=None,
|
| 56 |
+
selected_domains=None,
|
| 57 |
+
include_magnitude=None,
|
| 58 |
+
features_label_columns=None)),
|
| 59 |
+
('pca_handler', PCAHandler(apply_pca=None, variance=None)),
|
| 60 |
+
('train_model', TrainModel(classifier=None, train_label= None, target=None)),
|
| 61 |
+
])
|
| 62 |
+
|
| 63 |
+
def execute_combine_pipeline(accel_file, report_file,
|
| 64 |
+
time_window=None, label_columns=None
|
| 65 |
+
):
|
| 66 |
+
try:
|
| 67 |
+
# Load data files only if paths are valid
|
| 68 |
+
accel_data = pd.read_csv(accel_file) if accel_file else None
|
| 69 |
+
report_data = pd.read_csv(report_file) if report_file else None
|
| 70 |
+
|
| 71 |
+
# Validate inputs for the selected pipeline
|
| 72 |
+
if accel_data is None or report_data is None:
|
| 73 |
+
return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
|
| 74 |
+
combining_dataframes_pipeline.set_params(
|
| 75 |
+
create_combined_dataframe__time_window=time_window,
|
| 76 |
+
create_combined_dataframe__label_columns=label_columns.split(','))
|
| 77 |
+
X = report_data, accel_data
|
| 78 |
+
result = combining_dataframes_pipeline.fit_transform(X)
|
| 79 |
+
output_file = "combine_dataframes_output.csv"
|
| 80 |
+
result.to_csv(output_file, index=False)
|
| 81 |
+
|
| 82 |
+
return output_file
|
| 83 |
+
|
| 84 |
+
except Exception as e:
|
| 85 |
+
print(f"Error occurred: {str(e)}")
|
| 86 |
+
return str(e), None
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def execute_feature_extraction_pipeline(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
|
| 90 |
+
try:
|
| 91 |
+
combined_data = pd.read_csv(combined_file) if combined_file else None
|
| 92 |
+
if combined_data is None:
|
| 93 |
+
return "Error: Combined data file is required for this pipeline.", None
|
| 94 |
+
|
| 95 |
+
feature_extraction_pipeline.set_params(
|
| 96 |
+
low_pass_filter__cutoff_frequency=cutoff_frequency,
|
| 97 |
+
low_pass_filter__order=order,
|
| 98 |
+
low_pass_filter__sampling_rate=data_frequency,
|
| 99 |
+
scale_xyz_data__scaler_type=scaler_type,
|
| 100 |
+
extract_features__window_length=window_length,
|
| 101 |
+
extract_features__window_step_size=window_step_size,
|
| 102 |
+
extract_features__data_frequency=data_frequency,
|
| 103 |
+
#extract_features__selected_domains=None,
|
| 104 |
+
extract_features__include_magnitude=include_magnitude,
|
| 105 |
+
extract_features__features_label_columns=features_label_columns.split(','))
|
| 106 |
+
result = feature_extraction_pipeline.fit_transform(combined_data)
|
| 107 |
+
output_file = "extract_features_output.csv"
|
| 108 |
+
result.to_csv(output_file, index=False)
|
| 109 |
+
return output_file
|
| 110 |
+
|
| 111 |
+
except Exception as e:
|
| 112 |
+
print(f"Error occurred: {str(e)}")
|
| 113 |
+
return str(e)
|
| 114 |
+
|
| 115 |
+
def execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target):
|
| 116 |
+
try:
|
| 117 |
+
print(f"features_file: {features_file}")
|
| 118 |
+
features_data = pd.read_csv(features_file) if features_file else None
|
| 119 |
+
if features_data is None:
|
| 120 |
+
return "Error: Features data file is required for this pipeline.", None
|
| 121 |
+
|
| 122 |
+
training_model_pipeline.set_params(
|
| 123 |
+
pca_handler__apply_pca=apply_pca,
|
| 124 |
+
pca_handler__variance=pca_variance,
|
| 125 |
+
train_model__classifier=classifier,
|
| 126 |
+
train_model__train_label=train_label,
|
| 127 |
+
train_model__target=target)
|
| 128 |
+
|
| 129 |
+
X = features_data
|
| 130 |
+
training_model_pipeline.fit(X)
|
| 131 |
+
output_file, secondary_output_file = training_model_pipeline.named_steps['train_model'].get_output_files()
|
| 132 |
+
return output_file, secondary_output_file
|
| 133 |
+
|
| 134 |
+
except Exception as e:
|
| 135 |
+
print(f"Error occurred: {str(e)}")
|
| 136 |
+
return str(e), None
|
| 137 |
+
|
| 138 |
+
def execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
|
| 139 |
+
try:
|
| 140 |
+
print("hallo")
|
| 141 |
+
accel_data = pd.read_csv(accel_file) if accel_file else None
|
| 142 |
+
print("hallo2")
|
| 143 |
+
if accel_data is None:
|
| 144 |
+
return "Error: Accelerometer data file is required for this pipeline.", None
|
| 145 |
+
|
| 146 |
+
analyzing_data_pipeline.set_params(
|
| 147 |
+
low_pass_filter__cutoff_frequency=cutoff_frequency,
|
| 148 |
+
low_pass_filter__order=order,
|
| 149 |
+
low_pass_filter__sampling_rate=data_frequency,
|
| 150 |
+
scale_xyz_data__scaler_type=scaler_type,
|
| 151 |
+
extract_features__window_length=window_length,
|
| 152 |
+
extract_features__window_step_size=window_length,
|
| 153 |
+
extract_features__data_frequency=data_frequency,
|
| 154 |
+
#extract_features__selected_domains=None,
|
| 155 |
+
extract_features__include_magnitude=include_magnitude,
|
| 156 |
+
extract_features__features_label_columns=features_label_columns.split(','),
|
| 157 |
+
classify_movement_data__model_file=model_file.name
|
| 158 |
+
)
|
| 159 |
+
print("hallo3")
|
| 160 |
+
|
| 161 |
+
result = analyzing_data_pipeline.fit_transform(accel_data)
|
| 162 |
+
output_file = "analyze_data_output.csv"
|
| 163 |
+
result.to_csv(output_file, index=False)
|
| 164 |
+
return output_file
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"Error occurred: {str(e)}")
|
| 168 |
+
return str(e), None
|
| 169 |
+
|
| 170 |
+
def execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
|
| 171 |
+
cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
|
| 172 |
+
apply_pca, pca_variance, classifier, train_label, target):
|
| 173 |
+
try:
|
| 174 |
+
accel_data = pd.read_csv(accel_file) if accel_file else None
|
| 175 |
+
report_data = pd.read_csv(report_file) if report_file else None
|
| 176 |
+
if accel_data is None or report_data is None:
|
| 177 |
+
return "Error: Both accelerometer and self-report data files are required for this pipeline.", None
|
| 178 |
+
|
| 179 |
+
complete_training_model_pipeline.set_params(
|
| 180 |
+
create_combined_dataframe__time_window=time_window,
|
| 181 |
+
create_combined_dataframe__label_columns=label_columns.split(','),
|
| 182 |
+
low_pass_filter__cutoff_frequency=cutoff_frequency,
|
| 183 |
+
low_pass_filter__order=order,
|
| 184 |
+
low_pass_filter__sampling_rate=data_frequency,
|
| 185 |
+
scale_xyz_data__scaler_type=scaler_type,
|
| 186 |
+
extract_features__window_length=window_length,
|
| 187 |
+
extract_features__window_step_size=window_step_size,
|
| 188 |
+
extract_features__data_frequency=data_frequency,
|
| 189 |
+
#extract_features__selected_domains=None,
|
| 190 |
+
extract_features__include_magnitude=include_magnitude,
|
| 191 |
+
extract_features__features_label_columns=label_columns.split(','),
|
| 192 |
+
pca_handler__apply_pca=apply_pca,
|
| 193 |
+
pca_handler__variance=pca_variance,
|
| 194 |
+
train_model__classifier=classifier,
|
| 195 |
+
train_model__train_label=label_columns,
|
| 196 |
+
train_model__target=target
|
| 197 |
+
)
|
| 198 |
+
X = report_data, accel_data
|
| 199 |
+
complete_training_model_pipeline.fit(X)
|
| 200 |
+
output_file, secondary_output_file = complete_training_model_pipeline.named_steps['train_model'].get_output_files()
|
| 201 |
+
return output_file, secondary_output_file
|
| 202 |
+
|
| 203 |
+
except Exception as e:
|
| 204 |
+
print(f"Error occurred: {str(e)}")
|
| 205 |
+
return str(e), None
|
| 206 |
+
|
| 207 |
+
# Gradio Blocks Interface
|
| 208 |
+
with gr.Blocks() as demo:
|
| 209 |
+
with gr.Tabs():
|
| 210 |
+
with gr.TabItem("Combine DataFrames"):
|
| 211 |
+
accel_file = gr.File(label="Upload Accelerometer Data")
|
| 212 |
+
report_file = gr.File(label="Upload Self-Report Data")
|
| 213 |
+
time_window = gr.Number(label="Time Window (minutes)", value=2)
|
| 214 |
+
label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 215 |
+
combine_button = gr.Button("Combine DataFrames")
|
| 216 |
+
combine_output = gr.File(label="Download Combined DataFrame")
|
| 217 |
+
|
| 218 |
+
def combine_dataframes(accel_file, report_file, time_window, label_columns):
|
| 219 |
+
output_file = execute_combine_pipeline(accel_file, report_file, time_window, label_columns)
|
| 220 |
+
return output_file
|
| 221 |
+
|
| 222 |
+
combine_button.click(combine_dataframes, inputs=[accel_file, report_file, time_window, label_columns], outputs=combine_output)
|
| 223 |
+
|
| 224 |
+
with gr.TabItem("Extract Features"):
|
| 225 |
+
combined_file = gr.File(label="Upload Combined Data")
|
| 226 |
+
|
| 227 |
+
cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
|
| 228 |
+
order = gr.Number(label="Order", value=4)
|
| 229 |
+
|
| 230 |
+
scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
|
| 231 |
+
|
| 232 |
+
window_length = gr.Number(label="Window Length (seconds)", value=60)
|
| 233 |
+
window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
|
| 234 |
+
data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
|
| 235 |
+
|
| 236 |
+
#selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
|
| 237 |
+
include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
|
| 238 |
+
features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 239 |
+
|
| 240 |
+
extract_button = gr.Button("Extract Features")
|
| 241 |
+
extract_output = gr.File(label="Download Extracted Features")
|
| 242 |
+
|
| 243 |
+
def extract_features(combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns):
|
| 244 |
+
output_file = execute_feature_extraction_pipeline(combined_file,
|
| 245 |
+
cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency,
|
| 246 |
+
include_magnitude, features_label_columns
|
| 247 |
+
)
|
| 248 |
+
return output_file
|
| 249 |
+
|
| 250 |
+
extract_button.click(extract_features, inputs=[combined_file, cutoff_frequency, order, scaler_type, window_length, window_step_size,
|
| 251 |
+
data_frequency, include_magnitude, features_label_columns], outputs=extract_output)
|
| 252 |
+
|
| 253 |
+
with gr.TabItem("Train Model"):
|
| 254 |
+
features_file = gr.File(label="Upload Features Data")
|
| 255 |
+
|
| 256 |
+
apply_pca = gr.Checkbox(label="Apply PCA", value=False)
|
| 257 |
+
pca_variance = gr.Number(label="PCA Variance", value=0.95)
|
| 258 |
+
classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
|
| 259 |
+
train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 260 |
+
target = gr.Textbox(label="Target Label", value="arousal")
|
| 261 |
+
|
| 262 |
+
train_button = gr.Button("Train Model")
|
| 263 |
+
train_output_json = gr.File(label="Download Model JSON")
|
| 264 |
+
train_output_pkl = gr.File(label="Download Model PKL")
|
| 265 |
+
|
| 266 |
+
def train_model(features_file, apply_pca, pca_variance, classifier, train_label, target):
|
| 267 |
+
output_file, secondary_output_file = execute_training_pipeline(features_file, apply_pca, pca_variance, classifier, train_label, target)
|
| 268 |
+
return output_file, secondary_output_file
|
| 269 |
+
|
| 270 |
+
train_button.click(train_model, inputs=[features_file, apply_pca, pca_variance, classifier, train_label, target], outputs=[train_output_json, train_output_pkl])
|
| 271 |
+
|
| 272 |
+
with gr.TabItem("Analyze Data"):
|
| 273 |
+
accel_file = gr.File(label="Upload Accelerometer Data")
|
| 274 |
+
model_file = gr.File(label="Upload Model")
|
| 275 |
+
|
| 276 |
+
cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
|
| 277 |
+
order = gr.Number(label="Order", value=4)
|
| 278 |
+
|
| 279 |
+
scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
|
| 280 |
+
|
| 281 |
+
window_length = gr.Number(label="Window Length (seconds)", value=60)
|
| 282 |
+
data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
|
| 283 |
+
|
| 284 |
+
#selected_domains= gr.Textbox(label="Only these domains (Comma-Seperated) / If you want all then leave out", value=None)
|
| 285 |
+
include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
|
| 286 |
+
features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 287 |
+
|
| 288 |
+
analyze_button = gr.Button("Analyze Data")
|
| 289 |
+
analyze_output = gr.File(label="Download Analyzed Data")
|
| 290 |
+
|
| 291 |
+
def analyze_data(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length, data_frequency, include_magnitude, features_label_columns):
|
| 292 |
+
output_file = execute_analyze_pipeline(accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
|
| 293 |
+
data_frequency, include_magnitude, features_label_columns)
|
| 294 |
+
return output_file
|
| 295 |
+
|
| 296 |
+
analyze_button.click(analyze_data, inputs=[accel_file, model_file, cutoff_frequency, order, scaler_type, window_length,
|
| 297 |
+
data_frequency, include_magnitude, features_label_columns ], outputs=analyze_output)
|
| 298 |
+
|
| 299 |
+
with gr.TabItem("Complete Train Model"):
|
| 300 |
+
accel_file = gr.File(label="Upload Accelerometer Data")
|
| 301 |
+
report_file = gr.File(label="Upload Self-Report Data")
|
| 302 |
+
|
| 303 |
+
time_window = gr.Number(label="Time Window (minutes)", value=2)
|
| 304 |
+
label_columns = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 305 |
+
|
| 306 |
+
cutoff_frequency = gr.Number(label="Cutoff Frequency (Hz)", value=10)
|
| 307 |
+
order = gr.Number(label="Order", value=4)
|
| 308 |
+
|
| 309 |
+
scaler_type = gr.Radio(label="Scaler Type", choices=["standard", "minmax"])
|
| 310 |
+
|
| 311 |
+
window_length = gr.Number(label="Window Length (seconds)", value=60)
|
| 312 |
+
window_step_size = gr.Number(label="Window Step Size (seconds)", value=20)
|
| 313 |
+
data_frequency = gr.Number(label="Data Frequency (Hz)", value=25)
|
| 314 |
+
|
| 315 |
+
include_magnitude= gr.Checkbox(label="Include Magnitude", value=True)
|
| 316 |
+
#features_label_columns= gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 317 |
+
|
| 318 |
+
apply_pca = gr.Checkbox(label="Apply PCA", value=False)
|
| 319 |
+
pca_variance = gr.Number(label="PCA Variance", value=0.95)
|
| 320 |
+
classifier = gr.Dropdown(label="Classifier", choices=["xgboost", "svm", "randomforest"], value="xgboost")
|
| 321 |
+
#train_label = gr.Textbox(label="Label Columns (comma-separated)", value="valence,arousal")
|
| 322 |
+
target = gr.Textbox(label="Target Label", value="arousal")
|
| 323 |
+
|
| 324 |
+
complete_train_button = gr.Button("Complete Train Model")
|
| 325 |
+
|
| 326 |
+
complete_train_output_pkl = gr.File(label="Download Model PKL")
|
| 327 |
+
complete_train_output_json = gr.File(label="Download Model JSON")
|
| 328 |
+
|
| 329 |
+
def complete_train_model(accel_file, report_file, time_window, label_columns,
|
| 330 |
+
cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
|
| 331 |
+
apply_pca, pca_variance, classifier, train_label, target):
|
| 332 |
+
output_file, secondary_output_file = execute_complete_training_pipeline(accel_file, report_file, time_window, label_columns,
|
| 333 |
+
cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
|
| 334 |
+
apply_pca, pca_variance, classifier, train_label, target)
|
| 335 |
+
return output_file, secondary_output_file
|
| 336 |
+
|
| 337 |
+
complete_train_button.click(complete_train_model, inputs=[accel_file, report_file, time_window, label_columns,
|
| 338 |
+
cutoff_frequency, order, scaler_type, window_length, window_step_size, data_frequency, include_magnitude, features_label_columns,
|
| 339 |
+
apply_pca, pca_variance, classifier, train_label, target], outputs=[complete_train_output_pkl, complete_train_output_json])
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
demo.launch()
|
pipeline_classes/__init__.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Description: This file is used to import all the classes in the pipeline_classes folder.
|
| 2 |
+
|
| 3 |
+
# from .import_data import ImportData
|
| 4 |
+
from .create_combineddataframe import CreateCombinedDataFrame
|
| 5 |
+
from .scale_xyzdata import ScaleXYZData
|
| 6 |
+
from .extract_features import ExtractFeatures
|
| 7 |
+
from .pcahandler import PCAHandler
|
| 8 |
+
from .train_model import TrainModel
|
| 9 |
+
from .classify_movementdata import ClassifyMovementData
|
| 10 |
+
from .lowpassfilter import LowPassFilter
|
pipeline_classes/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (563 Bytes). View file
|
|
|
pipeline_classes/__pycache__/__init__.cpython-313.pyc
ADDED
|
Binary file (622 Bytes). View file
|
|
|
pipeline_classes/__pycache__/classify_movementdata.cpython-310.pyc
ADDED
|
Binary file (1.48 kB). View file
|
|
|
pipeline_classes/__pycache__/classify_movementdata.cpython-313.pyc
ADDED
|
Binary file (2.05 kB). View file
|
|
|
pipeline_classes/__pycache__/create_combineddataframe.cpython-310.pyc
ADDED
|
Binary file (3.21 kB). View file
|
|
|
pipeline_classes/__pycache__/create_combineddataframe.cpython-313.pyc
ADDED
|
Binary file (4.57 kB). View file
|
|
|
pipeline_classes/__pycache__/extract_features.cpython-310.pyc
ADDED
|
Binary file (8.93 kB). View file
|
|
|
pipeline_classes/__pycache__/extract_features.cpython-313.pyc
ADDED
|
Binary file (18.3 kB). View file
|
|
|
pipeline_classes/__pycache__/import_data.cpython-313.pyc
ADDED
|
Binary file (2.77 kB). View file
|
|
|
pipeline_classes/__pycache__/lowpassfilter.cpython-310.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
pipeline_classes/__pycache__/lowpassfilter.cpython-313.pyc
ADDED
|
Binary file (2.75 kB). View file
|
|
|
pipeline_classes/__pycache__/pcahandler.cpython-310.pyc
ADDED
|
Binary file (1.18 kB). View file
|
|
|
pipeline_classes/__pycache__/pcahandler.cpython-313.pyc
ADDED
|
Binary file (1.64 kB). View file
|
|
|
pipeline_classes/__pycache__/scale_xyzdata.cpython-310.pyc
ADDED
|
Binary file (1.4 kB). View file
|
|
|
pipeline_classes/__pycache__/scale_xyzdata.cpython-313.pyc
ADDED
|
Binary file (1.82 kB). View file
|
|
|
pipeline_classes/__pycache__/train_model.cpython-310.pyc
ADDED
|
Binary file (5.32 kB). View file
|
|
|
pipeline_classes/__pycache__/train_model.cpython-313.pyc
ADDED
|
Binary file (8.5 kB). View file
|
|
|
pipeline_classes/classify_movementdata.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
import joblib
|
| 5 |
+
from _config import config
|
| 6 |
+
|
| 7 |
+
# This class is used to classify the movement data using a pre-trained model
|
| 8 |
+
class ClassifyMovementData(BaseEstimator, TransformerMixin):
|
| 9 |
+
def __init__(self, model_file = None):
|
| 10 |
+
#self.model_path = model_path if model_path else config.get("model_path")
|
| 11 |
+
self.model_file = model_file
|
| 12 |
+
self.model = None
|
| 13 |
+
|
| 14 |
+
def fit(self, X, y=None):
|
| 15 |
+
return self
|
| 16 |
+
|
| 17 |
+
def transform(self, X):
|
| 18 |
+
if self.model is None:
|
| 19 |
+
if self.model_file is None:
|
| 20 |
+
raise ValueError("Model file is not provided.")
|
| 21 |
+
try:
|
| 22 |
+
self.model = joblib.load(self.model_file) # Load the model
|
| 23 |
+
except Exception as e:
|
| 24 |
+
raise ValueError(f"Failed to load the model file: {e}")
|
| 25 |
+
|
| 26 |
+
# Assuming `X` is a DataFrame of pre-extracted features.
|
| 27 |
+
predictions = self.model.predict(X)
|
| 28 |
+
|
| 29 |
+
# Adding predictions to the DataFrame as the first column
|
| 30 |
+
X.insert(0, 'predicted_emotion', predictions)
|
| 31 |
+
|
| 32 |
+
print("Data classified successfully.")
|
| 33 |
+
|
| 34 |
+
# Export the labeled DataFrame to CSV
|
| 35 |
+
#window_length_str = str(config["window_length"])
|
| 36 |
+
output_file = f"classified_movement_data.csv"
|
| 37 |
+
X.to_csv(output_file, index=False)
|
| 38 |
+
print(f"Classified movement data exported successfully to {output_file}.")
|
| 39 |
+
|
| 40 |
+
return X
|
pipeline_classes/create_combineddataframe.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from _config import config
|
| 5 |
+
|
| 6 |
+
class CreateCombinedDataFrame(BaseEstimator, TransformerMixin):
|
| 7 |
+
def __init__(self, time_window, label_columns=None):
|
| 8 |
+
self.time_window = time_window
|
| 9 |
+
self.label_columns = label_columns
|
| 10 |
+
|
| 11 |
+
def fit(self, X, y=None):
|
| 12 |
+
return self
|
| 13 |
+
|
| 14 |
+
def transform(self, X):
|
| 15 |
+
df_reports, df_accel = X
|
| 16 |
+
|
| 17 |
+
print(f"PreprocesssingCombined initialized with label_columns: {self.label_columns}")
|
| 18 |
+
# Ensure the chosen label columns exist in the dataset
|
| 19 |
+
valid_conditions = (df_reports['timeOfEngagement'] != 0)
|
| 20 |
+
for label in self.label_columns:
|
| 21 |
+
valid_conditions &= (df_reports[label] != "NONE")
|
| 22 |
+
|
| 23 |
+
df_reports = df_reports[valid_conditions].copy()
|
| 24 |
+
|
| 25 |
+
# No datetime conversion needed; timestamps remain as integers
|
| 26 |
+
df_accel.rename(columns={'timestamp': 'timeOfNotification'}, inplace=True)
|
| 27 |
+
|
| 28 |
+
print(f"ExtractAccelData initialized with time_window: {self.time_window}")
|
| 29 |
+
df_reports['accel_data'] = df_reports.apply(lambda row: self._extract_accel_data(row, df_accel), axis=1)
|
| 30 |
+
|
| 31 |
+
print(f"Combining called with label_columns: {self.label_columns}")
|
| 32 |
+
combined_data = []
|
| 33 |
+
|
| 34 |
+
for _, row in df_reports.iterrows():
|
| 35 |
+
accel_data = row['accel_data']
|
| 36 |
+
for _, accel_row in accel_data.iterrows():
|
| 37 |
+
combined_row = {
|
| 38 |
+
'participantId': row['participantId'], # Participant ID
|
| 39 |
+
'selfreport_time': row['timeOfNotification'], # Self-report time
|
| 40 |
+
'accel_time': accel_row['timeOfNotification'], # Accelerometer data time
|
| 41 |
+
'x': accel_row['x'], # x-axis accelerometer data
|
| 42 |
+
'y': accel_row['y'], # y-axis accelerometer data
|
| 43 |
+
'z': accel_row['z'] # z-axis accelerometer data
|
| 44 |
+
}
|
| 45 |
+
|
| 46 |
+
# Dynamically add emotion labels to the combined row
|
| 47 |
+
for label in self.label_columns:
|
| 48 |
+
combined_row[label] = row[label]
|
| 49 |
+
|
| 50 |
+
combined_data.append(combined_row)
|
| 51 |
+
|
| 52 |
+
combined_df = pd.DataFrame(combined_data)
|
| 53 |
+
|
| 54 |
+
# Convert integer timestamps back to datetime format for the CSV
|
| 55 |
+
combined_df['selfreport_time'] = pd.to_datetime(combined_df['selfreport_time'], unit='ms')
|
| 56 |
+
combined_df['accel_time'] = pd.to_datetime(combined_df['accel_time'], unit='ms')
|
| 57 |
+
|
| 58 |
+
# Create groupid column (unique identifier based on participantId and selfreport_time)
|
| 59 |
+
combined_df['groupid'] = combined_df.groupby(['participantId', 'selfreport_time']).ngroup() + 1
|
| 60 |
+
col = combined_df.pop("groupid") # Move groupid to the first column
|
| 61 |
+
combined_df.insert(0, col.name, col)
|
| 62 |
+
|
| 63 |
+
# Export the combined dataframe to CSV
|
| 64 |
+
time_window_str = str(self.time_window)
|
| 65 |
+
label_columns_str = "_".join(self.label_columns)
|
| 66 |
+
file_name = f"combined_data_timewindow_{time_window_str}min_labels_{label_columns_str}.csv"
|
| 67 |
+
combined_df.to_csv(file_name, index=False)
|
| 68 |
+
print(f"Combined dataframe exported successfully to {file_name}.")
|
| 69 |
+
|
| 70 |
+
return combined_df
|
| 71 |
+
|
| 72 |
+
def _extract_accel_data(self, row, accel_data):
|
| 73 |
+
time_delta = self.time_window * 60 * 1000 # Convert minutes to milliseconds
|
| 74 |
+
start_time = row['timeOfNotification'] - time_delta # Keep as integer
|
| 75 |
+
end_time = row['timeOfNotification'] + time_delta # Keep as integer
|
| 76 |
+
participant_id = row['participantId']
|
| 77 |
+
|
| 78 |
+
# Ensure accel_data['timeOfNotification'] is also an integer
|
| 79 |
+
accel_data['timeOfNotification'] = accel_data['timeOfNotification'].astype(np.int64) # Ensure integer format
|
| 80 |
+
|
| 81 |
+
# Log a warning if the desired time range exceeds the available data range
|
| 82 |
+
if start_time < accel_data['timeOfNotification'].min() or end_time > accel_data['timeOfNotification'].max():
|
| 83 |
+
print(
|
| 84 |
+
f"Warning: Data does not cover the full {self.time_window}-minute window for participant {participant_id}. "
|
| 85 |
+
f"Available range: {accel_data['timeOfNotification'].min()} to {accel_data['timeOfNotification'].max()}. "
|
| 86 |
+
f"Requested range: {start_time} to {end_time}."
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
# Apply the filtering mask
|
| 90 |
+
mask = (
|
| 91 |
+
(accel_data['participantId'] == participant_id) &
|
| 92 |
+
(accel_data['timeOfNotification'] >= max(start_time, accel_data['timeOfNotification'].min())) &
|
| 93 |
+
(accel_data['timeOfNotification'] <= min(end_time, accel_data['timeOfNotification'].max()))
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
print("Start Time (ms):", start_time)
|
| 99 |
+
print("End Time (ms):", end_time)
|
| 100 |
+
print("Filtered Rows:\n", accel_data[mask])
|
| 101 |
+
|
| 102 |
+
return accel_data[mask]
|
| 103 |
+
|
pipeline_classes/extract_features.py
ADDED
|
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from scipy.fftpack import fft
|
| 5 |
+
from scipy.signal import welch
|
| 6 |
+
import pywt
|
| 7 |
+
from _config import config
|
| 8 |
+
|
| 9 |
+
class ExtractFeatures(BaseEstimator, TransformerMixin):
|
| 10 |
+
def __init__(self, window_length, window_step_size, data_frequency, selected_domains=None, include_magnitude=False, features_label_columns=None):
|
| 11 |
+
self.window_length = window_length
|
| 12 |
+
self.window_step_size = window_step_size
|
| 13 |
+
self.data_frequency = data_frequency
|
| 14 |
+
self.selected_domains = selected_domains
|
| 15 |
+
self.include_magnitude = include_magnitude
|
| 16 |
+
self.features_label_columns = features_label_columns #if label_columns else ["arousal", "valence"] # Default to arousal and valence if not specified
|
| 17 |
+
|
| 18 |
+
def fit(self, X, y=None):
|
| 19 |
+
return self
|
| 20 |
+
|
| 21 |
+
def transform(self, X):
|
| 22 |
+
features_list = []
|
| 23 |
+
|
| 24 |
+
if 'groupid' in X.columns: # Check for groupid column
|
| 25 |
+
for groupid in X['groupid'].unique(): # Iterate over unique group IDs
|
| 26 |
+
temp = X[X['groupid'] == groupid] # Filter rows by group ID
|
| 27 |
+
temp_ex = temp[['accel_time', 'x', 'y', 'z']].copy() # Keep only the necessary columns (accel_time can be removed if unused)
|
| 28 |
+
windows = self._window_data(temp_ex[['x', 'y', 'z']]) # Create windows of data
|
| 29 |
+
|
| 30 |
+
for window in windows:
|
| 31 |
+
features = self._extract_features_from_window(window) # Extract features from each window
|
| 32 |
+
features['groupid'] = groupid # Add groupid to the features
|
| 33 |
+
|
| 34 |
+
# Dynamically add emotion labels to the features
|
| 35 |
+
for label in self.features_label_columns:
|
| 36 |
+
features[label] = temp[label].iloc[0]
|
| 37 |
+
|
| 38 |
+
features_list.append(pd.DataFrame([features])) # Convert dictionary to DataFrame
|
| 39 |
+
|
| 40 |
+
else: # In case there's no groupid, calculate features without it
|
| 41 |
+
windows = self._window_data(X[['x', 'y', 'z']])
|
| 42 |
+
for window in windows:
|
| 43 |
+
features = self._extract_features_from_window(window)
|
| 44 |
+
features_list.append(pd.DataFrame([features]))
|
| 45 |
+
|
| 46 |
+
all_features = pd.concat(features_list, ignore_index=True)
|
| 47 |
+
|
| 48 |
+
# Export features to CSV
|
| 49 |
+
window_length_str = str(self.window_length)
|
| 50 |
+
window_step_size_str = str(self.window_step_size)
|
| 51 |
+
if self.selected_domains is None: # All features calculated if domains are not selected
|
| 52 |
+
domain_str = "all_features"
|
| 53 |
+
else:
|
| 54 |
+
domain_str = "_".join(self.selected_domains)
|
| 55 |
+
file_name = f"features_window_{window_length_str}_step_{window_step_size_str}_{domain_str}.csv"
|
| 56 |
+
all_features.to_csv(file_name, index=False)
|
| 57 |
+
|
| 58 |
+
print("All features extracted successfully.")
|
| 59 |
+
return all_features
|
| 60 |
+
|
| 61 |
+
# Time Domain Features
|
| 62 |
+
def _calculate_magnitude(self, window):
|
| 63 |
+
return np.sqrt(window[:, 0]**2 + window[:, 1]**2 + window[:, 2]**2)
|
| 64 |
+
|
| 65 |
+
def _window_data(self, data): # Function to create windows of the data
|
| 66 |
+
window_samples = int(self.window_length * self.data_frequency) # Number of samples in each window 60sec * 25Hz = 1500 samples
|
| 67 |
+
step_samples = int(self.window_step_size * self.data_frequency) # Number of samples to move the window
|
| 68 |
+
windows = [data[i:i + window_samples] for i in range(0, len(data) - window_samples + 1, step_samples)] # Create windows
|
| 69 |
+
return np.array(windows)
|
| 70 |
+
|
| 71 |
+
def _extract_features_from_window(self, window): #DONE Mehrere domains gleichzeitig berechnen
|
| 72 |
+
all_features = {}
|
| 73 |
+
|
| 74 |
+
if self.selected_domains is None or 'time_domain' in self.selected_domains:
|
| 75 |
+
all_features.update(self._extract_time_domain_features(window))
|
| 76 |
+
|
| 77 |
+
if self.selected_domains is None or 'spatial' in self.selected_domains:
|
| 78 |
+
all_features.update(self._extract_spatial_features(window))
|
| 79 |
+
|
| 80 |
+
if self.selected_domains is None or 'frequency' in self.selected_domains:
|
| 81 |
+
all_features.update(self._extract_frequency_domain_features(window))
|
| 82 |
+
|
| 83 |
+
if self.selected_domains is None or 'statistical' in self.selected_domains:
|
| 84 |
+
all_features.update(self._extract_statistical_features(window))
|
| 85 |
+
|
| 86 |
+
if self.selected_domains is None or 'wavelet' in self.selected_domains:
|
| 87 |
+
all_features.update(self._extract_wavelet_features(window))
|
| 88 |
+
|
| 89 |
+
return all_features
|
| 90 |
+
|
| 91 |
+
def _extract_time_domain_features(self, window):
|
| 92 |
+
features = {
|
| 93 |
+
'mean_x': np.mean(window[:, 0]),
|
| 94 |
+
'mean_y': np.mean(window[:, 1]),
|
| 95 |
+
'mean_z': np.mean(window[:, 2]),
|
| 96 |
+
'std_x': np.std(window[:, 0]),
|
| 97 |
+
'std_y': np.std(window[:, 1]),
|
| 98 |
+
'std_z': np.std(window[:, 2]),
|
| 99 |
+
'variance_x': np.var(window[:, 0]),
|
| 100 |
+
'variance_y': np.var(window[:, 1]),
|
| 101 |
+
'variance_z': np.var(window[:, 2]),
|
| 102 |
+
'rms_x': np.sqrt(np.mean(window[:, 0]**2)),
|
| 103 |
+
'rms_y': np.sqrt(np.mean(window[:, 1]**2)),
|
| 104 |
+
'rms_z': np.sqrt(np.mean(window[:, 2]**2)),
|
| 105 |
+
'max_x': np.max(window[:, 0]),
|
| 106 |
+
'max_y': np.max(window[:, 1]),
|
| 107 |
+
'max_z': np.max(window[:, 2]),
|
| 108 |
+
'min_x': np.min(window[:, 0]),
|
| 109 |
+
'min_y': np.min(window[:, 1]),
|
| 110 |
+
'min_z': np.min(window[:, 2]),
|
| 111 |
+
'peak_to_peak_x': np.ptp(window[:, 0]),
|
| 112 |
+
'peak_to_peak_y': np.ptp(window[:, 1]),
|
| 113 |
+
'peak_to_peak_z': np.ptp(window[:, 2]),
|
| 114 |
+
'skewness_x': pd.Series(window[:, 0]).skew(),
|
| 115 |
+
'skewness_y': pd.Series(window[:, 1]).skew(),
|
| 116 |
+
'skewness_z': pd.Series(window[:, 2]).skew(),
|
| 117 |
+
'kurtosis_x': pd.Series(window[:, 0]).kurt(),
|
| 118 |
+
'kurtosis_y': pd.Series(window[:, 1]).kurt(),
|
| 119 |
+
'kurtosis_z': pd.Series(window[:, 2]).kurt(),
|
| 120 |
+
'zero_crossing_rate_x': np.sum(np.diff(np.sign(window[:, 0])) != 0),
|
| 121 |
+
'zero_crossing_rate_y': np.sum(np.diff(np.sign(window[:, 1])) != 0),
|
| 122 |
+
'zero_crossing_rate_z': np.sum(np.diff(np.sign(window[:, 2])) != 0),
|
| 123 |
+
'sma' : np.sum(np.abs(window[:, 0])) + np.sum(np.abs(window[:, 1])) + np.sum(np.abs(window[:, 2])), #Signal Magnitude Area
|
| 124 |
+
}
|
| 125 |
+
# print(f"Time domain features extracted successfully.")
|
| 126 |
+
|
| 127 |
+
# Additional features for Magnitude (xyz in one vector)
|
| 128 |
+
if self.include_magnitude:
|
| 129 |
+
magnitude = self._calculate_magnitude(window)
|
| 130 |
+
features['mean_magnitude'] = np.mean(magnitude)
|
| 131 |
+
features['std_magnitude'] = np.std(magnitude)
|
| 132 |
+
features['variance_magnitude'] = np.var(magnitude)
|
| 133 |
+
features['rms_magnitude'] = np.sqrt(np.mean(magnitude**2))
|
| 134 |
+
features['max_magnitude'] = np.max(magnitude)
|
| 135 |
+
features['min_magnitude'] = np.min(magnitude)
|
| 136 |
+
features['peak_to_peak_magnitude'] = np.ptp(magnitude)
|
| 137 |
+
features['skewness_magnitude'] = pd.Series(magnitude).skew()
|
| 138 |
+
features['kurtosis_magnitude'] = pd.Series(magnitude).kurt()
|
| 139 |
+
features['zero_crossing_rate_magnitude'] = np.sum(np.diff(np.sign(magnitude)) != 0)
|
| 140 |
+
# print(f"Additional time domain features for magnitude extracted successfully.")
|
| 141 |
+
|
| 142 |
+
return features
|
| 143 |
+
|
| 144 |
+
# Spatial Features
|
| 145 |
+
def _extract_spatial_features(self, window):
|
| 146 |
+
features = {}
|
| 147 |
+
|
| 148 |
+
# Euclidean Norm (Magnitude)
|
| 149 |
+
magnitude = self._calculate_magnitude(window)
|
| 150 |
+
features['euclidean_norm'] = np.mean(magnitude) # or np.linalg.norm for each window
|
| 151 |
+
|
| 152 |
+
# Tilt Angles (Pitch and Roll)
|
| 153 |
+
pitch = np.arctan2(window[:, 1], np.sqrt(window[:, 0]**2 + window[:, 2]**2)) * (180 / np.pi)
|
| 154 |
+
roll = np.arctan2(window[:, 0], np.sqrt(window[:, 1]**2 + window[:, 2]**2)) * (180 / np.pi)
|
| 155 |
+
features['mean_pitch'] = np.mean(pitch)
|
| 156 |
+
features['mean_roll'] = np.mean(roll)
|
| 157 |
+
|
| 158 |
+
# Correlation between Axes
|
| 159 |
+
features['correlation_xy'] = np.corrcoef(window[:, 0], window[:, 1])[0, 1]
|
| 160 |
+
features['correlation_xz'] = np.corrcoef(window[:, 0], window[:, 2])[0, 1]
|
| 161 |
+
features['correlation_yz'] = np.corrcoef(window[:, 1], window[:, 2])[0, 1]
|
| 162 |
+
|
| 163 |
+
# print(f"Spatial features extracted successfully.")
|
| 164 |
+
return features
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# Frequency Domain Features
|
| 169 |
+
def _extract_frequency_domain_features(self, window):
|
| 170 |
+
n = len(window)
|
| 171 |
+
freq_values = np.fft.fftfreq(n, d=1/self.data_frequency)[:n // 2]
|
| 172 |
+
fft_values = fft(window, axis=0)
|
| 173 |
+
fft_magnitude = np.abs(fft_values)[:n // 2]
|
| 174 |
+
|
| 175 |
+
features = {}
|
| 176 |
+
|
| 177 |
+
# Spectral Entropy
|
| 178 |
+
def spectral_entropy(signal):
|
| 179 |
+
psd = np.square(signal)
|
| 180 |
+
psd_norm = psd / np.sum(psd)
|
| 181 |
+
return -np.sum(psd_norm * np.log(psd_norm + 1e-10))
|
| 182 |
+
|
| 183 |
+
for i, axis in enumerate(['x', 'y', 'z']):
|
| 184 |
+
# Dominant Frequency
|
| 185 |
+
dominant_frequency = freq_values[np.argmax(fft_magnitude[:, i])]
|
| 186 |
+
features[f'dominant_frequency_{axis}'] = dominant_frequency
|
| 187 |
+
|
| 188 |
+
# Spectral Entropy
|
| 189 |
+
entropy = spectral_entropy(fft_magnitude[:, i])
|
| 190 |
+
features[f'spectral_entropy_{axis}'] = entropy
|
| 191 |
+
|
| 192 |
+
# Power Spectral Density (PSD) and Energy
|
| 193 |
+
f, psd_values = welch(window[:, i], fs=self.data_frequency, nperseg=n)
|
| 194 |
+
features[f'psd_mean_{axis}'] = np.mean(psd_values)
|
| 195 |
+
features[f'energy_{axis}'] = np.sum(psd_values**2)
|
| 196 |
+
|
| 197 |
+
# Bandwidth (frequency range containing significant portion of the energy)
|
| 198 |
+
cumulative_energy = np.cumsum(psd_values)
|
| 199 |
+
total_energy = cumulative_energy[-1]
|
| 200 |
+
low_cutoff_idx = np.argmax(cumulative_energy > 0.1 * total_energy)
|
| 201 |
+
high_cutoff_idx = np.argmax(cumulative_energy > 0.9 * total_energy)
|
| 202 |
+
bandwidth = f[high_cutoff_idx] - f[low_cutoff_idx]
|
| 203 |
+
features[f'bandwidth_{axis}'] = bandwidth
|
| 204 |
+
|
| 205 |
+
# Spectral Centroid (Center of mass of the spectrum)
|
| 206 |
+
spectral_centroid = np.sum(f * psd_values) / np.sum(psd_values)
|
| 207 |
+
features[f'spectral_centroid_{axis}'] = spectral_centroid
|
| 208 |
+
|
| 209 |
+
if self.include_magnitude:
|
| 210 |
+
# Magnitude-based Frequency Domain Features
|
| 211 |
+
magnitude = self._calculate_magnitude(window)
|
| 212 |
+
fft_magnitude_mag = np.abs(fft(magnitude))[:n // 2]
|
| 213 |
+
|
| 214 |
+
# Dominant Frequency for Magnitude
|
| 215 |
+
features['dominant_frequency_magnitude'] = freq_values[np.argmax(fft_magnitude_mag)]
|
| 216 |
+
|
| 217 |
+
# Spectral Entropy for Magnitude
|
| 218 |
+
features['spectral_entropy_magnitude'] = spectral_entropy(fft_magnitude_mag)
|
| 219 |
+
|
| 220 |
+
# Power Spectral Density and Energy for Magnitude
|
| 221 |
+
f, psd_values_mag = welch(magnitude, fs=self.data_frequency, nperseg=n)
|
| 222 |
+
features['psd_mean_magnitude'] = np.mean(psd_values_mag)
|
| 223 |
+
features['energy_magnitude'] = np.sum(psd_values_mag**2)
|
| 224 |
+
|
| 225 |
+
# Bandwidth for Magnitude
|
| 226 |
+
cumulative_energy_mag = np.cumsum(psd_values_mag)
|
| 227 |
+
total_energy_mag = cumulative_energy_mag[-1]
|
| 228 |
+
low_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.1 * total_energy_mag)
|
| 229 |
+
high_cutoff_idx_mag = np.argmax(cumulative_energy_mag > 0.9 * total_energy_mag)
|
| 230 |
+
bandwidth_mag = f[high_cutoff_idx_mag] - f[low_cutoff_idx_mag]
|
| 231 |
+
features['bandwidth_magnitude'] = bandwidth_mag
|
| 232 |
+
|
| 233 |
+
# Spectral Centroid for Magnitude
|
| 234 |
+
features['spectral_centroid_magnitude'] = np.sum(f * psd_values_mag) / np.sum(psd_values_mag)
|
| 235 |
+
|
| 236 |
+
# print(f"Frequency domain features extracted successfully.")
|
| 237 |
+
return features
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
def _extract_statistical_features(self, window):
|
| 241 |
+
features = {
|
| 242 |
+
'25th_percentile_x': np.percentile(window[:, 0], 25),
|
| 243 |
+
'25th_percentile_y': np.percentile(window[:, 1], 25),
|
| 244 |
+
'25th_percentile_z': np.percentile(window[:, 2], 25),
|
| 245 |
+
'75th_percentile_x': np.percentile(window[:, 0], 75),
|
| 246 |
+
'75th_percentile_y': np.percentile(window[:, 1], 75),
|
| 247 |
+
'75th_percentile_z': np.percentile(window[:, 2], 75),
|
| 248 |
+
}
|
| 249 |
+
|
| 250 |
+
if self.include_magnitude:
|
| 251 |
+
magnitude = self._calculate_magnitude(window)
|
| 252 |
+
features['25th_percentile_magnitude'] = np.percentile(magnitude, 25)
|
| 253 |
+
features['75th_percentile_magnitude'] = np.percentile(magnitude, 75)
|
| 254 |
+
|
| 255 |
+
# print(f"Statistical features extracted successfully.")
|
| 256 |
+
return features
|
| 257 |
+
|
| 258 |
+
def _extract_wavelet_features(self, window, wavelet='db1'):
|
| 259 |
+
coeffs = pywt.wavedec(window, wavelet, axis=0, level=3)
|
| 260 |
+
features = {
|
| 261 |
+
'wavelet_energy_approx_x': np.sum(coeffs[0][:, 0]**2),
|
| 262 |
+
'wavelet_energy_approx_y': np.sum(coeffs[0][:, 1]**2),
|
| 263 |
+
'wavelet_energy_approx_z': np.sum(coeffs[0][:, 2]**2),
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
if self.include_magnitude:
|
| 267 |
+
magnitude = self._calculate_magnitude(window)
|
| 268 |
+
coeffs_magnitude = pywt.wavedec(magnitude, wavelet, level=3)
|
| 269 |
+
features['wavelet_energy_approx_magnitude'] = np.sum(coeffs_magnitude[0]**2)
|
| 270 |
+
|
| 271 |
+
# print(f"Wavelet features extracted successfully.")
|
| 272 |
+
return features
|
pipeline_classes/lowpassfilter.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from scipy.signal import butter, filtfilt
|
| 5 |
+
|
| 6 |
+
class LowPassFilter(BaseEstimator, TransformerMixin):
|
| 7 |
+
def __init__(self, cutoff_frequency, sampling_rate, order):
|
| 8 |
+
"""
|
| 9 |
+
Initialize the LowPassFilter class.
|
| 10 |
+
|
| 11 |
+
Parameters:
|
| 12 |
+
- cutoff_frequency: The cutoff frequency for the low-pass filter (default: 5 Hz).
|
| 13 |
+
- sampling_rate: The sampling rate of the accelerometer data (default: 25 Hz).
|
| 14 |
+
- order: The order of the filter (default: 4).
|
| 15 |
+
"""
|
| 16 |
+
self.cutoff_frequency = cutoff_frequency
|
| 17 |
+
self.sampling_rate = sampling_rate
|
| 18 |
+
self.order = order
|
| 19 |
+
|
| 20 |
+
def _butter_lowpass_filter(self, data):
|
| 21 |
+
"""
|
| 22 |
+
Apply a Butterworth low-pass filter to the data.
|
| 23 |
+
|
| 24 |
+
Parameters:
|
| 25 |
+
- data: A NumPy array containing the accelerometer data to be filtered.
|
| 26 |
+
|
| 27 |
+
Returns:
|
| 28 |
+
- A filtered NumPy array.
|
| 29 |
+
"""
|
| 30 |
+
nyquist = 0.5 * self.sampling_rate
|
| 31 |
+
normalized_cutoff = self.cutoff_frequency / nyquist
|
| 32 |
+
b, a = butter(self.order, normalized_cutoff, btype='low', analog=False)
|
| 33 |
+
filtered_data = filtfilt(b, a, data, axis=0)
|
| 34 |
+
return filtered_data
|
| 35 |
+
|
| 36 |
+
def fit(self, X, y=None):
|
| 37 |
+
return self
|
| 38 |
+
|
| 39 |
+
def transform(self, X):
|
| 40 |
+
"""
|
| 41 |
+
Apply the low-pass filter to the accelerometer data.
|
| 42 |
+
|
| 43 |
+
Parameters:
|
| 44 |
+
- X: A DataFrame with 'x', 'y', and 'z' columns representing the accelerometer data.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
- The DataFrame with filtered 'x', 'y', and 'z' columns.
|
| 48 |
+
"""
|
| 49 |
+
if 'x' in X.columns and 'y' in X.columns and 'z' in X.columns:
|
| 50 |
+
X[['x', 'y', 'z']] = self._butter_lowpass_filter(X[['x', 'y', 'z']].values)
|
| 51 |
+
print("Low-pass filter applied successfully.")
|
| 52 |
+
else:
|
| 53 |
+
raise ValueError("The input DataFrame must contain 'x', 'y', and 'z' columns.")
|
| 54 |
+
|
| 55 |
+
return X
|
pipeline_classes/pcahandler.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from sklearn.decomposition import PCA
|
| 5 |
+
from _config import config
|
| 6 |
+
|
| 7 |
+
class PCAHandler(BaseEstimator, TransformerMixin):
|
| 8 |
+
def __init__(self, apply_pca=False, variance=0.95):
|
| 9 |
+
self.apply_pca = apply_pca
|
| 10 |
+
self.variance = variance
|
| 11 |
+
self.pca = None
|
| 12 |
+
|
| 13 |
+
def fit(self, X, y=None):
|
| 14 |
+
if self.apply_pca:
|
| 15 |
+
self.pca = PCA(n_components=self.variance)
|
| 16 |
+
self.pca.fit(X)
|
| 17 |
+
return self
|
| 18 |
+
|
| 19 |
+
def transform(self, X):
|
| 20 |
+
if self.apply_pca and self.pca:
|
| 21 |
+
X_transformed = self.pca.transform(X)
|
| 22 |
+
return pd.DataFrame(X_transformed, index=X.index)
|
| 23 |
+
|
| 24 |
+
return X
|
pipeline_classes/scale_xyzdata.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
| 5 |
+
from _config import config
|
| 6 |
+
|
| 7 |
+
class ScaleXYZData(BaseEstimator, TransformerMixin):
|
| 8 |
+
def __init__(self, scaler_type='standard'):
|
| 9 |
+
self.scaler_type = scaler_type
|
| 10 |
+
|
| 11 |
+
def fit(self, X, y=None):
|
| 12 |
+
return self
|
| 13 |
+
|
| 14 |
+
def transform(self, X):
|
| 15 |
+
columns_to_scale = ['x', 'y', 'z']
|
| 16 |
+
if self.scaler_type == 'standard': # Scale the columns using StandardScaler or MinMaxScaler
|
| 17 |
+
scaler = StandardScaler()
|
| 18 |
+
elif self.scaler_type == 'minmax':
|
| 19 |
+
scaler = MinMaxScaler()
|
| 20 |
+
elif self.scaler_type == 'none':
|
| 21 |
+
return X # Return the DataFrame without scaling
|
| 22 |
+
else:
|
| 23 |
+
raise ValueError("Invalid scaler_type. Expected 'standard' or 'minmax'.") # Raise an error if scaler_type is invalid
|
| 24 |
+
scaled_columns = scaler.fit_transform(X[columns_to_scale])
|
| 25 |
+
scaled_df = pd.DataFrame(scaled_columns, columns=columns_to_scale, index=X.index)
|
| 26 |
+
X[columns_to_scale] = scaled_df
|
| 27 |
+
print("Data scaled successfully.")
|
| 28 |
+
return X
|
pipeline_classes/train_model.py
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 4 |
+
from sklearn.model_selection import StratifiedGroupKFold
|
| 5 |
+
from skopt import BayesSearchCV
|
| 6 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 7 |
+
from sklearn.svm import SVC
|
| 8 |
+
from xgboost import XGBClassifier
|
| 9 |
+
import joblib
|
| 10 |
+
from skopt.space import Real, Integer, Categorical
|
| 11 |
+
from sklearn.metrics import classification_report, accuracy_score
|
| 12 |
+
import json
|
| 13 |
+
from sklearn.preprocessing import LabelEncoder
|
| 14 |
+
#from _config import config
|
| 15 |
+
|
| 16 |
+
class TrainModel(BaseEstimator, TransformerMixin):
|
| 17 |
+
def __init__(self, classifier, train_label, target):
|
| 18 |
+
#self.config = config
|
| 19 |
+
#self.target = config.get("target_label", None) # User-defined target label in config
|
| 20 |
+
self.classifier = classifier
|
| 21 |
+
self.train_label = train_label
|
| 22 |
+
self.target = target
|
| 23 |
+
self.label_encoder = LabelEncoder()
|
| 24 |
+
#self.selected_domains = self.config.get("selected_domains", "All domains") # Default to all domains if None
|
| 25 |
+
|
| 26 |
+
#if not self.target:
|
| 27 |
+
# raise ValueError("No target label specified in the config. Please set 'target_label'.")
|
| 28 |
+
|
| 29 |
+
def get_default_param_space(self, classifier):
|
| 30 |
+
""" Returns the default hyperparameter space for a given classifier. """
|
| 31 |
+
if classifier == 'xgboost':
|
| 32 |
+
return {
|
| 33 |
+
'learning_rate': Real(0.01, 0.3, prior='log-uniform'),
|
| 34 |
+
'n_estimators': Integer(100, 1000),
|
| 35 |
+
'max_depth': Integer(3, 10),
|
| 36 |
+
'min_child_weight': (1, 10),
|
| 37 |
+
'subsample': (0.5, 1.0),
|
| 38 |
+
'colsample_bytree': (0.5, 1.0),
|
| 39 |
+
'gamma': (0, 10),
|
| 40 |
+
'reg_alpha': (0, 10),
|
| 41 |
+
'reg_lambda': (0, 10),
|
| 42 |
+
}
|
| 43 |
+
elif classifier == 'svm':
|
| 44 |
+
return {
|
| 45 |
+
'C': Real(0.1, 10, prior='log-uniform'),
|
| 46 |
+
'kernel': Categorical(['linear', 'rbf'])
|
| 47 |
+
}
|
| 48 |
+
elif classifier == 'randomforest':
|
| 49 |
+
return {
|
| 50 |
+
'n_estimators': Integer(100, 1000),
|
| 51 |
+
'max_depth': Integer(3, 10)
|
| 52 |
+
}
|
| 53 |
+
else:
|
| 54 |
+
raise ValueError(f"Unsupported classifier type: {classifier}")
|
| 55 |
+
|
| 56 |
+
def fit(self, X, y=None):
|
| 57 |
+
# Ensure the target column exists in the dataset
|
| 58 |
+
if self.target not in X.columns:
|
| 59 |
+
raise ValueError(f"Target label '{self.target}' not found in the dataset.")
|
| 60 |
+
|
| 61 |
+
# Fit the label encoder on the target column
|
| 62 |
+
print(f"Encoding the target labels for '{self.target}'...")
|
| 63 |
+
self.label_encoder.fit(X[self.target])
|
| 64 |
+
|
| 65 |
+
# Print the mapping between original labels and encoded labels
|
| 66 |
+
original_labels = list(self.label_encoder.classes_)
|
| 67 |
+
encoded_labels = list(range(len(original_labels)))
|
| 68 |
+
label_mapping = dict(zip(encoded_labels, original_labels))
|
| 69 |
+
print(f"Label encoding complete. Mapping: {label_mapping}")
|
| 70 |
+
|
| 71 |
+
# Transform the target column and add it as 'encoded_target'
|
| 72 |
+
X['encoded_target'] = self.label_encoder.transform(X[self.target])
|
| 73 |
+
|
| 74 |
+
# Value counts for the encoded target
|
| 75 |
+
value_counts = X['encoded_target'].value_counts().to_dict()
|
| 76 |
+
print(f"Value counts for encoded target: {value_counts}")
|
| 77 |
+
print(X.columns)
|
| 78 |
+
# Pop unnecessary columns (groupid, emotion labels not being used, etc.)
|
| 79 |
+
groups = X.pop('groupid')
|
| 80 |
+
print(f"Group IDs popped from the dataset.")
|
| 81 |
+
# Pop the label columns which aren't used
|
| 82 |
+
|
| 83 |
+
self.train_label = self.train_label.split(",")
|
| 84 |
+
for label in self.train_label:
|
| 85 |
+
X.pop(label)
|
| 86 |
+
|
| 87 |
+
print(f"Label columns popped from the dataset.")
|
| 88 |
+
# Pop the encoded target as Y
|
| 89 |
+
y = X.pop('encoded_target')
|
| 90 |
+
print(f"Encoded target column popped from the dataset.")
|
| 91 |
+
print(X.columns)
|
| 92 |
+
|
| 93 |
+
# Store the feature names for later use
|
| 94 |
+
feature_names = X.columns.tolist()
|
| 95 |
+
print(f"hallo")
|
| 96 |
+
# Choose classifier
|
| 97 |
+
classifier = self.classifier
|
| 98 |
+
if classifier == 'xgboost':
|
| 99 |
+
model = XGBClassifier(objective='multi:softmax', random_state=42)
|
| 100 |
+
elif classifier == 'svm':
|
| 101 |
+
model = SVC(probability=True)
|
| 102 |
+
elif classifier == 'randomforest':
|
| 103 |
+
model = RandomForestClassifier(random_state=42)
|
| 104 |
+
else:
|
| 105 |
+
raise ValueError(f"Unsupported classifier type: {classifier}")
|
| 106 |
+
|
| 107 |
+
print(f"Training the model using {classifier}...")
|
| 108 |
+
|
| 109 |
+
# Use user-defined param_space if provided, otherwise use default
|
| 110 |
+
print(f"Classifier: {classifier}")
|
| 111 |
+
default_param_space = self.get_default_param_space(classifier)
|
| 112 |
+
param_space = default_param_space
|
| 113 |
+
|
| 114 |
+
# Hyperparameter tuning using Bayesian optimization
|
| 115 |
+
sgkf = StratifiedGroupKFold(n_splits=5)
|
| 116 |
+
print(f"Parameter space being used: {param_space}")
|
| 117 |
+
if param_space is None:
|
| 118 |
+
raise ValueError("Parameter space cannot be None. Please check the classifier configuration.")
|
| 119 |
+
|
| 120 |
+
opt = BayesSearchCV(
|
| 121 |
+
estimator=model,
|
| 122 |
+
search_spaces=param_space,
|
| 123 |
+
cv=sgkf,
|
| 124 |
+
n_iter=5,
|
| 125 |
+
n_jobs=-1,
|
| 126 |
+
n_points=1,
|
| 127 |
+
verbose=1,
|
| 128 |
+
scoring='accuracy'
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
print("Hyperparameter tuning in progress...")
|
| 132 |
+
print(X.describe(),X.columns)
|
| 133 |
+
print(f"stop")
|
| 134 |
+
|
| 135 |
+
# Fit the model using the encoded target
|
| 136 |
+
opt.fit(X, y, groups=groups)
|
| 137 |
+
self.best_model = opt.best_estimator_
|
| 138 |
+
print(f"Best parameters found: {opt.best_params_}")
|
| 139 |
+
|
| 140 |
+
# Print classification metrics
|
| 141 |
+
y_pred = self.best_model.predict(X)
|
| 142 |
+
accuracy = accuracy_score(y, y_pred)
|
| 143 |
+
report = classification_report(y, y_pred, target_names=self.label_encoder.classes_, output_dict=True)
|
| 144 |
+
|
| 145 |
+
# Save classification report
|
| 146 |
+
classification_report_json = report
|
| 147 |
+
with open(f'classification_report_{self.target}.json', 'w') as f:
|
| 148 |
+
json.dump(classification_report_json, f, indent=4)
|
| 149 |
+
|
| 150 |
+
print(f"Accuracy: {accuracy}")
|
| 151 |
+
print(f"Classification Report:\n{report}")
|
| 152 |
+
|
| 153 |
+
# Save the best model with the target label in the file name
|
| 154 |
+
model_name = f"{classifier}_best_model_{self.target}.pkl"
|
| 155 |
+
joblib.dump(self.best_model, model_name)
|
| 156 |
+
print("Model saved successfully.")
|
| 157 |
+
|
| 158 |
+
# Save model metadata
|
| 159 |
+
model_metadata = {
|
| 160 |
+
"best_params": opt.best_params_,
|
| 161 |
+
"accuracy": accuracy,
|
| 162 |
+
"classification_report": classification_report_json,
|
| 163 |
+
"label_mapping": label_mapping,
|
| 164 |
+
"model_name": model_name,
|
| 165 |
+
"value_counts": value_counts,
|
| 166 |
+
#"selected_domains": self.selected_domains,
|
| 167 |
+
#"include_magnitude": self.config.get("include_magnitude", True)
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
if hasattr(self.best_model, "feature_importances_"):
|
| 171 |
+
feature_importances = self.best_model.feature_importances_
|
| 172 |
+
# Convert feature importances to native Python floats
|
| 173 |
+
feature_importance_dict = {feature: float(importance) for feature, importance in zip(feature_names, feature_importances)}
|
| 174 |
+
model_metadata["feature_importances"] = feature_importance_dict
|
| 175 |
+
print("Feature Importances:")
|
| 176 |
+
for feature, importance in feature_importance_dict.items():
|
| 177 |
+
print(f"{feature}: {importance:.4f}")
|
| 178 |
+
|
| 179 |
+
# Save metadata with the target name in the file name
|
| 180 |
+
metadata_file = f"{classifier}_model_metadata_{self.target}.json"
|
| 181 |
+
with open(metadata_file, "w") as f:
|
| 182 |
+
json.dump(model_metadata, f, indent=4)
|
| 183 |
+
print(f"Model metadata saved to {metadata_file}.")
|
| 184 |
+
|
| 185 |
+
# Save file paths internally for later retrieval
|
| 186 |
+
self.model_file = f"{classifier}_best_model_{self.target}.pkl"
|
| 187 |
+
self.metadata_file = f"{classifier}_model_metadata_{self.target}.json"
|
| 188 |
+
|
| 189 |
+
return self
|
| 190 |
+
|
| 191 |
+
def get_output_files(self):
|
| 192 |
+
return self.model_file, self.metadata_file
|
| 193 |
+
|
| 194 |
+
def transform(self, X):
|
| 195 |
+
return X # Placeholder for transform step (not needed for training)
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
pandas == 2.2.3
|
| 2 |
+
numpy == 2.1.2
|
| 3 |
+
scikit-learn == 1.5.2
|
| 4 |
+
scikit-optimize == 0.10.2
|
| 5 |
+
xgboost == 2.1.1
|
| 6 |
+
joblib == 1.4.2
|
| 7 |
+
PyWavelets == 1.7.0
|
| 8 |
+
scipy == 1.14.1
|
| 9 |
+
gradio == 5.8.0
|