dMRI-IVIM-ML-Toolkit / src /ml_models.py
skamus48's picture
Upload 26 files
5d0dc03 verified
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os
class IVIMRegressor:
"""
Machine Learning wrapper for estimating IVIM/DKI parameters from diffusion MRI signals.
This class provides a unified interface for training and applying various regression models
(Random Forest, Extra Trees, MLP, etc.) to map signal attenuation curves directly to
tissue parameters (D, f, D*, K), bypassing iterative non-linear least squares fitting.
Supported architectures:
- 'random_forest': Robust baseline, handles noise well.
- 'extra_trees': Often faster and slightly more accurate than RF. In our experiments, this model showed superior robustness to noise.
- 'mlp': Multi-layer Perceptron for capturing complex non-linear mappings.
- 'xgboost': Gradient boosting (requires xgboost package).
- 'svr': Support Vector Regression.
"""
def __init__(self, model_type='extra_trees', params=None):
self.model_type = model_type
self.params = params if params else {}
self.model = self._build_model()
def _build_model(self):
if self.model_type == 'random_forest':
# Default params from paper/notebook
n_estimators = self.params.get('n_estimators', 100)
return RandomForestRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1)
elif self.model_type == 'extra_trees':
n_estimators = self.params.get('n_estimators', 100)
return ExtraTreesRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1)
elif self.model_type == 'mlp':
hidden_layer_sizes = self.params.get('hidden_layer_sizes', (100, 50))
return MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, max_iter=500, random_state=42)
elif self.model_type == 'xgboost':
try:
from xgboost import XGBRegressor
return XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=-1, random_state=42)
except ImportError:
print("XGBoost not installed. Falling back to Random Forest.")
return RandomForestRegressor(n_estimators=100, random_state=42)
elif self.model_type == 'svr':
C = self.params.get('C', 100)
return SVR(C=C)
else:
raise ValueError(f"Unknown model type: {self.model_type}")
def train(self, X, y, test_size=0.2, verbose=True):
"""
Trains the regression model using the provided signal-parameter pairs.
Args:
X: Input feature matrix (Normalized Signal vs b-values). Shape: [n_samples, n_b_values]
y: Target parameter vector (e.g., Diffusion Coefficient D). Shape: [n_samples]
test_size: Fraction of data to reserve for validation (default: 0.2).
verbose: If True, prints training progress and validation metrics.
Returns:
Dictionary containing validation metrics (MAE, MSE, RMSE, R2).
"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
if verbose:
print(f"Training {self.model_type} on {len(X_train)} samples...")
self.model.fit(X_train, y_train)
# Evaluate
predictions = self.model.predict(X_test)
metrics = self._evaluate(y_test, predictions)
if verbose:
print("--- Validation Metrics ---")
for k, v in metrics.items():
print(f"{k}: {v:.6f}")
return metrics
def predict(self, X):
"""Predicts parameters for new data."""
return self.model.predict(X)
def _evaluate(self, y_true, y_pred):
return {
'MAE': mean_absolute_error(y_true, y_pred),
'MSE': mean_squared_error(y_true, y_pred),
'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
'R2': r2_score(y_true, y_pred)
}
def save(self, filepath):
"""Saves the trained model to disk."""
joblib.dump(self.model, filepath)
print(f"Model saved to {filepath}")
def load(self, filepath):
"""Loads a trained model from disk."""
if os.path.exists(filepath):
self.model = joblib.load(filepath)
print(f"Model loaded from {filepath}")
else:
raise FileNotFoundError(f"Model file not found: {filepath}")
def load_training_data(data_dir, dataset_name='MR701'):
"""
Helper to load X and Y CSV files from the data directory.
Expected format: Data_X2_{dataset}.csv and Data_Y_{dataset}.csv
"""
x_path = os.path.join(data_dir, f'Data_X2_{dataset_name}.csv')
y_path = os.path.join(data_dir, f'Data_Y_{dataset_name}.csv')
if not os.path.exists(x_path) or not os.path.exists(y_path):
raise FileNotFoundError(f"Data files not found for {dataset_name} in {data_dir}")
X = np.loadtxt(x_path)
Y = np.loadtxt(y_path) # Assuming Y contains [D, f, D*, K] columns or similar
return X, Y