import numpy as np import pandas as pd from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor from sklearn.neural_network import MLPRegressor from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet from sklearn.svm import SVR from sklearn.model_selection import train_test_split, cross_val_score from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score import joblib import os class IVIMRegressor: """ Machine Learning wrapper for estimating IVIM/DKI parameters from diffusion MRI signals. This class provides a unified interface for training and applying various regression models (Random Forest, Extra Trees, MLP, etc.) to map signal attenuation curves directly to tissue parameters (D, f, D*, K), bypassing iterative non-linear least squares fitting. Supported architectures: - 'random_forest': Robust baseline, handles noise well. - 'extra_trees': Often faster and slightly more accurate than RF. In our experiments, this model showed superior robustness to noise. - 'mlp': Multi-layer Perceptron for capturing complex non-linear mappings. - 'xgboost': Gradient boosting (requires xgboost package). - 'svr': Support Vector Regression. """ def __init__(self, model_type='extra_trees', params=None): self.model_type = model_type self.params = params if params else {} self.model = self._build_model() def _build_model(self): if self.model_type == 'random_forest': # Default params from paper/notebook n_estimators = self.params.get('n_estimators', 100) return RandomForestRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1) elif self.model_type == 'extra_trees': n_estimators = self.params.get('n_estimators', 100) return ExtraTreesRegressor(n_estimators=n_estimators, random_state=42, n_jobs=-1) elif self.model_type == 'mlp': hidden_layer_sizes = self.params.get('hidden_layer_sizes', (100, 50)) return MLPRegressor(hidden_layer_sizes=hidden_layer_sizes, max_iter=500, random_state=42) elif self.model_type == 'xgboost': try: from xgboost import XGBRegressor return XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=-1, random_state=42) except ImportError: print("XGBoost not installed. Falling back to Random Forest.") return RandomForestRegressor(n_estimators=100, random_state=42) elif self.model_type == 'svr': C = self.params.get('C', 100) return SVR(C=C) else: raise ValueError(f"Unknown model type: {self.model_type}") def train(self, X, y, test_size=0.2, verbose=True): """ Trains the regression model using the provided signal-parameter pairs. Args: X: Input feature matrix (Normalized Signal vs b-values). Shape: [n_samples, n_b_values] y: Target parameter vector (e.g., Diffusion Coefficient D). Shape: [n_samples] test_size: Fraction of data to reserve for validation (default: 0.2). verbose: If True, prints training progress and validation metrics. Returns: Dictionary containing validation metrics (MAE, MSE, RMSE, R2). """ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) if verbose: print(f"Training {self.model_type} on {len(X_train)} samples...") self.model.fit(X_train, y_train) # Evaluate predictions = self.model.predict(X_test) metrics = self._evaluate(y_test, predictions) if verbose: print("--- Validation Metrics ---") for k, v in metrics.items(): print(f"{k}: {v:.6f}") return metrics def predict(self, X): """Predicts parameters for new data.""" return self.model.predict(X) def _evaluate(self, y_true, y_pred): return { 'MAE': mean_absolute_error(y_true, y_pred), 'MSE': mean_squared_error(y_true, y_pred), 'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)), 'R2': r2_score(y_true, y_pred) } def save(self, filepath): """Saves the trained model to disk.""" joblib.dump(self.model, filepath) print(f"Model saved to {filepath}") def load(self, filepath): """Loads a trained model from disk.""" if os.path.exists(filepath): self.model = joblib.load(filepath) print(f"Model loaded from {filepath}") else: raise FileNotFoundError(f"Model file not found: {filepath}") def load_training_data(data_dir, dataset_name='MR701'): """ Helper to load X and Y CSV files from the data directory. Expected format: Data_X2_{dataset}.csv and Data_Y_{dataset}.csv """ x_path = os.path.join(data_dir, f'Data_X2_{dataset_name}.csv') y_path = os.path.join(data_dir, f'Data_Y_{dataset_name}.csv') if not os.path.exists(x_path) or not os.path.exists(y_path): raise FileNotFoundError(f"Data files not found for {dataset_name} in {data_dir}") X = np.loadtxt(x_path) Y = np.loadtxt(y_path) # Assuming Y contains [D, f, D*, K] columns or similar return X, Y