skamus48
/

dMRI-IVIM-ML-Toolkit

medical-imaging

Model card Files Files and versions

dMRI-IVIM-ML-Toolkit / train_pretrained.py

skamus48's picture

Upload 26 files

5d0dc03 verified 4 months ago

history blame contribute delete

4.13 kB

	import numpy as np
	import joblib
	import os
	import sys
	from sklearn.ensemble import ExtraTreesRegressor
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_squared_error

	# Ensure output directory exists
	if not os.path.exists('models'):
	os.makedirs('models')

	def generate_comprehensive_training_data(n_samples=100000):
	"""
	Generates a large, comprehensive dataset covering a wide range of b-values
	and physiological parameters to create a robust pre-trained model.
	"""
	print(f"Generating {n_samples} synthetic samples for pre-training...")

	# Extended set of b-values to cover various clinical protocols (Prostate, Brain, etc.)
	# We include a superset of common b-values.
	# NOTE: For the pre-trained model to be universal, it ideally needs to be trained
	# on the SPECIFIC b-values of the target protocol.
	# However, for a general purpose 'demo' model, we will use a standard clinical set.
	# Users should ideally retrain for their specific protocol, but this serves as a strong baseline.
	b_values = np.array([0, 10, 20, 30, 50, 80, 100, 150, 200, 400, 600, 800, 1000, 1500, 2000])

	# Random parameters within broad physiological ranges
	D = np.random.uniform(0.0001, 0.004, n_samples) # Diffusion coefficient (mm2/s)
	f = np.random.uniform(0.01, 0.4, n_samples) # Perfusion fraction
	D_star = np.random.uniform(0.005, 0.1, n_samples) # Pseudo-diffusion (mm2/s)
	K = np.random.uniform(0, 2.5, n_samples) # Kurtosis (dimensionless)

	X = []
	Y = [] # Targets: [D, f, D*, K]

	for i in range(n_samples):
	# Signal model: IVIM-DKI
	# S/S0 = f * exp(-bD) + (1-f) * exp(-bD + 1/6 b^2 * D^2 * K)

	# Diffusion term with Kurtosis
	# Note: We clip the exponent to avoid overflow/numerical issues at high b-values/K
	exponent_diff = -b_values * D[i] + (1/6) * (b_values*2) (D[i]*2) K[i]
	term_diff = np.exp(exponent_diff)

	# Perfusion term
	term_perf = np.exp(-b_values * D_star[i])

	signal = f[i] * term_perf + (1 - f[i]) * term_diff

	# Add Rician noise (approximated as Gaussian for simplicity in this large batch)
	# SNR varying between 20 and 100
	snr = np.random.uniform(20, 100)
	sigma = 1.0 / snr
	noise = np.random.normal(0, sigma, size=len(b_values))

	signal_noisy = signal + noise

	# Rician correction (magnitude)
	signal_noisy = np.sqrt(signal_noisy2 + noise2) # Simple approximation

	# Normalize (though it's already relative to S0=1)
	signal_norm = signal_noisy / np.max(signal_noisy)

	X.append(signal_norm)
	Y.append([D[i], f[i], D_star[i], K[i]])

	return np.array(X), np.array(Y), b_values

	def train_and_save():
	X, Y, b_values = generate_comprehensive_training_data(n_samples=50000)

	print("Training ExtraTreesRegressor (Multi-output)...")
	# Optimized for model size < 100MB for GitHub hosting
	model = ExtraTreesRegressor(
	n_estimators=50, # Reduced from 100
	max_depth=20, # Limit depth to prevent massive trees
	min_samples_leaf=5, # Prune leaves to reduce size
	min_samples_split=10,
	n_jobs=-1,
	random_state=42,
	verbose=1
	)

	model.fit(X, Y)

	# Evaluate on a small subset
	preds = model.predict(X[:1000])
	mse = mean_squared_error(Y[:1000], preds)
	print(f"Training MSE: {mse:.6f}")

	# Save model
	model_path = 'models/ivim_dki_extratrees.joblib'
	joblib.dump(model, model_path, compress=3) # Compress to keep file size small
	print(f"Model saved to {model_path}")

	# Save b-values metadata so we know what protocol this model expects
	np.save('models/b_values_config.npy', b_values)
	print("Configuration saved.")

	if __name__ == "__main__":
	train_and_save()