Spaces:

dev-jas
/

polymer-aging-with-ml

Sleeping

polymer-aging-with-ml / backend /utils /train.py

devjas1

Initial Release: Polymer Aging With ML [Standalone Appliance]

4a0e21d about 1 month ago

6.74 kB

	"""
	Main Training Script

	This script orchestrates the model training process. It is configuration-driven
	and uses MLflow for experiment tracking.

	Usage:
	python scripts/train.py --config-path configs/base_config.yaml
	"""

	from pathlib import Path
	import sys
	import argparse
	import yaml
	from typing import Dict, Optional, Any

	import pandas as pd
	import torch
	import mlflow
	from torch.utils.data import DataLoader, TensorDataset
	from tqdm import tqdm

	# Ensure the backend is in the path to import registry and preprocessing
	sys.path.append(str(Path(__file__).resolve().parents[1]))

	from config import TARGET_LEN
	from backend.utils.preprocessing import preprocess_spectrum
	from models.registry import build


	def load_data(data_path: Path, target_len: int):
	"""Load and preprocess data from a CSV file."""
	df = pd.read_csv(data_path)

	# This is a placeholder for your actual data loading.
	# You need to parse your 'spectra' column into x and y values.
	# For this example, we assume 'y_values' are stored as a string of numbers.
	# A more robust solution would use np.load or similar if data is saved in binary format.

	all_y = []
	# This loop is inefficient and for demonstration only. Vectorize in production.
	for _, row in tqdm(df.iterrows(), total=len(df), desc=f"Processing {data_path.name}"):
	# Dummy x_values, as preprocess_spectrum primarily uses y_values
	x_values = range(len(row['spectrum'].split()))
	y_values = [float(y) for y in row['spectrum'].split()]
	_, y_processed = preprocess_spectrum(
	x_values, y_values, modality='raman')
	all_y.append(y_processed)

	features = torch.tensor(all_y, dtype=torch.float32).unsqueeze(1)
	labels = torch.tensor(df['label'].values, dtype=torch.long)

	return TensorDataset(features, labels)


	def train(config: dict, jobs_db: Optional[Dict[str, Any]] = None, job_id: Optional[str] = None):
	"""Main training and validation loop."""
	try:
	# --- MLflow Setup ---
	mlflow.set_experiment(config['experiment_name'])
	with mlflow.start_run(run_name=config.get('run_name', 'default_run')) as run:
	mlflow.log_params(config)
	if jobs_db and job_id:
	jobs_db[job_id]['mlflow_run_id'] = run.info.run_id
	jobs_db[job_id]['status'] = 'RUNNING'
	print(f"MLflow Run ID: {run.info.run_id}")

	# --- Data Loading ---
	data_dir = Path(config['data_dir'])
	train_dataset = load_data(data_dir / config['train_csv'], TARGET_LEN)
	val_dataset = load_data(data_dir / config['val_csv'], TARGET_LEN)

	train_loader = DataLoader(
	train_dataset, batch_size=config['batch_size'], shuffle=True)
	val_loader = DataLoader(val_dataset, batch_size=config['batch_size'])

	# --- Model, Optimizer, Loss ---
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {device}")

	model = build(config['model_name'], TARGET_LEN).to(device)
	optimizer = getattr(torch.optim, config['optimizer'])(
	model.parameters(), lr=config['learning_rate'])
	criterion = getattr(torch.nn, config['loss_function'])()

	# --- Training Loop ---
	best_val_loss = float('inf')
	for epoch in range(config['epochs']):
	model.train()
	train_loss = 0.0
	for features, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Train]"):
	features, labels = features.to(device), labels.to(device)

	optimizer.zero_grad()
	outputs = model(features)
	loss = criterion(outputs, labels)
	loss.backward()
	optimizer.step()
	train_loss += loss.item()

	avg_train_loss = train_loss / len(train_loader)
	mlflow.log_metric("train_loss", avg_train_loss, step=epoch)

	# --- Validation Loop ---
	model.eval()
	val_loss = 0.0
	with torch.no_grad():
	for features, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{config['epochs']} [Val]"):
	features, labels = features.to(device), labels.to(device)
	outputs = model(features)
	loss = criterion(outputs, labels)
	val_loss += loss.item()

	avg_val_loss = val_loss / len(val_loader)
	mlflow.log_metric("val_loss", avg_val_loss, step=epoch)
	print(
	f"Epoch {epoch+1}: Train Loss: {avg_train_loss:.4f}, Val Loss: {avg_val_loss:.4f}")

	# --- Progress Update for Web UI ---
	if jobs_db and job_id:
	progress = (epoch + 1) / config['epochs']
	jobs_db[job_id]['progress'] = progress
	jobs_db[job_id]['metrics']['train_loss'].append(avg_train_loss)
	jobs_db[job_id]['metrics']['val_loss'].append(avg_val_loss)
	jobs_db[job_id]['current_epoch'] = epoch + 1

	# --- Save Best Model ---
	if avg_val_loss < best_val_loss:
	best_val_loss = avg_val_loss
	mlflow.pytorch.log_model(
	model, "model", registered_model_name=f"{config.get('run_name', 'default_run')}_best")
	print(
	f"New best model saved at epoch {epoch+1} with validation loss: {best_val_loss:.4f}")

	if jobs_db and job_id:
	jobs_db[job_id]['status'] = 'COMPLETED'
	jobs_db[job_id]['progress'] = 1.0
	print("✅ Training complete.")

	except Exception as e:
	print(f"❌ Training failed: {e}")
	if jobs_db and job_id:
	jobs_db[job_id]['status'] = 'FAILED'
	jobs_db[job_id]['error'] = str(e)
	raise


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(
	description="Train a spectral classification model.")
	parser.add_argument(
	"--config-path",
	type=Path,
	required=True,
	help="Path to the YAML configuration file."
	)
	args = parser.parse_args()

	with open(args.config_path, 'r', encoding='utf-8') as f:
	config = yaml.safe_load(f)

	# Run training from CLI without web-specific job tracking
	train(config=config)