""" Walnut Rancidity Predictor — Training Script Stacked LSTM with Attention, multi-task outputs. Memory-efficient: reads CSV in chunks and builds a fixed-size sample. """ import os, math, time, json, gc from pathlib import Path import numpy as np import pandas as pd import torch import torch.nn as nn from torch.utils.data import Dataset, DataLoader, TensorDataset from sklearn.preprocessing import StandardScaler from sklearn.metrics import roc_auc_score, mean_absolute_error import joblib # ── Reproducibility ────────────────────────────────────────────────────────── SEED = 42 torch.manual_seed(SEED) np.random.seed(SEED) # ── Paths ───────────────────────────────────────────────────────────────────── DATA_PATH = Path("data/walnut_storage_timeseries.csv") MODEL_DIR = Path("models") MODEL_PATH = MODEL_DIR / "walnut_rancidity_lstm_attention.pt" MODEL_DIR.mkdir(exist_ok=True) # ── Hyper-parameters ────────────────────────────────────────────────────────── FEATURE_COLS = [ "temperature", "humidity", "moisture", "oxygen", "peroxide_value", "free_fatty_acids", "hexanal_level", "oxidation_index", ] TARGET_COLS = [ "rancidity_probability", "shelf_life_remaining_days", "decay_curve_value", ] SEQ_LEN = 30 BATCH_SIZE = 64 EPOCHS = 20 LR = 1e-3 HIDDEN = 64 N_LAYERS = 3 DROPOUT = 0.2 MAX_SEQS = 90000 # cap sequences to keep memory manageable VAL_FRAC = 0.12 TEST_FRAC = 0.13 LOSS_W = {"bce": 1.0, "mse_shelf": 0.5, "mse_decay": 0.5} # ── Model ───────────────────────────────────────────────────────────────────── class Attention(nn.Module): def __init__(self, hidden_size: int): super().__init__() self.attn = nn.Linear(hidden_size, 1) def forward(self, lstm_out: torch.Tensor) -> torch.Tensor: scores = self.attn(lstm_out).squeeze(-1) weights = torch.softmax(scores, dim=-1) context = (weights.unsqueeze(-1) * lstm_out).sum(dim=1) return context class WalnutLSTMAttention(nn.Module): def __init__(self, n_features: int, hidden: int, n_layers: int, dropout: float): super().__init__() self.lstm = nn.LSTM( input_size=n_features, hidden_size=hidden, num_layers=n_layers, dropout=dropout if n_layers > 1 else 0.0, batch_first=True, ) self.attn = Attention(hidden) self.dropout = nn.Dropout(dropout) self.head_rancidity = nn.Sequential( nn.Linear(hidden, 32), nn.ReLU(), nn.Linear(32, 1), nn.Sigmoid(), ) self.head_shelf_life = nn.Sequential( nn.Linear(hidden, 32), nn.ReLU(), nn.Linear(32, 1), ) self.head_decay = nn.Sequential( nn.Linear(hidden, 32), nn.ReLU(), nn.Linear(32, 1), nn.Sigmoid(), ) def forward(self, x: torch.Tensor): lstm_out, _ = self.lstm(x) context = self.attn(lstm_out) context = self.dropout(context) rp = self.head_rancidity(context).squeeze(-1) sl = self.head_shelf_life(context).squeeze(-1) dc = self.head_decay(context).squeeze(-1) return rp, sl, dc # ── Data loading ────────────────────────────────────────────────────────────── def load_sequences(max_seqs: int = MAX_SEQS): """ Read CSV sequence-by-sequence (groupby seq_id) and extract the last SEQ_LEN rows as one training window per sequence. Keeps peak memory low by processing one group at a time. """ print(f" Reading {DATA_PATH} …") df = pd.read_csv(DATA_PATH, dtype={ "sequence_id": np.int32, "day": np.int16, "temperature": np.float32, "humidity": np.float32, "moisture": np.float32, "oxygen": np.float32, "peroxide_value": np.float32, "free_fatty_acids": np.float32, "hexanal_level": np.float32, "oxidation_index": np.float32, "rancidity_probability": np.float32, "shelf_life_remaining_days": np.float32, "decay_curve_value": np.float32, }) print(f" Loaded {len(df):,} rows, {df['sequence_id'].nunique():,} sequences") X_list, y_list = [], [] grouped = df.groupby("sequence_id", sort=False) for seq_id, grp in grouped: if len(X_list) >= max_seqs: break grp = grp.sort_values("day") feats = grp[FEATURE_COLS].values # (T, 8) tgts = grp[TARGET_COLS].values # (T, 3) n = len(feats) if n < SEQ_LEN: continue # One window: last SEQ_LEN timesteps X_list.append(feats[-SEQ_LEN:]) y_list.append(tgts[-1]) del df gc.collect() X = np.stack(X_list, axis=0).astype(np.float32) # (N, SEQ_LEN, 8) y = np.stack(y_list, axis=0).astype(np.float32) # (N, 3) # Normalise shelf life [0,1] for training (denorm in metrics) y[:, 1] /= 180.0 print(f" Built {len(X):,} samples shape={X.shape}") return X, y def rmse(pred: np.ndarray, true: np.ndarray) -> float: return float(np.sqrt(np.mean((pred - true) ** 2))) # ── Main ────────────────────────────────────────────────────────────────────── def train(): print("=" * 60) print("Walnut Rancidity Predictor — Training") print("=" * 60) # 1. Load print("\n[1/5] Loading sequences …") X, y = load_sequences(MAX_SEQS) N = len(X) # 2. Split print("\n[2/5] Splitting train/val/test …") rng = np.random.default_rng(SEED) idx = rng.permutation(N) n_test = int(N * TEST_FRAC) n_val = int(N * VAL_FRAC) te_idx = idx[:n_test] va_idx = idx[n_test:n_test + n_val] tr_idx = idx[n_test + n_val:] # 3. Scale features print("\n[3/5] Fitting StandardScaler …") scaler = StandardScaler() X_flat = X[tr_idx].reshape(-1, X.shape[-1]) scaler.fit(X_flat) joblib.dump(scaler, MODEL_DIR / "feature_scaler.pkl") def scale_split(indices): Xs = X[indices].copy() shape = Xs.shape Xs = scaler.transform(Xs.reshape(-1, shape[-1])).reshape(shape) ys = y[indices] return torch.tensor(Xs, dtype=torch.float32), torch.tensor(ys, dtype=torch.float32) X_tr, y_tr = scale_split(tr_idx) X_va, y_va = scale_split(va_idx) X_te, y_te = scale_split(te_idx) print(f" Train: {len(X_tr):,} Val: {len(X_va):,} Test: {len(X_te):,}") del X, y; gc.collect() tr_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True) va_loader = DataLoader(TensorDataset(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False) te_loader = DataLoader(TensorDataset(X_te, y_te), batch_size=BATCH_SIZE, shuffle=False) # 4. Model print("\n[4/5] Building model …") n_feat = len(FEATURE_COLS) model = WalnutLSTMAttention(n_feat, HIDDEN, N_LAYERS, DROPOUT) device = torch.device("cpu") total_params = sum(p.numel() for p in model.parameters()) print(f" Parameters: {total_params:,}") optimizer = torch.optim.Adam(model.parameters(), lr=LR) bce_fn = nn.BCELoss() mse_fn = nn.MSELoss() best_val_loss = math.inf history = [] # 5. Training print(f"\n[5/5] Training for {EPOCHS} epochs …") for epoch in range(1, EPOCHS + 1): t0 = time.time() model.train() tr_losses = [] for xb, yb in tr_loader: rp_pred, sl_pred, dc_pred = model(xb) loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0]) + LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1]) + LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2])) optimizer.zero_grad() loss.backward() nn.utils.clip_grad_norm_(model.parameters(), 1.0) optimizer.step() tr_losses.append(loss.item()) model.eval() val_losses, rp_preds, rp_trues = [], [], [] with torch.no_grad(): for xb, yb in va_loader: rp_pred, sl_pred, dc_pred = model(xb) val_loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0]) + LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1]) + LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2])) val_losses.append(val_loss.item()) rp_preds.extend(rp_pred.numpy()) rp_trues.extend(yb[:, 0].numpy()) avg_tr = float(np.mean(tr_losses)) avg_val = float(np.mean(val_losses)) try: auc = roc_auc_score((np.array(rp_trues) > 0.5).astype(int), rp_preds) except Exception: auc = float("nan") elapsed = time.time() - t0 print(f" Epoch {epoch:2d}/{EPOCHS} " f"train={avg_tr:.4f} val={avg_val:.4f} " f"AUC={auc:.4f} {elapsed:.1f}s") history.append({"epoch": epoch, "train_loss": avg_tr, "val_loss": avg_val, "auc": auc}) if avg_val < best_val_loss: best_val_loss = avg_val torch.save({ "epoch": epoch, "model_state": model.state_dict(), "optimizer": optimizer.state_dict(), "val_loss": avg_val, "config": { "n_features": n_feat, "hidden": HIDDEN, "n_layers": N_LAYERS, "dropout": DROPOUT, "seq_len": SEQ_LEN, }, }, MODEL_PATH) print(f" ✓ Best model saved (val={avg_val:.4f})") # Evaluate on test set print("\nTest evaluation …") ckpt = torch.load(MODEL_PATH, map_location="cpu") model.load_state_dict(ckpt["model_state"]) model.eval() rp_preds, rp_trues = [], [] sl_preds, sl_trues = [], [] dc_preds, dc_trues = [], [] with torch.no_grad(): for xb, yb in te_loader: rp_p, sl_p, dc_p = model(xb) rp_preds.extend(rp_p.numpy()); rp_trues.extend(yb[:, 0].numpy()) sl_preds.extend(sl_p.numpy()); sl_trues.extend(yb[:, 1].numpy()) dc_preds.extend(dc_p.numpy()); dc_trues.extend(yb[:, 2].numpy()) rp_arr, rp_t = np.array(rp_preds), np.array(rp_trues) sl_arr, sl_t = np.array(sl_preds), np.array(sl_trues) dc_arr, dc_t = np.array(dc_preds), np.array(dc_trues) try: test_auc = roc_auc_score((rp_t > 0.5).astype(int), rp_arr) except Exception: test_auc = float("nan") metrics = { "rancidity_AUC": round(float(test_auc), 4), "rancidity_MAE": round(float(mean_absolute_error(rp_t, rp_arr)), 4), "rancidity_RMSE": round(rmse(rp_arr, rp_t), 4), "shelf_life_MAE_days": round(float(mean_absolute_error(sl_t * 180, sl_arr * 180)), 2), "shelf_life_RMSE_days": round(rmse(sl_arr * 180, sl_t * 180), 2), "decay_MAE": round(float(mean_absolute_error(dc_t, dc_arr)), 4), "decay_RMSE": round(rmse(dc_arr, dc_t), 4), "best_val_loss": round(best_val_loss, 4), } print("\nTest Metrics:") for k, v in metrics.items(): print(f" {k}: {v}") with open(MODEL_DIR / "metrics.json", "w") as f: json.dump(metrics, f, indent=2) with open(MODEL_DIR / "training_history.json", "w") as f: json.dump(history, f, indent=2) print(f"\nModel → {MODEL_PATH}") print("Training complete.") return metrics if __name__ == "__main__": train()