| """ |
| Walnut Rancidity Predictor — Training Script |
| Stacked LSTM with Attention, multi-task outputs. |
| Memory-efficient: reads CSV in chunks and builds a fixed-size sample. |
| """ |
|
|
| import os, math, time, json, gc |
| from pathlib import Path |
|
|
| import numpy as np |
| import pandas as pd |
| import torch |
| import torch.nn as nn |
| from torch.utils.data import Dataset, DataLoader, TensorDataset |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.metrics import roc_auc_score, mean_absolute_error |
| import joblib |
|
|
| |
| SEED = 42 |
| torch.manual_seed(SEED) |
| np.random.seed(SEED) |
|
|
| |
| DATA_PATH = Path("data/walnut_storage_timeseries.csv") |
| MODEL_DIR = Path("models") |
| MODEL_PATH = MODEL_DIR / "walnut_rancidity_lstm_attention.pt" |
| MODEL_DIR.mkdir(exist_ok=True) |
|
|
| |
| FEATURE_COLS = [ |
| "temperature", "humidity", "moisture", "oxygen", |
| "peroxide_value", "free_fatty_acids", "hexanal_level", "oxidation_index", |
| ] |
| TARGET_COLS = [ |
| "rancidity_probability", |
| "shelf_life_remaining_days", |
| "decay_curve_value", |
| ] |
|
|
| SEQ_LEN = 30 |
| BATCH_SIZE = 64 |
| EPOCHS = 20 |
| LR = 1e-3 |
| HIDDEN = 64 |
| N_LAYERS = 3 |
| DROPOUT = 0.2 |
| MAX_SEQS = 90000 |
| VAL_FRAC = 0.12 |
| TEST_FRAC = 0.13 |
|
|
| LOSS_W = {"bce": 1.0, "mse_shelf": 0.5, "mse_decay": 0.5} |
|
|
|
|
| |
| class Attention(nn.Module): |
| def __init__(self, hidden_size: int): |
| super().__init__() |
| self.attn = nn.Linear(hidden_size, 1) |
|
|
| def forward(self, lstm_out: torch.Tensor) -> torch.Tensor: |
| scores = self.attn(lstm_out).squeeze(-1) |
| weights = torch.softmax(scores, dim=-1) |
| context = (weights.unsqueeze(-1) * lstm_out).sum(dim=1) |
| return context |
|
|
|
|
| class WalnutLSTMAttention(nn.Module): |
| def __init__(self, n_features: int, hidden: int, n_layers: int, dropout: float): |
| super().__init__() |
| self.lstm = nn.LSTM( |
| input_size=n_features, |
| hidden_size=hidden, |
| num_layers=n_layers, |
| dropout=dropout if n_layers > 1 else 0.0, |
| batch_first=True, |
| ) |
| self.attn = Attention(hidden) |
| self.dropout = nn.Dropout(dropout) |
|
|
| self.head_rancidity = nn.Sequential( |
| nn.Linear(hidden, 32), nn.ReLU(), |
| nn.Linear(32, 1), nn.Sigmoid(), |
| ) |
| self.head_shelf_life = nn.Sequential( |
| nn.Linear(hidden, 32), nn.ReLU(), |
| nn.Linear(32, 1), |
| ) |
| self.head_decay = nn.Sequential( |
| nn.Linear(hidden, 32), nn.ReLU(), |
| nn.Linear(32, 1), nn.Sigmoid(), |
| ) |
|
|
| def forward(self, x: torch.Tensor): |
| lstm_out, _ = self.lstm(x) |
| context = self.attn(lstm_out) |
| context = self.dropout(context) |
| rp = self.head_rancidity(context).squeeze(-1) |
| sl = self.head_shelf_life(context).squeeze(-1) |
| dc = self.head_decay(context).squeeze(-1) |
| return rp, sl, dc |
|
|
|
|
| |
| def load_sequences(max_seqs: int = MAX_SEQS): |
| """ |
| Read CSV sequence-by-sequence (groupby seq_id) and extract the last |
| SEQ_LEN rows as one training window per sequence. |
| Keeps peak memory low by processing one group at a time. |
| """ |
| print(f" Reading {DATA_PATH} …") |
| df = pd.read_csv(DATA_PATH, dtype={ |
| "sequence_id": np.int32, |
| "day": np.int16, |
| "temperature": np.float32, |
| "humidity": np.float32, |
| "moisture": np.float32, |
| "oxygen": np.float32, |
| "peroxide_value": np.float32, |
| "free_fatty_acids": np.float32, |
| "hexanal_level": np.float32, |
| "oxidation_index": np.float32, |
| "rancidity_probability": np.float32, |
| "shelf_life_remaining_days": np.float32, |
| "decay_curve_value": np.float32, |
| }) |
| print(f" Loaded {len(df):,} rows, {df['sequence_id'].nunique():,} sequences") |
|
|
| X_list, y_list = [], [] |
| grouped = df.groupby("sequence_id", sort=False) |
|
|
| for seq_id, grp in grouped: |
| if len(X_list) >= max_seqs: |
| break |
| grp = grp.sort_values("day") |
| feats = grp[FEATURE_COLS].values |
| tgts = grp[TARGET_COLS].values |
|
|
| n = len(feats) |
| if n < SEQ_LEN: |
| continue |
|
|
| |
| X_list.append(feats[-SEQ_LEN:]) |
| y_list.append(tgts[-1]) |
|
|
| del df |
| gc.collect() |
|
|
| X = np.stack(X_list, axis=0).astype(np.float32) |
| y = np.stack(y_list, axis=0).astype(np.float32) |
|
|
| |
| y[:, 1] /= 180.0 |
|
|
| print(f" Built {len(X):,} samples shape={X.shape}") |
| return X, y |
|
|
|
|
| def rmse(pred: np.ndarray, true: np.ndarray) -> float: |
| return float(np.sqrt(np.mean((pred - true) ** 2))) |
|
|
|
|
| |
| def train(): |
| print("=" * 60) |
| print("Walnut Rancidity Predictor — Training") |
| print("=" * 60) |
|
|
| |
| print("\n[1/5] Loading sequences …") |
| X, y = load_sequences(MAX_SEQS) |
| N = len(X) |
|
|
| |
| print("\n[2/5] Splitting train/val/test …") |
| rng = np.random.default_rng(SEED) |
| idx = rng.permutation(N) |
| n_test = int(N * TEST_FRAC) |
| n_val = int(N * VAL_FRAC) |
| te_idx = idx[:n_test] |
| va_idx = idx[n_test:n_test + n_val] |
| tr_idx = idx[n_test + n_val:] |
|
|
| |
| print("\n[3/5] Fitting StandardScaler …") |
| scaler = StandardScaler() |
| X_flat = X[tr_idx].reshape(-1, X.shape[-1]) |
| scaler.fit(X_flat) |
| joblib.dump(scaler, MODEL_DIR / "feature_scaler.pkl") |
|
|
| def scale_split(indices): |
| Xs = X[indices].copy() |
| shape = Xs.shape |
| Xs = scaler.transform(Xs.reshape(-1, shape[-1])).reshape(shape) |
| ys = y[indices] |
| return torch.tensor(Xs, dtype=torch.float32), torch.tensor(ys, dtype=torch.float32) |
|
|
| X_tr, y_tr = scale_split(tr_idx) |
| X_va, y_va = scale_split(va_idx) |
| X_te, y_te = scale_split(te_idx) |
| print(f" Train: {len(X_tr):,} Val: {len(X_va):,} Test: {len(X_te):,}") |
|
|
| del X, y; gc.collect() |
|
|
| tr_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True) |
| va_loader = DataLoader(TensorDataset(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False) |
| te_loader = DataLoader(TensorDataset(X_te, y_te), batch_size=BATCH_SIZE, shuffle=False) |
|
|
| |
| print("\n[4/5] Building model …") |
| n_feat = len(FEATURE_COLS) |
| model = WalnutLSTMAttention(n_feat, HIDDEN, N_LAYERS, DROPOUT) |
| device = torch.device("cpu") |
| total_params = sum(p.numel() for p in model.parameters()) |
| print(f" Parameters: {total_params:,}") |
|
|
| optimizer = torch.optim.Adam(model.parameters(), lr=LR) |
| bce_fn = nn.BCELoss() |
| mse_fn = nn.MSELoss() |
|
|
| best_val_loss = math.inf |
| history = [] |
|
|
| |
| print(f"\n[5/5] Training for {EPOCHS} epochs …") |
| for epoch in range(1, EPOCHS + 1): |
| t0 = time.time() |
| model.train() |
| tr_losses = [] |
|
|
| for xb, yb in tr_loader: |
| rp_pred, sl_pred, dc_pred = model(xb) |
| loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0]) |
| + LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1]) |
| + LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2])) |
| optimizer.zero_grad() |
| loss.backward() |
| nn.utils.clip_grad_norm_(model.parameters(), 1.0) |
| optimizer.step() |
| tr_losses.append(loss.item()) |
|
|
| model.eval() |
| val_losses, rp_preds, rp_trues = [], [], [] |
| with torch.no_grad(): |
| for xb, yb in va_loader: |
| rp_pred, sl_pred, dc_pred = model(xb) |
| val_loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0]) |
| + LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1]) |
| + LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2])) |
| val_losses.append(val_loss.item()) |
| rp_preds.extend(rp_pred.numpy()) |
| rp_trues.extend(yb[:, 0].numpy()) |
|
|
| avg_tr = float(np.mean(tr_losses)) |
| avg_val = float(np.mean(val_losses)) |
| try: |
| auc = roc_auc_score((np.array(rp_trues) > 0.5).astype(int), rp_preds) |
| except Exception: |
| auc = float("nan") |
|
|
| elapsed = time.time() - t0 |
| print(f" Epoch {epoch:2d}/{EPOCHS} " |
| f"train={avg_tr:.4f} val={avg_val:.4f} " |
| f"AUC={auc:.4f} {elapsed:.1f}s") |
|
|
| history.append({"epoch": epoch, "train_loss": avg_tr, |
| "val_loss": avg_val, "auc": auc}) |
|
|
| if avg_val < best_val_loss: |
| best_val_loss = avg_val |
| torch.save({ |
| "epoch": epoch, |
| "model_state": model.state_dict(), |
| "optimizer": optimizer.state_dict(), |
| "val_loss": avg_val, |
| "config": { |
| "n_features": n_feat, |
| "hidden": HIDDEN, |
| "n_layers": N_LAYERS, |
| "dropout": DROPOUT, |
| "seq_len": SEQ_LEN, |
| }, |
| }, MODEL_PATH) |
| print(f" ✓ Best model saved (val={avg_val:.4f})") |
|
|
| |
| print("\nTest evaluation …") |
| ckpt = torch.load(MODEL_PATH, map_location="cpu") |
| model.load_state_dict(ckpt["model_state"]) |
| model.eval() |
|
|
| rp_preds, rp_trues = [], [] |
| sl_preds, sl_trues = [], [] |
| dc_preds, dc_trues = [], [] |
|
|
| with torch.no_grad(): |
| for xb, yb in te_loader: |
| rp_p, sl_p, dc_p = model(xb) |
| rp_preds.extend(rp_p.numpy()); rp_trues.extend(yb[:, 0].numpy()) |
| sl_preds.extend(sl_p.numpy()); sl_trues.extend(yb[:, 1].numpy()) |
| dc_preds.extend(dc_p.numpy()); dc_trues.extend(yb[:, 2].numpy()) |
|
|
| rp_arr, rp_t = np.array(rp_preds), np.array(rp_trues) |
| sl_arr, sl_t = np.array(sl_preds), np.array(sl_trues) |
| dc_arr, dc_t = np.array(dc_preds), np.array(dc_trues) |
|
|
| try: |
| test_auc = roc_auc_score((rp_t > 0.5).astype(int), rp_arr) |
| except Exception: |
| test_auc = float("nan") |
|
|
| metrics = { |
| "rancidity_AUC": round(float(test_auc), 4), |
| "rancidity_MAE": round(float(mean_absolute_error(rp_t, rp_arr)), 4), |
| "rancidity_RMSE": round(rmse(rp_arr, rp_t), 4), |
| "shelf_life_MAE_days": round(float(mean_absolute_error(sl_t * 180, sl_arr * 180)), 2), |
| "shelf_life_RMSE_days": round(rmse(sl_arr * 180, sl_t * 180), 2), |
| "decay_MAE": round(float(mean_absolute_error(dc_t, dc_arr)), 4), |
| "decay_RMSE": round(rmse(dc_arr, dc_t), 4), |
| "best_val_loss": round(best_val_loss, 4), |
| } |
|
|
| print("\nTest Metrics:") |
| for k, v in metrics.items(): |
| print(f" {k}: {v}") |
|
|
| with open(MODEL_DIR / "metrics.json", "w") as f: |
| json.dump(metrics, f, indent=2) |
| with open(MODEL_DIR / "training_history.json", "w") as f: |
| json.dump(history, f, indent=2) |
|
|
| print(f"\nModel → {MODEL_PATH}") |
| print("Training complete.") |
| return metrics |
|
|
|
|
| if __name__ == "__main__": |
| train() |
|
|