Arko007's picture
Upload train.py with huggingface_hub
35a9ca9 verified
"""
Walnut Rancidity Predictor — Training Script
Stacked LSTM with Attention, multi-task outputs.
Memory-efficient: reads CSV in chunks and builds a fixed-size sample.
"""
import os, math, time, json, gc
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, mean_absolute_error
import joblib
# ── Reproducibility ──────────────────────────────────────────────────────────
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)
# ── Paths ─────────────────────────────────────────────────────────────────────
DATA_PATH = Path("data/walnut_storage_timeseries.csv")
MODEL_DIR = Path("models")
MODEL_PATH = MODEL_DIR / "walnut_rancidity_lstm_attention.pt"
MODEL_DIR.mkdir(exist_ok=True)
# ── Hyper-parameters ──────────────────────────────────────────────────────────
FEATURE_COLS = [
"temperature", "humidity", "moisture", "oxygen",
"peroxide_value", "free_fatty_acids", "hexanal_level", "oxidation_index",
]
TARGET_COLS = [
"rancidity_probability",
"shelf_life_remaining_days",
"decay_curve_value",
]
SEQ_LEN = 30
BATCH_SIZE = 64
EPOCHS = 20
LR = 1e-3
HIDDEN = 64
N_LAYERS = 3
DROPOUT = 0.2
MAX_SEQS = 90000 # cap sequences to keep memory manageable
VAL_FRAC = 0.12
TEST_FRAC = 0.13
LOSS_W = {"bce": 1.0, "mse_shelf": 0.5, "mse_decay": 0.5}
# ── Model ─────────────────────────────────────────────────────────────────────
class Attention(nn.Module):
def __init__(self, hidden_size: int):
super().__init__()
self.attn = nn.Linear(hidden_size, 1)
def forward(self, lstm_out: torch.Tensor) -> torch.Tensor:
scores = self.attn(lstm_out).squeeze(-1)
weights = torch.softmax(scores, dim=-1)
context = (weights.unsqueeze(-1) * lstm_out).sum(dim=1)
return context
class WalnutLSTMAttention(nn.Module):
def __init__(self, n_features: int, hidden: int, n_layers: int, dropout: float):
super().__init__()
self.lstm = nn.LSTM(
input_size=n_features,
hidden_size=hidden,
num_layers=n_layers,
dropout=dropout if n_layers > 1 else 0.0,
batch_first=True,
)
self.attn = Attention(hidden)
self.dropout = nn.Dropout(dropout)
self.head_rancidity = nn.Sequential(
nn.Linear(hidden, 32), nn.ReLU(),
nn.Linear(32, 1), nn.Sigmoid(),
)
self.head_shelf_life = nn.Sequential(
nn.Linear(hidden, 32), nn.ReLU(),
nn.Linear(32, 1),
)
self.head_decay = nn.Sequential(
nn.Linear(hidden, 32), nn.ReLU(),
nn.Linear(32, 1), nn.Sigmoid(),
)
def forward(self, x: torch.Tensor):
lstm_out, _ = self.lstm(x)
context = self.attn(lstm_out)
context = self.dropout(context)
rp = self.head_rancidity(context).squeeze(-1)
sl = self.head_shelf_life(context).squeeze(-1)
dc = self.head_decay(context).squeeze(-1)
return rp, sl, dc
# ── Data loading ──────────────────────────────────────────────────────────────
def load_sequences(max_seqs: int = MAX_SEQS):
"""
Read CSV sequence-by-sequence (groupby seq_id) and extract the last
SEQ_LEN rows as one training window per sequence.
Keeps peak memory low by processing one group at a time.
"""
print(f" Reading {DATA_PATH} …")
df = pd.read_csv(DATA_PATH, dtype={
"sequence_id": np.int32,
"day": np.int16,
"temperature": np.float32,
"humidity": np.float32,
"moisture": np.float32,
"oxygen": np.float32,
"peroxide_value": np.float32,
"free_fatty_acids": np.float32,
"hexanal_level": np.float32,
"oxidation_index": np.float32,
"rancidity_probability": np.float32,
"shelf_life_remaining_days": np.float32,
"decay_curve_value": np.float32,
})
print(f" Loaded {len(df):,} rows, {df['sequence_id'].nunique():,} sequences")
X_list, y_list = [], []
grouped = df.groupby("sequence_id", sort=False)
for seq_id, grp in grouped:
if len(X_list) >= max_seqs:
break
grp = grp.sort_values("day")
feats = grp[FEATURE_COLS].values # (T, 8)
tgts = grp[TARGET_COLS].values # (T, 3)
n = len(feats)
if n < SEQ_LEN:
continue
# One window: last SEQ_LEN timesteps
X_list.append(feats[-SEQ_LEN:])
y_list.append(tgts[-1])
del df
gc.collect()
X = np.stack(X_list, axis=0).astype(np.float32) # (N, SEQ_LEN, 8)
y = np.stack(y_list, axis=0).astype(np.float32) # (N, 3)
# Normalise shelf life [0,1] for training (denorm in metrics)
y[:, 1] /= 180.0
print(f" Built {len(X):,} samples shape={X.shape}")
return X, y
def rmse(pred: np.ndarray, true: np.ndarray) -> float:
return float(np.sqrt(np.mean((pred - true) ** 2)))
# ── Main ──────────────────────────────────────────────────────────────────────
def train():
print("=" * 60)
print("Walnut Rancidity Predictor — Training")
print("=" * 60)
# 1. Load
print("\n[1/5] Loading sequences …")
X, y = load_sequences(MAX_SEQS)
N = len(X)
# 2. Split
print("\n[2/5] Splitting train/val/test …")
rng = np.random.default_rng(SEED)
idx = rng.permutation(N)
n_test = int(N * TEST_FRAC)
n_val = int(N * VAL_FRAC)
te_idx = idx[:n_test]
va_idx = idx[n_test:n_test + n_val]
tr_idx = idx[n_test + n_val:]
# 3. Scale features
print("\n[3/5] Fitting StandardScaler …")
scaler = StandardScaler()
X_flat = X[tr_idx].reshape(-1, X.shape[-1])
scaler.fit(X_flat)
joblib.dump(scaler, MODEL_DIR / "feature_scaler.pkl")
def scale_split(indices):
Xs = X[indices].copy()
shape = Xs.shape
Xs = scaler.transform(Xs.reshape(-1, shape[-1])).reshape(shape)
ys = y[indices]
return torch.tensor(Xs, dtype=torch.float32), torch.tensor(ys, dtype=torch.float32)
X_tr, y_tr = scale_split(tr_idx)
X_va, y_va = scale_split(va_idx)
X_te, y_te = scale_split(te_idx)
print(f" Train: {len(X_tr):,} Val: {len(X_va):,} Test: {len(X_te):,}")
del X, y; gc.collect()
tr_loader = DataLoader(TensorDataset(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True)
va_loader = DataLoader(TensorDataset(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False)
te_loader = DataLoader(TensorDataset(X_te, y_te), batch_size=BATCH_SIZE, shuffle=False)
# 4. Model
print("\n[4/5] Building model …")
n_feat = len(FEATURE_COLS)
model = WalnutLSTMAttention(n_feat, HIDDEN, N_LAYERS, DROPOUT)
device = torch.device("cpu")
total_params = sum(p.numel() for p in model.parameters())
print(f" Parameters: {total_params:,}")
optimizer = torch.optim.Adam(model.parameters(), lr=LR)
bce_fn = nn.BCELoss()
mse_fn = nn.MSELoss()
best_val_loss = math.inf
history = []
# 5. Training
print(f"\n[5/5] Training for {EPOCHS} epochs …")
for epoch in range(1, EPOCHS + 1):
t0 = time.time()
model.train()
tr_losses = []
for xb, yb in tr_loader:
rp_pred, sl_pred, dc_pred = model(xb)
loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0])
+ LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1])
+ LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2]))
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
tr_losses.append(loss.item())
model.eval()
val_losses, rp_preds, rp_trues = [], [], []
with torch.no_grad():
for xb, yb in va_loader:
rp_pred, sl_pred, dc_pred = model(xb)
val_loss = (LOSS_W["bce"] * bce_fn(rp_pred, yb[:, 0])
+ LOSS_W["mse_shelf"] * mse_fn(sl_pred, yb[:, 1])
+ LOSS_W["mse_decay"] * mse_fn(dc_pred, yb[:, 2]))
val_losses.append(val_loss.item())
rp_preds.extend(rp_pred.numpy())
rp_trues.extend(yb[:, 0].numpy())
avg_tr = float(np.mean(tr_losses))
avg_val = float(np.mean(val_losses))
try:
auc = roc_auc_score((np.array(rp_trues) > 0.5).astype(int), rp_preds)
except Exception:
auc = float("nan")
elapsed = time.time() - t0
print(f" Epoch {epoch:2d}/{EPOCHS} "
f"train={avg_tr:.4f} val={avg_val:.4f} "
f"AUC={auc:.4f} {elapsed:.1f}s")
history.append({"epoch": epoch, "train_loss": avg_tr,
"val_loss": avg_val, "auc": auc})
if avg_val < best_val_loss:
best_val_loss = avg_val
torch.save({
"epoch": epoch,
"model_state": model.state_dict(),
"optimizer": optimizer.state_dict(),
"val_loss": avg_val,
"config": {
"n_features": n_feat,
"hidden": HIDDEN,
"n_layers": N_LAYERS,
"dropout": DROPOUT,
"seq_len": SEQ_LEN,
},
}, MODEL_PATH)
print(f" ✓ Best model saved (val={avg_val:.4f})")
# Evaluate on test set
print("\nTest evaluation …")
ckpt = torch.load(MODEL_PATH, map_location="cpu")
model.load_state_dict(ckpt["model_state"])
model.eval()
rp_preds, rp_trues = [], []
sl_preds, sl_trues = [], []
dc_preds, dc_trues = [], []
with torch.no_grad():
for xb, yb in te_loader:
rp_p, sl_p, dc_p = model(xb)
rp_preds.extend(rp_p.numpy()); rp_trues.extend(yb[:, 0].numpy())
sl_preds.extend(sl_p.numpy()); sl_trues.extend(yb[:, 1].numpy())
dc_preds.extend(dc_p.numpy()); dc_trues.extend(yb[:, 2].numpy())
rp_arr, rp_t = np.array(rp_preds), np.array(rp_trues)
sl_arr, sl_t = np.array(sl_preds), np.array(sl_trues)
dc_arr, dc_t = np.array(dc_preds), np.array(dc_trues)
try:
test_auc = roc_auc_score((rp_t > 0.5).astype(int), rp_arr)
except Exception:
test_auc = float("nan")
metrics = {
"rancidity_AUC": round(float(test_auc), 4),
"rancidity_MAE": round(float(mean_absolute_error(rp_t, rp_arr)), 4),
"rancidity_RMSE": round(rmse(rp_arr, rp_t), 4),
"shelf_life_MAE_days": round(float(mean_absolute_error(sl_t * 180, sl_arr * 180)), 2),
"shelf_life_RMSE_days": round(rmse(sl_arr * 180, sl_t * 180), 2),
"decay_MAE": round(float(mean_absolute_error(dc_t, dc_arr)), 4),
"decay_RMSE": round(rmse(dc_arr, dc_t), 4),
"best_val_loss": round(best_val_loss, 4),
}
print("\nTest Metrics:")
for k, v in metrics.items():
print(f" {k}: {v}")
with open(MODEL_DIR / "metrics.json", "w") as f:
json.dump(metrics, f, indent=2)
with open(MODEL_DIR / "training_history.json", "w") as f:
json.dump(history, f, indent=2)
print(f"\nModel → {MODEL_PATH}")
print("Training complete.")
return metrics
if __name__ == "__main__":
train()