cla1r3
/

Neural-network

Model card Files Files and versions

xet

Community

cla1r3 commited on Dec 30, 2025

Commit

15a4e7c

verified ·

1 Parent(s): a3d5117

Upload neural_network.py

Browse files

Files changed (1) hide show

neural_network.py +640 -0

neural_network.py ADDED Viewed

	@@ -0,0 +1,640 @@

+# -*- coding: utf-8 -*-
+"""neural network
+Automatically generated by Colab.
+Original file is located at
+    https://colab.research.google.com/drive/13Vym7d6JDkWLa9cv9p8h_amR_3uUnGp9
+"""
+# Cell A: Upload training dataset google sheets (CSV file)
+from google.colab import files
+import pandas as pd
+import io
+uploaded = files.upload()
+# Cell B: Define liability predictor model
+import torch
+import torch.nn as nn
+class LiabilityPredictor(nn.Module):
+    def __init__(
+        self,
+        input_dim: int = 640,
+        output_dim: int = 4,
+        hidden_dims=(128, 64),
+        dropout: float = 0.10,
+        activation: str = "gelu",
+        use_layernorm: bool = True,
+    ):
+        super().__init__()
+# Choose activation function. Converts "gelu" string into actual PyTorch layer.
+        act_layer = {
+            "relu": nn.ReLU,
+            "gelu": nn.GELU,
+            "silu": nn.SiLU,
+        }.get(activation.lower())
+        if act_layer is None:
+            raise ValueError(f"Unknown activation='{activation}'. Use 'relu', 'gelu', or 'silu'.")
+        layers = []
+        if use_layernorm:
+            layers.append(nn.LayerNorm(input_dim))
+        prev = input_dim
+        for h in hidden_dims:
+            layers.append(nn.Linear(prev, h))
+            if use_layernorm:
+                layers.append(nn.LayerNorm(h))
+            layers.append(act_layer())
+            if dropout and dropout > 0:
+                layers.append(nn.Dropout(dropout))
+            prev = h
+        layers.append(nn.Linear(prev, output_dim))
+        self.net = nn.Sequential(*layers)
+        self._init_weights()
+    def _init_weights(self): #Xavier initialisation
+        # Stable init for small-data regression
+        for m in self.modules():
+            if isinstance(m, nn.Linear):
+                nn.init.xavier_uniform_(m.weight)
+                if m.bias is not None:
+                    nn.init.zeros_(m.bias)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        # Guardrails: ensure correct dtype/shape
+        if x.dim() == 1:
+            x = x.unsqueeze(0)  # (640,) -> (1, 640)
+        if x.dim() != 2:
+            raise ValueError(f"Expected x to have shape (batch, features). Got {tuple(x.shape)}")
+        return self.net(x.float())
+# Cell C: Create dataset
+import torch
+from torch.utils.data import Dataset
+import pandas as pd
+from transformers import AutoModel, AutoTokenizer
+import numpy as np
+MODEL_NAME = "facebook/esm2_t6_8M_UR50D"
+CSV_PATH = "trainingdataset  - Sheet 1.csv"
+df = pd.read_csv(CSV_PATH)
+target_cols = ['polyreactivity', 'hydrophobicity', 'aggregation', 'charge_patch']
+for col in target_cols:
+    df[col] = pd.to_numeric(df[col], errors='coerce')
+df = df.dropna(subset=['VH','VL'] + target_cols).reset_index(drop=True)
+y = df[target_cols].values
+print("Target order:", target_cols)
+print("Rows kept:", len(df))
+# Load ESM-2
+tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
+esm_model = AutoModel.from_pretrained(MODEL_NAME)
+esm_model.eval()
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+esm_model.to(device)
+hidden_size = esm_model.config.hidden_size
+def embed_sequences_meanpool_scoring_style(seqs, batch_size=8):
+    unique_seqs = list(dict.fromkeys(seqs))
+    seq_to_vec = {}
+    for i in range(0, len(unique_seqs), batch_size):
+        batch_seqs = unique_seqs[i:i + batch_size]
+        tokenized = tokenizer(
+            batch_seqs,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+        )
+        tokenized = {k: v.to(device) for k, v in tokenized.items()}
+        with torch.inference_mode():
+            out = esm_model(**tokenized)
+        token_emb = out.last_hidden_state
+        attn = tokenized["attention_mask"].float()
+        pooled = (token_emb * attn.unsqueeze(-1)).sum(dim=1)
+        pooled = pooled / attn.sum(dim=1).clamp(min=1).unsqueeze(-1)
+        pooled = pooled.detach().cpu()
+        for s, v in zip(batch_seqs, pooled):
+            seq_to_vec[s] = v
+    return seq_to_vec
+all_seqs = df["VH"].tolist() + df["VL"].tolist()
+seq_to_vec = embed_sequences_meanpool_scoring_style(all_seqs, batch_size=8)
+X_tensors = []
+for _, row in df.iterrows():
+    vh_vec = seq_to_vec[row["VH"]]
+    vl_vec = seq_to_vec[row["VL"]]
+    assert vh_vec.shape == (hidden_size,), f"VH vec shape {vh_vec.shape} != ({hidden_size},)"
+    assert vl_vec.shape == (hidden_size,), f"VL vec shape {vl_vec.shape} != ({hidden_size},)"
+# Concatenate VH + VL
+    combined_vec = torch.cat([vh_vec, vl_vec], dim=0)  # (640,)
+    X_tensors.append(combined_vec)
+X = torch.stack(X_tensors, dim=0).numpy()
+assert X.shape[1] == 2 * hidden_size, f"Expected {2*hidden_size} features, got {X.shape[1]}"
+assert X.shape[0] == y.shape[0], f"X rows {X.shape[0]} != y rows {y.shape[0]}"
+# Create dataset object
+class AntibodyDataset(Dataset):
+    def __init__(self, X, y):
+        self.X = torch.tensor(X, dtype=torch.float32)
+        self.y = torch.tensor(y, dtype=torch.float32)
+    def __len__(self):
+        return len(self.X)
+    def __getitem__(self, idx):
+        return self.X[idx], self.y[idx]
+dataset = AntibodyDataset(X, y)
+print(
+    f"Dataset created: {len(dataset)} samples | "
+    f"X shape: {X.shape} | y shape: {y.shape}"
+)
+# Double-check
+print("First name:", df["name"].iloc[0] if "name" in df.columns else "(no 'name' column)")
+print("First y row:", y[0])
+# Cell D (REPLACEMENT): Evaluation and training data using five-fold CV
+!pip -q install scikit-learn
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+from sklearn.model_selection import KFold
+# Dataset wrapper (raw y stored; z-scoring is done per fold)
+class AntibodyDatasetRaw(Dataset):
+    def __init__(self, X_np, y_np):
+        self.X = torch.tensor(X_np, dtype=torch.float32)
+        self.y = torch.tensor(y_np, dtype=torch.float32)
+    def __len__(self):
+        return self.X.shape[0]
+    def __getitem__(self, idx):
+        return self.X[idx], self.y[idx]
+def mae_rmse_r2(y_true, y_pred):
+    err = y_pred - y_true
+    mae = np.mean(np.abs(err), axis=0)
+    rmse = np.sqrt(np.mean(err**2, axis=0))
+    ss_res = np.sum((y_true - y_pred)**2, axis=0)
+    ss_tot = np.sum((y_true - np.mean(y_true, axis=0))**2, axis=0) + 1e-12
+    r2 = 1.0 - (ss_res / ss_tot)
+    return mae, rmse, r2
+def train_one_fold(X_train, y_train_raw, X_val, y_val_raw,
+                   hidden_dims=(128,64), dropout=0.10,
+                   batch_size=16, max_epochs=200,
+                   lr=3e-4, weight_decay=1e-4,
+                   patience=12, min_delta=1e-4):
+    # ----- z-score targets using TRAIN only (no leakage) -----
+    y_mean = y_train_raw.mean(axis=0)
+    y_std  = y_train_raw.std(axis=0) + 1e-8
+    y_train_z = (y_train_raw - y_mean) / y_std
+    y_val_z   = (y_val_raw   - y_mean) / y_std
+    train_ds = AntibodyDatasetRaw(X_train, y_train_z)
+    val_ds   = AntibodyDatasetRaw(X_val,   y_val_z)
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
+    val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)
+    # ----- model -----
+    model = LiabilityPredictor(
+        input_dim=X_train.shape[1],
+        hidden_dims=hidden_dims,
+        dropout=dropout
+    ).to(device)
+    loss_fn = nn.MSELoss()
+    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=3, min_lr=1e-5
+    )
+    best_val = float("inf")
+    best_state = None
+    bad = 0
+    best_ep = 0
+    def epoch_loss(loader, train: bool):
+        model.train() if train else model.eval()
+        total, n = 0.0, 0
+        for xb, yb in loader:
+            xb = xb.to(device)
+            yb = yb.to(device)
+            if train:
+                optimizer.zero_grad()
+            with torch.set_grad_enabled(train):
+                pred = model(xb)
+                loss = loss_fn(pred, yb)
+                if train:
+                    loss.backward()
+                    optimizer.step()
+            bs = xb.size(0)
+            total += loss.item() * bs
+            n += bs
+        return total / max(n, 1)
+    @torch.no_grad()
+    def predict_val_raw():
+        model.eval()
+        preds_z = []
+        for xb, _ in val_loader:
+            xb = xb.to(device)
+            pz = model(xb).cpu().numpy()
+            preds_z.append(pz)
+        preds_z = np.vstack(preds_z)
+        return preds_z * y_std + y_mean
+    # training loop
+    train_loss_hist = []
+    val_loss_hist = []
+    for ep in range(1, max_epochs + 1):
+        tr = epoch_loss(train_loader, True)
+        va = epoch_loss(val_loader, False)
+        train_loss_hist.append(tr)
+        val_loss_hist.append(va)
+        scheduler.step(va)
+        if va < best_val - min_delta:
+            best_val = va
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+            bad = 0
+        else:
+            bad += 1
+            if bad >= patience:
+                break
+    model.load_state_dict(best_state)
+    # Predictions in raw units + metrics
+    y_pred_raw = predict_val_raw()
+    mae, rmse, r2 = mae_rmse_r2(y_val_raw, y_pred_raw)
+    # Baseline: Predict TRAIN mean in raw units
+    base_pred = np.tile(y_mean.reshape(1,-1), (y_val_raw.shape[0], 1))
+    b_mae, b_rmse, b_r2 = mae_rmse_r2(y_val_raw, base_pred)
+    return (mae, rmse, r2), (b_mae, b_rmse, b_r2), (train_loss_hist, val_loss_hist)
+# Run 5-fold CV
+X_np = X.astype(np.float32)
+y_np = y.astype(np.float32)
+kf = KFold(n_splits=5, shuffle=True, random_state=42)
+fold_metrics = []
+fold_baseline = []
+fold_histories = []
+for fold, (tr_idx, va_idx) in enumerate(kf.split(X_np), start=1):
+    X_tr, X_va = X_np[tr_idx], X_np[va_idx]
+    y_tr, y_va = y_np[tr_idx], y_np[va_idx]
+    (mae, rmse, r2), (b_mae, b_rmse, b_r2), (tr_hist, va_hist) = train_one_fold(
+        X_tr, y_tr, X_va, y_va,
+        hidden_dims=(128,64),
+        dropout=0.10,
+        batch_size=16,
+        max_epochs=200,
+        lr=3e-4,
+        weight_decay=1e-4,
+        patience=12
+    )
+    fold_metrics.append((mae, rmse, r2))
+    fold_baseline.append((b_mae, b_rmse, b_r2))
+    fold_histories.append((tr_hist, va_hist))
+    print(f"\nFold {fold}/5")
+    print("  NN MAE :", dict(zip(target_cols, mae)))
+    print("  NN R2  :", dict(zip(target_cols, r2)))
+    print("  BASE MAE:", dict(zip(target_cols, b_mae)))
+    print("  BASE R2 :", dict(zip(target_cols, b_r2)))
+print("\nDone. Run Cell E for plots + summary + final training.")
+# Cell E: Post-CV plots + conclusion stats + Train final deployment model + Save
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd # Import pandas for nice tables
+# 1) CV summary plots + conclusions
+K = len(fold_metrics)
+T = len(target_cols)
+nn_mae = np.stack([m[0] for m in fold_metrics], axis=0)   # (K,4)
+nn_rmse= np.stack([m[1] for m in fold_metrics], axis=0)
+nn_r2  = np.stack([m[2] for m in fold_metrics], axis=0)
+b_mae  = np.stack([m[0] for m in fold_baseline], axis=0)
+b_rmse = np.stack([m[1] for m in fold_baseline], axis=0)
+b_r2   = np.stack([m[2] for m in fold_baseline], axis=0)
+def mean_std(a):
+    return a.mean(axis=0), a.std(axis=0)
+nn_mae_m, nn_mae_s = mean_std(nn_mae)
+nn_r2_m,  nn_r2_s  = mean_std(nn_r2)
+b_mae_m,  b_mae_s  = mean_std(b_mae)
+b_r2_m,   b_r2_s   = mean_std(b_r2)
+x = np.arange(T)
+w = 0.35
+plt.figure()
+plt.bar(x - w/2, nn_mae_m, yerr=nn_mae_s, width=w, label="NN")
+plt.bar(x + w/2, b_mae_m,  yerr=b_mae_s,  width=w, label="Baseline")
+plt.xticks(x, target_cols, rotation=30, ha="right")
+plt.ylabel("MAE (raw units)")
+plt.title("5-Fold CV: MAE per target (mean ± std)")
+plt.legend()
+plt.show()
+plt.figure()
+plt.bar(x - w/2, nn_r2_m, yerr=nn_r2_s, width=w, label="NN")
+plt.bar(x + w/2, b_r2_m,  yerr=b_r2_s,  width=w, label="Baseline")
+plt.xticks(x, target_cols, rotation=30, ha="right")
+plt.ylabel("R²")
+plt.title("5-Fold CV: R² per target (mean ± std)")
+plt.legend()
+plt.show()
+# Worst-target MAE: because you need all four good
+nn_worst_mae = nn_mae.max(axis=1)
+b_worst_mae  = b_mae.max(axis=1)
+print("Worst-target MAE across folds:")
+worst_mae_df = pd.DataFrame({
+    'Metric': ['NN worst-MAE mean ± std', 'BASE worst-MAE mean ± std'],
+    'Value': [f"{nn_worst_mae.mean():.4f} ± {nn_worst_mae.std():.4f}", f"{b_worst_mae.mean():.4f} ± {b_worst_mae.std():.4f}"]
+})
+display(worst_mae_df)
+print("\nPer-target summary (mean ± std):")
+per_target_summary_data = []
+for i, t in enumerate(target_cols):
+    per_target_summary_data.append({
+        'Target': t,
+        'NN MAE': f"{nn_mae_m[i]:.4f}±{nn_mae_s[i]:.4f}",
+        'NN R2': f"{nn_r2_m[i]:.4f}±{nn_r2_s[i]:.4f}",
+        'BASE MAE': f"{b_mae_m[i]:.4f}±{b_mae_s[i]:.4f}",
+        'BASE R2': f"{b_r2_m[i]:.4f}±{b_r2_s[i]:.4f}"
+    })
+per_target_df = pd.DataFrame(per_target_summary_data)
+display(per_target_df)
+print("\nOverall (mean across targets):")
+overall_summary_data = [
+    {
+        'Model': 'NN',
+        'MAE_mean': f"{nn_mae_m.mean():.4f} ± {nn_mae_s.mean():.4f}",
+        'R2_mean': f"{nn_r2_m.mean():.4f} ± {nn_r2_s.mean():.4f}"
+    },
+    {
+        'Model': 'BASE',
+        'MAE_mean': f"{b_mae_m.mean():.4f} ± {b_mae_s.mean():.4f}",
+        'R2_mean': f"{b_r2_m.mean():.4f} ± {b_r2_s.mean():.4f}"
+    }
+]
+overall_df = pd.DataFrame(overall_summary_data)
+display(overall_df)
+from sklearn.model_selection import train_test_split
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import Dataset, DataLoader
+import numpy as np
+import matplotlib.pyplot as plt
+# Safety checks
+if "fold_histories" not in globals() or len(fold_histories) == 0:
+    raise ValueError("fold_histories not found or empty. Make sure you appended (tr_hist, va_hist) inside the CV fold loop.")
+# Determine the minimum number of epochs ran across folds (due to early stopping)
+min_len = min(len(tr) for tr, _ in fold_histories)
+print("CV folds:", len(fold_histories))
+print("Min epochs across folds (truncate to this):", min_len)
+print("Epochs per fold:", [len(tr) for tr, _ in fold_histories])
+# Truncate each fold to min_len so curves align by epoch index
+tr_mat = np.array([tr[:min_len] for tr, _ in fold_histories], dtype=np.float32)  # shape: (K, min_len)
+va_mat = np.array([va[:min_len] for _, va in fold_histories], dtype=np.float32)  # shape: (K, min_len)
+# Compute mean ± std across folds for each epoch
+tr_mean = tr_mat.mean(axis=0)
+tr_std  = tr_mat.std(axis=0)
+va_mean = va_mat.mean(axis=0)
+va_std  = va_mat.std(axis=0)
+# Plot mean curves with ±1 std shading
+x = np.arange(1, min_len + 1)
+plt.figure()
+plt.plot(x, tr_mean, label="CV train loss (mean)")
+plt.plot(x, va_mean, label="CV val loss (mean)")
+plt.fill_between(x, tr_mean - tr_std, tr_mean + tr_std, alpha=0.2)
+plt.fill_between(x, va_mean - va_std, va_mean + va_std, alpha=0.2)
+plt.xlabel("Epoch")
+plt.ylabel("MSE in z-space")
+plt.title("5-Fold CV Learning Curves (truncated to min epoch, mean ± std)")
+plt.axhline(1.0, linestyle=":", label="z-space baseline (~1.0)")
+plt.legend()
+plt.show()
+# Train deployable model on ALL data
+X_all = X.astype(np.float32)
+y_all = y.astype(np.float32)
+y_mean_full = y_all.mean(axis=0)
+y_std_full  = y_all.std(axis=0) + 1e-8
+y_z_full = (y_all - y_mean_full) / y_std_full
+class AntibodyDatasetZ(Dataset):
+    def __init__(self, X_np, y_z_np):
+        self.X = torch.tensor(X_np, dtype=torch.float32)
+        self.y = torch.tensor(y_z_np, dtype=torch.float32)
+    def __len__(self):
+        return len(self.X)
+    def __getitem__(self, idx):
+        return self.X[idx], self.y[idx]
+ds_full = AntibodyDatasetZ(X_all, y_z_full)
+loader_full = DataLoader(ds_full, batch_size=16, shuffle=True)
+final_model = LiabilityPredictor(input_dim=640, hidden_dims=(128,64), dropout=0.10).to(device)
+optimizer_final = optim.Adam(final_model.parameters(), lr= 1e-4, weight_decay=1e-4)
+epochs_final = min_len
+loss_hist_full = []
+loss_fn = nn.MSELoss()
+final_model.train()
+for ep in range(1, epochs_final+1):
+    total, n = 0.0, 0
+    for xb, yb in loader_full:
+        xb, yb = xb.to(device), yb.to(device)
+        optimizer_final.zero_grad()
+        pred = final_model(xb)
+        loss = loss_fn(pred, yb)
+        loss.backward()
+        optimizer_final.step()
+        total += loss.item() * xb.size(0)
+        n += xb.size(0)
+    loss_epoch = total / max(n, 1)
+    loss_hist_full.append(loss_epoch)
+    if ep % 10 == 0 or ep == 1:
+        print(f"[FINAL-ALL] Epoch {ep:03d} | train_loss(zMSE) {loss_epoch:.4f}")
+import numpy as np
+def movavg(x, w=7):
+    x = np.array(x)
+    if len(x) < w: return x
+    return np.convolve(x, np.ones(w)/w, mode="valid")
+plt.figure()
+plt.plot(np.arange(1, epochs_final+1), loss_hist_full, label="train loss (all data)")
+plt.xlabel("Epoch")
+plt.ylabel("MSE in z-space")
+plt.title("Deployable Model Training Curve (ALL data)")
+plt.legend()
+plt.show()
+final_artifacts = {
+    "state_dict": final_model.state_dict(),
+    "y_mean": y_mean_full,
+    "y_std": y_std_full,
+    "target_cols": target_cols,
+    "trained_on": "ALL_DATA_FINAL_MODEL_CELL_E",
+    "epochs_final": epochs_final,
+}
+# Cell F: Plot graphs to visualise loss and accuracy
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+print("y_mean:", y_mean_full)
+print("y_std:", y_std_full)
+final_model.eval()
+y_true_z_list = []
+y_pred_z_list = []
+with torch.no_grad():
+    for xb, yb in loader_full:
+        xb = xb.to(device)
+        pred_z = final_model(xb).cpu().numpy()   # (batch, 4) in z-space
+        y_pred_z_list.append(pred_z)
+        y_true_z_list.append(yb.numpy())             # (batch, 4) in z-space
+y_true_z = np.vstack(y_true_z_list)
+y_pred_z = np.vstack(y_pred_z_list)
+# Unscale HERE
+y_true = y_true_z * y_std_full + y_mean_full
+y_pred = y_pred_z * y_std_full + y_mean_full
+def pearsonr(a, b):
+    a = a - a.mean()
+    b = b - b.mean()
+    return float((a @ b) / (np.sqrt((a @ a) * (b @ b)) + 1e-12))
+def spearmanr(a, b):
+    ra = a.argsort().argsort().astype(float)
+    rb = b.argsort().argsort().astype(float)
+    return pearsonr(ra, rb)
+for j, name in enumerate(target_cols):
+    p = pearsonr(y_true[:, j], y_pred[:, j])
+    s = spearmanr(y_true[:, j], y_pred[:, j])
+    plt.figure()
+    plt.scatter(y_true[:, j], y_pred[:, j])
+    lo = min(y_true[:, j].min(), y_pred[:, j].min())
+    hi = max(y_true[:, j].max(), y_pred[:, j].max())
+    plt.plot([lo, hi], [lo, hi], linestyle="--")
+    plt.xlabel(f"True {name}")
+    plt.ylabel(f"Predicted {name}")
+    plt.title(f"{name} (val)  R={p:.2f}  ρ={s:.2f}")
+    plt.show()
+import torch
+artifact = {
+    "state_dict": final_model.state_dict(),
+    "y_mean": y_mean_full,
+    "y_std": y_std_full,
+    "target_cols": target_cols,
+    "input_dim": 640,
+    "hidden_dims": (128, 64),
+    "dropout": 0.10,
+}
+torch.save(artifact, "liability_predictor.pt")
+print("Saved:", "liability_predictor.pt")
+from google.colab import files
+files.download("liability_predictor.pt")