add test model and the train, test files

Browse files

Files changed (9) hide show

TabM_NEO_training_0.pth +3 -0
TabM_NEO_training_1.pth +3 -0
TabM_NEO_training_2.pth +3 -0
TabM_NEO_training_3.pth +3 -0
data/tabm_test.tsv +3 -0
data/tabm_train.tsv +3 -0
run_tabm_hyperopt.sh +48 -0
src/tabm_eval.py +382 -0
src/tabm_train.py +487 -0

TabM_NEO_training_0.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd1a0a6368c7837eab7a6ecc41bcf96b9245b7ce1380f738b360acafa2da388a
+size 289388

TabM_NEO_training_1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66dc7122b51b67db5eaca4745514d494f70b1e359dcc1bf2aa2ebf11c765a2e9
+size 5572025

TabM_NEO_training_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0b4d7648d06b2e5795bd0032f0cba34386e86ee5a1fadd4d05ed9ad0eca9fffe
+size 288915

TabM_NEO_training_3.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4695839cb1fb6ed3d5acb1f516085eaef3e45e90c16638ffcceb60e61f97f14b
+size 637113

data/tabm_test.tsv CHANGED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a30b0d4d859c7a2844539d28ef98c9ac23add6df054cefb242d23b117ea47dc
+size 3686362

data/tabm_train.tsv CHANGED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eb0e395c5f8f3af544f9a4896626d5e51212564777a2e751bab6c4296634176a
+size 13137157

run_tabm_hyperopt.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#!/bin/bash
+# TabM Hyperparameter Search (sum_exp_rank) + Final Training + Evaluation
+set -e
+START_TS="$(date '+%F %T')"
+START_EPOCH="$(date +%s)"
+echo "[Start] ${START_TS}"
+OUT_DIR="tabm_results_hyperopt_parallel"
+mkdir -p "$OUT_DIR"
+echo "[Hyperopt] Search for TabM hyperparameters (sum_exp_rank) and train the final model..."
+python src/tabm_train.py \
+  --data_file data/tabm_train.tsv \
+  --model_out "$OUT_DIR/tabm_hyperopt_best.pth" \
+  --max_evals 30 \
+  --cv_folds 5 \
+  --epochs 20 \
+  --final_epochs 40 \
+  --batch_size 128 \
+  --alpha 0.005 \
+  --tune_k \
+  --device auto \
+  --nr_hyperopt_rep 4
+MODEL_GLOB="$OUT_DIR/tabm_hyperopt_best_rep*.pth"
+echo "Start evaluating (weighted average of multiple models)..."
+python src/tabm_eval.py \
+  --model_glob "$MODEL_GLOB" \
+  --data_file achieve_features_test.tsv \
+  --output_file "$OUT_DIR/TabM_NEO_test.txt" \
+  --output_xlsx "$OUT_DIR/TabM_NEO_test.xlsx" \
+  --tesla_file "$OUT_DIR/TabM_NEO_test_tesla.txt" \
+  --tesla_xlsx "$OUT_DIR/TabM_NEO_test_tesla.xlsx" \
+  --device auto --batch_size 1024 --skip_no_cd8
+echo "Evaluation completed!"
+END_TS="$(date '+%F %T')"
+END_EPOCH="$(date +%s)"
+ELAPSED=$((END_EPOCH - START_EPOCH))
+H=$((ELAPSED/3600))
+M=$(((ELAPSED%3600)/60))
+S=$((ELAPSED%60))
+printf "[End] %s | Total elapsed: %02d:%02d:%02d\n" "$END_TS" "$H" "$M" "$S"

src/tabm_eval.py ADDED Viewed

	@@ -0,0 +1,382 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import numpy as np
+import pandas as pd
+import torch
+import tabm
+from sklearn.metrics import precision_recall_curve, auc
+def normalize_rt(s: pd.Series) -> pd.Series:
+    return s.astype(str).str.strip().str.upper()
+def compute_patient_metrics(df_p: pd.DataFrame, y_prob: np.ndarray) -> tuple:
+    X_r = df_p.copy()
+    X_r['ML_pred'] = y_prob
+    X_r['response'] = (normalize_rt(X_r['response_type']) == 'CD8').astype(int)
+    X_r = X_r.sort_values(by=['ML_pred'], ascending=False).reset_index(drop=True)
+    idx_pos = np.where(X_r['response'].to_numpy() == 1)[0]
+    idx_tested = np.where(normalize_rt(X_r['response_type']) == 'NEGATIVE')[0]
+    def topk_counts(k: int):
+        k_eff = min(k, len(X_r))
+        nr_correct = int(np.sum(idx_pos < k_eff))
+        nr_tested = nr_correct + int(np.sum(idx_tested < k_eff))
+        return nr_correct, nr_tested
+    nr_correct20, nr_tested20 = topk_counts(20)
+    nr_correct50, nr_tested50 = topk_counts(50)
+    nr_correct100, nr_tested100 = topk_counts(100)
+    nr_immuno = int(np.sum(X_r['response'] == 1))
+    y_true = X_r['response'].to_numpy()
+    y_pred = X_r['ML_pred'].to_numpy()
+    alpha = 0.005
+    score = float(np.sum(np.exp(-alpha * idx_pos)))
+    if nr_immuno > 0:
+        sort_idx = np.argsort(idx_pos)
+        ranks_str = ",".join([f"{int(r+1)}" for r in idx_pos[sort_idx]])
+        mut_seqs = X_r.loc[X_r['response'] == 1, 'mutant_seq'].to_numpy()
+        mut_seqs_str = ",".join([str(s) for s in mut_seqs[sort_idx]])
+        genes = X_r.loc[X_r['response'] == 1, 'gene'].to_numpy()
+        genes_str = ",".join([str(g) for g in genes[sort_idx]])
+    else:
+        ranks_str = ""
+        mut_seqs_str = ""
+        genes_str = ""
+    return (X_r['ML_pred'].to_numpy(), X_r,
+            nr_correct20, nr_tested20,
+            nr_correct50, nr_tested50,
+            nr_correct100, nr_tested100,
+            nr_immuno, idx_pos, score,
+            ranks_str, mut_seqs_str, genes_str)
+def predict_in_batches(model, X_all, device, batch_size=1024):
+    model.eval()
+    y_prob_all = []
+    with torch.inference_mode():
+        for i in range(0, len(X_all), batch_size):
+            batch_end = min(i + batch_size, len(X_all))
+            batch_X = X_all[i:batch_end].to(device)
+            batch_pred = model(batch_X).mean(1)
+            batch_pred = torch.softmax(batch_pred, dim=1)[:, 1]
+            y_prob_all.append(batch_pred.cpu())
+            del batch_X, batch_pred
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+    return torch.cat(y_prob_all, dim=0).numpy()
+def main():
+    ap = argparse.ArgumentParser(description="TabM model evaluation, output format consistent with TestVotingClassifier")
+    ap.add_argument("--model_file", type=str, required=False, help="TabM model file, e.g. tabm_results/tabm_model.pth (mutually exclusive with --model_files/--model_glob, choose one of three)")
+    ap.add_argument("--model_files", type=str, nargs='*', default=None, help="Multiple model files for equal-weighted average prediction")
+    ap.add_argument("--model_glob", type=str, default=None, help="Use wildcards to match multiple model files (e.g. tabm_results/tabm_hyperopt_best_rep*.pth)")
+    ap.add_argument("--data_file", type=str, required=True, help="Input TSV: TestVoting_selection_neopep.tsv")
+    ap.add_argument("--output_file", type=str, required=True, help="Main result output file (header consistent with original)")
+    ap.add_argument("--tesla_file", type=str, default=None, help="TESLA score output file (for neopep task)")
+    ap.add_argument("--output_xlsx", type=str, default=None, help="Main result Excel output path (optional)")
+    ap.add_argument("--tesla_xlsx", type=str, default=None, help="TESLA result Excel output path (optional)")
+    ap.add_argument("--dataset_name", type=str, default=None, help="If no dataset column exists, use this value as Dataset column in TESLA")
+    ap.add_argument("--skip_no_cd8", action="store_true", help="Skip patients without CD8")
+    ap.add_argument("--device", type=str, default="auto", choices=["auto", "cuda", "cpu"],
+                help="Device selection: auto/cuda/cpu")
+    ap.add_argument("--batch_size", type=int, default=1024,
+                help="Batch size to avoid GPU memory overflow (default 1024)")
+    args = ap.parse_args()
+    # device selection
+    if args.device == "auto":
+        if torch.cuda.is_available():
+            device = torch.device('cuda:0')
+            print(f"🚀 Auto-selected GPU: {torch.cuda.get_device_name(0)}")
+            print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+        else:
+            device = torch.device('cpu')
+            print("⚠️  No GPU detected, using CPU")
+    elif args.device == "cuda":
+        if torch.cuda.is_available():
+            device = torch.device('cuda:0')
+            print(f"🚀 Force using GPU: {torch.cuda.get_device_name(0)}")
+        else:
+            raise RuntimeError("CUDA specified but no GPU detected")
+    else:
+        device = torch.device('cpu')
+        print("️  Using CPU")
+    print(f" Batch size: {args.batch_size}")
+    # Read data
+    df = pd.read_csv(args.data_file, sep="\t", header=0, low_memory=False)
+    print(f"📈 Data shape: {df.shape}")
+    # Required columns check
+    required_cols = ["patient", "response_type", "gene", "mutant_seq"]
+    for c in required_cols:
+        if c not in df.columns:
+            raise KeyError(f"Missing required column: {c}")
+    # Feature columns = all columns except metadata columns
+    feature_cols = [c for c in df.columns if c not in required_cols]
+    # Dynamically read numeric features (no fixed column count processing)
+    X_all = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0).to_numpy()
+    print(f" Number of features: {X_all.shape[1]}")
+    # model files parsing
+    import glob as _glob
+    model_paths: list[str] = []
+    if args.model_files:
+        model_paths.extend(list(args.model_files))
+    if args.model_glob:
+        model_paths.extend(sorted(_glob.glob(args.model_glob)))
+    if not model_paths and args.model_file:
+        model_paths = [args.model_file]
+    if not model_paths:
+        raise FileNotFoundError("No model files found, please check!")
+    first_ckpt = torch.load(model_paths[0], map_location='cpu', weights_only=False)
+    model_args = first_ckpt['args']
+    def _predict_with_model(model_path: str, X_all_np: np.ndarray) -> np.ndarray:
+        if not os.path.exists(model_path):
+            raise FileNotFoundError(f"Model file not existed: {model_path}")
+        ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
+        m_args = ckpt['args']
+        X_np = X_all_np
+        if ckpt.get("used_feature_idx") is not None:
+            try:
+                ufi = ckpt["used_feature_idx"]
+                import numpy as _np
+                ufi_arr = _np.array(ufi, dtype=int)
+                max_idx = X_np.shape[1] - 1
+                ufi_arr = ufi_arr[(ufi_arr >= 0) & (ufi_arr <= max_idx)]
+                if len(ufi_arr) > 0:
+                    X_np = X_np[:, ufi_arr]
+            except Exception:
+                pass
+        X_tensor_cpu = torch.as_tensor(X_np, dtype=torch.float32)
+        num_embeddings = None
+        if getattr(m_args, 'use_embeddings', False):
+            if m_args.embedding_type == 'linear':
+                import rtdl_num_embeddings
+                num_embeddings = rtdl_num_embeddings.LinearReLUEmbeddings(X_tensor_cpu.shape[1])
+            elif m_args.embedding_type == 'periodic':
+                import rtdl_num_embeddings
+                num_embeddings = rtdl_num_embeddings.PeriodicEmbeddings(X_tensor_cpu.shape[1], lite=False)
+            elif m_args.embedding_type == 'piecewise':
+                import rtdl_num_embeddings
+                num_embeddings = rtdl_num_embeddings.PiecewiseLinearEmbeddings(
+                    rtdl_num_embeddings.compute_bins(X_tensor_cpu, n_bins=48),
+                    d_embedding=16,
+                    activation=False,
+                    version='B',
+                )
+        model = tabm.TabM.make(
+            n_num_features=X_tensor_cpu.shape[1],
+            cat_cardinalities=[],
+            d_out=2,
+            k=m_args.k,
+            n_blocks=m_args.n_blocks,
+            d_block=m_args.d_block,
+            num_embeddings=num_embeddings,
+            arch_type=getattr(m_args, 'arch_type', 'tabm'),
+        )
+        model.load_state_dict(ckpt['model_state_dict'])
+        model.to(device)
+        model.eval()
+        bs = max(256, args.batch_size)
+        probs_list = []
+        n = len(X_tensor_cpu)
+        with torch.inference_mode():
+            for i in range(0, n, bs):
+                j = min(i + bs, n)
+                xb = X_tensor_cpu[i:j].to(device)
+                logits = model(xb).mean(1)
+                probs = torch.softmax(logits, dim=1)[:, 1].detach().cpu().numpy()
+                probs_list.append(probs)
+                del xb, logits
+                if torch.cuda.is_available() and device.type == 'cuda':
+                    torch.cuda.empty_cache()
+                if (i // bs) % 50 == 0:
+                    print(f"    batch {i//bs}/{(n+bs-1)//bs}")
+        return np.concatenate(probs_list, axis=0)
+    def _stringify(v):
+        try:
+            return repr(v)
+        except Exception:
+            try:
+                return str(v)
+            except Exception:
+                return "<unprintable>"
+    print("===== Saved Hyperparameters from checkpoint['args'] =====")
+    if hasattr(model_args, "__dict__"):
+        hp_items = sorted(vars(model_args).items())
+    elif isinstance(model_args, dict):
+        hp_items = sorted(model_args.items())
+    else:
+        try:
+            hp_items = sorted(model_args.__dict__.items())
+        except Exception:
+            hp_items = []
+            print("⚠️ Unable to enumerate contents of model_args")
+    for key, val in hp_items:
+        print(f"- {key}: {_stringify(val)}")
+    print("=========================================================")
+    def _p_dict(title, d):
+        try:
+            print(title)
+            for k in sorted(d.keys()):
+                try:
+                    print(f"- {k}: {repr(d[k])}")
+                except Exception:
+                    print(f"- {k}: <unprintable>")
+            print("=" * len(title))
+        except Exception:
+            pass
+    if isinstance(first_ckpt.get("training_args"), dict):
+        _p_dict("===== checkpoint['training_args'] =====", first_ckpt["training_args"])
+    if isinstance(first_ckpt.get("best_params"), dict):
+        _p_dict("===== checkpoint['best_params'] =====", first_ckpt["best_params"])
+    if isinstance(first_ckpt.get("full_args"), dict):
+        _p_dict("===== checkpoint['full_args'] =====", first_ckpt["full_args"])
+    if first_ckpt.get("used_feature_idx") is not None:
+        try:
+            ufi = first_ckpt["used_feature_idx"]
+            print("===== used_feature_idx =====")
+            print(f"- length: {len(ufi)}")
+            print(f"- head: {list(ufi[:10])}")
+            print("=" * 25)
+        except Exception:
+            print("===== used_feature_idx =====\n<unprintable>\n============================")
+    try:
+        print("===== Environment =====")
+        print(f"- torch: {torch.__version__}")
+        print(f"- cuda available: {torch.cuda.is_available()}")
+        if torch.cuda.is_available():
+            print(f"- device: {torch.cuda.get_device_name(0)}")
+            print(f"- cuda version: {torch.version.cuda}")
+        import tabm as _tabm_mod
+        print(f"- tabm: {getattr(_tabm_mod, '__version__', 'unknown')}")
+        print("========================")
+    except Exception:
+        pass
+    n_models = len(model_paths)
+    print(f"🔗 Loading {n_models} models for equal-weighted average prediction...")
+    y_prob_all = None
+    for mp in model_paths:
+        print(f"  -> {mp}")
+        probs = _predict_with_model(mp, X_all)
+        if y_prob_all is None:
+            y_prob_all = probs.astype(np.float64)
+        else:
+            y_prob_all += probs
+    y_prob_all = (y_prob_all / float(n_models)).astype(np.float64)
+    print(f"✅ Prediction completed, total {len(y_prob_all)} samples; number of models={n_models}")
+    rows_main = []
+    rows_tesla = []
+    need_header = (not os.path.exists(args.output_file)) or (os.path.getsize(args.output_file) == 0)
+    with open(args.output_file, "a", encoding="utf-8") as f:
+        if need_header:
+            f.write("Patient\tNr_correct_top20\tNr_tested_top20\tNr_correct_top50\tNr_tested_top50\t"
+                    "Nr_correct_top100\tNr_tested_top100\tNr_immunogenic\tNr_peptides\tClf_score\t"
+                    "CD8_ranks\tCD8_mut_seqs\tCD8_genes\n")
+        for patient, df_p in df.groupby("patient", sort=False):
+            has_cd8 = (normalize_rt(df_p["response_type"]) == "CD8").any()
+            if args.skip_no_cd8 and not has_cd8:
+                continue
+            idx = df_p.index.to_numpy()
+            y_prob = y_prob_all[idx]
+            (y_pred_sorted, X_sorted,
+             nr_correct20, nr_tested20,
+             nr_correct50, nr_tested50,
+             nr_correct100, nr_tested100,
+             nr_immuno, r, score,
+             ranks_str, mut_seqs_str, genes_str) = compute_patient_metrics(df_p, y_prob)
+            f.write(f"{patient}\t{nr_correct20}\t{nr_tested20}\t{nr_correct50}\t{nr_tested50}\t"
+                    f"{nr_correct100}\t{nr_tested100}\t{nr_immuno}\t{len(df_p)}\t{score:.6f}\t"
+                    f"{ranks_str}\t{mut_seqs_str}\t{genes_str}\n")
+            rows_main.append({
+                "Patient": patient,
+                "Nr_correct_top20": nr_correct20,
+                "Nr_tested_top20": nr_tested20,
+                "Nr_correct_top50": nr_correct50,
+                "Nr_tested_top50": nr_tested50,
+                "Nr_correct_top100": nr_correct100,
+                "Nr_tested_top100": nr_tested100,
+                "Nr_immunogenic": nr_immuno,
+                "Nr_peptides": len(df_p),
+                "Clf_score": score,
+                "CD8_ranks": ranks_str,
+                "CD8_mut_seqs": mut_seqs_str,
+                "CD8_genes": genes_str,
+            })
+            if args.tesla_file or args.tesla_xlsx:
+                if "dataset" in df_p.columns:
+                    dataset_val = str(df_p["dataset"].iloc[0])
+                else:
+                    dataset_val = args.dataset_name if args.dataset_name is not None else ""
+                idx_nt = X_sorted['response_type'].astype(str) != 'not_tested'
+                y_pred_tesla = pd.Series(y_pred_sorted)[idx_nt].to_numpy()
+                y_tesla = X_sorted.loc[idx_nt, 'response'].to_numpy()
+                ttif = (nr_correct20 / nr_tested20) if nr_tested20 > 0 else 0.0
+                fr = (nr_correct100 / nr_immuno) if nr_immuno > 0 else 0.0
+                precision, recall, _ = precision_recall_curve(y_tesla, y_pred_tesla)
+                auprc = auc(recall, precision)
+                if args.tesla_file:
+                    new_tesla = (not os.path.exists(args.tesla_file)) or (os.path.getsize(args.tesla_file) == 0)
+                    with open(args.tesla_file, "a", encoding="utf-8") as tf:
+                        if new_tesla:
+                            tf.write("Dataset\tPatient\tTTIF\tFR\tAUPRC\n")
+                        tf.write(f"{dataset_val}\t{patient}\t{ttif:.3f}\t{fr:.3f}\t{auprc:.3f}\n")
+                rows_tesla.append({
+                    "Dataset": dataset_val,
+                    "Patient": patient,
+                    "TTIF": ttif,
+                    "FR": fr,
+                    "AUPRC": auprc,
+                })
+    if args.output_xlsx and rows_main:
+        os.makedirs(os.path.dirname(args.output_xlsx) or '.', exist_ok=True)
+        pd.DataFrame(rows_main).to_excel(args.output_xlsx, index=False)
+    if args.tesla_xlsx and rows_tesla:
+        os.makedirs(os.path.dirname(args.tesla_xlsx) or '.', exist_ok=True)
+        pd.DataFrame(rows_tesla).to_excel(args.tesla_xlsx, index=False)
+    print(f" Evaluation completed! Processed {len(rows_main)} patients")
+if __name__ == "__main__":
+    main()

src/tabm_train.py ADDED Viewed

	@@ -0,0 +1,487 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+import argparse
+import os
+import random
+from copy import deepcopy
+from typing import Any, Dict
+import numpy as np
+import pandas as pd
+from hyperopt import fmin, tpe, hp, Trials, STATUS_OK
+from hyperopt.pyll.base import scope
+from sklearn.model_selection import StratifiedKFold
+import torch
+import torch.nn as nn
+import torch.optim
+from torch import Tensor
+import tabm
+import rtdl_num_embeddings
+def set_seed(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed + 1)
+    torch.manual_seed(seed + 2)
+def _dump_model_info_sidecar(model_path: str) -> None:
+    try:
+        if not os.path.exists(model_path):
+            return
+        ckpt = torch.load(model_path, map_location='cpu', weights_only=False)
+        sidecar = os.path.splitext(model_path)[0] + ".info.txt"
+        with open(sidecar, "w", encoding="utf-8") as f:
+            def _p(title: str, d):
+                try:
+                    f.write(title + "\n")
+                    if hasattr(d, "__dict__"):
+                        items = sorted(vars(d).items())
+                    elif isinstance(d, dict):
+                        items = sorted(d.items())
+                    else:
+                        try:
+                            items = sorted(d.__dict__.items())
+                        except Exception:
+                            items = []
+                    for k, v in items:
+                        try:
+                            f.write(f"- {k}: {repr(v)}\n")
+                        except Exception:
+                            f.write(f"- {k}: <unprintable>\n")
+                    f.write("=" * len(title) + "\n")
+                except Exception:
+                    pass
+            _p("===== checkpoint['args'] =====", ckpt.get('args'))
+            _p("===== checkpoint['training_args'] =====", ckpt.get('training_args', {}))
+            _p("===== checkpoint['best_params'] =====", ckpt.get('best_params', {}))
+            _p("===== checkpoint['full_args'] =====", ckpt.get('full_args', {}))
+            if ckpt.get("used_feature_idx") is not None:
+                ufi = ckpt["used_feature_idx"]
+                f.write("===== used_feature_idx =====\n")
+                try:
+                    f.write(f"- length: {len(ufi)}\n")
+                    f.write(f"- head: {list(ufi[:10])}\n")
+                except Exception:
+                    f.write("<unprintable>\n")
+                f.write("=" * 25 + "\n")
+            # ENVs Info
+            try:
+                f.write("===== Environment =====\n")
+                f.write(f"- torch: {torch.__version__}\n")
+                f.write(f"- cuda available: {torch.cuda.is_available()}\n")
+                if torch.cuda.is_available():
+                    f.write(f"- device: {torch.cuda.get_device_name(0)}\n")
+                    f.write(f"- cuda version: {torch.version.cuda}\n")
+                import tabm as _tabm_mod
+                f.write(f"- tabm: {getattr(_tabm_mod, '__version__', 'unknown')}\n")
+                f.write("========================\n")
+            except Exception:
+                pass
+    except Exception:
+        pass
+def load_training_data(data_file: str) -> tuple[np.ndarray, np.ndarray]:
+    # Read training data: first column as label, remaining columns as numerical features (adaptive number of columns)
+    # Using pandas for more robust parsing and to avoid 1D array errors caused by empty data
+    df = pd.read_csv(
+        data_file,
+        sep='\t',
+        header=0,
+        dtype=str,
+        keep_default_na=False,
+        na_filter=False,
+        engine='python',
+    )
+    if df.shape[0] == 0 or df.shape[1] < 2:
+        raise ValueError(
+            f"Incorrect training data format: {data_file}, requires at least 1 label column + 1 feature column, actual shape={df.shape}"
+        )
+    # Determine label column (prefer column named 'label', otherwise use the first column)
+    label_col = 'label' if 'label' in df.columns else df.columns[0]
+    # Parse labels as integers (non-numeric values will be set to 0)
+    y = pd.to_numeric(df[label_col], errors='coerce').fillna(0).astype(np.int64).to_numpy()
+    # Parse features as float32
+    feature_cols = [c for c in df.columns if c != label_col]
+    if len(feature_cols) == 0:
+        raise ValueError("No feature columns found")
+    X_df = df[feature_cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)
+    X = X_df.to_numpy(dtype=np.float32)
+    return X, y
+def build_num_embeddings(embedding_type: str, X_fold: np.ndarray) -> tuple[Any, np.ndarray]:
+    used_idx = np.arange(X_fold.shape[1])
+    if embedding_type == 'piecewise':
+        var = X_fold.var(axis=0)
+        used_idx = np.where(var > 0.0)[0]
+        X_fold = X_fold[:, used_idx]
+        if len(used_idx) < 1:
+            return None, used_idx
+        try:
+            X_tensor = torch.as_tensor(X_fold, dtype=torch.float32)
+            num_embeddings = rtdl_num_embeddings.PiecewiseLinearEmbeddings(
+                rtdl_num_embeddings.compute_bins(X_tensor, n_bins=48),
+                d_embedding=16,
+                activation=False,
+                version='B',
+            )
+            return num_embeddings, used_idx
+        except Exception:
+            return None, used_idx
+    elif embedding_type == 'linear':
+        return rtdl_num_embeddings.LinearReLUEmbeddings(X_fold.shape[1]), used_idx
+    elif embedding_type == 'periodic':
+        return rtdl_num_embeddings.PeriodicEmbeddings(X_fold.shape[1], lite=False), used_idx
+    else:
+        return None, used_idx
+def make_model(n_features: int,
+               k: int,
+               n_blocks: int,
+               d_block: int,
+               num_embeddings: Any,
+               arch_type: str = 'tabm') -> nn.Module:
+    return tabm.TabM.make(
+        n_num_features=n_features,
+        cat_cardinalities=[],
+        d_out=2,
+        k=k,
+        n_blocks=n_blocks,
+        d_block=d_block,
+        num_embeddings=num_embeddings,
+        arch_type=arch_type,
+    )
+def train_one_epoch(model: nn.Module,
+                    X: torch.Tensor,
+                    y: torch.Tensor,
+                    optimizer: torch.optim.Optimizer,
+                    batch_size: int,
+                    device: torch.device) -> float:
+    model.train()
+    indices = torch.randperm(len(X), device=device)
+    batches = indices.split(batch_size)
+    total_loss = 0.0
+    share_training_batches = True
+    def loss_fn(y_pred: Tensor, y_true: Tensor) -> Tensor:
+        # (B, k, 2) -> (B*k, 2)
+        y_pred = y_pred.flatten(0, 1)
+        if share_training_batches:
+            y_true = y_true.repeat_interleave(model.backbone.k)
+        else:
+            y_true = y_true.flatten(0, 1)
+        return nn.functional.cross_entropy(y_pred, y_true)
+    for batch_idx in batches:
+        optimizer.zero_grad()
+        logits = model(X[batch_idx])
+        loss = loss_fn(logits, y[batch_idx])
+        loss.backward()
+        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+        optimizer.step()
+        total_loss += float(loss.detach().cpu())
+    return total_loss / max(1, len(batches))
+def sum_rank_correct_numpy(y_true: np.ndarray, y_prob: np.ndarray, alpha: float = 0.005) -> float:
+    idx = np.argsort(-y_prob)
+    y_sorted = y_true[idx]
+    r = np.where(y_sorted == 1)[0]
+    return float(np.sum(np.exp(-alpha * r)))
+@torch.inference_mode()
+def evaluate_sum_exp_rank(model: nn.Module, X: torch.Tensor, y: torch.Tensor, device: torch.device, alpha: float = 0.005) -> float:
+    model.eval()
+    eval_bs = 8096
+    logits = torch.cat([
+        model(X[idx]).mean(1)
+        for idx in torch.arange(len(X), device=device).split(eval_bs)
+    ])
+    probs_pos = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
+    y_true = y.cpu().numpy()
+    return sum_rank_correct_numpy(y_true, probs_pos, alpha)
+def objective(params: Dict[str, Any],
+              X: np.ndarray,
+              y: np.ndarray,
+              device: torch.device,
+              seed: int,
+              cv_folds: int,
+              epochs: int,
+              batch_size: int,
+              alpha: float = 0.005) -> Dict[str, Any]:
+    k = int(params.get('k', 32))
+    n_blocks = int(params['n_blocks'])
+    d_block = int(params['d_block'])
+    lr = float(params['lr'])
+    wd_choice = params['weight_decay_choice']  # 0 or sampled
+    weight_decay = 0.0 if wd_choice == 0 else float(params['weight_decay_val'])
+    embedding_type = params['embedding_type']  # 'none'/'linear'/'periodic'/'piecewise'
+    arch_type = params['arch_type']  # 'tabm'/'tabm-mini'
+    cv = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=seed)
+    ap_scores: list[float] = []
+    for train_idx, val_idx in cv.split(X, y):
+        X_tr = X[train_idx]
+        y_tr = y[train_idx]
+        X_va = X[val_idx]
+        y_va = y[val_idx]
+        num_embeddings, used_idx = build_num_embeddings(embedding_type, X_tr)
+        X_tr_used = X_tr[:, used_idx] if len(used_idx) != X_tr.shape[1] else (X_tr if embedding_type != 'piecewise' else X_tr[:, used_idx])
+        X_va_used = X_va[:, used_idx] if embedding_type == 'piecewise' else X_va
+        n_features = X_tr_used.shape[1]
+        model = make_model(n_features, k, n_blocks, d_block, num_embeddings, arch_type).to(device)
+        optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+        X_tr_t = torch.as_tensor(X_tr_used, device=device)
+        y_tr_t = torch.as_tensor(y_tr, device=device)
+        X_va_t = torch.as_tensor(X_va_used, device=device)
+        y_va_t = torch.as_tensor(y_va, device=device)
+        for _ in range(epochs):
+            train_one_epoch(model, X_tr_t, y_tr_t, optimizer, batch_size, device)
+        score = evaluate_sum_exp_rank(model, X_va_t, y_va_t, device, alpha)
+        ap_scores.append(score)
+    mean_score = float(np.mean(ap_scores))
+    return {"loss": -mean_score, "status": STATUS_OK, "score": mean_score}
+def train_final(X: np.ndarray,
+                y: np.ndarray,
+                best_params: Dict[str, Any],
+                device: torch.device,
+                final_epochs: int,
+                batch_size: int,
+                output_path: str,
+                seed: int,
+                alpha: float = 0.005) -> None:
+    k = int(best_params.get('k', 32))
+    n_blocks = int(best_params['n_blocks'])
+    d_block = int(best_params['d_block'])
+    lr = float(best_params['lr'])
+    wd_choice = best_params['weight_decay_choice']
+    weight_decay = 0.0 if wd_choice == 0 else float(best_params['weight_decay_val'])
+    embedding_type = best_params['embedding_type']
+    arch_type = best_params['arch_type']
+    num_embeddings, used_idx = build_num_embeddings(embedding_type, X)
+    X_used = X[:, used_idx] if embedding_type == 'piecewise' else X
+    n_features = X_used.shape[1]
+    model = make_model(n_features, k, n_blocks, d_block, num_embeddings, arch_type).to(device)
+    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
+    X_t = torch.as_tensor(X_used, device=device)
+    y_t = torch.as_tensor(y, device=device)
+    for _ in range(final_epochs):
+        train_one_epoch(model, X_t, y_t, optimizer, batch_size, device)
+    os.makedirs(os.path.dirname(output_path) or '.', exist_ok=True)
+    torch.save({
+        "model_state_dict": model.state_dict(),
+        "args": argparse.Namespace(
+            k=k,
+            n_blocks=n_blocks,
+            d_block=d_block,
+            use_embeddings=True if embedding_type in ("linear", "periodic", "piecewise") else False,
+            embedding_type=embedding_type,
+            arch_type=arch_type,
+        ),
+        "best_params": deepcopy(best_params),
+        "training_args": {
+            "lr": lr,
+            "weight_decay_choice": wd_choice,
+            "weight_decay_val": weight_decay,
+            "batch_size": batch_size,
+            "final_epochs": final_epochs,
+            "seed": seed,
+            "alpha": alpha,
+            "device": str(device),
+        },
+        "used_feature_idx": used_idx,
+        "full_args": dict(
+            best_params=deepcopy(best_params),
+            final_epochs=final_epochs, batch_size=batch_size,
+            seed=seed, alpha=alpha, device=str(device),
+        ),
+        "search_space": "hyperopt space v1",
+    }, output_path)
+    print(f"Final models saved into: {output_path}")
+    _dump_model_info_sidecar(output_path)
+def hyperopt_search(X: np.ndarray,
+                    y: np.ndarray,
+                    device: torch.device,
+                    seed: int,
+                    cv_folds: int,
+                    epochs: int,
+                    batch_size: int,
+                    alpha: float,
+                    tune_k: bool,
+                    max_evals: int) -> tuple[dict, float]:
+    space = {
+        "n_blocks": scope.int(hp.quniform("n_blocks", 2, 5, 1)),
+        "d_block": scope.int(hp.quniform("d_block", 64, 1024, 16)),
+        "lr": hp.loguniform("lr", np.log(1e-4), np.log(5e-3)),
+        "weight_decay_choice": hp.choice("weight_decay_choice", [0, 1]),
+        "weight_decay_val": hp.loguniform("weight_decay_val", np.log(1e-4), np.log(1e-1)),
+        "embedding_type": hp.choice("embedding_type", ["none", "linear", "periodic", "piecewise"]),
+        "arch_type": hp.choice("arch_type", ["tabm", "tabm-mini"]),
+    }
+    if tune_k:
+        space["k"] = scope.int(hp.quniform("k", 16, 32, 8))
+    else:
+        space["k"] = 32
+    def obj_fn(hparams):
+        return objective(hparams, X, y, device, seed, cv_folds, epochs, batch_size, alpha)
+    trials = Trials()
+    best = fmin(fn=obj_fn, space=space, algo=tpe.suggest, max_evals=max_evals, trials=trials)
+    best_trial = min(trials.trials, key=lambda t: t["result"]["loss"])
+    best_ap = -best_trial["result"]["loss"]
+    best_params = best_trial["misc"]["vals"].copy()
+    emb_choices = ["none", "linear", "periodic", "piecewise"]
+    best_params["embedding_type"] = emb_choices[int(best_params["embedding_type"][0])] if isinstance(best_params["embedding_type"], list) else best_params["embedding_type"]
+    arch_choices = ["tabm", "tabm-mini"]
+    best_params["arch_type"] = arch_choices[int(best_params["arch_type"][0])] if isinstance(best_params["arch_type"], list) else best_params["arch_type"]
+    if isinstance(best_params.get("k", 32), list):
+        best_params["k"] = int(best_params["k"][0])
+    for k_ in ["n_blocks", "d_block", "weight_decay_choice"]:
+        if isinstance(best_params[k_], list):
+            best_params[k_] = int(best_params[k_][0])
+    for k_ in ["lr", "weight_decay_val"]:
+        if isinstance(best_params[k_], list):
+            best_params[k_] = float(best_params[k_][0])
+    return best_params, float(best_ap)
+def run_one_pipeline(rep_idx: int,
+                     X: np.ndarray,
+                     y: np.ndarray,
+                     device_str: str,
+                     args_dict: dict,
+                     out_dir: str,
+                     base: str,
+                     ext: str) -> str:
+    device = torch.device(device_str)
+    rep_seed = int(args_dict["seed"]) + 997 * int(rep_idx)
+    set_seed(rep_seed)
+    print(f"[rep {rep_idx}] 🔍 Starting hyperparameter search (max_evals={args_dict['max_evals']}) ...")
+    best_params, best_ap = hyperopt_search(
+        X, y, device,
+        seed=rep_seed,
+        cv_folds=args_dict["cv_folds"],
+        epochs=args_dict["epochs"],
+        batch_size=args_dict["batch_size"],
+        alpha=args_dict["alpha"],
+        tune_k=args_dict["tune_k"],
+        max_evals=args_dict["max_evals"],
+    )
+    print(f"[rep {rep_idx}] 🎯 Best sum_exp_rank={best_ap:.6f}")
+    print(f"[rep {rep_idx}] 🎯 Best parameters={best_params}")
+    out_path = os.path.join(out_dir, f"{base}_rep{rep_idx}{ext}")
+    print(f"[rep {rep_idx}] 🏋️  Starting final training and saving to: {out_path}")
+    train_final(
+        X, y, best_params, device,
+        final_epochs=args_dict["final_epochs"],
+        batch_size=args_dict["batch_size"],
+        output_path=out_path,
+        seed=rep_seed,
+        alpha=args_dict["alpha"],
+    )
+    return out_path
+def main():
+    ap = argparse.ArgumentParser(description="TabM hyperparameter search (Hyperopt) with internal cross-validation, target=AUPRC; training set only, no external validation/test")
+    ap.add_argument("--data_file", type=str, default="Neopep_ml_with_labels.txt", help="Training data TSV")
+    ap.add_argument("--model_out", type=str, default="tabm_results/tabm_hyperopt_best.pth", help="Final model save path (or base name within directory)")
+    ap.add_argument("--max_evals", type=int, default=30, help="Number of Hyperopt evaluations per parallel repetition")
+    ap.add_argument("--cv_folds", type=int, default=5, help="Number of cross-validation folds")
+    ap.add_argument("--epochs", type=int, default=40, help="Training epochs per fold")
+    ap.add_argument("--final_epochs", type=int, default=120, help="Final model training epochs")
+    ap.add_argument("--batch_size", type=int, default=256, help="Batch size")
+    ap.add_argument("--seed", type=int, default=42, help="Random seed (each repetition will be offset when running in parallel)")
+    ap.add_argument("--alpha", type=float, default=0.005, help="Alpha for sum_exp_rank")
+    ap.add_argument("--tune_k", action="store_true", help="Whether to search for k together (default fixed at 32)")
+    ap.add_argument("--device", type=str, default="auto", help="Device selection: auto/cuda/cpu")
+    ap.add_argument("--nr_hyperopt_rep", type=int, default=1, help="Parallel repetition count: each independent hyperparameter search + final training")
+    args = ap.parse_args()
+    set_seed(args.seed)
+    # Device selection
+    if args.device == "auto":
+        if torch.cuda.is_available():
+            device = torch.device('cuda:0')
+            print(f"🚀 Detected GPU: {torch.cuda.get_device_name(0)}")
+            print(f"   GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
+            print(f"   CUDA Version: {torch.version.cuda}")
+        else:
+            device = torch.device('cpu')
+            print("⚠️  No GPU detected, using CPU")
+    elif args.device == "cuda":
+        if torch.cuda.is_available():
+            device = torch.device('cuda:0')
+            print(f"🚀 Forcing GPU usage: {torch.cuda.get_device_name(0)}")
+        else:
+            raise RuntimeError("CUDA specified but no GPU detected")
+    else:
+        device = torch.device('cpu')
+        print("🖥️  Using CPU")
+    X, y = load_training_data(args.data_file)
+    print(f"Training data: {X.shape}, Positive sample ratio: {np.mean(y):.5f}")
+    out_dir = os.path.dirname(args.model_out) or '.'
+    os.makedirs(out_dir, exist_ok=True)
+    base = os.path.splitext(os.path.basename(args.model_out))[0]
+    ext = os.path.splitext(args.model_out)[1] or '.pth'
+    args_dict = {
+        "seed": int(args.seed),
+        "cv_folds": int(args.cv_folds),
+        "epochs": int(args.epochs),
+        "final_epochs": int(args.final_epochs),
+        "batch_size": int(args.batch_size),
+        "alpha": float(args.alpha),
+        "tune_k": bool(args.tune_k),
+        "max_evals": int(args.max_evals),
+    }
+    from multiprocessing import get_context
+    ctx = get_context('spawn')
+    repeats = int(args.nr_hyperopt_rep)
+    print(f"🧵 Parallel repetitions: {repeats} (each independent hyperparameter search + final training)")
+    with ctx.Pool(processes=repeats) as pool:
+        paths = pool.starmap(
+            run_one_pipeline,
+            [(i, X, y, str(device), args_dict, out_dir, base, ext) for i in range(repeats)]
+        )
+    print("Saved model files:")
+    for p in sorted(paths):
+        print("-", p)
+if __name__ == "__main__":
+    main()