| | import os
|
| | import pandas as pd
|
| | import numpy as np
|
| | from sklearn.model_selection import KFold
|
| | from sklearn.metrics import mean_squared_error, r2_score
|
| | from scipy.stats import pearsonr, ttest_ind
|
| | from catboost import CatBoostRegressor
|
| |
|
| |
|
| | data = pd.read_csv("embeddings/ESM2_interaction.csv")
|
| |
|
| |
|
| | for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
|
| | data[col] = data[col].fillna("")
|
| |
|
| |
|
| | for col in ["Ligand Features", "Receptor Features", "Physical Features"]:
|
| | data[col] = data[col].apply(
|
| | lambda s: [float(x) for x in str(s).split(",") if x.strip()]
|
| | )
|
| |
|
| |
|
| | X_ligand = np.vstack(data["Ligand Features"].values)
|
| | X_receptor = np.vstack(data["Receptor Features"].values)
|
| | X_physical = np.vstack(data["Physical Features"].values)
|
| |
|
| |
|
| | raw_y = data["KD(M)"].values
|
| | y = np.log10(raw_y)
|
| |
|
| | records = []
|
| |
|
| |
|
| | for repeat in range(1, 6):
|
| | kf = KFold(n_splits=5, shuffle=True, random_state=repeat)
|
| |
|
| | for include_phys in (False, True):
|
| | X_base = np.hstack([X_ligand, X_receptor])
|
| | X_full = np.hstack([X_base, X_physical])
|
| | X_data = X_full if include_phys else X_base
|
| |
|
| | for fold_idx, (train_idx, test_idx) in enumerate(kf.split(X_data), start=1):
|
| | X_train, X_test = X_data[train_idx], X_data[test_idx]
|
| | y_train, y_test = y[train_idx], y[test_idx]
|
| |
|
| |
|
| | model = CatBoostRegressor(
|
| | iterations=2000,
|
| | learning_rate=0.08,
|
| | depth=4,
|
| | verbose=500,
|
| | task_type="GPU",
|
| | devices="0"
|
| | )
|
| |
|
| |
|
| | model.fit(X_train, y_train)
|
| |
|
| | preds = model.predict(X_test)
|
| | rmse = np.sqrt(mean_squared_error(y_test, preds))
|
| | r2 = r2_score(y_test, preds)
|
| | pcc = pearsonr(y_test, preds)[0]
|
| |
|
| | records.append({
|
| | "repeat": repeat,
|
| | "fold": fold_idx,
|
| | "with_physical": include_phys,
|
| | "pearson_r": pcc,
|
| | "r2": r2,
|
| | "rmse": rmse
|
| | })
|
| |
|
| |
|
| | metrics_df = pd.DataFrame(records)
|
| |
|
| |
|
| | out_dir = "metrics"
|
| | os.makedirs(out_dir, exist_ok=True)
|
| | csv_path = os.path.join(out_dir, "InteractionMetrics.csv")
|
| | metrics_df.to_csv(csv_path, index=False)
|
| | print(f"All metrics saved to {csv_path}")
|
| |
|
| |
|
| | results = {}
|
| | for metric in ["pearson_r", "r2", "rmse"]:
|
| | grp_with = metrics_df.loc[metrics_df.with_physical, metric]
|
| | grp_without = metrics_df.loc[~metrics_df.with_physical, metric]
|
| | t_stat, p_val = ttest_ind(grp_with, grp_without, equal_var=False)
|
| | results[metric] = (t_stat, p_val)
|
| |
|
| | print("\nT test results comparing with vs without physical features:")
|
| | for m, (t_stat, p_val) in results.items():
|
| | print(f"{m} → t = {t_stat:.3f}, p = {p_val:.3f}")
|
| |
|