File size: 7,224 Bytes

813c6b1

import pandas as pd
import numpy as np
import optuna
from optuna.trial import TrialState
from rdkit import Chem
from rdkit.Chem import AllChem
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import xgboost as xgb
import os
from datasets import load_from_disk
from scipy.stats import spearmanr
import matplotlib.pyplot as plt

base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"

def save_and_plot_predictions(y_true_train, y_pred_train, y_true_val, y_pred_val, output_path):
    os.makedirs(output_path, exist_ok=True)

    # Save training predictions
    train_df = pd.DataFrame({'True Permeability': y_true_train, 'Predicted Permeability': y_pred_train})
    train_df.to_csv(os.path.join(output_path, 'train_predictions.csv'), index=False)

    # Save validation predictions
    val_df = pd.DataFrame({'True Permeability': y_true_val, 'Predicted Permeability': y_pred_val})
    val_df.to_csv(os.path.join(output_path, 'val_predictions.csv'), index=False)

    # Plot training predictions
    plot_correlation(
        y_true_train,
        y_pred_train,
        title="Training Set Correlation Plot",
        output_file=os.path.join(output_path, 'train_correlation.png'),
    )

    # Plot validation predictions
    plot_correlation(
        y_true_val,
        y_pred_val,
        title="Validation Set Correlation Plot",
        output_file=os.path.join(output_path, 'val_correlation.png'),
    )

def plot_correlation(y_true, y_pred, title, output_file):
    spearman_corr, _ = spearmanr(y_true, y_pred)

    # Scatter plot
    plt.figure(figsize=(10, 8))
    plt.scatter(y_true, y_pred, alpha=0.5, label='Data points', color='#BC80FF')
    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='teal', linestyle='--', label='Ideal fit')

    # Add annotations
    plt.title(f"{title}\nSpearman Correlation: {spearman_corr:.3f}")
    plt.xlabel("True Permeability (logP)")
    plt.ylabel("Predicted Affinity (logP)")
    plt.legend()

    # Save and show the plot
    plt.tight_layout()
    plt.savefig(output_file)
    plt.show()

# Load dataset
dataset = load_from_disk(f'{base_path}/data/permeability')

# Extract sequences, labels, and embeddings
sequences = np.stack(dataset['sequence'])
labels = np.stack(dataset['labels'])  # Regression labels
embeddings = np.stack(dataset['embedding'])  # Pre-trained embeddings

# Function to compute Morgan fingerprints
def compute_morgan_fingerprints(smiles_list, radius=2, n_bits=2048):
    fps = []
    for smiles in smiles_list:
        mol = Chem.MolFromSmiles(smiles)
        if mol is not None:
            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
            fps.append(np.array(fp))
        else:
            # If the SMILES string is invalid, use a zero vector
            fps.append(np.zeros(n_bits))
            print(f"Invalid SMILES: {smiles}")
    return np.array(fps)

# Compute Morgan fingerprints for the sequences
#morgan_fingerprints = compute_morgan_fingerprints(sequences)

# Concatenate embeddings with Morgan fingerprints
#input_features = np.concatenate([embeddings, morgan_fingerprints], axis=1)
input_features = embeddings

# Initialize global variables
best_model_path = f"{base_path}/src/permeability"
os.makedirs(best_model_path, exist_ok=True)

def trial_info_callback(study, trial):
    if study.best_trial == trial:
        print(f"Trial {trial.number}:")
        print(f"  MSE: {trial.value}")

def objective(trial):
    # Define hyperparameters
    params = {
        'objective': 'reg:squarederror',
        'lambda': trial.suggest_float('lambda', 0.1, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 0.1, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1),
        'max_depth': trial.suggest_int('max_depth', 2, 30),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
        'tree_method': 'hist',
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 10.0, log=True),
        'device': 'cuda:6',
    }
    num_boost_round = trial.suggest_int('num_boost_round', 10, 1000)

    # Train-validation split
    X_train, X_val, y_train, y_val = train_test_split(input_features, labels, test_size=0.2, random_state=42)

    # Convert data to DMatrix
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_val, label=y_val)

    # Train XGBoost
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=num_boost_round,
        evals=[(dvalid, "validation")],
        early_stopping_rounds=50,
        verbose_eval=False,
    )

    # Predict and evaluate
    preds_train = model.predict(dtrain)
    preds_val = model.predict(dvalid)
    
    mse = mean_squared_error(y_val, preds_val)
    
    # Calculate Spearman Rank Correlation for both train and validation
    spearman_train, _ = spearmanr(y_train, preds_train)
    spearman_val, _ = spearmanr(y_val, preds_val)
    print(f"Train Spearman: {spearman_train:.4f}, Val Spearman: {spearman_val:.4f}")

    # Save the best model
    if trial.study.user_attrs.get("best_mse", np.inf) > mse:
        trial.study.set_user_attr("best_mse", mse)
        trial.study.set_user_attr("best_spearman_train", spearman_train)
        trial.study.set_user_attr("best_spearman_val", spearman_val)
        trial.study.set_user_attr("best_trial", trial.number)
        model.save_model(os.path.join(best_model_path, "best_model.json"))
        save_and_plot_predictions(y_train, preds_train, y_val, preds_val, best_model_path)
        print(f"✓ NEW BEST! Trial {trial.number}: MSE={mse:.4f}, Train Spearman={spearman_train:.4f}, Val Spearman={spearman_val:.4f}")

    return mse

if __name__ == "__main__":
    study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
    study.optimize(objective, n_trials=200, callbacks=[trial_info_callback])

    # Prepare summary text
    summary = []
    summary.append("\n" + "="*60)
    summary.append("OPTIMIZATION COMPLETE")
    summary.append("="*60)
    summary.append(f"Number of finished trials: {len(study.trials)}")
    summary.append(f"\nBest Trial: #{study.user_attrs.get('best_trial', 'N/A')}")
    summary.append(f"Best MSE: {study.best_trial.value:.4f}")
    summary.append(f"Best Training Spearman Correlation: {study.user_attrs.get('best_spearman_train', None):.4f}")
    summary.append(f"Best Validation Spearman Correlation: {study.user_attrs.get('best_spearman_val', None):.4f}")
    summary.append(f"\nBest hyperparameters:")
    for key, value in study.best_trial.params.items():
        summary.append(f"  {key}: {value}")
    summary.append("="*60)
    
    # Print to console
    for line in summary:
        print(line)
    
    # Save to file
    metrics_file = os.path.join(best_model_path, "optimization_metrics.txt")
    with open(metrics_file, 'w') as f:
        f.write('\n'.join(summary))
    print(f"\n✓ Metrics saved to: {metrics_file}")