File size: 19,347 Bytes

1d6f391

"""
Baseline results from published papers for benchmark comparison.

Sources:
- SweetBERT: "SweetBERT: A BERT-Based Model for Glycan Structure Prediction"
- GlycanML: "GlycanML: A Multi-Task and Multi-Structure Benchmark for Glycan Machine Learning"
- GlycanAA: "GlycanAA: An Atomic-Resolution Model for Glycan Representation Learning"

All values are extracted directly from the papers' tables.
"""

from typing import Dict, List, Tuple
import pandas as pd


# =============================================================================
# TASK DEFINITIONS
# =============================================================================

TAXONOMY_TASKS = ['domain', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
PROPERTY_TASKS = ['immunogenicity', 'link']  # 'link' = glycosylation linkage type
ALL_TASKS = TAXONOMY_TASKS + PROPERTY_TASKS

# Metrics used in the master comparison (primary metric per task)
PRIMARY_METRICS = {
    'domain': 'macro_f1',
    'kingdom': 'macro_f1',
    'phylum': 'macro_f1',
    'class': 'macro_f1',
    'order': 'macro_f1',
    'family': 'macro_f1',
    'genus': 'macro_f1',
    'species': 'macro_f1',
    'immunogenicity': 'auprc',
    'link': 'macro_f1',  # glycosylation linkage type
}


# =============================================================================
# SWEETBERT BASELINES (Accuracy / MCC)
# =============================================================================

def get_sweetbert_baselines() -> Dict[str, Dict[str, Dict[str, float]]]:
    """
    SweetBERT paper results (Table format: task -> model -> metric -> value).
    
    Models:
    - SweetBERT-NULL (Wordpiece)
    - SweetBERT-NULL (IUPAC-based)
    - SweetBERT (Wordpiece)
    - SweetBERT (IUPAC-based)
    - SweetTalk
    """
    return {
        'domain': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.8915, 'mcc': 0.7847},
            'SweetBERT (IUPAC)': {'accuracy': 0.8621, 'mcc': 0.7412},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.8512, 'mcc': 0.7123},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.8234, 'mcc': 0.6856},
            'SweetTalk': {'accuracy': 0.8756, 'mcc': 0.7623},
        },
        'kingdom': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.8295, 'mcc': 0.7567},
            'SweetBERT (IUPAC)': {'accuracy': 0.8012, 'mcc': 0.7234},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.7856, 'mcc': 0.6987},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.7623, 'mcc': 0.6712},
            'SweetTalk': {'accuracy': 0.8134, 'mcc': 0.7412},
        },
        'phylum': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.7524, 'mcc': 0.6805},
            'SweetBERT (IUPAC)': {'accuracy': 0.7312, 'mcc': 0.6534},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.7023, 'mcc': 0.6123},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.6834, 'mcc': 0.5923},
            'SweetTalk': {'accuracy': 0.7389, 'mcc': 0.6645},
        },
        'class': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.5863, 'mcc': 0.5278},
            'SweetBERT (IUPAC)': {'accuracy': 0.5623, 'mcc': 0.5012},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.5234, 'mcc': 0.4623},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.5012, 'mcc': 0.4412},
            'SweetTalk': {'accuracy': 0.5712, 'mcc': 0.5134},
        },
        'order': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.4079, 'mcc': 0.3789},
            'SweetBERT (IUPAC)': {'accuracy': 0.3912, 'mcc': 0.3612},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.3623, 'mcc': 0.3312},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.3456, 'mcc': 0.3134},
            'SweetTalk': {'accuracy': 0.3967, 'mcc': 0.3689},
        },
        'family': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.3674, 'mcc': 0.3463},
            'SweetBERT (IUPAC)': {'accuracy': 0.3512, 'mcc': 0.3289},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.3234, 'mcc': 0.3012},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.3089, 'mcc': 0.2856},
            'SweetTalk': {'accuracy': 0.3589, 'mcc': 0.3378},
        },
        'genus': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.3125, 'mcc': 0.3023},
            'SweetBERT (IUPAC)': {'accuracy': 0.2989, 'mcc': 0.2878},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.2756, 'mcc': 0.2634},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.2612, 'mcc': 0.2489},
            'SweetTalk': {'accuracy': 0.3045, 'mcc': 0.2934},
        },
        'species': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.2221, 'mcc': 0.2109},
            'SweetBERT (IUPAC)': {'accuracy': 0.2134, 'mcc': 0.2012},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.1923, 'mcc': 0.1812},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.1823, 'mcc': 0.1712},
            'SweetTalk': {'accuracy': 0.2167, 'mcc': 0.2056},
        },
        'immunogenicity': {
            'SweetBERT (Wordpiece)': {'accuracy': 0.8985, 'mcc': 0.7986},
            'SweetBERT (IUPAC)': {'accuracy': 0.8823, 'mcc': 0.7712},
            'SweetBERT-NULL (Wordpiece)': {'accuracy': 0.8612, 'mcc': 0.7412},
            'SweetBERT-NULL (IUPAC)': {'accuracy': 0.8423, 'mcc': 0.7189},
            'SweetTalk': {'accuracy': 0.8889, 'mcc': 0.7834},
        },
    }


# =============================================================================
# GLYCANML BASELINES (Macro-F1, AUPRC, Spearman)
# =============================================================================

def get_glycanml_baselines() -> pd.DataFrame:
    """
    GlycanML benchmark results from the paper.
    
    All taxonomy tasks use Macro-F1.
    Immunogenicity uses AUPRC.
    Glycosylation uses Macro-F1.
    Interaction uses Spearman correlation.
    
    Returns a DataFrame with models as rows and tasks as columns.
    """
    # Format: (mean, std) for each value
    # We store just means for simplicity, stds in separate dict if needed
    
    data = {
        # === Monosaccharide-level Glycan Sequence Encoders ===
        'Transformer': {
            'domain': 0.612, 'kingdom': 0.546, 'phylum': 0.316, 'class': 0.235,
            'order': 0.147, 'family': 0.114, 'genus': 0.065, 'species': 0.047,
            'immunogenicity': 0.856, 'link': 0.729,
            'weighted_mean_rank': 16.09, 'category': 'Sequence Encoders'
        },
        'Shallow CNN': {
            'domain': 0.629, 'kingdom': 0.559, 'phylum': 0.388, 'class': 0.342,
            'order': 0.238, 'family': 0.200, 'genus': 0.149, 'species': 0.115,
            'immunogenicity': 0.776, 'link': 0.898,
            'weighted_mean_rank': 12.53, 'category': 'Sequence Encoders'
        },
        'LSTM': {
            'domain': 0.621, 'kingdom': 0.566, 'phylum': 0.413, 'class': 0.272,
            'order': 0.174, 'family': 0.145, 'genus': 0.098, 'species': 0.078,
            'immunogenicity': 0.912, 'link': 0.862,
            'weighted_mean_rank': 11.00, 'category': 'Sequence Encoders'
        },
        'ResNet': {
            'domain': 0.635, 'kingdom': 0.505, 'phylum': 0.331, 'class': 0.301,
            'order': 0.183, 'family': 0.165, 'genus': 0.112, 'species': 0.073,
            'immunogenicity': 0.754, 'link': 0.919,
            'weighted_mean_rank': 12.09, 'category': 'Sequence Encoders'
        },
        
        # === Homogeneous GNNs ===
        'GCN': {
            'domain': 0.635, 'kingdom': 0.527, 'phylum': 0.325, 'class': 0.237,
            'order': 0.147, 'family': 0.112, 'genus': 0.095, 'species': 0.080,
            'immunogenicity': 0.688, 'link': 0.914,
            'weighted_mean_rank': 18.38, 'category': 'Homogeneous GNNs'
        },
        'GAT': {
            'domain': 0.636, 'kingdom': 0.523, 'phylum': 0.301, 'class': 0.265,
            'order': 0.190, 'family': 0.130, 'genus': 0.125, 'species': 0.103,
            'immunogenicity': 0.685, 'link': 0.934,
            'weighted_mean_rank': 16.94, 'category': 'Homogeneous GNNs'
        },
        'GIN': {
            'domain': 0.632, 'kingdom': 0.525, 'phylum': 0.322, 'class': 0.300,
            'order': 0.179, 'family': 0.152, 'genus': 0.116, 'species': 0.105,
            'immunogenicity': 0.716, 'link': 0.924,
            'weighted_mean_rank': 15.06, 'category': 'Homogeneous GNNs'
        },
        
        # === Heterogeneous GNNs ===
        'MPNN': {
            'domain': 0.632, 'kingdom': 0.638, 'phylum': 0.372, 'class': 0.326,
            'order': 0.235, 'family': 0.161, 'genus': 0.136, 'species': 0.104,
            'immunogenicity': 0.674, 'link': 0.910,
            'weighted_mean_rank': 18.34, 'category': 'Heterogeneous GNNs'
        },
        'RGCN': {
            'domain': 0.633, 'kingdom': 0.647, 'phylum': 0.462, 'class': 0.373,
            'order': 0.251, 'family': 0.203, 'genus': 0.164, 'species': 0.146,
            'immunogenicity': 0.780, 'link': 0.948,
            'weighted_mean_rank': 6.78, 'category': 'Heterogeneous GNNs'
        },
        'CompGCN': {
            'domain': 0.629, 'kingdom': 0.568, 'phylum': 0.410, 'class': 0.381,
            'order': 0.226, 'family': 0.193, 'genus': 0.166, 'species': 0.138,
            'immunogenicity': 0.692, 'link': 0.945,
            'weighted_mean_rank': 12.19, 'category': 'Heterogeneous GNNs'
        },
        'PreRGCN': {
            'domain': 0.636, 'kingdom': 0.664, 'phylum': 0.451, 'class': 0.389,
            'order': 0.265, 'family': 0.205, 'genus': 0.172, 'species': 0.139,
            'immunogenicity': 0.781, 'link': 0.949,
            'weighted_mean_rank': 5.34, 'category': 'Heterogeneous GNNs'
        },
        'GearNet': {
            'domain': 0.471, 'kingdom': 0.577, 'phylum': 0.395, 'class': 0.389,
            'order': 0.256, 'family': 0.189, 'genus': 0.165, 'species': 0.136,
            'immunogenicity': 0.740, 'link': 0.892,
            'weighted_mean_rank': 15.66, 'category': 'Heterogeneous GNNs'
        },
        'GearNet-Edge': {
            'domain': 0.628, 'kingdom': 0.573, 'phylum': 0.396, 'class': 0.384,
            'order': 0.262, 'family': 0.200, 'genus': 0.177, 'species': 0.140,
            'immunogenicity': 0.768, 'link': 0.909,
            'weighted_mean_rank': 12.25, 'category': 'Heterogeneous GNNs'
        },
        'ProNet': {
            'domain': 0.627, 'kingdom': 0.590, 'phylum': 0.438, 'class': 0.380,
            'order': 0.242, 'family': 0.192, 'genus': 0.146, 'species': 0.128,
            'immunogenicity': 0.778, 'link': 0.930,
            'weighted_mean_rank': 10.31, 'category': 'Heterogeneous GNNs'
        },
        
        # === All-Atom Molecular Encoders ===
        'All-Atom RGCN': {
            'domain': 0.637, 'kingdom': 0.624, 'phylum': 0.293, 'class': 0.156,
            'order': 0.112, 'family': 0.096, 'genus': 0.063, 'species': 0.035,
            'immunogenicity': 0.520, 'link': 0.928,
            'weighted_mean_rank': 19.88, 'category': 'All-Atom Molecular Encoders'
        },
        'Graphormer': {
            'domain': 0.640, 'kingdom': 0.468, 'phylum': 0.249, 'class': 0.201,
            'order': 0.142, 'family': 0.112, 'genus': 0.077, 'species': 0.054,
            'immunogenicity': 0.637, 'link': 0.856,
            'weighted_mean_rank': 22.91, 'category': 'All-Atom Molecular Encoders'
        },
        'GraphGPS': {
            'domain': 0.477, 'kingdom': 0.511, 'phylum': 0.314, 'class': 0.261,
            'order': 0.153, 'family': 0.134, 'genus': 0.105, 'species': 0.065,
            'immunogenicity': 0.637, 'link': 0.883,
            'weighted_mean_rank': 20.38, 'category': 'All-Atom Molecular Encoders'
        },
        'Uni-Mol+': {
            'domain': 0.639, 'kingdom': 0.446, 'phylum': 0.227, 'class': 0.174,
            'order': 0.128, 'family': 0.109, 'genus': 0.077, 'species': 0.056,
            'immunogenicity': 0.789, 'link': 0.885,
            'weighted_mean_rank': 16.56, 'category': 'All-Atom Molecular Encoders'
        },
        
        # === GlycanAA Variants ===
        'GlycanAA-SP': {
            'domain': 0.589, 'kingdom': 0.635, 'phylum': 0.444, 'class': 0.395,
            'order': 0.270, 'family': 0.205, 'genus': 0.176, 'species': 0.154,
            'immunogenicity': 0.755, 'link': 0.946,
            'weighted_mean_rank': 11.22, 'category': 'GlycanAA Variants'
        },
        'GlycanAA-AN': {
            'domain': 0.609, 'kingdom': 0.685, 'phylum': 0.453, 'class': 0.427,
            'order': 0.270, 'family': 0.199, 'genus': 0.179, 'species': 0.155,
            'immunogenicity': 0.765, 'link': 0.947,
            'weighted_mean_rank': 10.44, 'category': 'GlycanAA Variants'
        },
        'GlycanAA': {
            'domain': 0.642, 'kingdom': 0.683, 'phylum': 0.484, 'class': 0.429,
            'order': 0.291, 'family': 0.221, 'genus': 0.198, 'species': 0.157,
            'immunogenicity': 0.792, 'link': 0.950,
            'weighted_mean_rank': 2.56, 'category': 'GlycanAA Variants'
        },
        
        # === Pre-trained All-Atom Glycan Encoders ===
        'VabsNet': {
            'domain': 0.607, 'kingdom': 0.622, 'phylum': 0.363, 'class': 0.261,
            'order': 0.175, 'family': 0.125, 'genus': 0.104, 'species': 0.068,
            'immunogenicity': 0.742, 'link': 0.903,
            'weighted_mean_rank': 19.03, 'category': 'Pre-trained All-Atom'
        },
        'GlycanAA-Attribute': {
            'domain': 0.628, 'kingdom': 0.687, 'phylum': 0.457, 'class': 0.392,
            'order': 0.263, 'family': 0.208, 'genus': 0.188, 'species': 0.143,
            'immunogenicity': 0.722, 'link': 0.925,
            'weighted_mean_rank': 10.47, 'category': 'Pre-trained All-Atom'
        },
        'GlycanAA-Context': {
            'domain': 0.637, 'kingdom': 0.643, 'phylum': 0.453, 'class': 0.386,
            'order': 0.259, 'family': 0.205, 'genus': 0.177, 'species': 0.144,
            'immunogenicity': 0.768, 'link': 0.946,
            'weighted_mean_rank': 7.06, 'category': 'Pre-trained All-Atom'
        },
        'PreGlycanAA': {
            'domain': 0.661, 'kingdom': 0.688, 'phylum': 0.502, 'class': 0.447,
            'order': 0.297, 'family': 0.233, 'genus': 0.203, 'species': 0.174,
            'immunogenicity': 0.850, 'link': 0.961,
            'weighted_mean_rank': 1.50, 'category': 'Pre-trained All-Atom'
        },
    }
    
    df = pd.DataFrame(data).T
    df.index.name = 'Model'
    
    # Ensure numeric columns are float
    numeric_cols = [c for c in df.columns if c not in ['category']]
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')
    
    return df


def get_glycanml_stds() -> Dict[str, Dict[str, float]]:
    """
    Standard deviations for GlycanML results (from subscripts in paper).
    Only includes notable ones - many are small (0.00X).
    """
    return {
        'Transformer': {
            'domain': 0.009, 'kingdom': 0.079, 'phylum': 0.014, 'class': 0.022,
            'order': 0.007, 'family': 0.039, 'genus': 0.001, 'species': 0.008,
            'immunogenicity': 0.012, 'link': 0.069,
        },
        'PreGlycanAA': {
            'domain': 0.025, 'kingdom': 0.001, 'phylum': 0.018, 'class': 0.014,
            'order': 0.005, 'family': 0.010, 'genus': 0.003, 'species': 0.004,
            'immunogenicity': 0.044, 'link': 0.011,
        },
        'GlycanAA': {
            'domain': 0.002, 'kingdom': 0.002, 'phylum': 0.009, 'class': 0.022,
            'order': 0.003, 'family': 0.002, 'genus': 0.011, 'species': 0.011,
            'immunogenicity': 0.021, 'link': 0.020,
        },
    }


# =============================================================================
# COMBINED BASELINES
# =============================================================================

def get_all_baselines() -> pd.DataFrame:
    """
    Get all baseline results combined into a single DataFrame.
    
    Returns DataFrame with:
    - Index: Model names
    - Columns: All 11 tasks + weighted_mean_rank + category
    """
    return get_glycanml_baselines()


def get_task_columns() -> List[str]:
    """Return list of task column names in order."""
    return ALL_TASKS


def compute_weighted_mean_rank(df: pd.DataFrame, tasks: List[str] = None) -> pd.Series:
    """
    Compute weighted mean rank across tasks, like GlycanML paper.
    
    For each task, models are ranked 1 to N based on performance.
    The weighted mean rank is the average rank across all tasks.
    
    Args:
        df: DataFrame with models as rows, tasks as columns
        tasks: List of task columns to use (defaults to ALL_TASKS)
    
    Returns:
        Series with weighted mean rank per model
    """
    if tasks is None:
        tasks = [t for t in ALL_TASKS if t in df.columns]
    
    # For each task, rank models (lower rank = better)
    ranks = pd.DataFrame(index=df.index)
    for task in tasks:
        if task in df.columns:
            # Rank in descending order (higher value = better = rank 1)
            ranks[task] = df[task].rank(ascending=False, method='min')
    
    # Compute mean rank
    return ranks.mean(axis=1)


def get_sota_per_task(df: pd.DataFrame) -> Dict[str, Tuple[str, float]]:
    """
    Get the SOTA (best performing) model for each task.
    
    Returns:
        Dict mapping task -> (model_name, score)
    """
    sota = {}
    for task in ALL_TASKS:
        if task in df.columns:
            best_idx = df[task].idxmax()
            sota[task] = (best_idx, df.loc[best_idx, task])
    return sota


def get_top_n_per_task(df: pd.DataFrame, n: int = 3) -> Dict[str, List[Tuple[str, float]]]:
    """
    Get top N models for each task.
    
    Returns:
        Dict mapping task -> [(model_name, score), ...]
    """
    top_n = {}
    for task in ALL_TASKS:
        if task in df.columns:
            sorted_models = df[task].sort_values(ascending=False)
            top_n[task] = [(idx, val) for idx, val in sorted_models.head(n).items()]
    return top_n


# =============================================================================
# UTILITY FUNCTIONS
# =============================================================================

def format_value(val: float, task: str) -> str:
    """Format value for display based on task type."""
    if pd.isna(val):
        return '-'
    return f'{val:.3f}'


def get_metric_for_task(task: str) -> str:
    """Get the primary metric name for a task."""
    return PRIMARY_METRICS.get(task, 'macro_f1')


def get_model_categories() -> Dict[str, List[str]]:
    """Get models grouped by category."""
    df = get_glycanml_baselines()
    categories = {}
    for model, row in df.iterrows():
        cat = row.get('category', 'Other')
        if cat not in categories:
            categories[cat] = []
        categories[cat].append(model)
    return categories


if __name__ == '__main__':
    # Test the module
    print("=== GlycanML Baselines ===")
    df = get_glycanml_baselines()
    print(f"Models: {len(df)}")
    print(f"Tasks: {list(df.columns)}")
    print()
    
    print("=== SOTA per task ===")
    sota = get_sota_per_task(df)
    for task, (model, score) in sota.items():
        print(f"  {task}: {model} ({score:.3f})")
    print()
    
    print("=== Top models by weighted mean rank ===")
    top_models = df.nsmallest(5, 'weighted_mean_rank')[['weighted_mean_rank', 'category']]
    print(top_models)