File size: 20,370 Bytes

714cf46

#!/usr/bin/env python
"""
Utility script to list Protify supported models and datasets.
"""
import argparse
import sys


model_descriptions = {
    'ESM2-8': {
        'description': 'Small protein language model (8M parameters) from Meta AI that learns evolutionary information from millions of protein sequences.',
        'size': '8M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-35': {
        'description': 'Medium-sized protein language model (35M parameters) trained on evolutionary data.',
        'size': '35M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-150': {
        'description': 'Large protein language model (150M parameters) with improved protein structure prediction capabilities.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-650': {
        'description': 'Very large protein language model (650M parameters) offering state-of-the-art performance on many protein prediction tasks.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-3B': {
        'description': 'Largest ESM2 protein language model (3B parameters) with exceptional capability for protein structure and function prediction.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'Random': {
        'description': 'Baseline model with randomly initialized weights, serving as a negative control.',
        'size': 'Varies',
        'type': 'Baseline control',
        'citation': 'N/A'
    },
    'Random-Transformer': {
        'description': 'Randomly initialized transformer model serving as a homology-based control.',
        'size': 'Varies',
        'type': 'Baseline control',
        'citation': 'N/A'
    },
    'ESMC-300': {
        'description': 'Protein language model optimized for classification tasks with 300M parameters.',
        'size': '300M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'ESMC-600': {
        'description': 'Larger protein language model for classification with 600M parameters.',
        'size': '600M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'ProtBert': {
        'description': 'BERT-based protein language model trained on protein sequences from UniRef.',
        'size': '420M parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtBert-BFD': {
        'description': 'BERT-based protein language model trained on BFD database with improved performance.',
        'size': '420M parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtT5': {
        'description': 'T5-based protein language model capable of both encoding and generation tasks.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtT5-XL-UniRef50-full-prec': {
        'description': 'Extra large T5-based protein language model trained on UniRef50 with full precision.',
        'size': '11B parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ANKH-Base': {
        'description': 'Base version of the ANKH protein language model focused on protein structure understanding.',
        'size': '400M parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'ANKH-Large': {
        'description': 'Large version of the ANKH protein language model with improved structural predictions.',
        'size': '1.2B parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'ANKH2-Large': {
        'description': 'Improved second generation ANKH protein language model.',
        'size': '1.2B parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'GLM2-150': {
        'description': 'Medium-sized general language model adapted for protein sequences.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'GLM2-650': {
        'description': 'Large general language model adapted for protein sequences.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'GLM2-GAIA': {
        'description': 'Specialized GLM protein language model with GAIA architecture improvements.',
        'size': '1B+ parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-150': {
        'description': 'Deep protein language model with 150M parameters focused on protein structure.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-650': {
        'description': 'Larger deep protein language model with 650M parameters.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-3B': {
        'description': 'Largest deep protein language model in the DPLM family with 3B parameters.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DSM-150': {
        'description': 'Deep language model for proteins with 150M parameters.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DSM-650': {
        'description': 'Deep language model for proteins with 650M parameters.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    }
}


dataset_descriptions = {
    'EC': {
        'description': 'Enzyme Commission numbers dataset for predicting enzyme function classification.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-CC': {
        'description': 'Gene Ontology Cellular Component dataset for predicting protein localization in cells.',
        'type': 'Multi-label classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-BP': {
        'description': 'Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-MF': {
        'description': 'Gene Ontology Molecular Function dataset for predicting protein molecular functions.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'MB': {
        'description': 'Metal ion binding dataset for predicting protein-metal interactions.',
        'type': 'Classification',
        'task': 'Protein-metal binding prediction',
        'citation': 'Gleghorn Lab'
    },
    'DeepLoc-2': {
        'description': 'Binary classification dataset for predicting protein localization in 2 categories.',
        'type': 'Binary classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'DeepLoc-10': {
        'description': 'Multi-class classification dataset for predicting protein localization in 10 categories.',
        'type': 'Multi-class classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'enzyme-kcat': {
        'description': 'Dataset for predicting enzyme catalytic rate constants (kcat).',
        'type': 'Regression',
        'task': 'Enzyme kinetics prediction',
        'citation': 'Gleghorn Lab'
    },
    'solubility': {
        'description': 'Dataset for predicting protein solubility properties.',
        'type': 'Binary classification',
        'task': 'Protein solubility prediction',
        'citation': 'Gleghorn Lab'
    },
    'localization': {
        'description': 'Dataset for predicting subcellular localization of proteins.',
        'type': 'Multi-class classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'temperature-stability': {
        'description': 'Dataset for predicting protein stability at different temperatures.',
        'type': 'Binary classification',
        'task': 'Protein stability prediction',
        'citation': 'Gleghorn Lab'
    },
    'peptide-HLA-MHC-affinity': {
        'description': 'Dataset for predicting peptide binding affinity to HLA/MHC complexes.',
        'type': 'Protein-protein interaction',
        'task': 'Binding affinity prediction',
        'citation': 'Gleghorn Lab'
    },
    'optimal-temperature': {
        'description': 'Dataset for predicting the optimal temperature for protein function.',
        'type': 'Regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'optimal-ph': {
        'description': 'Dataset for predicting the optimal pH for protein function.',
        'type': 'Regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'material-production': {
        'description': 'Dataset for predicting protein suitability for material production.',
        'type': 'Classification',
        'task': 'Protein application prediction',
        'citation': 'Gleghorn Lab'
    },
    'fitness-prediction': {
        'description': 'Dataset for predicting protein fitness in various environments.',
        'type': 'Classification',
        'task': 'Protein fitness prediction',
        'citation': 'Gleghorn Lab'
    },
    'number-of-folds': {
        'description': 'Dataset for predicting the number of structural folds in proteins.',
        'type': 'Classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'cloning-clf': {
        'description': 'Dataset for predicting protein suitability for cloning operations.',
        'type': 'Classification',
        'task': 'Protein engineering prediction',
        'citation': 'Gleghorn Lab'
    },
    'stability-prediction': {
        'description': 'Dataset for predicting overall protein stability.',
        'type': 'Classification',
        'task': 'Protein stability prediction',
        'citation': 'Gleghorn Lab'
    },
    'human-ppi': {
        'description': 'Dataset for predicting human protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'SecondaryStructure-3': {
        'description': 'Dataset for predicting protein secondary structure in 3 classes.',
        'type': 'Token-wise classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'SecondaryStructure-8': {
        'description': 'Dataset for predicting protein secondary structure in 8 classes.',
        'type': 'Token-wise classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'fluorescence-prediction': {
        'description': 'Dataset for predicting protein fluorescence properties.',
        'type': 'Token-wise regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'plastic': {
        'description': 'Dataset for predicting protein capability for plastic degradation.',
        'type': 'Classification',
        'task': 'Enzyme function prediction',
        'citation': 'Gleghorn Lab'
    },
    'gold-ppi': {
        'description': 'Gold standard dataset for protein-protein interaction prediction.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/bernett_gold_ppi'
    },
    'human-ppi-pinui': {
        'description': 'Human protein-protein interaction dataset from PiNUI.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'yeast-ppi-pinui': {
        'description': 'Yeast protein-protein interaction dataset from PiNUI.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'shs27-ppi': {
        'description': 'SHS27k dataset containing 27,000 protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/SHS27k'
    },
    'shs148-ppi': {
        'description': 'SHS148k dataset containing 148,000 protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/SHS148k'
    },
    'PPA-ppi': {
        'description': 'Protein-Protein Affinity dataset for quantitative binding predictions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI affinity prediction',
        'citation': 'Synthyra/ProteinProteinAffinity'
    },
}


def list_models(show_standard_only=False):
    """List available models with descriptions if available"""
    try:
        from .base_models.get_base_models import currently_supported_models, standard_models
        from .base_models.model_descriptions import model_descriptions
        
        if show_standard_only:
            models_to_show = standard_models
            print("\n=== Standard Models ===\n")
        else:
            models_to_show = currently_supported_models
            print("\n=== All Supported Models ===\n")
        
        # Calculate maximum widths for formatting
        max_name_len = max(len(name) for name in models_to_show)
        max_type_len = max(len(model_descriptions.get(name, {}).get('type', 'Unknown')) for name in models_to_show if name in model_descriptions)
        max_size_len = max(len(model_descriptions.get(name, {}).get('size', 'Unknown')) for name in models_to_show if name in model_descriptions)
        
        # Print header
        print(f"{'Model':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Size':<{max_size_len+2}}Description")
        print("-" * (max_name_len + max_type_len + max_size_len + 50))
        
        # Print model information
        for model_name in models_to_show:
            if model_name in model_descriptions:
                model_info = model_descriptions[model_name]
                print(f"{model_name:<{max_name_len+2}}{model_info.get('type', 'Unknown'):<{max_type_len+2}}{model_info.get('size', 'Unknown'):<{max_size_len+2}}{model_info.get('description', 'No description available')}")
            else:
                print(f"{model_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_size_len+2}}No description available")
    
    except ImportError as e:
        print(f"Error loading model information: {e}")
        print("\n=== Models ===\n")
        try:
            from .base_models.get_base_models import currently_supported_models, standard_models
            
            if show_standard_only:
                for model_name in standard_models:
                    print(f"- {model_name}")
            else:
                for model_name in currently_supported_models:
                    print(f"- {model_name}")
        except ImportError:
            print("Could not load model lists. Please check your installation.")


def list_datasets(show_standard_only=False):
    """List available datasets with descriptions if available"""
    try:
        from .data.supported_datasets import supported_datasets, standard_data_benchmark
        from .data.dataset_descriptions import dataset_descriptions
        
        if show_standard_only:
            datasets_to_show = {name: supported_datasets[name] for name in standard_data_benchmark if name in supported_datasets}
            print("\n=== Standard Benchmark Datasets ===\n")
        else:
            datasets_to_show = supported_datasets
            print("\n=== All Supported Datasets ===\n")
        
        # Calculate maximum widths for formatting
        max_name_len = max(len(name) for name in datasets_to_show)
        max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
        max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
        
        # Print header
        print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
        print("-" * (max_name_len + max_type_len + max_task_len + 50))
        
        # Print dataset information
        for dataset_name in datasets_to_show:
            if dataset_name in dataset_descriptions:
                dataset_info = dataset_descriptions[dataset_name]
                print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
            else:
                print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")
    
    except ImportError as e:
        print(f"Error loading dataset information: {e}")
        print("\n=== Datasets ===\n")
        try:
            from .data.supported_datasets import supported_datasets, standard_data_benchmark
            
            if show_standard_only:
                for dataset_name in standard_data_benchmark:
                    if dataset_name in supported_datasets:
                        print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
            else:
                for dataset_name, dataset_source in supported_datasets.items():
                    print(f"- {dataset_name}: {dataset_source}")
        except ImportError:
            print("Could not load dataset lists. Please check your installation.")


def main():
    """Main function to run the script from command line"""
    parser = argparse.ArgumentParser(description='List Protify supported models and datasets')
    parser.add_argument('--models', action='store_true', help='List supported models')
    parser.add_argument('--datasets', action='store_true', help='List supported datasets')
    parser.add_argument('--standard-only', action='store_true', help='Show only standard models/datasets')
    parser.add_argument('--all', action='store_true', help='Show both models and datasets')
    
    args = parser.parse_args()
    
    if len(sys.argv) == 1 or args.all:
        list_models(args.standard_only)
        print("\n" + "="*80 + "\n")
        list_datasets(args.standard_only)
        return
    
    if args.models:
        list_models(args.standard_only)
    
    if args.datasets:
        list_datasets(args.standard_only)


if __name__ == "__main__":
    main()