#!/usr/bin/env python """ Utility script to list Protify supported models and datasets. """ import argparse import sys model_descriptions = { 'ESM2-8': { 'description': 'Small protein language model (8M parameters) from Meta AI that learns evolutionary information from millions of protein sequences.', 'size': '8M parameters', 'type': 'Protein language model', 'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.' }, 'ESM2-35': { 'description': 'Medium-sized protein language model (35M parameters) trained on evolutionary data.', 'size': '35M parameters', 'type': 'Protein language model', 'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.' }, 'ESM2-150': { 'description': 'Large protein language model (150M parameters) with improved protein structure prediction capabilities.', 'size': '150M parameters', 'type': 'Protein language model', 'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.' }, 'ESM2-650': { 'description': 'Very large protein language model (650M parameters) offering state-of-the-art performance on many protein prediction tasks.', 'size': '650M parameters', 'type': 'Protein language model', 'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.' }, 'ESM2-3B': { 'description': 'Largest ESM2 protein language model (3B parameters) with exceptional capability for protein structure and function prediction.', 'size': '3B parameters', 'type': 'Protein language model', 'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.' }, 'Random': { 'description': 'Baseline model with randomly initialized weights, serving as a negative control.', 'size': 'Varies', 'type': 'Baseline control', 'citation': 'N/A' }, 'Random-Transformer': { 'description': 'Randomly initialized transformer model serving as a homology-based control.', 'size': 'Varies', 'type': 'Baseline control', 'citation': 'N/A' }, 'ESMC-300': { 'description': 'Protein language model optimized for classification tasks with 300M parameters.', 'size': '300M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'ESMC-600': { 'description': 'Larger protein language model for classification with 600M parameters.', 'size': '600M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'ProtBert': { 'description': 'BERT-based protein language model trained on protein sequences from UniRef.', 'size': '420M parameters', 'type': 'Protein language model', 'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.' }, 'ProtBert-BFD': { 'description': 'BERT-based protein language model trained on BFD database with improved performance.', 'size': '420M parameters', 'type': 'Protein language model', 'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.' }, 'ProtT5': { 'description': 'T5-based protein language model capable of both encoding and generation tasks.', 'size': '3B parameters', 'type': 'Protein language model', 'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.' }, 'ProtT5-XL-UniRef50-full-prec': { 'description': 'Extra large T5-based protein language model trained on UniRef50 with full precision.', 'size': '11B parameters', 'type': 'Protein language model', 'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.' }, 'ANKH-Base': { 'description': 'Base version of the ANKH protein language model focused on protein structure understanding.', 'size': '400M parameters', 'type': 'Protein language model', 'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.' }, 'ANKH-Large': { 'description': 'Large version of the ANKH protein language model with improved structural predictions.', 'size': '1.2B parameters', 'type': 'Protein language model', 'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.' }, 'ANKH2-Large': { 'description': 'Improved second generation ANKH protein language model.', 'size': '1.2B parameters', 'type': 'Protein language model', 'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.' }, 'GLM2-150': { 'description': 'Medium-sized general language model adapted for protein sequences.', 'size': '150M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'GLM2-650': { 'description': 'Large general language model adapted for protein sequences.', 'size': '650M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'GLM2-GAIA': { 'description': 'Specialized GLM protein language model with GAIA architecture improvements.', 'size': '1B+ parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'DPLM-150': { 'description': 'Deep protein language model with 150M parameters focused on protein structure.', 'size': '150M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'DPLM-650': { 'description': 'Larger deep protein language model with 650M parameters.', 'size': '650M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'DPLM-3B': { 'description': 'Largest deep protein language model in the DPLM family with 3B parameters.', 'size': '3B parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'DSM-150': { 'description': 'Deep language model for proteins with 150M parameters.', 'size': '150M parameters', 'type': 'Protein language model', 'citation': 'N/A' }, 'DSM-650': { 'description': 'Deep language model for proteins with 650M parameters.', 'size': '650M parameters', 'type': 'Protein language model', 'citation': 'N/A' } } dataset_descriptions = { 'EC': { 'description': 'Enzyme Commission numbers dataset for predicting enzyme function classification.', 'type': 'Multi-label classification', 'task': 'Protein function prediction', 'citation': 'Gleghorn Lab' }, 'GO-CC': { 'description': 'Gene Ontology Cellular Component dataset for predicting protein localization in cells.', 'type': 'Multi-label classification', 'task': 'Protein localization prediction', 'citation': 'Gleghorn Lab' }, 'GO-BP': { 'description': 'Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.', 'type': 'Multi-label classification', 'task': 'Protein function prediction', 'citation': 'Gleghorn Lab' }, 'GO-MF': { 'description': 'Gene Ontology Molecular Function dataset for predicting protein molecular functions.', 'type': 'Multi-label classification', 'task': 'Protein function prediction', 'citation': 'Gleghorn Lab' }, 'MB': { 'description': 'Metal ion binding dataset for predicting protein-metal interactions.', 'type': 'Classification', 'task': 'Protein-metal binding prediction', 'citation': 'Gleghorn Lab' }, 'DeepLoc-2': { 'description': 'Binary classification dataset for predicting protein localization in 2 categories.', 'type': 'Binary classification', 'task': 'Protein localization prediction', 'citation': 'Gleghorn Lab' }, 'DeepLoc-10': { 'description': 'Multi-class classification dataset for predicting protein localization in 10 categories.', 'type': 'Multi-class classification', 'task': 'Protein localization prediction', 'citation': 'Gleghorn Lab' }, 'enzyme-kcat': { 'description': 'Dataset for predicting enzyme catalytic rate constants (kcat).', 'type': 'Regression', 'task': 'Enzyme kinetics prediction', 'citation': 'Gleghorn Lab' }, 'solubility': { 'description': 'Dataset for predicting protein solubility properties.', 'type': 'Binary classification', 'task': 'Protein solubility prediction', 'citation': 'Gleghorn Lab' }, 'localization': { 'description': 'Dataset for predicting subcellular localization of proteins.', 'type': 'Multi-class classification', 'task': 'Protein localization prediction', 'citation': 'Gleghorn Lab' }, 'temperature-stability': { 'description': 'Dataset for predicting protein stability at different temperatures.', 'type': 'Binary classification', 'task': 'Protein stability prediction', 'citation': 'Gleghorn Lab' }, 'peptide-HLA-MHC-affinity': { 'description': 'Dataset for predicting peptide binding affinity to HLA/MHC complexes.', 'type': 'Protein-protein interaction', 'task': 'Binding affinity prediction', 'citation': 'Gleghorn Lab' }, 'optimal-temperature': { 'description': 'Dataset for predicting the optimal temperature for protein function.', 'type': 'Regression', 'task': 'Protein property prediction', 'citation': 'Gleghorn Lab' }, 'optimal-ph': { 'description': 'Dataset for predicting the optimal pH for protein function.', 'type': 'Regression', 'task': 'Protein property prediction', 'citation': 'Gleghorn Lab' }, 'material-production': { 'description': 'Dataset for predicting protein suitability for material production.', 'type': 'Classification', 'task': 'Protein application prediction', 'citation': 'Gleghorn Lab' }, 'fitness-prediction': { 'description': 'Dataset for predicting protein fitness in various environments.', 'type': 'Classification', 'task': 'Protein fitness prediction', 'citation': 'Gleghorn Lab' }, 'number-of-folds': { 'description': 'Dataset for predicting the number of structural folds in proteins.', 'type': 'Classification', 'task': 'Protein structure prediction', 'citation': 'Gleghorn Lab' }, 'cloning-clf': { 'description': 'Dataset for predicting protein suitability for cloning operations.', 'type': 'Classification', 'task': 'Protein engineering prediction', 'citation': 'Gleghorn Lab' }, 'stability-prediction': { 'description': 'Dataset for predicting overall protein stability.', 'type': 'Classification', 'task': 'Protein stability prediction', 'citation': 'Gleghorn Lab' }, 'human-ppi': { 'description': 'Dataset for predicting human protein-protein interactions.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Gleghorn Lab' }, 'SecondaryStructure-3': { 'description': 'Dataset for predicting protein secondary structure in 3 classes.', 'type': 'Token-wise classification', 'task': 'Protein structure prediction', 'citation': 'Gleghorn Lab' }, 'SecondaryStructure-8': { 'description': 'Dataset for predicting protein secondary structure in 8 classes.', 'type': 'Token-wise classification', 'task': 'Protein structure prediction', 'citation': 'Gleghorn Lab' }, 'fluorescence-prediction': { 'description': 'Dataset for predicting protein fluorescence properties.', 'type': 'Token-wise regression', 'task': 'Protein property prediction', 'citation': 'Gleghorn Lab' }, 'plastic': { 'description': 'Dataset for predicting protein capability for plastic degradation.', 'type': 'Classification', 'task': 'Enzyme function prediction', 'citation': 'Gleghorn Lab' }, 'gold-ppi': { 'description': 'Gold standard dataset for protein-protein interaction prediction.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Synthyra/bernett_gold_ppi' }, 'human-ppi-pinui': { 'description': 'Human protein-protein interaction dataset from PiNUI.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Gleghorn Lab' }, 'yeast-ppi-pinui': { 'description': 'Yeast protein-protein interaction dataset from PiNUI.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Gleghorn Lab' }, 'shs27-ppi': { 'description': 'SHS27k dataset containing 27,000 protein-protein interactions.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Synthyra/SHS27k' }, 'shs148-ppi': { 'description': 'SHS148k dataset containing 148,000 protein-protein interactions.', 'type': 'Protein-protein interaction', 'task': 'PPI prediction', 'citation': 'Synthyra/SHS148k' }, 'PPA-ppi': { 'description': 'Protein-Protein Affinity dataset for quantitative binding predictions.', 'type': 'Protein-protein interaction', 'task': 'PPI affinity prediction', 'citation': 'Synthyra/ProteinProteinAffinity' }, } def list_models(show_standard_only=False): """List available models with descriptions if available""" try: from .base_models.get_base_models import currently_supported_models, standard_models from .base_models.model_descriptions import model_descriptions if show_standard_only: models_to_show = standard_models print("\n=== Standard Models ===\n") else: models_to_show = currently_supported_models print("\n=== All Supported Models ===\n") # Calculate maximum widths for formatting max_name_len = max(len(name) for name in models_to_show) max_type_len = max(len(model_descriptions.get(name, {}).get('type', 'Unknown')) for name in models_to_show if name in model_descriptions) max_size_len = max(len(model_descriptions.get(name, {}).get('size', 'Unknown')) for name in models_to_show if name in model_descriptions) # Print header print(f"{'Model':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Size':<{max_size_len+2}}Description") print("-" * (max_name_len + max_type_len + max_size_len + 50)) # Print model information for model_name in models_to_show: if model_name in model_descriptions: model_info = model_descriptions[model_name] print(f"{model_name:<{max_name_len+2}}{model_info.get('type', 'Unknown'):<{max_type_len+2}}{model_info.get('size', 'Unknown'):<{max_size_len+2}}{model_info.get('description', 'No description available')}") else: print(f"{model_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_size_len+2}}No description available") except ImportError as e: print(f"Error loading model information: {e}") print("\n=== Models ===\n") try: from .base_models.get_base_models import currently_supported_models, standard_models if show_standard_only: for model_name in standard_models: print(f"- {model_name}") else: for model_name in currently_supported_models: print(f"- {model_name}") except ImportError: print("Could not load model lists. Please check your installation.") def list_datasets(show_standard_only=False): """List available datasets with descriptions if available""" try: from .data.supported_datasets import supported_datasets, standard_data_benchmark from .data.dataset_descriptions import dataset_descriptions if show_standard_only: datasets_to_show = {name: supported_datasets[name] for name in standard_data_benchmark if name in supported_datasets} print("\n=== Standard Benchmark Datasets ===\n") else: datasets_to_show = supported_datasets print("\n=== All Supported Datasets ===\n") # Calculate maximum widths for formatting max_name_len = max(len(name) for name in datasets_to_show) max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions) max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions) # Print header print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description") print("-" * (max_name_len + max_type_len + max_task_len + 50)) # Print dataset information for dataset_name in datasets_to_show: if dataset_name in dataset_descriptions: dataset_info = dataset_descriptions[dataset_name] print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}") else: print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available") except ImportError as e: print(f"Error loading dataset information: {e}") print("\n=== Datasets ===\n") try: from .data.supported_datasets import supported_datasets, standard_data_benchmark if show_standard_only: for dataset_name in standard_data_benchmark: if dataset_name in supported_datasets: print(f"- {dataset_name}: {supported_datasets[dataset_name]}") else: for dataset_name, dataset_source in supported_datasets.items(): print(f"- {dataset_name}: {dataset_source}") except ImportError: print("Could not load dataset lists. Please check your installation.") def main(): """Main function to run the script from command line""" parser = argparse.ArgumentParser(description='List Protify supported models and datasets') parser.add_argument('--models', action='store_true', help='List supported models') parser.add_argument('--datasets', action='store_true', help='List supported datasets') parser.add_argument('--standard-only', action='store_true', help='Show only standard models/datasets') parser.add_argument('--all', action='store_true', help='Show both models and datasets') args = parser.parse_args() if len(sys.argv) == 1 or args.all: list_models(args.standard_only) print("\n" + "="*80 + "\n") list_datasets(args.standard_only) return if args.models: list_models(args.standard_only) if args.datasets: list_datasets(args.standard_only) if __name__ == "__main__": main()