directionality_probe / protify /resource_info.py
nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
#!/usr/bin/env python
"""
Utility script to list Protify supported models and datasets.
"""
import argparse
import sys
model_descriptions = {
'ESM2-8': {
'description': 'Small protein language model (8M parameters) from Meta AI that learns evolutionary information from millions of protein sequences.',
'size': '8M parameters',
'type': 'Protein language model',
'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
},
'ESM2-35': {
'description': 'Medium-sized protein language model (35M parameters) trained on evolutionary data.',
'size': '35M parameters',
'type': 'Protein language model',
'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
},
'ESM2-150': {
'description': 'Large protein language model (150M parameters) with improved protein structure prediction capabilities.',
'size': '150M parameters',
'type': 'Protein language model',
'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
},
'ESM2-650': {
'description': 'Very large protein language model (650M parameters) offering state-of-the-art performance on many protein prediction tasks.',
'size': '650M parameters',
'type': 'Protein language model',
'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
},
'ESM2-3B': {
'description': 'Largest ESM2 protein language model (3B parameters) with exceptional capability for protein structure and function prediction.',
'size': '3B parameters',
'type': 'Protein language model',
'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
},
'Random': {
'description': 'Baseline model with randomly initialized weights, serving as a negative control.',
'size': 'Varies',
'type': 'Baseline control',
'citation': 'N/A'
},
'Random-Transformer': {
'description': 'Randomly initialized transformer model serving as a homology-based control.',
'size': 'Varies',
'type': 'Baseline control',
'citation': 'N/A'
},
'ESMC-300': {
'description': 'Protein language model optimized for classification tasks with 300M parameters.',
'size': '300M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'ESMC-600': {
'description': 'Larger protein language model for classification with 600M parameters.',
'size': '600M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'ProtBert': {
'description': 'BERT-based protein language model trained on protein sequences from UniRef.',
'size': '420M parameters',
'type': 'Protein language model',
'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
},
'ProtBert-BFD': {
'description': 'BERT-based protein language model trained on BFD database with improved performance.',
'size': '420M parameters',
'type': 'Protein language model',
'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
},
'ProtT5': {
'description': 'T5-based protein language model capable of both encoding and generation tasks.',
'size': '3B parameters',
'type': 'Protein language model',
'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
},
'ProtT5-XL-UniRef50-full-prec': {
'description': 'Extra large T5-based protein language model trained on UniRef50 with full precision.',
'size': '11B parameters',
'type': 'Protein language model',
'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
},
'ANKH-Base': {
'description': 'Base version of the ANKH protein language model focused on protein structure understanding.',
'size': '400M parameters',
'type': 'Protein language model',
'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
},
'ANKH-Large': {
'description': 'Large version of the ANKH protein language model with improved structural predictions.',
'size': '1.2B parameters',
'type': 'Protein language model',
'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
},
'ANKH2-Large': {
'description': 'Improved second generation ANKH protein language model.',
'size': '1.2B parameters',
'type': 'Protein language model',
'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
},
'GLM2-150': {
'description': 'Medium-sized general language model adapted for protein sequences.',
'size': '150M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'GLM2-650': {
'description': 'Large general language model adapted for protein sequences.',
'size': '650M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'GLM2-GAIA': {
'description': 'Specialized GLM protein language model with GAIA architecture improvements.',
'size': '1B+ parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'DPLM-150': {
'description': 'Deep protein language model with 150M parameters focused on protein structure.',
'size': '150M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'DPLM-650': {
'description': 'Larger deep protein language model with 650M parameters.',
'size': '650M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'DPLM-3B': {
'description': 'Largest deep protein language model in the DPLM family with 3B parameters.',
'size': '3B parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'DSM-150': {
'description': 'Deep language model for proteins with 150M parameters.',
'size': '150M parameters',
'type': 'Protein language model',
'citation': 'N/A'
},
'DSM-650': {
'description': 'Deep language model for proteins with 650M parameters.',
'size': '650M parameters',
'type': 'Protein language model',
'citation': 'N/A'
}
}
dataset_descriptions = {
'EC': {
'description': 'Enzyme Commission numbers dataset for predicting enzyme function classification.',
'type': 'Multi-label classification',
'task': 'Protein function prediction',
'citation': 'Gleghorn Lab'
},
'GO-CC': {
'description': 'Gene Ontology Cellular Component dataset for predicting protein localization in cells.',
'type': 'Multi-label classification',
'task': 'Protein localization prediction',
'citation': 'Gleghorn Lab'
},
'GO-BP': {
'description': 'Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.',
'type': 'Multi-label classification',
'task': 'Protein function prediction',
'citation': 'Gleghorn Lab'
},
'GO-MF': {
'description': 'Gene Ontology Molecular Function dataset for predicting protein molecular functions.',
'type': 'Multi-label classification',
'task': 'Protein function prediction',
'citation': 'Gleghorn Lab'
},
'MB': {
'description': 'Metal ion binding dataset for predicting protein-metal interactions.',
'type': 'Classification',
'task': 'Protein-metal binding prediction',
'citation': 'Gleghorn Lab'
},
'DeepLoc-2': {
'description': 'Binary classification dataset for predicting protein localization in 2 categories.',
'type': 'Binary classification',
'task': 'Protein localization prediction',
'citation': 'Gleghorn Lab'
},
'DeepLoc-10': {
'description': 'Multi-class classification dataset for predicting protein localization in 10 categories.',
'type': 'Multi-class classification',
'task': 'Protein localization prediction',
'citation': 'Gleghorn Lab'
},
'enzyme-kcat': {
'description': 'Dataset for predicting enzyme catalytic rate constants (kcat).',
'type': 'Regression',
'task': 'Enzyme kinetics prediction',
'citation': 'Gleghorn Lab'
},
'solubility': {
'description': 'Dataset for predicting protein solubility properties.',
'type': 'Binary classification',
'task': 'Protein solubility prediction',
'citation': 'Gleghorn Lab'
},
'localization': {
'description': 'Dataset for predicting subcellular localization of proteins.',
'type': 'Multi-class classification',
'task': 'Protein localization prediction',
'citation': 'Gleghorn Lab'
},
'temperature-stability': {
'description': 'Dataset for predicting protein stability at different temperatures.',
'type': 'Binary classification',
'task': 'Protein stability prediction',
'citation': 'Gleghorn Lab'
},
'peptide-HLA-MHC-affinity': {
'description': 'Dataset for predicting peptide binding affinity to HLA/MHC complexes.',
'type': 'Protein-protein interaction',
'task': 'Binding affinity prediction',
'citation': 'Gleghorn Lab'
},
'optimal-temperature': {
'description': 'Dataset for predicting the optimal temperature for protein function.',
'type': 'Regression',
'task': 'Protein property prediction',
'citation': 'Gleghorn Lab'
},
'optimal-ph': {
'description': 'Dataset for predicting the optimal pH for protein function.',
'type': 'Regression',
'task': 'Protein property prediction',
'citation': 'Gleghorn Lab'
},
'material-production': {
'description': 'Dataset for predicting protein suitability for material production.',
'type': 'Classification',
'task': 'Protein application prediction',
'citation': 'Gleghorn Lab'
},
'fitness-prediction': {
'description': 'Dataset for predicting protein fitness in various environments.',
'type': 'Classification',
'task': 'Protein fitness prediction',
'citation': 'Gleghorn Lab'
},
'number-of-folds': {
'description': 'Dataset for predicting the number of structural folds in proteins.',
'type': 'Classification',
'task': 'Protein structure prediction',
'citation': 'Gleghorn Lab'
},
'cloning-clf': {
'description': 'Dataset for predicting protein suitability for cloning operations.',
'type': 'Classification',
'task': 'Protein engineering prediction',
'citation': 'Gleghorn Lab'
},
'stability-prediction': {
'description': 'Dataset for predicting overall protein stability.',
'type': 'Classification',
'task': 'Protein stability prediction',
'citation': 'Gleghorn Lab'
},
'human-ppi': {
'description': 'Dataset for predicting human protein-protein interactions.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Gleghorn Lab'
},
'SecondaryStructure-3': {
'description': 'Dataset for predicting protein secondary structure in 3 classes.',
'type': 'Token-wise classification',
'task': 'Protein structure prediction',
'citation': 'Gleghorn Lab'
},
'SecondaryStructure-8': {
'description': 'Dataset for predicting protein secondary structure in 8 classes.',
'type': 'Token-wise classification',
'task': 'Protein structure prediction',
'citation': 'Gleghorn Lab'
},
'fluorescence-prediction': {
'description': 'Dataset for predicting protein fluorescence properties.',
'type': 'Token-wise regression',
'task': 'Protein property prediction',
'citation': 'Gleghorn Lab'
},
'plastic': {
'description': 'Dataset for predicting protein capability for plastic degradation.',
'type': 'Classification',
'task': 'Enzyme function prediction',
'citation': 'Gleghorn Lab'
},
'gold-ppi': {
'description': 'Gold standard dataset for protein-protein interaction prediction.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Synthyra/bernett_gold_ppi'
},
'human-ppi-pinui': {
'description': 'Human protein-protein interaction dataset from PiNUI.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Gleghorn Lab'
},
'yeast-ppi-pinui': {
'description': 'Yeast protein-protein interaction dataset from PiNUI.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Gleghorn Lab'
},
'shs27-ppi': {
'description': 'SHS27k dataset containing 27,000 protein-protein interactions.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Synthyra/SHS27k'
},
'shs148-ppi': {
'description': 'SHS148k dataset containing 148,000 protein-protein interactions.',
'type': 'Protein-protein interaction',
'task': 'PPI prediction',
'citation': 'Synthyra/SHS148k'
},
'PPA-ppi': {
'description': 'Protein-Protein Affinity dataset for quantitative binding predictions.',
'type': 'Protein-protein interaction',
'task': 'PPI affinity prediction',
'citation': 'Synthyra/ProteinProteinAffinity'
},
}
def list_models(show_standard_only=False):
"""List available models with descriptions if available"""
try:
from .base_models.get_base_models import currently_supported_models, standard_models
from .base_models.model_descriptions import model_descriptions
if show_standard_only:
models_to_show = standard_models
print("\n=== Standard Models ===\n")
else:
models_to_show = currently_supported_models
print("\n=== All Supported Models ===\n")
# Calculate maximum widths for formatting
max_name_len = max(len(name) for name in models_to_show)
max_type_len = max(len(model_descriptions.get(name, {}).get('type', 'Unknown')) for name in models_to_show if name in model_descriptions)
max_size_len = max(len(model_descriptions.get(name, {}).get('size', 'Unknown')) for name in models_to_show if name in model_descriptions)
# Print header
print(f"{'Model':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Size':<{max_size_len+2}}Description")
print("-" * (max_name_len + max_type_len + max_size_len + 50))
# Print model information
for model_name in models_to_show:
if model_name in model_descriptions:
model_info = model_descriptions[model_name]
print(f"{model_name:<{max_name_len+2}}{model_info.get('type', 'Unknown'):<{max_type_len+2}}{model_info.get('size', 'Unknown'):<{max_size_len+2}}{model_info.get('description', 'No description available')}")
else:
print(f"{model_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_size_len+2}}No description available")
except ImportError as e:
print(f"Error loading model information: {e}")
print("\n=== Models ===\n")
try:
from .base_models.get_base_models import currently_supported_models, standard_models
if show_standard_only:
for model_name in standard_models:
print(f"- {model_name}")
else:
for model_name in currently_supported_models:
print(f"- {model_name}")
except ImportError:
print("Could not load model lists. Please check your installation.")
def list_datasets(show_standard_only=False):
"""List available datasets with descriptions if available"""
try:
from .data.supported_datasets import supported_datasets, standard_data_benchmark
from .data.dataset_descriptions import dataset_descriptions
if show_standard_only:
datasets_to_show = {name: supported_datasets[name] for name in standard_data_benchmark if name in supported_datasets}
print("\n=== Standard Benchmark Datasets ===\n")
else:
datasets_to_show = supported_datasets
print("\n=== All Supported Datasets ===\n")
# Calculate maximum widths for formatting
max_name_len = max(len(name) for name in datasets_to_show)
max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
# Print header
print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
print("-" * (max_name_len + max_type_len + max_task_len + 50))
# Print dataset information
for dataset_name in datasets_to_show:
if dataset_name in dataset_descriptions:
dataset_info = dataset_descriptions[dataset_name]
print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
else:
print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")
except ImportError as e:
print(f"Error loading dataset information: {e}")
print("\n=== Datasets ===\n")
try:
from .data.supported_datasets import supported_datasets, standard_data_benchmark
if show_standard_only:
for dataset_name in standard_data_benchmark:
if dataset_name in supported_datasets:
print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
else:
for dataset_name, dataset_source in supported_datasets.items():
print(f"- {dataset_name}: {dataset_source}")
except ImportError:
print("Could not load dataset lists. Please check your installation.")
def main():
"""Main function to run the script from command line"""
parser = argparse.ArgumentParser(description='List Protify supported models and datasets')
parser.add_argument('--models', action='store_true', help='List supported models')
parser.add_argument('--datasets', action='store_true', help='List supported datasets')
parser.add_argument('--standard-only', action='store_true', help='Show only standard models/datasets')
parser.add_argument('--all', action='store_true', help='Show both models and datasets')
args = parser.parse_args()
if len(sys.argv) == 1 or args.all:
list_models(args.standard_only)
print("\n" + "="*80 + "\n")
list_datasets(args.standard_only)
return
if args.models:
list_models(args.standard_only)
if args.datasets:
list_datasets(args.standard_only)
if __name__ == "__main__":
main()