Upload folder using huggingface_hub

714cf46 verified 20 days ago

20.4 kB

	#!/usr/bin/env python
	"""
	Utility script to list Protify supported models and datasets.
	"""
	import argparse
	import sys


	model_descriptions = {
	'ESM2-8': {
	'description': 'Small protein language model (8M parameters) from Meta AI that learns evolutionary information from millions of protein sequences.',
	'size': '8M parameters',
	'type': 'Protein language model',
	'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
	},
	'ESM2-35': {
	'description': 'Medium-sized protein language model (35M parameters) trained on evolutionary data.',
	'size': '35M parameters',
	'type': 'Protein language model',
	'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
	},
	'ESM2-150': {
	'description': 'Large protein language model (150M parameters) with improved protein structure prediction capabilities.',
	'size': '150M parameters',
	'type': 'Protein language model',
	'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
	},
	'ESM2-650': {
	'description': 'Very large protein language model (650M parameters) offering state-of-the-art performance on many protein prediction tasks.',
	'size': '650M parameters',
	'type': 'Protein language model',
	'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
	},
	'ESM2-3B': {
	'description': 'Largest ESM2 protein language model (3B parameters) with exceptional capability for protein structure and function prediction.',
	'size': '3B parameters',
	'type': 'Protein language model',
	'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
	},
	'Random': {
	'description': 'Baseline model with randomly initialized weights, serving as a negative control.',
	'size': 'Varies',
	'type': 'Baseline control',
	'citation': 'N/A'
	},
	'Random-Transformer': {
	'description': 'Randomly initialized transformer model serving as a homology-based control.',
	'size': 'Varies',
	'type': 'Baseline control',
	'citation': 'N/A'
	},
	'ESMC-300': {
	'description': 'Protein language model optimized for classification tasks with 300M parameters.',
	'size': '300M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'ESMC-600': {
	'description': 'Larger protein language model for classification with 600M parameters.',
	'size': '600M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'ProtBert': {
	'description': 'BERT-based protein language model trained on protein sequences from UniRef.',
	'size': '420M parameters',
	'type': 'Protein language model',
	'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
	},
	'ProtBert-BFD': {
	'description': 'BERT-based protein language model trained on BFD database with improved performance.',
	'size': '420M parameters',
	'type': 'Protein language model',
	'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
	},
	'ProtT5': {
	'description': 'T5-based protein language model capable of both encoding and generation tasks.',
	'size': '3B parameters',
	'type': 'Protein language model',
	'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
	},
	'ProtT5-XL-UniRef50-full-prec': {
	'description': 'Extra large T5-based protein language model trained on UniRef50 with full precision.',
	'size': '11B parameters',
	'type': 'Protein language model',
	'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
	},
	'ANKH-Base': {
	'description': 'Base version of the ANKH protein language model focused on protein structure understanding.',
	'size': '400M parameters',
	'type': 'Protein language model',
	'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
	},
	'ANKH-Large': {
	'description': 'Large version of the ANKH protein language model with improved structural predictions.',
	'size': '1.2B parameters',
	'type': 'Protein language model',
	'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
	},
	'ANKH2-Large': {
	'description': 'Improved second generation ANKH protein language model.',
	'size': '1.2B parameters',
	'type': 'Protein language model',
	'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
	},
	'GLM2-150': {
	'description': 'Medium-sized general language model adapted for protein sequences.',
	'size': '150M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'GLM2-650': {
	'description': 'Large general language model adapted for protein sequences.',
	'size': '650M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'GLM2-GAIA': {
	'description': 'Specialized GLM protein language model with GAIA architecture improvements.',
	'size': '1B+ parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'DPLM-150': {
	'description': 'Deep protein language model with 150M parameters focused on protein structure.',
	'size': '150M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'DPLM-650': {
	'description': 'Larger deep protein language model with 650M parameters.',
	'size': '650M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'DPLM-3B': {
	'description': 'Largest deep protein language model in the DPLM family with 3B parameters.',
	'size': '3B parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'DSM-150': {
	'description': 'Deep language model for proteins with 150M parameters.',
	'size': '150M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	},
	'DSM-650': {
	'description': 'Deep language model for proteins with 650M parameters.',
	'size': '650M parameters',
	'type': 'Protein language model',
	'citation': 'N/A'
	}
	}


	dataset_descriptions = {
	'EC': {
	'description': 'Enzyme Commission numbers dataset for predicting enzyme function classification.',
	'type': 'Multi-label classification',
	'task': 'Protein function prediction',
	'citation': 'Gleghorn Lab'
	},
	'GO-CC': {
	'description': 'Gene Ontology Cellular Component dataset for predicting protein localization in cells.',
	'type': 'Multi-label classification',
	'task': 'Protein localization prediction',
	'citation': 'Gleghorn Lab'
	},
	'GO-BP': {
	'description': 'Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.',
	'type': 'Multi-label classification',
	'task': 'Protein function prediction',
	'citation': 'Gleghorn Lab'
	},
	'GO-MF': {
	'description': 'Gene Ontology Molecular Function dataset for predicting protein molecular functions.',
	'type': 'Multi-label classification',
	'task': 'Protein function prediction',
	'citation': 'Gleghorn Lab'
	},
	'MB': {
	'description': 'Metal ion binding dataset for predicting protein-metal interactions.',
	'type': 'Classification',
	'task': 'Protein-metal binding prediction',
	'citation': 'Gleghorn Lab'
	},
	'DeepLoc-2': {
	'description': 'Binary classification dataset for predicting protein localization in 2 categories.',
	'type': 'Binary classification',
	'task': 'Protein localization prediction',
	'citation': 'Gleghorn Lab'
	},
	'DeepLoc-10': {
	'description': 'Multi-class classification dataset for predicting protein localization in 10 categories.',
	'type': 'Multi-class classification',
	'task': 'Protein localization prediction',
	'citation': 'Gleghorn Lab'
	},
	'enzyme-kcat': {
	'description': 'Dataset for predicting enzyme catalytic rate constants (kcat).',
	'type': 'Regression',
	'task': 'Enzyme kinetics prediction',
	'citation': 'Gleghorn Lab'
	},
	'solubility': {
	'description': 'Dataset for predicting protein solubility properties.',
	'type': 'Binary classification',
	'task': 'Protein solubility prediction',
	'citation': 'Gleghorn Lab'
	},
	'localization': {
	'description': 'Dataset for predicting subcellular localization of proteins.',
	'type': 'Multi-class classification',
	'task': 'Protein localization prediction',
	'citation': 'Gleghorn Lab'
	},
	'temperature-stability': {
	'description': 'Dataset for predicting protein stability at different temperatures.',
	'type': 'Binary classification',
	'task': 'Protein stability prediction',
	'citation': 'Gleghorn Lab'
	},
	'peptide-HLA-MHC-affinity': {
	'description': 'Dataset for predicting peptide binding affinity to HLA/MHC complexes.',
	'type': 'Protein-protein interaction',
	'task': 'Binding affinity prediction',
	'citation': 'Gleghorn Lab'
	},
	'optimal-temperature': {
	'description': 'Dataset for predicting the optimal temperature for protein function.',
	'type': 'Regression',
	'task': 'Protein property prediction',
	'citation': 'Gleghorn Lab'
	},
	'optimal-ph': {
	'description': 'Dataset for predicting the optimal pH for protein function.',
	'type': 'Regression',
	'task': 'Protein property prediction',
	'citation': 'Gleghorn Lab'
	},
	'material-production': {
	'description': 'Dataset for predicting protein suitability for material production.',
	'type': 'Classification',
	'task': 'Protein application prediction',
	'citation': 'Gleghorn Lab'
	},
	'fitness-prediction': {
	'description': 'Dataset for predicting protein fitness in various environments.',
	'type': 'Classification',
	'task': 'Protein fitness prediction',
	'citation': 'Gleghorn Lab'
	},
	'number-of-folds': {
	'description': 'Dataset for predicting the number of structural folds in proteins.',
	'type': 'Classification',
	'task': 'Protein structure prediction',
	'citation': 'Gleghorn Lab'
	},
	'cloning-clf': {
	'description': 'Dataset for predicting protein suitability for cloning operations.',
	'type': 'Classification',
	'task': 'Protein engineering prediction',
	'citation': 'Gleghorn Lab'
	},
	'stability-prediction': {
	'description': 'Dataset for predicting overall protein stability.',
	'type': 'Classification',
	'task': 'Protein stability prediction',
	'citation': 'Gleghorn Lab'
	},
	'human-ppi': {
	'description': 'Dataset for predicting human protein-protein interactions.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Gleghorn Lab'
	},
	'SecondaryStructure-3': {
	'description': 'Dataset for predicting protein secondary structure in 3 classes.',
	'type': 'Token-wise classification',
	'task': 'Protein structure prediction',
	'citation': 'Gleghorn Lab'
	},
	'SecondaryStructure-8': {
	'description': 'Dataset for predicting protein secondary structure in 8 classes.',
	'type': 'Token-wise classification',
	'task': 'Protein structure prediction',
	'citation': 'Gleghorn Lab'
	},
	'fluorescence-prediction': {
	'description': 'Dataset for predicting protein fluorescence properties.',
	'type': 'Token-wise regression',
	'task': 'Protein property prediction',
	'citation': 'Gleghorn Lab'
	},
	'plastic': {
	'description': 'Dataset for predicting protein capability for plastic degradation.',
	'type': 'Classification',
	'task': 'Enzyme function prediction',
	'citation': 'Gleghorn Lab'
	},
	'gold-ppi': {
	'description': 'Gold standard dataset for protein-protein interaction prediction.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Synthyra/bernett_gold_ppi'
	},
	'human-ppi-pinui': {
	'description': 'Human protein-protein interaction dataset from PiNUI.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Gleghorn Lab'
	},
	'yeast-ppi-pinui': {
	'description': 'Yeast protein-protein interaction dataset from PiNUI.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Gleghorn Lab'
	},
	'shs27-ppi': {
	'description': 'SHS27k dataset containing 27,000 protein-protein interactions.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Synthyra/SHS27k'
	},
	'shs148-ppi': {
	'description': 'SHS148k dataset containing 148,000 protein-protein interactions.',
	'type': 'Protein-protein interaction',
	'task': 'PPI prediction',
	'citation': 'Synthyra/SHS148k'
	},
	'PPA-ppi': {
	'description': 'Protein-Protein Affinity dataset for quantitative binding predictions.',
	'type': 'Protein-protein interaction',
	'task': 'PPI affinity prediction',
	'citation': 'Synthyra/ProteinProteinAffinity'
	},
	}


	def list_models(show_standard_only=False):
	"""List available models with descriptions if available"""
	try:
	from .base_models.get_base_models import currently_supported_models, standard_models
	from .base_models.model_descriptions import model_descriptions

	if show_standard_only:
	models_to_show = standard_models
	print("\n=== Standard Models ===\n")
	else:
	models_to_show = currently_supported_models
	print("\n=== All Supported Models ===\n")

	# Calculate maximum widths for formatting
	max_name_len = max(len(name) for name in models_to_show)
	max_type_len = max(len(model_descriptions.get(name, {}).get('type', 'Unknown')) for name in models_to_show if name in model_descriptions)
	max_size_len = max(len(model_descriptions.get(name, {}).get('size', 'Unknown')) for name in models_to_show if name in model_descriptions)

	# Print header
	print(f"{'Model':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Size':<{max_size_len+2}}Description")
	print("-" * (max_name_len + max_type_len + max_size_len + 50))

	# Print model information
	for model_name in models_to_show:
	if model_name in model_descriptions:
	model_info = model_descriptions[model_name]
	print(f"{model_name:<{max_name_len+2}}{model_info.get('type', 'Unknown'):<{max_type_len+2}}{model_info.get('size', 'Unknown'):<{max_size_len+2}}{model_info.get('description', 'No description available')}")
	else:
	print(f"{model_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_size_len+2}}No description available")

	except ImportError as e:
	print(f"Error loading model information: {e}")
	print("\n=== Models ===\n")
	try:
	from .base_models.get_base_models import currently_supported_models, standard_models

	if show_standard_only:
	for model_name in standard_models:
	print(f"- {model_name}")
	else:
	for model_name in currently_supported_models:
	print(f"- {model_name}")
	except ImportError:
	print("Could not load model lists. Please check your installation.")


	def list_datasets(show_standard_only=False):
	"""List available datasets with descriptions if available"""
	try:
	from .data.supported_datasets import supported_datasets, standard_data_benchmark
	from .data.dataset_descriptions import dataset_descriptions

	if show_standard_only:
	datasets_to_show = {name: supported_datasets[name] for name in standard_data_benchmark if name in supported_datasets}
	print("\n=== Standard Benchmark Datasets ===\n")
	else:
	datasets_to_show = supported_datasets
	print("\n=== All Supported Datasets ===\n")

	# Calculate maximum widths for formatting
	max_name_len = max(len(name) for name in datasets_to_show)
	max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
	max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)

	# Print header
	print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
	print("-" * (max_name_len + max_type_len + max_task_len + 50))

	# Print dataset information
	for dataset_name in datasets_to_show:
	if dataset_name in dataset_descriptions:
	dataset_info = dataset_descriptions[dataset_name]
	print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
	else:
	print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")

	except ImportError as e:
	print(f"Error loading dataset information: {e}")
	print("\n=== Datasets ===\n")
	try:
	from .data.supported_datasets import supported_datasets, standard_data_benchmark

	if show_standard_only:
	for dataset_name in standard_data_benchmark:
	if dataset_name in supported_datasets:
	print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
	else:
	for dataset_name, dataset_source in supported_datasets.items():
	print(f"- {dataset_name}: {dataset_source}")
	except ImportError:
	print("Could not load dataset lists. Please check your installation.")


	def main():
	"""Main function to run the script from command line"""
	parser = argparse.ArgumentParser(description='List Protify supported models and datasets')
	parser.add_argument('--models', action='store_true', help='List supported models')
	parser.add_argument('--datasets', action='store_true', help='List supported datasets')
	parser.add_argument('--standard-only', action='store_true', help='Show only standard models/datasets')
	parser.add_argument('--all', action='store_true', help='Show both models and datasets')

	args = parser.parse_args()

	if len(sys.argv) == 1 or args.all:
	list_models(args.standard_only)
	print("\n" + "="*80 + "\n")
	list_datasets(args.standard_only)
	return

	if args.models:
	list_models(args.standard_only)

	if args.datasets:
	list_datasets(args.standard_only)


	if __name__ == "__main__":
	main()