File size: 20,370 Bytes
714cf46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
#!/usr/bin/env python
"""
Utility script to list Protify supported models and datasets.
"""
import argparse
import sys


model_descriptions = {
    'ESM2-8': {
        'description': 'Small protein language model (8M parameters) from Meta AI that learns evolutionary information from millions of protein sequences.',
        'size': '8M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-35': {
        'description': 'Medium-sized protein language model (35M parameters) trained on evolutionary data.',
        'size': '35M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-150': {
        'description': 'Large protein language model (150M parameters) with improved protein structure prediction capabilities.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-650': {
        'description': 'Very large protein language model (650M parameters) offering state-of-the-art performance on many protein prediction tasks.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'ESM2-3B': {
        'description': 'Largest ESM2 protein language model (3B parameters) with exceptional capability for protein structure and function prediction.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'Lin et al. (2022). Evolutionary-scale prediction of atomic level protein structure with a language model.'
    },
    'Random': {
        'description': 'Baseline model with randomly initialized weights, serving as a negative control.',
        'size': 'Varies',
        'type': 'Baseline control',
        'citation': 'N/A'
    },
    'Random-Transformer': {
        'description': 'Randomly initialized transformer model serving as a homology-based control.',
        'size': 'Varies',
        'type': 'Baseline control',
        'citation': 'N/A'
    },
    'ESMC-300': {
        'description': 'Protein language model optimized for classification tasks with 300M parameters.',
        'size': '300M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'ESMC-600': {
        'description': 'Larger protein language model for classification with 600M parameters.',
        'size': '600M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'ProtBert': {
        'description': 'BERT-based protein language model trained on protein sequences from UniRef.',
        'size': '420M parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtBert-BFD': {
        'description': 'BERT-based protein language model trained on BFD database with improved performance.',
        'size': '420M parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtT5': {
        'description': 'T5-based protein language model capable of both encoding and generation tasks.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ProtT5-XL-UniRef50-full-prec': {
        'description': 'Extra large T5-based protein language model trained on UniRef50 with full precision.',
        'size': '11B parameters',
        'type': 'Protein language model',
        'citation': 'Elnaggar et al. (2021). ProtTrans: Towards Cracking the Language of Life\'s Code Through Self-Supervised Learning.'
    },
    'ANKH-Base': {
        'description': 'Base version of the ANKH protein language model focused on protein structure understanding.',
        'size': '400M parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'ANKH-Large': {
        'description': 'Large version of the ANKH protein language model with improved structural predictions.',
        'size': '1.2B parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'ANKH2-Large': {
        'description': 'Improved second generation ANKH protein language model.',
        'size': '1.2B parameters',
        'type': 'Protein language model',
        'citation': 'Choromanski et al. (2022). ANKH: Optimized Protein Language Model Unlocks General-Purpose Modelling.'
    },
    'GLM2-150': {
        'description': 'Medium-sized general language model adapted for protein sequences.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'GLM2-650': {
        'description': 'Large general language model adapted for protein sequences.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'GLM2-GAIA': {
        'description': 'Specialized GLM protein language model with GAIA architecture improvements.',
        'size': '1B+ parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-150': {
        'description': 'Deep protein language model with 150M parameters focused on protein structure.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-650': {
        'description': 'Larger deep protein language model with 650M parameters.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DPLM-3B': {
        'description': 'Largest deep protein language model in the DPLM family with 3B parameters.',
        'size': '3B parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DSM-150': {
        'description': 'Deep language model for proteins with 150M parameters.',
        'size': '150M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    },
    'DSM-650': {
        'description': 'Deep language model for proteins with 650M parameters.',
        'size': '650M parameters',
        'type': 'Protein language model',
        'citation': 'N/A'
    }
}


dataset_descriptions = {
    'EC': {
        'description': 'Enzyme Commission numbers dataset for predicting enzyme function classification.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-CC': {
        'description': 'Gene Ontology Cellular Component dataset for predicting protein localization in cells.',
        'type': 'Multi-label classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-BP': {
        'description': 'Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'GO-MF': {
        'description': 'Gene Ontology Molecular Function dataset for predicting protein molecular functions.',
        'type': 'Multi-label classification',
        'task': 'Protein function prediction',
        'citation': 'Gleghorn Lab'
    },
    'MB': {
        'description': 'Metal ion binding dataset for predicting protein-metal interactions.',
        'type': 'Classification',
        'task': 'Protein-metal binding prediction',
        'citation': 'Gleghorn Lab'
    },
    'DeepLoc-2': {
        'description': 'Binary classification dataset for predicting protein localization in 2 categories.',
        'type': 'Binary classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'DeepLoc-10': {
        'description': 'Multi-class classification dataset for predicting protein localization in 10 categories.',
        'type': 'Multi-class classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'enzyme-kcat': {
        'description': 'Dataset for predicting enzyme catalytic rate constants (kcat).',
        'type': 'Regression',
        'task': 'Enzyme kinetics prediction',
        'citation': 'Gleghorn Lab'
    },
    'solubility': {
        'description': 'Dataset for predicting protein solubility properties.',
        'type': 'Binary classification',
        'task': 'Protein solubility prediction',
        'citation': 'Gleghorn Lab'
    },
    'localization': {
        'description': 'Dataset for predicting subcellular localization of proteins.',
        'type': 'Multi-class classification',
        'task': 'Protein localization prediction',
        'citation': 'Gleghorn Lab'
    },
    'temperature-stability': {
        'description': 'Dataset for predicting protein stability at different temperatures.',
        'type': 'Binary classification',
        'task': 'Protein stability prediction',
        'citation': 'Gleghorn Lab'
    },
    'peptide-HLA-MHC-affinity': {
        'description': 'Dataset for predicting peptide binding affinity to HLA/MHC complexes.',
        'type': 'Protein-protein interaction',
        'task': 'Binding affinity prediction',
        'citation': 'Gleghorn Lab'
    },
    'optimal-temperature': {
        'description': 'Dataset for predicting the optimal temperature for protein function.',
        'type': 'Regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'optimal-ph': {
        'description': 'Dataset for predicting the optimal pH for protein function.',
        'type': 'Regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'material-production': {
        'description': 'Dataset for predicting protein suitability for material production.',
        'type': 'Classification',
        'task': 'Protein application prediction',
        'citation': 'Gleghorn Lab'
    },
    'fitness-prediction': {
        'description': 'Dataset for predicting protein fitness in various environments.',
        'type': 'Classification',
        'task': 'Protein fitness prediction',
        'citation': 'Gleghorn Lab'
    },
    'number-of-folds': {
        'description': 'Dataset for predicting the number of structural folds in proteins.',
        'type': 'Classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'cloning-clf': {
        'description': 'Dataset for predicting protein suitability for cloning operations.',
        'type': 'Classification',
        'task': 'Protein engineering prediction',
        'citation': 'Gleghorn Lab'
    },
    'stability-prediction': {
        'description': 'Dataset for predicting overall protein stability.',
        'type': 'Classification',
        'task': 'Protein stability prediction',
        'citation': 'Gleghorn Lab'
    },
    'human-ppi': {
        'description': 'Dataset for predicting human protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'SecondaryStructure-3': {
        'description': 'Dataset for predicting protein secondary structure in 3 classes.',
        'type': 'Token-wise classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'SecondaryStructure-8': {
        'description': 'Dataset for predicting protein secondary structure in 8 classes.',
        'type': 'Token-wise classification',
        'task': 'Protein structure prediction',
        'citation': 'Gleghorn Lab'
    },
    'fluorescence-prediction': {
        'description': 'Dataset for predicting protein fluorescence properties.',
        'type': 'Token-wise regression',
        'task': 'Protein property prediction',
        'citation': 'Gleghorn Lab'
    },
    'plastic': {
        'description': 'Dataset for predicting protein capability for plastic degradation.',
        'type': 'Classification',
        'task': 'Enzyme function prediction',
        'citation': 'Gleghorn Lab'
    },
    'gold-ppi': {
        'description': 'Gold standard dataset for protein-protein interaction prediction.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/bernett_gold_ppi'
    },
    'human-ppi-pinui': {
        'description': 'Human protein-protein interaction dataset from PiNUI.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'yeast-ppi-pinui': {
        'description': 'Yeast protein-protein interaction dataset from PiNUI.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Gleghorn Lab'
    },
    'shs27-ppi': {
        'description': 'SHS27k dataset containing 27,000 protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/SHS27k'
    },
    'shs148-ppi': {
        'description': 'SHS148k dataset containing 148,000 protein-protein interactions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI prediction',
        'citation': 'Synthyra/SHS148k'
    },
    'PPA-ppi': {
        'description': 'Protein-Protein Affinity dataset for quantitative binding predictions.',
        'type': 'Protein-protein interaction',
        'task': 'PPI affinity prediction',
        'citation': 'Synthyra/ProteinProteinAffinity'
    },
}


def list_models(show_standard_only=False):
    """List available models with descriptions if available"""
    try:
        from .base_models.get_base_models import currently_supported_models, standard_models
        from .base_models.model_descriptions import model_descriptions
        
        if show_standard_only:
            models_to_show = standard_models
            print("\n=== Standard Models ===\n")
        else:
            models_to_show = currently_supported_models
            print("\n=== All Supported Models ===\n")
        
        # Calculate maximum widths for formatting
        max_name_len = max(len(name) for name in models_to_show)
        max_type_len = max(len(model_descriptions.get(name, {}).get('type', 'Unknown')) for name in models_to_show if name in model_descriptions)
        max_size_len = max(len(model_descriptions.get(name, {}).get('size', 'Unknown')) for name in models_to_show if name in model_descriptions)
        
        # Print header
        print(f"{'Model':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Size':<{max_size_len+2}}Description")
        print("-" * (max_name_len + max_type_len + max_size_len + 50))
        
        # Print model information
        for model_name in models_to_show:
            if model_name in model_descriptions:
                model_info = model_descriptions[model_name]
                print(f"{model_name:<{max_name_len+2}}{model_info.get('type', 'Unknown'):<{max_type_len+2}}{model_info.get('size', 'Unknown'):<{max_size_len+2}}{model_info.get('description', 'No description available')}")
            else:
                print(f"{model_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_size_len+2}}No description available")
    
    except ImportError as e:
        print(f"Error loading model information: {e}")
        print("\n=== Models ===\n")
        try:
            from .base_models.get_base_models import currently_supported_models, standard_models
            
            if show_standard_only:
                for model_name in standard_models:
                    print(f"- {model_name}")
            else:
                for model_name in currently_supported_models:
                    print(f"- {model_name}")
        except ImportError:
            print("Could not load model lists. Please check your installation.")


def list_datasets(show_standard_only=False):
    """List available datasets with descriptions if available"""
    try:
        from .data.supported_datasets import supported_datasets, standard_data_benchmark
        from .data.dataset_descriptions import dataset_descriptions
        
        if show_standard_only:
            datasets_to_show = {name: supported_datasets[name] for name in standard_data_benchmark if name in supported_datasets}
            print("\n=== Standard Benchmark Datasets ===\n")
        else:
            datasets_to_show = supported_datasets
            print("\n=== All Supported Datasets ===\n")
        
        # Calculate maximum widths for formatting
        max_name_len = max(len(name) for name in datasets_to_show)
        max_type_len = max(len(dataset_descriptions.get(name, {}).get('type', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
        max_task_len = max(len(dataset_descriptions.get(name, {}).get('task', 'Unknown')) for name in datasets_to_show if name in dataset_descriptions)
        
        # Print header
        print(f"{'Dataset':<{max_name_len+2}}{'Type':<{max_type_len+2}}{'Task':<{max_task_len+2}}Description")
        print("-" * (max_name_len + max_type_len + max_task_len + 50))
        
        # Print dataset information
        for dataset_name in datasets_to_show:
            if dataset_name in dataset_descriptions:
                dataset_info = dataset_descriptions[dataset_name]
                print(f"{dataset_name:<{max_name_len+2}}{dataset_info.get('type', 'Unknown'):<{max_type_len+2}}{dataset_info.get('task', 'Unknown'):<{max_task_len+2}}{dataset_info.get('description', 'No description available')}")
            else:
                print(f"{dataset_name:<{max_name_len+2}}{'Unknown':<{max_type_len+2}}{'Unknown':<{max_task_len+2}}No description available")
    
    except ImportError as e:
        print(f"Error loading dataset information: {e}")
        print("\n=== Datasets ===\n")
        try:
            from .data.supported_datasets import supported_datasets, standard_data_benchmark
            
            if show_standard_only:
                for dataset_name in standard_data_benchmark:
                    if dataset_name in supported_datasets:
                        print(f"- {dataset_name}: {supported_datasets[dataset_name]}")
            else:
                for dataset_name, dataset_source in supported_datasets.items():
                    print(f"- {dataset_name}: {dataset_source}")
        except ImportError:
            print("Could not load dataset lists. Please check your installation.")


def main():
    """Main function to run the script from command line"""
    parser = argparse.ArgumentParser(description='List Protify supported models and datasets')
    parser.add_argument('--models', action='store_true', help='List supported models')
    parser.add_argument('--datasets', action='store_true', help='List supported datasets')
    parser.add_argument('--standard-only', action='store_true', help='Show only standard models/datasets')
    parser.add_argument('--all', action='store_true', help='Show both models and datasets')
    
    args = parser.parse_args()
    
    if len(sys.argv) == 1 or args.all:
        list_models(args.standard_only)
        print("\n" + "="*80 + "\n")
        list_datasets(args.standard_only)
        return
    
    if args.models:
        list_models(args.standard_only)
    
    if args.datasets:
        list_datasets(args.standard_only)


if __name__ == "__main__":
    main()