| """ |
| Dataset descriptions derived from the README "Currently Supported Datasets" table. |
| |
| Each entry provides: |
| - type: One of {BC, MCC, MLC, SLC, R, TR, Various} |
| - task: Short task description |
| - description: Brief dataset blurb |
| - tokenwise: Whether the task is token-wise (per-residue) |
| - multiple_inputs: Whether the dataset involves multiple sequence inputs per sample |
| """ |
|
|
| dataset_descriptions = { |
| |
| "EC": { |
| "type": "MLC", |
| "task": "Protein function prediction", |
| "description": "Enzyme Commission numbers dataset for predicting enzyme function classification.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "GO-CC": { |
| "type": "MLC", |
| "task": "Protein localization prediction", |
| "description": "Gene Ontology Cellular Component dataset for predicting protein localization in cells.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "GO-BP": { |
| "type": "MLC", |
| "task": "Protein function prediction", |
| "description": "Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "GO-MF": { |
| "type": "MLC", |
| "task": "Protein function prediction", |
| "description": "Gene Ontology Molecular Function dataset for predicting protein molecular functions.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
|
|
| |
| "MB": { |
| "type": "BC", |
| "task": "Protein-metal binding prediction", |
| "description": "Metal ion binding dataset for predicting protein-metal interactions.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "DeepLoc-2": { |
| "type": "BC", |
| "task": "Protein localization prediction", |
| "description": "Binary classification dataset for predicting protein localization in 2 categories.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "solubility": { |
| "type": "BC", |
| "task": "Protein solubility prediction", |
| "description": "Dataset for predicting protein solubility properties.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "temperature-stability": { |
| "type": "BC", |
| "task": "Protein stability prediction", |
| "description": "Dataset for predicting protein stability at different temperatures.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "material-production": { |
| "type": "BC", |
| "task": "Protein application prediction", |
| "description": "Dataset for predicting protein suitability for material production.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "fitness-prediction": { |
| "type": "BC", |
| "task": "Protein fitness prediction", |
| "description": "Dataset for predicting protein fitness in various environments.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "number-of-folds": { |
| "type": "BC", |
| "task": "Protein structure prediction", |
| "description": "Dataset for predicting the number of structural folds in proteins.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "cloning-clf": { |
| "type": "BC", |
| "task": "Protein engineering prediction", |
| "description": "Dataset for predicting protein suitability for cloning operations.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "stability-prediction": { |
| "type": "BC", |
| "task": "Protein stability prediction", |
| "description": "Dataset for predicting overall protein stability.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "plastic": { |
| "type": "BC", |
| "task": "Enzyme function prediction", |
| "description": "Dataset for predicting protein capability for plastic degradation.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "realness": { |
| "type": "BC", |
| "task": "Authenticity prediction", |
| "description": "Protein realness dataset.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
|
|
| |
| "DeepLoc-10": { |
| "type": "MCC", |
| "task": "Protein localization prediction", |
| "description": "Multi-class classification dataset for predicting protein localization in 10 categories.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "Subcellular": { |
| "type": "MCC", |
| "task": "Protein localization prediction", |
| "description": "Dataset for predicting subcellular localization of proteins.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "localization": { |
| "type": "MCC", |
| "task": "Protein localization prediction", |
| "description": "Dataset for predicting subcellular localization of proteins.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "foldseek-fold": { |
| "type": "MCC", |
| "task": "Protein structure prediction", |
| "description": "Dataset for protein fold classification using Foldseek.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "foldseek-inverse": { |
| "type": "MCC", |
| "task": "Protein structure prediction", |
| "description": "Inverse protein fold prediction dataset.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "ec-active": { |
| "type": "MCC", |
| "task": "Enzyme function prediction", |
| "description": "Dataset for predicting active enzyme classes.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_domain": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at domain level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_kingdom": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at kingdom level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_phylum": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at phylum level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_class": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at class level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_order": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at order level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_family": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at family level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_genus": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at genus level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "taxon_species": { |
| "type": "MCC", |
| "task": "Taxonomic prediction", |
| "description": "Taxonomic classification at species level.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
|
|
| |
| "enzyme-kcat": { |
| "type": "R", |
| "task": "Enzyme kinetics prediction", |
| "description": "Dataset for predicting enzyme catalytic rate constants (kcat).", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "optimal-temperature": { |
| "type": "R", |
| "task": "Protein property prediction", |
| "description": "Dataset for predicting the optimal temperature for protein function.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "optimal-ph": { |
| "type": "R", |
| "task": "Protein property prediction", |
| "description": "Dataset for predicting the optimal pH for protein function.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| "fluorescence-prediction": { |
| "type": "R", |
| "task": "Protein property prediction", |
| "description": "Dataset for predicting protein fluorescence properties.", |
| "tokenwise": True, |
| "multiple_inputs": False, |
| }, |
| "PPA-ppi": { |
| "type": "R", |
| "task": "protein-protein affinity prediction", |
| "description": "Protein-Protein Affinity dataset from Bindwell.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "million_full": { |
| "type": "R", |
| "task": "Protein fitness prediction", |
| "description": "Large-scale enzyme variant dataset, from Millionfull preprint October 2025", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
|
|
| |
| "gold-ppi": { |
| "type": "SLC", |
| "task": "PPI prediction", |
| "description": "Gold standard dataset for protein-protein interaction prediction.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "human-ppi-saprot": { |
| "type": "SLC", |
| "task": "PPI prediction", |
| "description": "Human protein-protein interaction dataset from SAProt paper.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "human-ppi-pinui": { |
| "type": "SLC", |
| "task": "PPI prediction", |
| "description": "Human protein-protein interaction dataset from PiNUI.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "yeast-ppi-pinui": { |
| "type": "SLC", |
| "task": "PPI prediction", |
| "description": "Yeast protein-protein interaction dataset from PiNUI.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "peptide-HLA-MHC-affinity": { |
| "type": "SLC", |
| "task": "Binding affinity prediction", |
| "description": "Dataset for predicting peptide binding affinity to HLA/MHC complexes.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "ppi-mutation-effect": { |
| "type": "SLC", |
| "task": "PPI effect prediction", |
| "description": "Compare wild type, mutated, and target sequence to determine if PPI is stronger or not.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
|
|
| |
| "shs27-ppi-raw": { |
| "type": "SLC", |
| "task": "PPI type prediction", |
| "description": "Raw SHS27k with single-label labels.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs148-ppi-raw": { |
| "type": "SLC", |
| "task": "PPI type prediction", |
| "description": "Raw SHS148k with single-label labels.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs27-ppi-random": { |
| "type": "MLC", |
| "task": "PPI prediction", |
| "description": "SHS27k", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs148-ppi-random": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "SHS148k CD-Hit 40%, multi-label lables, randomized data splits.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs27-ppi-dfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "SHS27k CD-Hit 40%, multi-label lables, data splits via depth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs148-ppi-dfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "SHS148k CD-Hit 40%, multi-label lables, data splits via depth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs27-ppi-bfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "SHS27k CD-Hit 40%, multi-label lables, data splits via breadth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "shs148-ppi-bfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "SHS148k CD-Hit 40%, multi-label lables, data splits via breadth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "string-ppi-random": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "STRING CD-Hit 40%, multi-label lables, randomized data splits.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "string-ppi-dfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "STRING CD-Hit 40%, multi-label lables, data splits via depth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
| "string-ppi-bfs": { |
| "type": "MLC", |
| "task": "PPI type prediction", |
| "description": "STRING CD-Hit 40%, multi-label lables, data splits via breadth first search.", |
| "tokenwise": False, |
| "multiple_inputs": True, |
| }, |
|
|
| |
| "SecondaryStructure-3": { |
| "type": "MCC", |
| "task": "Protein structure prediction", |
| "description": "Dataset for predicting protein secondary structure in 3 classes.", |
| "tokenwise": True, |
| "multiple_inputs": False, |
| }, |
| "SecondaryStructure-8": { |
| "type": "MCC", |
| "task": "Protein structure prediction", |
| "description": "Dataset for predicting protein secondary structure in 8 classes.", |
| "tokenwise": True, |
| "multiple_inputs": False, |
| }, |
| "plddt": { |
| "type": "TR", |
| "task": "Confidence prediction", |
| "description": "AlphaFold pLDDT confidence score prediction.", |
| "tokenwise": True, |
| "multiple_inputs": False, |
| }, |
|
|
| |
| "diff_phylogeny": { |
| "type": "Various", |
| "task": "Phylogeny prediction", |
| "description": "Differential phylogeny dataset.", |
| "tokenwise": False, |
| "multiple_inputs": False, |
| }, |
| } |
|
|
|
|
|
|