directionality_probe / protify /data /dataset_descriptions.py
nikraf's picture
Upload folder using huggingface_hub
714cf46 verified
"""
Dataset descriptions derived from the README "Currently Supported Datasets" table.
Each entry provides:
- type: One of {BC, MCC, MLC, SLC, R, TR, Various}
- task: Short task description
- description: Brief dataset blurb
- tokenwise: Whether the task is token-wise (per-residue)
- multiple_inputs: Whether the dataset involves multiple sequence inputs per sample
"""
dataset_descriptions = {
# Multi-label classification (MLC)
"EC": {
"type": "MLC",
"task": "Protein function prediction",
"description": "Enzyme Commission numbers dataset for predicting enzyme function classification.",
"tokenwise": False,
"multiple_inputs": False,
},
"GO-CC": {
"type": "MLC",
"task": "Protein localization prediction",
"description": "Gene Ontology Cellular Component dataset for predicting protein localization in cells.",
"tokenwise": False,
"multiple_inputs": False,
},
"GO-BP": {
"type": "MLC",
"task": "Protein function prediction",
"description": "Gene Ontology Biological Process dataset for predicting protein involvement in biological processes.",
"tokenwise": False,
"multiple_inputs": False,
},
"GO-MF": {
"type": "MLC",
"task": "Protein function prediction",
"description": "Gene Ontology Molecular Function dataset for predicting protein molecular functions.",
"tokenwise": False,
"multiple_inputs": False,
},
# Binary classification (BC)
"MB": {
"type": "BC",
"task": "Protein-metal binding prediction",
"description": "Metal ion binding dataset for predicting protein-metal interactions.",
"tokenwise": False,
"multiple_inputs": False,
},
"DeepLoc-2": {
"type": "BC",
"task": "Protein localization prediction",
"description": "Binary classification dataset for predicting protein localization in 2 categories.",
"tokenwise": False,
"multiple_inputs": False,
},
"solubility": {
"type": "BC",
"task": "Protein solubility prediction",
"description": "Dataset for predicting protein solubility properties.",
"tokenwise": False,
"multiple_inputs": False,
},
"temperature-stability": {
"type": "BC",
"task": "Protein stability prediction",
"description": "Dataset for predicting protein stability at different temperatures.",
"tokenwise": False,
"multiple_inputs": False,
},
"material-production": {
"type": "BC",
"task": "Protein application prediction",
"description": "Dataset for predicting protein suitability for material production.",
"tokenwise": False,
"multiple_inputs": False,
},
"fitness-prediction": {
"type": "BC",
"task": "Protein fitness prediction",
"description": "Dataset for predicting protein fitness in various environments.",
"tokenwise": False,
"multiple_inputs": False,
},
"number-of-folds": {
"type": "BC",
"task": "Protein structure prediction",
"description": "Dataset for predicting the number of structural folds in proteins.",
"tokenwise": False,
"multiple_inputs": False,
},
"cloning-clf": {
"type": "BC",
"task": "Protein engineering prediction",
"description": "Dataset for predicting protein suitability for cloning operations.",
"tokenwise": False,
"multiple_inputs": False,
},
"stability-prediction": {
"type": "BC",
"task": "Protein stability prediction",
"description": "Dataset for predicting overall protein stability.",
"tokenwise": False,
"multiple_inputs": False,
},
"plastic": {
"type": "BC",
"task": "Enzyme function prediction",
"description": "Dataset for predicting protein capability for plastic degradation.",
"tokenwise": False,
"multiple_inputs": False,
},
"realness": {
"type": "BC",
"task": "Authenticity prediction",
"description": "Protein realness dataset.",
"tokenwise": False,
"multiple_inputs": False,
},
# Multi-class classification (MCC)
"DeepLoc-10": {
"type": "MCC",
"task": "Protein localization prediction",
"description": "Multi-class classification dataset for predicting protein localization in 10 categories.",
"tokenwise": False,
"multiple_inputs": False,
},
"Subcellular": {
"type": "MCC",
"task": "Protein localization prediction",
"description": "Dataset for predicting subcellular localization of proteins.",
"tokenwise": False,
"multiple_inputs": False,
},
"localization": {
"type": "MCC",
"task": "Protein localization prediction",
"description": "Dataset for predicting subcellular localization of proteins.",
"tokenwise": False,
"multiple_inputs": False,
},
"foldseek-fold": {
"type": "MCC",
"task": "Protein structure prediction",
"description": "Dataset for protein fold classification using Foldseek.",
"tokenwise": False,
"multiple_inputs": False,
},
"foldseek-inverse": {
"type": "MCC",
"task": "Protein structure prediction",
"description": "Inverse protein fold prediction dataset.",
"tokenwise": False,
"multiple_inputs": False,
},
"ec-active": {
"type": "MCC",
"task": "Enzyme function prediction",
"description": "Dataset for predicting active enzyme classes.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_domain": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at domain level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_kingdom": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at kingdom level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_phylum": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at phylum level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_class": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at class level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_order": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at order level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_family": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at family level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_genus": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at genus level.",
"tokenwise": False,
"multiple_inputs": False,
},
"taxon_species": {
"type": "MCC",
"task": "Taxonomic prediction",
"description": "Taxonomic classification at species level.",
"tokenwise": False,
"multiple_inputs": False,
},
# Regression (R)
"enzyme-kcat": {
"type": "R",
"task": "Enzyme kinetics prediction",
"description": "Dataset for predicting enzyme catalytic rate constants (kcat).",
"tokenwise": False,
"multiple_inputs": False,
},
"optimal-temperature": {
"type": "R",
"task": "Protein property prediction",
"description": "Dataset for predicting the optimal temperature for protein function.",
"tokenwise": False,
"multiple_inputs": False,
},
"optimal-ph": {
"type": "R",
"task": "Protein property prediction",
"description": "Dataset for predicting the optimal pH for protein function.",
"tokenwise": False,
"multiple_inputs": False,
},
"fluorescence-prediction": {
"type": "R",
"task": "Protein property prediction",
"description": "Dataset for predicting protein fluorescence properties.",
"tokenwise": True,
"multiple_inputs": False,
},
"PPA-ppi": {
"type": "R",
"task": "protein-protein affinity prediction",
"description": "Protein-Protein Affinity dataset from Bindwell.",
"tokenwise": False,
"multiple_inputs": True,
},
"million_full": {
"type": "R",
"task": "Protein fitness prediction",
"description": "Large-scale enzyme variant dataset, from Millionfull preprint October 2025",
"tokenwise": False,
"multiple_inputs": False,
},
# Single-label classification (SLC) and PPI
"gold-ppi": {
"type": "SLC",
"task": "PPI prediction",
"description": "Gold standard dataset for protein-protein interaction prediction.",
"tokenwise": False,
"multiple_inputs": True,
},
"human-ppi-saprot": {
"type": "SLC",
"task": "PPI prediction",
"description": "Human protein-protein interaction dataset from SAProt paper.",
"tokenwise": False,
"multiple_inputs": True,
},
"human-ppi-pinui": {
"type": "SLC",
"task": "PPI prediction",
"description": "Human protein-protein interaction dataset from PiNUI.",
"tokenwise": False,
"multiple_inputs": True,
},
"yeast-ppi-pinui": {
"type": "SLC",
"task": "PPI prediction",
"description": "Yeast protein-protein interaction dataset from PiNUI.",
"tokenwise": False,
"multiple_inputs": True,
},
"peptide-HLA-MHC-affinity": {
"type": "SLC",
"task": "Binding affinity prediction",
"description": "Dataset for predicting peptide binding affinity to HLA/MHC complexes.",
"tokenwise": False,
"multiple_inputs": True,
},
"ppi-mutation-effect": {
"type": "SLC",
"task": "PPI effect prediction",
"description": "Compare wild type, mutated, and target sequence to determine if PPI is stronger or not.",
"tokenwise": False,
"multiple_inputs": True,
},
# PPI sets (MLC variants and splits)
"shs27-ppi-raw": {
"type": "SLC",
"task": "PPI type prediction",
"description": "Raw SHS27k with single-label labels.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs148-ppi-raw": {
"type": "SLC",
"task": "PPI type prediction",
"description": "Raw SHS148k with single-label labels.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs27-ppi-random": {
"type": "MLC",
"task": "PPI prediction",
"description": "SHS27k",
"tokenwise": False,
"multiple_inputs": True,
},
"shs148-ppi-random": {
"type": "MLC",
"task": "PPI type prediction",
"description": "SHS148k CD-Hit 40%, multi-label lables, randomized data splits.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs27-ppi-dfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "SHS27k CD-Hit 40%, multi-label lables, data splits via depth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs148-ppi-dfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "SHS148k CD-Hit 40%, multi-label lables, data splits via depth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs27-ppi-bfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "SHS27k CD-Hit 40%, multi-label lables, data splits via breadth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
"shs148-ppi-bfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "SHS148k CD-Hit 40%, multi-label lables, data splits via breadth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
"string-ppi-random": {
"type": "MLC",
"task": "PPI type prediction",
"description": "STRING CD-Hit 40%, multi-label lables, randomized data splits.",
"tokenwise": False,
"multiple_inputs": True,
},
"string-ppi-dfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "STRING CD-Hit 40%, multi-label lables, data splits via depth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
"string-ppi-bfs": {
"type": "MLC",
"task": "PPI type prediction",
"description": "STRING CD-Hit 40%, multi-label lables, data splits via breadth first search.",
"tokenwise": False,
"multiple_inputs": True,
},
# Token-wise tasks
"SecondaryStructure-3": {
"type": "MCC",
"task": "Protein structure prediction",
"description": "Dataset for predicting protein secondary structure in 3 classes.",
"tokenwise": True,
"multiple_inputs": False,
},
"SecondaryStructure-8": {
"type": "MCC",
"task": "Protein structure prediction",
"description": "Dataset for predicting protein secondary structure in 8 classes.",
"tokenwise": True,
"multiple_inputs": False,
},
"plddt": {
"type": "TR",
"task": "Confidence prediction",
"description": "AlphaFold pLDDT confidence score prediction.",
"tokenwise": True,
"multiple_inputs": False,
},
# Misc
"diff_phylogeny": {
"type": "Various",
"task": "Phylogeny prediction",
"description": "Differential phylogeny dataset.",
"tokenwise": False,
"multiple_inputs": False,
},
}