Spaces:
Sleeping
Sleeping
File size: 6,356 Bytes
0ed74db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | """Curated Pfam markers for genome-driven phenotype + medium recommendation.
Each entry: Pfam ID -> (column_name, biological role / why it matters).
This file is the verified-correct subset only β every Pfam ID below was checked
against InterPro's DESC field (see scripts/23_verify_markers.py and
data/markers/_verification.tsv) and only those whose actual Pfam family matches
the claimed biology are included.
Future expansion (especially for nitrogen fixation, methanogenesis, sulfate
reduction, methylotrophy, vitamin biosynthesis) should layer on TIGRFAM or KOfam
function-defined markers, which are tighter than Pfam structural domains.
"""
from __future__ import annotations
# ----------------------------------------------------------------------------
# Phenotype markers
# ----------------------------------------------------------------------------
TEMPERATURE_MARKERS: dict[str, tuple[str, str]] = {
"PF00012": ("Hsp70_DnaK", "Hsp70/DnaK chaperone β abundance scales with temp"),
"PF00183": ("Hsp90", "Hsp90 chaperone"),
"PF00118": ("Cpn60_GroEL", "Cpn60/GroEL chaperonin β strong thermophile signal"),
"PF00011": ("Hsp20", "small heat-shock protein"),
"PF00313": ("CSD_cold_shock", "cold-shock DNA-binding domain β psychrophile marker"),
"PF02824": ("TGS_thermosome", "TGS domain (often archaeal thermosome / SecA / ObgE)"),
}
PH_MARKERS: dict[str, tuple[str, str]] = {
"PF00006": ("ATP_synth_alphabeta", "F1 ATPase Ξ±/Ξ² nucleotide-binding β proton motive force"),
"PF00306": ("ATP_synth_alphabeta_C","F1 ATPase Ξ±/Ξ² C-terminal"),
"PF00430": ("ATP_synth_F0_B", "F0 ATPase B/B' subunit"),
"PF00999": ("NhaA_Na_H_exch", "NhaA-style Na+/H+ antiporter β alkaliphile signal"),
"PF06450": ("NhaB_Na_H_exch", "NhaB Na+/H+ antiporter"),
"PF00282": ("Pyridoxal_decarbox", "pyridoxal-dependent decarboxylase (acid resistance)"),
"PF01618": ("MotA_TolQ_ExbB", "proton channel family β flagellar stator / TolQ"),
"PF03224": ("V_ATPase_subH_N", "V-ATPase subunit H β acidophile / archaeal"),
}
OXYGEN_MARKERS: dict[str, tuple[str, str]] = {
"PF00115": ("COX1_aerobic", "heme-Cu terminal oxidase subunit I"),
"PF02790": ("COX2_TM_aerobic", "cytochrome c oxidase II transmembrane"),
"PF00116": ("COX2_periplasm_aero", "cytochrome c oxidase II periplasmic"),
"PF13442": ("Cyt_CBB3_microaero", "cbb3-type cytochrome c oxidase β microaerophile signal"),
"PF00355": ("Rieske_2Fe2S", "Rieske 2Fe-2S β cytochrome bc1 / aerobic resp."),
"PF00199": ("Catalase", "H2O2 detox β aerobic defense"),
"PF00081": ("SOD_FeMn", "Fe/Mn superoxide dismutase"),
"PF00080": ("SOD_CuZn", "Cu/Zn superoxide dismutase"),
"PF02906": ("FeFe_hyd_anaerobic", "[FeFe]-hydrogenase β strict-anaerobe marker"),
"PF00374": ("NiFe_hyd_anaerobic", "[NiFe]-hydrogenase large subunit"),
"PF00890": ("FAD_binding_FrdA", "FAD-binding (fumarate reductase / succinate DH)"),
"PF00037": ("Fer4_FeS_4Fe4S", "4Fe-4S ferredoxin β anaerobic energy"),
}
SALT_MARKERS: dict[str, tuple[str, str]] = {
"PF02702": ("KdpD_osmosensor", "K+ channel histidine-kinase osmosensor"),
"PF02386": ("TrkH_K_channel", "TrkH/H+/K+ cation transport"),
"PF02028": ("BCCT_compatible", "BCCT family glycine-betaine/choline transporter"),
"PF00528": ("BPD_transp_1", "binding-protein-dependent ABC transporter (broad)"),
"PF06339": ("EctC_ectoine_synth", "ectoine synthase β halophile compatible-solute"),
"PF01036": ("Bact_rhodopsin", "bacteriorhodopsin family β extreme halophile"),
}
# ----------------------------------------------------------------------------
# Media-component markers β drive recipe choice directly
# ----------------------------------------------------------------------------
VITAMIN_MARKERS: dict[str, tuple[str, str]] = {
"PF00590": ("TP_methylase_B12", "tetrapyrrole methylase β B12/heme/F430 biosynthesis"),
"PF01497": ("Peripla_BP_2", "periplasmic binding (B12, Fe-siderophore, etc.)"),
"PF00763": ("THF_DHG_CYH_folate", "THF dehydrogenase/cyclohydrolase β folate path"),
"PF02152": ("FolB_folate", "dihydroneopterin aldolase β folate path"),
"PF03740": ("PdxJ_pyridoxine", "pyridoxine biosynthesis PdxJ"),
"PF00926": ("DHBP_riboflavin", "DHBP synthase β riboflavin biosynthesis"),
}
NITROGEN_MARKERS: dict[str, tuple[str, str]] = {
"PF00142": ("NifH_nitrogenase", "NifH Fe-protein β fixes atmospheric N2"),
"PF00148": ("NifDK_nitrogenase", "Nitrogenase MoFe component 1"),
"PF03460": ("NIR_SIR_ferredoxin", "nitrite/sulfite reductase ferredoxin half"),
}
CARBON_MARKERS: dict[str, tuple[str, str]] = {
"PF00016": ("RuBisCO_large_form1", "RuBisCO large chain β Calvin cycle autotrophy"),
"PF00101": ("RuBisCO_small_form1", "RuBisCO small chain (Form I-specific)"),
"PF00128": ("Alpha_amylase", "starch utilization"),
"PF00150": ("Cellulase_GH5", "GH5 cellulase β plant-polymer carbon source"),
"PF00553": ("CBM_cellulose", "cellulose-binding module"),
}
SPECIAL_MARKERS: dict[str, tuple[str, str]] = {
"PF00384": ("Molybdopterin_OR", "broad: covers AprA/NarG/FdhF type oxidoreductases"),
"PF13361": ("UvrD_helicase_C", "DNA-repair helicase β positive control (in nearly all)"),
}
def all_markers() -> dict[str, tuple[str, str]]:
merged: dict[str, tuple[str, str]] = {}
for category in (
TEMPERATURE_MARKERS,
PH_MARKERS,
OXYGEN_MARKERS,
SALT_MARKERS,
VITAMIN_MARKERS,
NITROGEN_MARKERS,
CARBON_MARKERS,
SPECIAL_MARKERS,
):
merged.update(category)
return merged
def category_for(pfam_id: str) -> str:
for cat_name, cat in (
("temperature", TEMPERATURE_MARKERS),
("ph", PH_MARKERS),
("oxygen", OXYGEN_MARKERS),
("salt", SALT_MARKERS),
("vitamin", VITAMIN_MARKERS),
("nitrogen", NITROGEN_MARKERS),
("carbon", CARBON_MARKERS),
("special", SPECIAL_MARKERS),
):
if pfam_id in cat:
return cat_name
return "other"
|