File size: 6,356 Bytes
0ed74db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Curated Pfam markers for genome-driven phenotype + medium recommendation.

Each entry: Pfam ID -> (column_name, biological role / why it matters).

This file is the verified-correct subset only β€” every Pfam ID below was checked
against InterPro's DESC field (see scripts/23_verify_markers.py and
data/markers/_verification.tsv) and only those whose actual Pfam family matches
the claimed biology are included.

Future expansion (especially for nitrogen fixation, methanogenesis, sulfate
reduction, methylotrophy, vitamin biosynthesis) should layer on TIGRFAM or KOfam
function-defined markers, which are tighter than Pfam structural domains.
"""
from __future__ import annotations

# ----------------------------------------------------------------------------
# Phenotype markers
# ----------------------------------------------------------------------------

TEMPERATURE_MARKERS: dict[str, tuple[str, str]] = {
    "PF00012": ("Hsp70_DnaK",       "Hsp70/DnaK chaperone β€” abundance scales with temp"),
    "PF00183": ("Hsp90",            "Hsp90 chaperone"),
    "PF00118": ("Cpn60_GroEL",      "Cpn60/GroEL chaperonin β€” strong thermophile signal"),
    "PF00011": ("Hsp20",            "small heat-shock protein"),
    "PF00313": ("CSD_cold_shock",   "cold-shock DNA-binding domain β€” psychrophile marker"),
    "PF02824": ("TGS_thermosome",   "TGS domain (often archaeal thermosome / SecA / ObgE)"),
}

PH_MARKERS: dict[str, tuple[str, str]] = {
    "PF00006": ("ATP_synth_alphabeta",  "F1 ATPase Ξ±/Ξ² nucleotide-binding β€” proton motive force"),
    "PF00306": ("ATP_synth_alphabeta_C","F1 ATPase Ξ±/Ξ² C-terminal"),
    "PF00430": ("ATP_synth_F0_B",       "F0 ATPase B/B' subunit"),
    "PF00999": ("NhaA_Na_H_exch",       "NhaA-style Na+/H+ antiporter β€” alkaliphile signal"),
    "PF06450": ("NhaB_Na_H_exch",       "NhaB Na+/H+ antiporter"),
    "PF00282": ("Pyridoxal_decarbox",   "pyridoxal-dependent decarboxylase (acid resistance)"),
    "PF01618": ("MotA_TolQ_ExbB",       "proton channel family β€” flagellar stator / TolQ"),
    "PF03224": ("V_ATPase_subH_N",      "V-ATPase subunit H β€” acidophile / archaeal"),
}

OXYGEN_MARKERS: dict[str, tuple[str, str]] = {
    "PF00115": ("COX1_aerobic",         "heme-Cu terminal oxidase subunit I"),
    "PF02790": ("COX2_TM_aerobic",      "cytochrome c oxidase II transmembrane"),
    "PF00116": ("COX2_periplasm_aero",  "cytochrome c oxidase II periplasmic"),
    "PF13442": ("Cyt_CBB3_microaero",   "cbb3-type cytochrome c oxidase β€” microaerophile signal"),
    "PF00355": ("Rieske_2Fe2S",         "Rieske 2Fe-2S β€” cytochrome bc1 / aerobic resp."),
    "PF00199": ("Catalase",             "H2O2 detox β€” aerobic defense"),
    "PF00081": ("SOD_FeMn",             "Fe/Mn superoxide dismutase"),
    "PF00080": ("SOD_CuZn",             "Cu/Zn superoxide dismutase"),
    "PF02906": ("FeFe_hyd_anaerobic",   "[FeFe]-hydrogenase β€” strict-anaerobe marker"),
    "PF00374": ("NiFe_hyd_anaerobic",   "[NiFe]-hydrogenase large subunit"),
    "PF00890": ("FAD_binding_FrdA",     "FAD-binding (fumarate reductase / succinate DH)"),
    "PF00037": ("Fer4_FeS_4Fe4S",       "4Fe-4S ferredoxin β€” anaerobic energy"),
}

SALT_MARKERS: dict[str, tuple[str, str]] = {
    "PF02702": ("KdpD_osmosensor",      "K+ channel histidine-kinase osmosensor"),
    "PF02386": ("TrkH_K_channel",       "TrkH/H+/K+ cation transport"),
    "PF02028": ("BCCT_compatible",      "BCCT family glycine-betaine/choline transporter"),
    "PF00528": ("BPD_transp_1",         "binding-protein-dependent ABC transporter (broad)"),
    "PF06339": ("EctC_ectoine_synth",   "ectoine synthase β€” halophile compatible-solute"),
    "PF01036": ("Bact_rhodopsin",       "bacteriorhodopsin family β€” extreme halophile"),
}

# ----------------------------------------------------------------------------
# Media-component markers β€” drive recipe choice directly
# ----------------------------------------------------------------------------

VITAMIN_MARKERS: dict[str, tuple[str, str]] = {
    "PF00590": ("TP_methylase_B12",     "tetrapyrrole methylase β€” B12/heme/F430 biosynthesis"),
    "PF01497": ("Peripla_BP_2",         "periplasmic binding (B12, Fe-siderophore, etc.)"),
    "PF00763": ("THF_DHG_CYH_folate",   "THF dehydrogenase/cyclohydrolase β€” folate path"),
    "PF02152": ("FolB_folate",          "dihydroneopterin aldolase β€” folate path"),
    "PF03740": ("PdxJ_pyridoxine",      "pyridoxine biosynthesis PdxJ"),
    "PF00926": ("DHBP_riboflavin",      "DHBP synthase β€” riboflavin biosynthesis"),
}

NITROGEN_MARKERS: dict[str, tuple[str, str]] = {
    "PF00142": ("NifH_nitrogenase",     "NifH Fe-protein β€” fixes atmospheric N2"),
    "PF00148": ("NifDK_nitrogenase",    "Nitrogenase MoFe component 1"),
    "PF03460": ("NIR_SIR_ferredoxin",   "nitrite/sulfite reductase ferredoxin half"),
}

CARBON_MARKERS: dict[str, tuple[str, str]] = {
    "PF00016": ("RuBisCO_large_form1",  "RuBisCO large chain β€” Calvin cycle autotrophy"),
    "PF00101": ("RuBisCO_small_form1",  "RuBisCO small chain (Form I-specific)"),
    "PF00128": ("Alpha_amylase",        "starch utilization"),
    "PF00150": ("Cellulase_GH5",        "GH5 cellulase β€” plant-polymer carbon source"),
    "PF00553": ("CBM_cellulose",        "cellulose-binding module"),
}

SPECIAL_MARKERS: dict[str, tuple[str, str]] = {
    "PF00384": ("Molybdopterin_OR",     "broad: covers AprA/NarG/FdhF type oxidoreductases"),
    "PF13361": ("UvrD_helicase_C",      "DNA-repair helicase β€” positive control (in nearly all)"),
}


def all_markers() -> dict[str, tuple[str, str]]:
    merged: dict[str, tuple[str, str]] = {}
    for category in (
        TEMPERATURE_MARKERS,
        PH_MARKERS,
        OXYGEN_MARKERS,
        SALT_MARKERS,
        VITAMIN_MARKERS,
        NITROGEN_MARKERS,
        CARBON_MARKERS,
        SPECIAL_MARKERS,
    ):
        merged.update(category)
    return merged


def category_for(pfam_id: str) -> str:
    for cat_name, cat in (
        ("temperature", TEMPERATURE_MARKERS),
        ("ph", PH_MARKERS),
        ("oxygen", OXYGEN_MARKERS),
        ("salt", SALT_MARKERS),
        ("vitamin", VITAMIN_MARKERS),
        ("nitrogen", NITROGEN_MARKERS),
        ("carbon", CARBON_MARKERS),
        ("special", SPECIAL_MARKERS),
    ):
        if pfam_id in cat:
            return cat_name
    return "other"