File size: 3,217 Bytes
6c30d74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
"""Test BacDive phenotype extraction against a fixture of the real v2 schema."""
from __future__ import annotations

from microbe_model.data.bacdive import _derive_optimum, extract_phenotypes

# Trimmed-down version of a real /v2/fetch/24493 response (Phaeobacter gallaeciensis BS 107).
SAMPLE_RECORD = {
    "General": {
        "BacDive-ID": 24493,
        "NCBI tax id": [
            {"NCBI tax id": 1423144, "Matching level": "strain"},
            {"NCBI tax id": 60890, "Matching level": "species"},
        ],
    },
    "Name and taxonomic classification": {
        "LPSN": {
            "domain": "Bacteria",
            "phylum": "Pseudomonadota",
            "class": "Alphaproteobacteria",
            "order": "Rhodobacterales",
            "family": "Roseobacteraceae",
            "genus": "Phaeobacter",
            "species": "Phaeobacter gallaeciensis",
        },
        "genus": "Phaeobacter",
        "species": "Phaeobacter gallaeciensis",
    },
    "Culture and growth conditions": {
        "culture temp": [
            {"growth": "positive", "type": "growth", "temperature": "25"},
            {"growth": "positive", "type": "growth", "temperature": "22"},
            {"growth": "positive", "type": "growth", "temperature": "5-30"},
            {"growth": "negative", "type": "growth", "temperature": "37"},
        ],
    },
    "Physiology and metabolism": {
        "oxygen tolerance": [{"oxygen tolerance": "obligate aerobe"}],
    },
    "Sequence information": {
        "Genome sequences": [
            {"INSDC accession": "GCA_000511385", "assembly level": "complete"},
            {"INSDC accession": "GCA_000819625", "assembly level": "contig"},
        ],
    },
}


def test_extract_phenotypes_real_schema() -> None:
    out = extract_phenotypes(SAMPLE_RECORD)
    assert out["bacdive_id"] == 24493
    assert out["species"] == "Phaeobacter gallaeciensis"
    assert out["genus"] == "Phaeobacter"
    assert out["family"] == "Roseobacteraceae"
    assert out["ncbi_taxon_id"] == 1423144
    assert out["genome_accession"] == "GCA_000511385"  # first listed
    assert out["oxygen_requirement"] == "obligate aerobe"

    # Three positive-growth temps: 25, 22, midpoint(5-30)=17.5 → median = 22
    assert out["optimal_temperature_c"] == 22.0


def test_derive_optimum_prefers_explicit_optimum() -> None:
    entries = [
        {"type": "growth", "growth": "positive", "temperature": "30"},
        {"type": "optimum", "temperature": "37"},
        {"type": "growth", "growth": "positive", "temperature": "25"},
    ]
    assert _derive_optimum(entries, "temperature") == 37.0


def test_derive_optimum_falls_back_to_growth_median() -> None:
    entries = [
        {"type": "growth", "growth": "positive", "temperature": "20"},
        {"type": "growth", "growth": "positive", "temperature": "30"},
        {"type": "growth", "growth": "negative", "temperature": "45"},  # ignored
    ]
    assert _derive_optimum(entries, "temperature") == 25.0


def test_extract_phenotypes_handles_missing_fields() -> None:
    out = extract_phenotypes({})
    assert out["bacdive_id"] is None
    assert out["genome_accession"] is None
    assert out["optimal_temperature_c"] is None