File size: 7,507 Bytes
1168cd6
93ba629
 
 
 
 
1168cd6
93ba629
 
 
 
 
 
1168cd6
 
 
 
 
93ba629
1168cd6
 
 
 
 
 
93ba629
1168cd6
 
 
 
 
 
 
 
 
93ba629
1168cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93ba629
1168cd6
 
 
93ba629
1168cd6
 
 
93ba629
 
1168cd6
 
93ba629
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1168cd6
93ba629
 
 
 
 
1168cd6
93ba629
 
 
 
1168cd6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93ba629
1168cd6
93ba629
1168cd6
93ba629
1168cd6
 
 
 
 
 
 
93ba629
1168cd6
 
93ba629
 
 
1168cd6
 
93ba629
1168cd6
 
 
93ba629
1168cd6
93ba629
1168cd6
 
 
 
93ba629
1168cd6
 
 
 
93ba629
1168cd6
 
 
 
93ba629
1168cd6
 
 
 
93ba629
1168cd6
 
 
 
93ba629
 
 
1168cd6
 
 
 
93ba629
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
# engine/schema.py
# ------------------------------------------------------------
# Core schema + Extended schema support
# ------------------------------------------------------------

from __future__ import annotations
from typing import Dict, List, Any, Tuple
import json
import os

# ============================
# CORE SCHEMA DEFINITIONS
# ============================

POS_NEG_VAR = ["Positive", "Negative", "Variable"]
UNKNOWN = "Unknown"
MULTI_SEPARATOR = ";"


ENUMS = {
    "Gram Stain": ["Positive", "Negative", "Variable"],
    "Shape": ["Cocci", "Rods", "Bacilli", "Spiral", "Short Rods"],
    "Haemolysis Type": ["None", "Beta", "Gamma", "Alpha"],
}


SCHEMA: Dict[str, Dict[str, Any]] = {
    "Genus": {"type": "text", "required": True},
    "Species": {"type": "text", "required": False},

    "Gram Stain": {"type": "enum", "allowed": ENUMS["Gram Stain"]},
    "Shape": {"type": "enum", "allowed": ENUMS["Shape"]},
    "Colony Morphology": {"type": "multienum", "separator": MULTI_SEPARATOR},
    "Haemolysis": {"type": "enum", "allowed": POS_NEG_VAR},
    "Haemolysis Type": {"type": "multienum", "separator": MULTI_SEPARATOR, "allowed": ENUMS["Haemolysis Type"]},

    "Motility": {"type": "enum", "allowed": POS_NEG_VAR},
    "Capsule": {"type": "enum", "allowed": POS_NEG_VAR},
    "Spore Formation": {"type": "enum", "allowed": POS_NEG_VAR},

    "Growth Temperature": {"type": "range", "format": "low//high", "units": "°C"},
    "Oxygen Requirement": {"type": "text"},
    "Media Grown On": {"type": "multienum", "separator": MULTI_SEPARATOR},

    "Catalase": {"type": "enum", "allowed": POS_NEG_VAR},
    "Oxidase": {"type": "enum", "allowed": POS_NEG_VAR},
    "Indole": {"type": "enum", "allowed": POS_NEG_VAR},
    "Urease": {"type": "enum", "allowed": POS_NEG_VAR},
    "Citrate": {"type": "enum", "allowed": POS_NEG_VAR},
    "Methyl Red": {"type": "enum", "allowed": POS_NEG_VAR},
    "VP": {"type": "enum", "allowed": POS_NEG_VAR},
    "H2S": {"type": "enum", "allowed": POS_NEG_VAR},
    "DNase": {"type": "enum", "allowed": POS_NEG_VAR},
    "ONPG": {"type": "enum", "allowed": POS_NEG_VAR},
    "Coagulase": {"type": "enum", "allowed": POS_NEG_VAR},
    "Lipase Test": {"type": "enum", "allowed": POS_NEG_VAR},
    "Nitrate Reduction": {"type": "enum", "allowed": POS_NEG_VAR},

    "NaCl Tolerant (>=6%)": {"type": "enum", "allowed": POS_NEG_VAR},

    "Lysine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
    "Ornitihine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
    "Arginine dihydrolase": {"type": "enum", "allowed": POS_NEG_VAR},

    "Gelatin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
    "Esculin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},

    "Glucose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Lactose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Sucrose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Mannitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Sorbitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Maltose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Xylose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Rhamnose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Arabinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Raffinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Trehalose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
    "Inositol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},

    "Extra Notes": {"type": "text"},
}


FIELDS_ORDER: List[str] = list(SCHEMA.keys())

MULTI_FIELDS: List[str] = [
    f for f, meta in SCHEMA.items() if meta.get("type") == "multienum"
]

PNV_FIELDS: List[str] = [
    f for f, meta in SCHEMA.items()
    if meta.get("type") == "enum" and meta.get("allowed") == POS_NEG_VAR
]

# ============================================================
# EXTENDED SCHEMA SUPPORT (needed for Stage 10C)
# ============================================================

def get_core_fields() -> List[str]:
    """Return the exact core schema fields (columns in DB)."""
    return list(SCHEMA.keys())


def load_extended_schema(path: str = "data/extended_schema.json") -> Dict[str, Any]:
    """Load extended schema from JSON; always returns a dict."""
    if not os.path.exists(path):
        return {}
    try:
        with open(path, "r", encoding="utf-8") as f:
            obj = json.load(f)
            return obj if isinstance(obj, dict) else {}
    except Exception:
        return {}


def save_extended_schema(schema: Dict[str, Any], path: str = "data/extended_schema.json") -> None:
    """Save updated extended schema."""
    os.makedirs(os.path.dirname(path), exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(schema, f, indent=2, ensure_ascii=False)


# ============================================================
# NORMALIZATION / VALIDATION (your existing logic preserved)
# ============================================================

def normalize_value(field: str, value: str) -> str:
    if value is None or str(value).strip() == "":
        return UNKNOWN
    v = str(value).strip()

    if v.lower() == "unknown":
        return UNKNOWN

    meta = SCHEMA.get(field, {})
    ftype = meta.get("type")

    if ftype == "enum":
        allowed = meta.get("allowed", [])
        for a in allowed:
            if v.lower() == a.lower():
                return a
        if v.lower() in ["+", "positive", "pos"]:
            return "Positive"
        if v.lower() in ["-", "negative", "neg"]:
            return "Negative"
        if v.lower() in ["variable", "var", "v"]:
            return "Variable"
        return v

    if ftype == "multienum":
        parts = [p.strip() for p in v.split(MULTI_SEPARATOR) if p.strip()]
        allowed = meta.get("allowed")
        normed = []
        for p in parts:
            if allowed:
                hit = next((a for a in allowed if a.lower() == p.lower()), None)
                normed.append(hit if hit else p)
            else:
                normed.append(p)
        return "; ".join(normed) if normed else UNKNOWN

    if ftype == "range":
        return v.replace(" ", "")

    return v


def validate_record(rec: Dict[str, Any]) -> Tuple[bool, List[str]]:
    issues = []
    for field in FIELDS_ORDER:
        if field not in rec:
            continue
        val = rec[field]
        meta = SCHEMA[field]

        if meta["type"] == "enum":
            allowed = meta.get("allowed", [])
            if str(val) not in allowed + [UNKNOWN]:
                issues.append(f"{field}: '{val}' invalid")

        elif meta["type"] == "multienum":
            if val == UNKNOWN:
                continue
            parts = [p.strip() for p in val.split(MULTI_SEPARATOR)]
            allowed = meta.get("allowed")
            if allowed:
                bad = [p for p in parts if p not in allowed]
                if bad:
                    issues.append(f"{field}: invalid values {bad}")

        elif meta["type"] == "range":
            if val == UNKNOWN:
                continue
            if "//" not in str(val):
                issues.append(f"{field}: malformed range '{val}'")
    return (len(issues) == 0), issues


def empty_record() -> Dict[str, str]:
    rec = {}
    for f in SCHEMA.keys():
        rec[f] = "" if f in ("Genus", "Species") else UNKNOWN
    return rec