EphAsad commited on
Commit
33342cc
·
verified ·
1 Parent(s): eaf9a2b

Update engine/parser_rules.py

Browse files
Files changed (1) hide show
  1. engine/parser_rules.py +717 -694
engine/parser_rules.py CHANGED
@@ -1,699 +1,722 @@
1
- # engine/parser_rules.py
2
- # ------------------------------------------------------------
3
- # Rule-based core parser for microbiology descriptions.
4
- #
5
- # Stage 11E: built on the 0.68-accuracy version, with
6
- # targeted fixes for:
7
- # - MRMethyl Red ("mr" keyword)
8
- # - Motility (motile/non-motile/nonmotile/immotile +
9
- # tumbling/swarming/corkscrew motility)
10
- # - H₂S (subscript 2 normalised) + "produces/doesn't produce H2S"
11
- # - "reduces nitrate" / "does not reduce nitrate"
12
- # - Oxygen parsing: avoid "aerobic" grabbing "anaerobic"
13
- # - Non-spore-forming Negative (and avoid non- prefix mistake)
14
- # - Capsule "capsule positive/negative" phrasing
15
- # - NaCl tolerance in phrases like "in 6.5% NaCl"
16
- # - Growth Temperature always as "X//X" (including single temps)
17
- # - Sugar phrases "glucose positive" etc. → Fermentation fields
18
- # - Global non-fermenter patterns → all sugars Negative (if unset)
19
- # - Esculin negative → Esculin Hydrolysis Negative
20
- # - Colony morphology from "colonies dry, white and irregular..."
21
- # and "forming green colonies", etc.
22
- # - Diplococci / tetracocci / streptococci / staphylococci → Cocci
23
- # ------------------------------------------------------------
24
-
25
- from __future__ import annotations
26
-
27
- import re
28
- from typing import Dict, Any, List
29
-
30
-
31
- UNKNOWN = "Unknown"
32
-
33
- # ------------------------------------------------------------
34
- # Core fields and sugar mapping
35
- # ------------------------------------------------------------
36
-
37
- # Sugar name → core DB column
38
- SUGAR_FIELDS: Dict[str, str] = {
39
- "glucose": "Glucose Fermentation",
40
- "lactose": "Lactose Fermentation",
41
- "sucrose": "Sucrose Fermentation",
42
- "maltose": "Maltose Fermentation",
43
- "mannitol": "Mannitol Fermentation",
44
- "sorbitol": "Sorbitol Fermentation",
45
- "xylose": "Xylose Fermentation",
46
- "rhamnose": "Rhamnose Fermentation",
47
- "arabinose": "Arabinose Fermentation",
48
- "raffinose": "Raffinose Fermentation",
49
- "trehalose": "Trehalose Fermentation",
50
- "inositol": "Inositol Fermentation",
51
- }
52
-
53
- CORE_BOOL_FIELDS: Dict[str, List[str]] = {
54
- # field: [keywords to recognise the test name]
55
- "Catalase": ["catalase"],
56
- "Oxidase": ["oxidase"],
57
- "Indole": ["indole"],
58
- "Urease": ["urease"],
59
- "Citrate": ["citrate"],
60
- # MR: add "mr" as a keyword
61
- "Methyl Red": ["methyl red", "mr test", "mr"],
62
- "VP": ["voges-proskauer", "vp test", "vp"],
63
- # H2S: allow H2S and hydrogen sulfide
64
- "H2S": ["h2s", "hydrogen sulfide"],
65
- # DNase: handle "dnase" and "dnase test"
66
- "DNase": ["dnase", "dnase test"],
67
- "ONPG": ["onpg"],
68
- "Coagulase": ["coagulase"],
69
- "Lipase Test": ["lipase"],
70
- "Nitrate Reduction": ["nitrate reduction", "nitrate"],
71
- "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
72
- "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb"],
73
- "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
74
- "Arginine dihydrolase": ["arginine dihydrolase"],
75
- "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
76
- # Esculin Hydrolysis: also match plain "esculin"
77
- "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
78
- }
79
-
80
- # ------------------------------------------------------------
81
- # Generic helpers
82
- # ------------------------------------------------------------
83
-
84
- def _clean_text(text: str) -> str:
85
- """
86
- Normalise a few unicode oddities and collapse whitespace.
87
- Also:
88
- - strip degree symbols
89
- - normalise subscript ₂ → 2 for H₂S
90
- """
91
- if not text:
92
- return ""
93
- s = text.replace("°", "").replace("º", "")
94
- # normalise subscript 2 (HS → H2S)
95
- s = s.replace("₂", "2")
96
- # keep dashes as-as; regexes handle - and – explicitly
97
- return " ".join(s.split())
98
-
99
-
100
- def _norm(s: str) -> str:
101
- return s.strip().lower()
102
-
103
-
104
- def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
105
- """
106
- Write value to parsed[field] if:
107
- - field not present, or
108
- - we are replacing Unknown with a concrete value
109
- """
110
- if not value:
111
- return
112
- if field not in parsed or parsed[field] == UNKNOWN:
113
- parsed[field] = value
114
-
115
-
116
- def _value_from_pnv_context(segment: str) -> str | None:
117
- """
118
- Interpret a short phrase as Positive / Negative / Variable.
119
- Examples:
120
- "positive", "+", "pos" → Positive
121
- "negative", "neg", "-" → Negative
122
- "variable", "var", "v" → Variable
123
- """
124
- seg = _norm(segment)
125
- if seg in ["positive", "pos", "+"]:
126
- return "Positive"
127
- if seg in ["negative", "neg", "-"]:
128
- return "Negative"
129
- if seg in ["variable", "var", "v"]:
130
- return "Variable"
131
- return None
132
-
133
-
134
- # ------------------------------------------------------------
135
- # Gram stain and shape
136
- # ------------------------------------------------------------
137
-
138
- def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
139
- # Gram stain
140
- if "gram-positive" in text_lc or "gram positive" in text_lc:
141
- _set_if_stronger(parsed, "Gram Stain", "Positive")
142
- elif "gram-negative" in text_lc or "gram negative" in text_lc:
143
- _set_if_stronger(parsed, "Gram Stain", "Negative")
144
- elif "gram variable" in text_lc:
145
- _set_if_stronger(parsed, "Gram Stain", "Variable")
146
-
147
- # Shape
148
- # Prefer "short rods" over generic rods
149
- if "short rods" in text_lc:
150
- _set_if_stronger(parsed, "Shape", "Short Rods")
151
-
152
- # Cocci and variants (diplococci, tetracocci, etc.)
153
- if re.search(r"\bcocci\b", text_lc):
154
- _set_if_stronger(parsed, "Shape", "Cocci")
155
- if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
156
- _set_if_stronger(parsed, "Shape", "Cocci")
157
-
158
- # Rods / bacilli
159
- if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
160
- _set_if_stronger(parsed, "Shape", "Rods")
161
-
162
- # Spiral
163
- if "spiral" in text_lc or "spirochete" in text_lc:
164
- _set_if_stronger(parsed, "Shape", "Spiral")
165
-
166
-
167
- # ------------------------------------------------------------
168
- # Haemolysis
169
- # ------------------------------------------------------------
170
-
171
- def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
172
- """
173
- Handle haemolysis phrasing:
174
- - beta-haemolysis / beta-haemolytic / beta hemolysis / beta hemolytic
175
- - alpha- / gamma- / non-haemolytic
176
- Always set Haemolysis to Positive when a clear type is mentioned,
177
- except for "none"/gamma where it's Negative.
178
- """
179
- # Beta
180
- if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
181
- _set_if_stronger(parsed, "Haemolysis Type", "Beta")
182
- _set_if_stronger(parsed, "Haemolysis", "Positive")
183
-
184
- # Alpha
185
- if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
186
- _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
187
- _set_if_stronger(parsed, "Haemolysis", "Positive")
188
-
189
- # Gamma / non-haemolytic
190
- if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
191
- _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
192
- _set_if_stronger(parsed, "Haemolysis", "Negative")
193
- if (
194
- "non-haemolytic" in text_lc
195
- or "non hemolytic" in text_lc
196
- or "non-hemolytic" in text_lc
197
- ):
198
- _set_if_stronger(parsed, "Haemolysis Type", "None")
199
- _set_if_stronger(parsed, "Haemolysis", "Negative")
200
-
201
- # Variable phrasing
202
- if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
203
- _set_if_stronger(parsed, "Haemolysis Type", "Variable")
204
- _set_if_stronger(parsed, "Haemolysis", "Variable")
205
-
206
-
207
- # ------------------------------------------------------------
208
- # Boolean test parser (core enzyme tests etc.)
209
- # ------------------------------------------------------------
210
-
211
- def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
212
- """
213
- For each test in CORE_BOOL_FIELDS, look for patterns like:
214
- "catalase positive", "positive for catalase", etc.
215
- Also handles "negative" and "variable" and a few special
216
- cases like nitrate reduction and H2S production.
217
- """
218
- for field, keywords in CORE_BOOL_FIELDS.items():
219
- for kw in keywords:
220
- # "... catalase positive"
221
- m1 = re.search(
222
- rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
223
- text_lc,
224
- )
225
- if m1:
226
- val = _value_from_pnv_context(m1.group(1))
227
- if val:
228
- _set_if_stronger(parsed, field, val)
229
- break
230
-
231
- # "positive for catalase"
232
- m2 = re.search(
233
- rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
234
- text_lc,
235
- )
236
- if m2:
237
- val = _value_from_pnv_context(m2.group(1))
238
- if val:
239
- _set_if_stronger(parsed, field, val)
240
- break
241
-
242
- # Special-case NaCl tolerance with explicit percentages
243
- if field == "NaCl Tolerant (>=6%)":
244
- # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
245
- for m in re.finditer(
246
- r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
247
- text_lc,
248
- ):
249
- try:
250
- conc = float(m.group(3))
251
- if conc >= 6.0:
252
- _set_if_stronger(parsed, field, "Positive")
253
- except Exception:
254
- pass
255
-
256
- # e.g. "NaCl tolerant up to 10%"
257
- for m in re.finditer(
258
- r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
259
- text_lc,
260
- ):
261
- try:
262
- conc = float(m.group(1))
263
- if conc >= 6.0:
264
- _set_if_stronger(parsed, field, "Positive")
265
- except Exception:
266
- pass
267
-
268
- # explicit negative phrasing: "does not grow in 7% NaCl"
269
  if re.search(
270
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
271
  text_lc,
272
  ):
273
- _set_if_stronger(parsed, field, "Negative")
274
-
275
- # more general: "in 6.5% NaCl" (assume positive tolerance)
276
- for m in re.finditer(
277
- r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
278
- text_lc,
279
- ):
280
- try:
281
- conc = float(m.group(1))
282
- if conc >= 6.0 and "does not" not in text_lc and "no growth" not in text_lc:
283
- _set_if_stronger(parsed, field, "Positive")
284
- except Exception:
285
- pass
286
-
287
- # Nitrate: "reduces nitrate" / "does not reduce nitrate"
288
- if re.search(r"reduces nitrate", text_lc):
289
- _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
290
- if re.search(r"does (not|n't) reduce nitrate", text_lc):
291
- _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
292
-
293
- # H2S: "produces H2S", "H2S production", "does not produce H2S"
294
- if re.search(r"(produces|production of)\s+h2s", text_lc):
295
- _set_if_stronger(parsed, "H2S", "Positive")
296
- if re.search(r"does (not|n't) produce\s+h2s", text_lc) or re.search(
297
- r"no h2s production", text_lc
298
- ):
299
- _set_if_stronger(parsed, "H2S", "Negative")
300
-
301
-
302
- # ------------------------------------------------------------
303
- # Motility / Capsule / Spores
304
- # ------------------------------------------------------------
305
-
306
- def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
307
- # Motility
308
- # Basic: "motile" vs "non-motile"
309
- if (
310
- re.search(r"\bmotile\b", text_lc)
311
- and not re.search(r"\bnon[- ]?motile\b", text_lc)
312
- and "nonmotile" not in text_lc
313
- and "immotile" not in text_lc
314
- ):
315
- _set_if_stronger(parsed, "Motility", "Positive")
316
-
317
- if (
318
- "non-motile" in text_lc
319
- or "non motile" in text_lc
320
- or "nonmotile" in text_lc
321
- or "immotile" in text_lc
322
- ):
323
- _set_if_stronger(parsed, "Motility", "Negative")
324
-
325
- # Specific motility phrases: tumbling, swarming, corkscrew
326
- if (
327
- "tumbling motility" in text_lc
328
- or "swarming motility" in text_lc
329
- or "corkscrew motility" in text_lc
330
- or ("swarming" in text_lc and "non-swarming" not in text_lc)
331
- ):
332
- _set_if_stronger(parsed, "Motility", "Positive")
333
-
334
- # Capsule - include "capsule positive/negative"
335
- if (
336
- "capsulated" in text_lc
337
- or "encapsulated" in text_lc
338
- or "capsule present" in text_lc
339
- or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
340
- ):
341
- _set_if_stronger(parsed, "Capsule", "Positive")
342
-
343
- if (
344
- "non-capsulated" in text_lc
345
- or "no capsule" in text_lc
346
- or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
347
- ):
348
- _set_if_stronger(parsed, "Capsule", "Negative")
349
-
350
- # Spore formation:
351
- # Check negative phrases FIRST so they win over generic positive phrases.
352
- if (
353
- "non-spore-forming" in text_lc
354
- or "non spore forming" in text_lc
355
- or "nonspore-forming" in text_lc
356
- or "no spores" in text_lc
357
- ):
358
- _set_if_stronger(parsed, "Spore Formation", "Negative")
359
-
360
- if (
361
- "spore-forming" in text_lc
362
- or "spore forming" in text_lc
363
- or "forms spores" in text_lc
364
- ):
365
- _set_if_stronger(parsed, "Spore Formation", "Positive")
366
-
367
-
368
- # ------------------------------------------------------------
369
- # Oxygen requirement
370
- # ------------------------------------------------------------
371
-
372
- def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
373
- """
374
- Robust oxygen parsing:
375
- - Handle facultative first
376
- - Avoid "aerobic" accidentally matching inside "anaerobic"
377
- """
378
- # Facultative first
379
- if re.search(r"facultative(ly)? anaerob", text_lc):
380
- _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
381
-
382
- # Strict anaerobic (before aerobic, and with word boundary)
383
- if re.search(r"\bobligate anaerob", text_lc) or (
384
- re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc
385
- ):
386
- _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
387
-
388
- # Now handle purely aerobic, avoiding "anaerobic"
389
- if re.search(r"\bobligate aerobe\b", text_lc) or (
390
- re.search(r"\baerobic\b", text_lc) and "anaerobic" not in text_lc
391
- ):
392
- _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
393
-
394
- if "microaerophilic" in text_lc or "microaerophile" in text_lc:
395
- _set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")
396
-
397
- if "capnophilic" in text_lc or "co2" in text_lc:
398
- _set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")
399
-
400
-
401
- # ------------------------------------------------------------
402
- # Growth temperature
403
- # ------------------------------------------------------------
404
-
405
- def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
406
- """
407
- Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
408
- We ALWAYS store as "low//high":
409
- - For ranges: "4//45"
410
- - For single values: "37//37"
411
- """
412
- # Ranges like "4-45 °C", "10–40 °C"
413
- range_pattern = re.compile(
414
- r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
415
- )
416
- m_range = range_pattern.search(text_lc)
417
- if m_range:
418
- low = m_range.group(1)
419
- high = m_range.group(2)
420
- _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
421
- return
422
-
423
- # Single temps like "grows at 37 c"
424
- single_pattern = re.compile(
425
- r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
426
- )
427
- m_single = single_pattern.search(text_lc)
428
- if m_single:
429
- temp = m_single.group(2)
430
- _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
431
- return
432
-
433
- # Simplified: "grows at 37" (no explicit °C)
434
- m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
435
- if m_simple_num:
436
- temp = m_simple_num.group(1)
437
- _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
438
- return
439
-
440
- # Fallback: plain "37c" somewhere in the text
441
- m_plain = re.search(
442
- r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
443
- text_lc,
444
- )
445
- if m_plain:
446
- temp = m_plain.group(1)
447
- _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
448
-
449
-
450
- # ------------------------------------------------------------
451
- # Media grown on (coarse mapping)
452
- # ------------------------------------------------------------
453
-
454
- MEDIA_KEYWORDS = {
455
- "Blood Agar": [
456
- "blood agar",
457
- "blood-agar",
458
- ],
459
- "MacConkey Agar": [
460
- "macconkey agar",
461
- "mac conkey agar",
462
- "macconkey",
463
- ],
464
- "Chocolate Agar": [
465
- "chocolate agar",
466
- "chocolate-agar",
467
- ],
468
- "Nutrient Agar": [
469
- "nutrient agar",
470
- "nutrient-agar",
471
- ],
472
- "XLD Agar": [
473
- "xld agar",
474
- ],
475
- "TCBS Agar": [
476
- "tcbs agar",
477
- "tcbs",
478
- ],
479
- "ALOA": [
480
- "aloa agar",
481
- "aloa",
482
- ],
483
- "BCYE Agar": [
484
- "bcye agar",
485
- "bcye",
486
- ],
487
- "MRS Agar": [
488
- "mrs agar",
489
- ],
490
- }
491
-
492
-
493
- def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
494
- found_media: List[str] = []
495
- for media_name, patterns in MEDIA_KEYWORDS.items():
496
- for p in patterns:
497
- if p in text_lc:
498
- if media_name not in found_media:
499
- found_media.append(media_name)
500
-
501
- if found_media:
502
- _set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))
503
-
504
-
505
- # ------------------------------------------------------------
506
- # Sugar fermentation parsing
507
- # ------------------------------------------------------------
508
-
509
- def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
510
- """
511
- Handles patterns like:
512
- - "ferments glucose, mannitol and sucrose but not lactose"
513
- - "does not ferment lactose"
514
- - "non-lactose fermenter"
515
- - "glucose positive, mannitol negative"
516
- - global non-fermenter patterns
517
- """
518
-
519
- # 0) Simple "glucose positive / negative" style
520
- for sugar_key, field in SUGAR_FIELDS.items():
521
- m_simple = re.search(
522
- rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
523
- text_lc,
524
- )
525
- if m_simple:
526
- val = _value_from_pnv_context(m_simple.group(1))
527
- if val:
528
- _set_if_stronger(parsed, field, val)
529
-
530
- # 1) Pattern: "ferments X, Y and Z but not A, B"
531
- ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
532
- for m in ferments_pattern.finditer(text_lc):
533
- seg = m.group(1)
534
- # Split positive vs negative part on "but not"
535
- neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
536
- pos_part = neg_split[0]
537
- neg_part = neg_split[1] if len(neg_split) > 1 else ""
538
-
539
- # Positive sugars from pos_part
540
- for sugar_key, field in SUGAR_FIELDS.items():
541
- if re.search(rf"\b{sugar_key}\b", pos_part):
542
- _set_if_stronger(parsed, field, "Positive")
543
-
544
- # Negative sugars from neg_part
545
- for sugar_key, field in SUGAR_FIELDS.items():
546
- if re.search(rf"\b{sugar_key}\b", neg_part):
547
- _set_if_stronger(parsed, field, "Negative")
548
-
549
- # 2) "does not ferment X" / "doesn't ferment X"
550
- for sugar_key, field in SUGAR_FIELDS.items():
551
- if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
552
- _set_if_stronger(parsed, field, "Negative")
553
-
554
- # 3) "non-lactose fermenter", "non lactose fermenter"
555
- for sugar_key, field in SUGAR_FIELDS.items():
556
- if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
557
- _set_if_stronger(parsed, field, "Negative")
558
-
559
- # 4) "X fermentation positive/negative"
560
- for sugar_key, field in SUGAR_FIELDS.items():
561
- # "glucose fermentation positive"
562
- m1 = re.search(
563
- rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
564
- text_lc,
565
- )
566
- if m1:
567
- val = _value_from_pnv_context(m1.group(1))
568
- if val:
569
- _set_if_stronger(parsed, field, val)
570
- continue
571
-
572
- # "positive for glucose fermentation"
573
- m2 = re.search(
574
- rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
575
- text_lc,
576
- )
577
- if m2:
578
- val = _value_from_pnv_context(m2.group(1))
579
- if val:
580
- _set_if_stronger(parsed, field, val)
581
- continue
582
-
583
- # 5) Global non-fermenter patterns
584
- if (
585
- re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
586
- or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
587
- ):
588
- for field in SUGAR_FIELDS.values():
589
- if field not in parsed or parsed[field] == UNKNOWN:
590
- _set_if_stronger(parsed, field, "Negative")
591
-
592
-
593
- # ------------------------------------------------------------
594
- # Colony morphology (coarse, optional)
595
- # ------------------------------------------------------------
596
-
597
- def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
598
- """
599
- Very coarse mapping for colony morphology. We try a couple of patterns:
600
- - "colonies are yellow, mucoid"
601
- - "colonies dry, white and irregular on nutrient agar"
602
- - "forming green colonies", "forms mucoid colonie"
603
- """
604
- # Pattern 1: "colonies are ..."
605
- m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
606
- if m:
607
- desc = m.group(3).strip()
608
- if desc:
609
- pretty = "; ".join(
610
- [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
611
- )
612
- if pretty:
613
- _set_if_stronger(parsed, "Colony Morphology", pretty)
614
- return
615
-
616
- # Pattern 2: "colonies dry, white and irregular on nutrient agar"
617
- m2 = re.search(
618
- r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
619
- text_lc,
620
- )
621
- if m2:
622
- desc = m2.group(1).strip()
623
- if desc:
624
- pretty = "; ".join(
625
- [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
626
- )
627
- if pretty:
628
- _set_if_stronger(parsed, "Colony Morphology", pretty)
629
- return
630
-
631
- # Pattern 3: "forming green colonies", "forms mucoid colonies"
632
- m3 = re.search(
633
- r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
634
- text_lc,
635
- )
636
- if m3:
637
- desc = m3.group(2).strip()
638
- if desc:
639
- pretty = "; ".join(
640
- [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
641
- )
642
- if pretty:
643
- _set_if_stronger(parsed, "Colony Morphology", pretty)
644
-
645
-
646
- # ------------------------------------------------------------
647
- # PUBLIC API
648
- # ------------------------------------------------------------
649
-
650
- def parse_text_rules(text: str) -> Dict[str, Any]:
651
- """
652
- Main entry point.
653
-
654
- Parameters
655
- ----------
656
- text : str
657
- Free-text microbiology description.
658
-
659
- Returns
660
- -------
661
- dict
662
- {
663
- "parsed_fields": { field: value, ... },
664
- "source": "rule_parser",
665
- "raw": original_text,
666
- "error": optional_error_message
667
- }
668
- """
669
- original = text or ""
670
- text_clean = _clean_text(original)
671
- text_lc = text_clean.lower()
672
-
673
- parsed: Dict[str, str] = {}
674
-
675
- try:
676
- _parse_gram_and_shape(text_lc, parsed)
677
- _parse_haemolysis(text_lc, parsed)
678
- _parse_core_bool_tests(text_lc, parsed)
679
- _parse_motility_capsule_spores(text_lc, parsed)
680
- _parse_oxygen(text_lc, parsed)
681
- _parse_growth_temperature(text_lc, parsed)
682
- _parse_media(text_lc, parsed)
683
- _parse_sugars(text_lc, parsed)
684
- _parse_colony(text_lc, parsed)
685
-
686
- return {
687
- "parsed_fields": parsed,
688
- "source": "rule_parser",
689
- "raw": original,
690
- }
691
-
692
- except Exception as e:
693
- # Fail-safe: never crash the app, just report an error
694
- return {
695
- "parsed_fields": parsed,
696
- "source": "rule_parser",
697
- "raw": original,
698
- "error": f"{type(e).__name__}: {e}",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
699
  }
 
1
+ # engine/parser_rules.py
2
+ # ------------------------------------------------------------
3
+ # Rule-based core parser for microbiology descriptions.
4
+ #
5
+ # Stage 11F (Option A ranges + fixes):
6
+ # - Always store Growth Temperature as "low//high"
7
+ # single: 37 → "37//37"
8
+ # two temps in text: min//max (e.g. "4 °C but not 45 °C" → "4//45")
9
+ # - DNase robust parsing (DNase / DNase test, DNase activity, etc.)
10
+ # - Non–spore-forming Spore Formation = Negative (regex + early return)
11
+ # - "non-H2S producing" H2S = Negative
12
+ # - "aerobically" / "anaerobically" → Aerobic / Anaerobic
13
+ # - NaCl tolerance phrases improved
14
+ # - Colony morphology from "colonies dry, white and irregular on nutrient agar"
15
+ # ------------------------------------------------------------
16
+
17
+ from __future__ import annotations
18
+
19
+ import re
20
+ from typing import Dict, Any, List
21
+
22
+
23
+ UNKNOWN = "Unknown"
24
+
25
+ # ------------------------------------------------------------
26
+ # Core fields and sugar mapping
27
+ # ------------------------------------------------------------
28
+
29
+ # Sugar name → core DB column
30
+ SUGAR_FIELDS: Dict[str, str] = {
31
+ "glucose": "Glucose Fermentation",
32
+ "lactose": "Lactose Fermentation",
33
+ "sucrose": "Sucrose Fermentation",
34
+ "maltose": "Maltose Fermentation",
35
+ "mannitol": "Mannitol Fermentation",
36
+ "sorbitol": "Sorbitol Fermentation",
37
+ "xylose": "Xylose Fermentation",
38
+ "rhamnose": "Rhamnose Fermentation",
39
+ "arabinose": "Arabinose Fermentation",
40
+ "raffinose": "Raffinose Fermentation",
41
+ "trehalose": "Trehalose Fermentation",
42
+ "inositol": "Inositol Fermentation",
43
+ }
44
+
45
+ CORE_BOOL_FIELDS: Dict[str, List[str]] = {
46
+ # field: [keywords to recognise the test name]
47
+ "Catalase": ["catalase"],
48
+ "Oxidase": ["oxidase"],
49
+ "Indole": ["indole"],
50
+ "Urease": ["urease"],
51
+ "Citrate": ["citrate"],
52
+ # MR: include "mr"
53
+ "Methyl Red": ["methyl red", "mr test", "mr"],
54
+ "VP": ["voges-proskauer", "vp test", "vp"],
55
+ # H2S (includes H₂S → normalised to H2S in _clean_text)
56
+ "H2S": ["h2s", "hydrogen sulfide"],
57
+ # DNase: broaden patterns
58
+ "DNase": [
59
+ "dnase",
60
+ "dnase test",
61
+ "dnase activity",
62
+ "dnase production",
63
+ "dnaase",
64
+ "dna hydrolysis",
65
+ ],
66
+ "ONPG": ["onpg"],
67
+ "Coagulase": ["coagulase"],
68
+ "Lipase Test": ["lipase"],
69
+ "Nitrate Reduction": ["nitrate reduction", "nitrate"],
70
+ "NaCl Tolerant (>=6%)": ["6% nacl", "7% nacl", "nacl tolerant"],
71
+ "Lysine Decarboxylase": ["lysine decarboxylase", "lysine decarb"],
72
+ "Ornitihine Decarboxylase": ["ornithine decarboxylase", "ornithine decarb"],
73
+ "Arginine dihydrolase": ["arginine dihydrolase"],
74
+ "Gelatin Hydrolysis": ["gelatin hydrolysis", "gelatinase"],
75
+ # Esculin Hydrolysis: also match plain "esculin"
76
+ "Esculin Hydrolysis": ["esculin hydrolysis", "esculin"],
77
+ }
78
+
79
+ # ------------------------------------------------------------
80
+ # Generic helpers
81
+ # ------------------------------------------------------------
82
+
83
+ def _clean_text(text: str) -> str:
84
+ """
85
+ Normalise a few unicode oddities and collapse whitespace.
86
+ Also:
87
+ - strip degree symbols
88
+ - normalise subscript ₂ → 2 for H₂S
89
+ """
90
+ if not text:
91
+ return ""
92
+ s = text.replace("°", "").replace("º", "")
93
+ # normalise subscript 2 (H₂S → H2S)
94
+ s = s.replace("", "2")
95
+ # keep dashes as-is; regexes handle - and – explicitly
96
+ return " ".join(s.split())
97
+
98
+
99
+ def _norm(s: str) -> str:
100
+ return s.strip().lower()
101
+
102
+
103
+ def _set_if_stronger(parsed: Dict[str, str], field: str, value: str) -> None:
104
+ """
105
+ Write value to parsed[field] if:
106
+ - field not present, or
107
+ - we are replacing Unknown with a concrete value
108
+ """
109
+ if not value:
110
+ return
111
+ if field not in parsed or parsed[field] == UNKNOWN:
112
+ parsed[field] = value
113
+
114
+
115
+ def _value_from_pnv_context(segment: str) -> str | None:
116
+ """
117
+ Interpret a short phrase as Positive / Negative / Variable.
118
+ """
119
+ seg = _norm(segment)
120
+ if seg in ["positive", "pos", "+"]:
121
+ return "Positive"
122
+ if seg in ["negative", "neg", "-"]:
123
+ return "Negative"
124
+ if seg in ["variable", "var", "v"]:
125
+ return "Variable"
126
+ return None
127
+
128
+
129
+ # ------------------------------------------------------------
130
+ # Gram stain and shape
131
+ # ------------------------------------------------------------
132
+
133
+ def _parse_gram_and_shape(text_lc: str, parsed: Dict[str, str]) -> None:
134
+ # Gram stain
135
+ if "gram-positive" in text_lc or "gram positive" in text_lc:
136
+ _set_if_stronger(parsed, "Gram Stain", "Positive")
137
+ elif "gram-negative" in text_lc or "gram negative" in text_lc:
138
+ _set_if_stronger(parsed, "Gram Stain", "Negative")
139
+ elif "gram variable" in text_lc:
140
+ _set_if_stronger(parsed, "Gram Stain", "Variable")
141
+
142
+ # Shape
143
+ # Prefer "short rods" over generic rods
144
+ if "short rods" in text_lc:
145
+ _set_if_stronger(parsed, "Shape", "Short Rods")
146
+
147
+ # Cocci and variants (diplococci, tetracocci, etc.)
148
+ if re.search(r"\bcocci\b", text_lc):
149
+ _set_if_stronger(parsed, "Shape", "Cocci")
150
+ if re.search(r"\b(diplococci|tetracocci|streptococci|staphylococci)\b", text_lc):
151
+ _set_if_stronger(parsed, "Shape", "Cocci")
152
+
153
+ # Rods / bacilli
154
+ if re.search(r"\brods?\b", text_lc) or "bacilli" in text_lc:
155
+ _set_if_stronger(parsed, "Shape", "Rods")
156
+
157
+ # Spiral
158
+ if "spiral" in text_lc or "spirochete" in text_lc:
159
+ _set_if_stronger(parsed, "Shape", "Spiral")
160
+
161
+
162
+ # ------------------------------------------------------------
163
+ # Haemolysis
164
+ # ------------------------------------------------------------
165
+
166
+ def _parse_haemolysis(text_lc: str, parsed: Dict[str, str]) -> None:
167
+ """
168
+ Handle haemolysis phrasing:
169
+ - beta-haemolytic / beta hemolytic / beta-haemolysis / etc.
170
+ - alpha- / gamma- / non-haemolytic
171
+ """
172
+ # Beta
173
+ if re.search(r"beta[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
174
+ _set_if_stronger(parsed, "Haemolysis Type", "Beta")
175
+ _set_if_stronger(parsed, "Haemolysis", "Positive")
176
+
177
+ # Alpha
178
+ if re.search(r"alpha[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
179
+ _set_if_stronger(parsed, "Haemolysis Type", "Alpha")
180
+ _set_if_stronger(parsed, "Haemolysis", "Positive")
181
+
182
+ # Gamma / non-haemolytic
183
+ if re.search(r"gamma[- ]?(haemolytic|hemolytic|haemolysis|hemolysis)", text_lc):
184
+ _set_if_stronger(parsed, "Haemolysis Type", "Gamma")
185
+ _set_if_stronger(parsed, "Haemolysis", "Negative")
186
+ if (
187
+ "non-haemolytic" in text_lc
188
+ or "non hemolytic" in text_lc
189
+ or "non-hemolytic" in text_lc
190
+ ):
191
+ _set_if_stronger(parsed, "Haemolysis Type", "None")
192
+ _set_if_stronger(parsed, "Haemolysis", "Negative")
193
+
194
+ # Variable phrasing
195
+ if "variable haemolysis" in text_lc or "variable hemolysis" in text_lc:
196
+ _set_if_stronger(parsed, "Haemolysis Type", "Variable")
197
+ _set_if_stronger(parsed, "Haemolysis", "Variable")
198
+
199
+
200
+ # ------------------------------------------------------------
201
+ # Boolean test parser (core enzyme tests etc.)
202
+ # ------------------------------------------------------------
203
+
204
+ def _parse_core_bool_tests(text_lc: str, parsed: Dict[str, str]) -> None:
205
+ """
206
+ For each test in CORE_BOOL_FIELDS, look for patterns like:
207
+ "catalase positive", "positive for catalase", etc.
208
+ Also handles:
209
+ - NaCl tolerance with % values
210
+ - Nitrate reduction text
211
+ - H2S production / non-production
212
+ - DNase universal coverage
213
+ """
214
+ for field, keywords in CORE_BOOL_FIELDS.items():
215
+ for kw in keywords:
216
+ # "... catalase positive"
217
+ m1 = re.search(
218
+ rf"{re.escape(kw)}[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
219
+ text_lc,
220
+ )
221
+ if m1:
222
+ val = _value_from_pnv_context(m1.group(1))
223
+ if val:
224
+ _set_if_stronger(parsed, field, val)
225
+ break
226
+
227
+ # "positive for catalase"
228
+ m2 = re.search(
229
+ rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{re.escape(kw)}",
230
+ text_lc,
231
+ )
232
+ if m2:
233
+ val = _value_from_pnv_context(m2.group(1))
234
+ if val:
235
+ _set_if_stronger(parsed, field, val)
236
+ break
237
+
238
+ # Special-case NaCl tolerance with explicit percentages
239
+ if field == "NaCl Tolerant (>=6%)":
240
+ # e.g. "grows in 6.5% NaCl", "grows at 10% NaCl"
241
+ for m in re.finditer(
242
+ r"(grows|growth)\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
243
+ text_lc,
244
+ ):
245
+ try:
246
+ conc = float(m.group(3))
247
+ if conc >= 6.0:
248
+ _set_if_stronger(parsed, field, "Positive")
249
+ except Exception:
250
+ pass
251
+
252
+ # e.g. "NaCl tolerant up to 10%"
253
+ for m in re.finditer(
254
+ r"nacl\s+tolerant\s+(?:to|up to)?\s*(\d+(?:\.\d+)?)\s*%?",
255
+ text_lc,
256
+ ):
257
+ try:
258
+ conc = float(m.group(1))
259
+ if conc >= 6.0:
260
+ _set_if_stronger(parsed, field, "Positive")
261
+ except Exception:
262
+ pass
263
+
264
+ # explicit negative phrasing: "does not grow in 7% NaCl"
 
 
 
 
265
  if re.search(
266
  r"does (not|n't) grow\s+(in|at)\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
267
  text_lc,
268
  ):
269
+ _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Negative")
270
+
271
+ # general "in 6.5% NaCl" assume tolerance if no explicit "no growth"
272
+ for m in re.finditer(
273
+ r"\bin\s*(\d+(?:\.\d+)?)\s*%?\s*nacl",
274
+ text_lc,
275
+ ):
276
+ try:
277
+ conc = float(m.group(1))
278
+ if conc >= 6.0 and "does not" not in text_lc and "no growth" not in text_lc:
279
+ _set_if_stronger(parsed, "NaCl Tolerant (>=6%)", "Positive")
280
+ except Exception:
281
+ pass
282
+
283
+ # Nitrate: "reduces nitrate" / "does not reduce nitrate"
284
+ if re.search(r"reduces nitrate", text_lc):
285
+ _set_if_stronger(parsed, "Nitrate Reduction", "Positive")
286
+ if re.search(r"does (not|n't) reduce nitrate", text_lc):
287
+ _set_if_stronger(parsed, "Nitrate Reduction", "Negative")
288
+
289
+ # H2S: "produces H2S", "H2S production", "does not produce H2S",
290
+ # "non-H2S producing"
291
+ if re.search(r"(produces|production of)\s+h2s", text_lc):
292
+ _set_if_stronger(parsed, "H2S", "Positive")
293
+ if (
294
+ re.search(r"does (not|n't) produce\s+h2s", text_lc)
295
+ or re.search(r"no h2s production", text_lc)
296
+ or re.search(r"non[- ]h2s producing", text_lc)
297
+ ):
298
+ _set_if_stronger(parsed, "H2S", "Negative")
299
+
300
+ # --- DNase universal coverage ---
301
+ # Positive forms
302
+ if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(positive|pos|\+)\b", text_lc):
303
+ _set_if_stronger(parsed, "DNase", "Positive")
304
+
305
+ if re.search(r"\b(positive|pos|\+)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
306
+ _set_if_stronger(parsed, "DNase", "Positive")
307
+
308
+ # Negative forms
309
+ if re.search(r"\bdnase(\s+test|\s+activity|\s+production)?\s*(negative|neg|\-)\b", text_lc):
310
+ _set_if_stronger(parsed, "DNase", "Negative")
311
+
312
+ if re.search(r"\b(negative|neg|\-)\s+dnase(\s+test|\s+activity|\s+production)?\b", text_lc):
313
+ _set_if_stronger(parsed, "DNase", "Negative")
314
+
315
+ # non-DNase-producing
316
+ if re.search(r"\bnon[- ]?dnase[- ]?producing\b", text_lc):
317
+ _set_if_stronger(parsed, "DNase", "Negative")
318
+
319
+
320
+ # ------------------------------------------------------------
321
+ # Motility / Capsule / Spores
322
+ # ------------------------------------------------------------
323
+
324
+ def _parse_motility_capsule_spores(text_lc: str, parsed: Dict[str, str]) -> None:
325
+ # Motility
326
+ if (
327
+ re.search(r"\bmotile\b", text_lc)
328
+ and not re.search(r"\bnon[- ]?motile\b", text_lc)
329
+ and "nonmotile" not in text_lc
330
+ and "immotile" not in text_lc
331
+ ):
332
+ _set_if_stronger(parsed, "Motility", "Positive")
333
+
334
+ if (
335
+ "non-motile" in text_lc
336
+ or "non motile" in text_lc
337
+ or "nonmotile" in text_lc
338
+ or "immotile" in text_lc
339
+ ):
340
+ _set_if_stronger(parsed, "Motility", "Negative")
341
+
342
+ # Specific motility phrases: tumbling, swarming, corkscrew
343
+ if (
344
+ "tumbling motility" in text_lc
345
+ or "swarming motility" in text_lc
346
+ or "corkscrew motility" in text_lc
347
+ or ("swarming" in text_lc and "non-swarming" not in text_lc)
348
+ ):
349
+ _set_if_stronger(parsed, "Motility", "Positive")
350
+
351
+ # Capsule (including "capsule positive/negative")
352
+ if (
353
+ "capsulated" in text_lc
354
+ or "encapsulated" in text_lc
355
+ or "capsule present" in text_lc
356
+ or re.search(r"capsule[ \-]?(positive|pos|\+)", text_lc)
357
+ ):
358
+ _set_if_stronger(parsed, "Capsule", "Positive")
359
+
360
+ if (
361
+ "non-capsulated" in text_lc
362
+ or "no capsule" in text_lc
363
+ or re.search(r"capsule[ \-]?(negative|neg|\-)", text_lc)
364
+ ):
365
+ _set_if_stronger(parsed, "Capsule", "Negative")
366
+
367
+ # Spore formation
368
+ # NEGATIVE FIRST with strict boundaries, then early-return
369
+ if (
370
+ re.search(r"\bnon[-\s]?spore[-\s]?forming\b", text_lc)
371
+ or "no spores" in text_lc
372
+ ):
373
+ _set_if_stronger(parsed, "Spore Formation", "Negative")
374
+ return # prevent any positive overwrite
375
+
376
+ # POSITIVE (must not match the negative form)
377
+ if (
378
+ re.search(r"\bspore[-\s]?forming\b", text_lc)
379
+ or "forms spores" in text_lc
380
+ ):
381
+ _set_if_stronger(parsed, "Spore Formation", "Positive")
382
+
383
+
384
+ # ------------------------------------------------------------
385
+ # Oxygen requirement
386
+ # ------------------------------------------------------------
387
+
388
+ def _parse_oxygen(text_lc: str, parsed: Dict[str, str]) -> None:
389
+ """
390
+ Robust oxygen parsing:
391
+ - Handle facultative first
392
+ - Avoid "aerobic" accidentally matching inside "anaerobic"
393
+ - Include "aerobically" / "anaerobically"
394
+ """
395
+ # Facultative first
396
+ if re.search(r"facultative(ly)? anaerob", text_lc):
397
+ _set_if_stronger(parsed, "Oxygen Requirement", "Facultative Anaerobe")
398
+
399
+ # Strict anaerobic (before aerobic)
400
+ if (
401
+ re.search(r"\bobligate anaerob", text_lc)
402
+ or (re.search(r"\banaerobic\b", text_lc) and "facultative" not in text_lc)
403
+ or re.search(r"\banaerobically\b", text_lc)
404
+ ):
405
+ _set_if_stronger(parsed, "Oxygen Requirement", "Anaerobic")
406
+
407
+ # Now handle purely aerobic, avoiding "anaerobic"
408
+ if (
409
+ re.search(r"\bobligate aerobe\b", text_lc)
410
+ or (
411
+ re.search(r"\baerobic\b", text_lc)
412
+ and "anaerobic" not in text_lc
413
+ )
414
+ or (
415
+ re.search(r"\baerobically\b", text_lc)
416
+ and "anaerobically" not in text_lc
417
+ )
418
+ ):
419
+ _set_if_stronger(parsed, "Oxygen Requirement", "Aerobic")
420
+
421
+ if "microaerophilic" in text_lc or "microaerophile" in text_lc:
422
+ _set_if_stronger(parsed, "Oxygen Requirement", "Microaerophilic")
423
+
424
+ if "capnophilic" in text_lc or "co2" in text_lc:
425
+ _set_if_stronger(parsed, "Oxygen Requirement", "Capnophilic")
426
+
427
+
428
+ # ------------------------------------------------------------
429
+ # Growth temperature
430
+ # ------------------------------------------------------------
431
+
432
+ def _parse_growth_temperature(text_lc: str, parsed: Dict[str, str]) -> None:
433
+ """
434
+ Look for explicit temperatures like "grows at 37 °C" or ranges like "4–45 °C".
435
+ We ALWAYS store as "low//high":
436
+ - true ranges: "4-45 °C" "4//45"
437
+ - two temps in text: min//max (Option A), e.g.:
438
+ "grows at 4 °C but not at 45 °C" → "4//45"
439
+ "grows at 42 °C but not at 25 °C" → "25//42"
440
+ - single temps: "37 °C" → "37//37"
441
+ """
442
+ # 1) Explicit ranges like "4-45 °C" or "10–40 °C"
443
+ range_pattern = re.compile(
444
+ r"(\d+)\s*[-–/]\s*(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
445
+ )
446
+ m_range = range_pattern.search(text_lc)
447
+ if m_range:
448
+ low = m_range.group(1)
449
+ high = m_range.group(2)
450
+ _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
451
+ return
452
+
453
+ # 2) Option A: any two explicit temps → min//max
454
+ temps = re.findall(r"(\d+)\s*(?:c|°c|degrees c|degrees celsius)", text_lc)
455
+ if len(temps) >= 2:
456
+ nums = [int(t) for t in temps]
457
+ low = min(nums)
458
+ high = max(nums)
459
+ _set_if_stronger(parsed, "Growth Temperature", f"{low}//{high}")
460
+ return
461
+
462
+ # 3) Single temps like "grows at 37 c"
463
+ single_pattern = re.compile(
464
+ r"(grows|growth|optimum|optimal)\s+(?:at\s+)?(\d+)\s*(?:c|°c|degrees c|degrees celsius)"
465
+ )
466
+ m_single = single_pattern.search(text_lc)
467
+ if m_single:
468
+ temp = m_single.group(2)
469
+ _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
470
+ return
471
+
472
+ # 4) Simplified: "grows at 37" (no explicit °C)
473
+ m_simple_num = re.search(r"grows at (\d+)\b", text_lc)
474
+ if m_simple_num:
475
+ temp = m_simple_num.group(1)
476
+ _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
477
+ return
478
+
479
+ # 5) Fallback: plain "37c" somewhere in the text
480
+ m_plain = re.search(
481
+ r"\b(\d+)\s*(?:c|°c|degrees c|degrees celsius)\b",
482
+ text_lc,
483
+ )
484
+ if m_plain:
485
+ temp = m_plain.group(1)
486
+ _set_if_stronger(parsed, "Growth Temperature", f"{temp}//{temp}")
487
+
488
+
489
+ # ------------------------------------------------------------
490
+ # Media grown on (coarse mapping)
491
+ # ------------------------------------------------------------
492
+
493
+ MEDIA_KEYWORDS = {
494
+ "Blood Agar": [
495
+ "blood agar",
496
+ "blood-agar",
497
+ ],
498
+ "MacConkey Agar": [
499
+ "macconkey agar",
500
+ "mac conkey agar",
501
+ "macconkey",
502
+ ],
503
+ "Chocolate Agar": [
504
+ "chocolate agar",
505
+ "chocolate-agar",
506
+ ],
507
+ "Nutrient Agar": [
508
+ "nutrient agar",
509
+ "nutrient-agar",
510
+ ],
511
+ "XLD Agar": [
512
+ "xld agar",
513
+ ],
514
+ "TCBS Agar": [
515
+ "tcbs agar",
516
+ "tcbs",
517
+ ],
518
+ "ALOA": [
519
+ "aloa agar",
520
+ "aloa",
521
+ ],
522
+ "BCYE Agar": [
523
+ "bcye agar",
524
+ "bcye",
525
+ ],
526
+ "MRS Agar": [
527
+ "mrs agar",
528
+ ],
529
+ }
530
+
531
+
532
+ def _parse_media(text_lc: str, parsed: Dict[str, str]) -> None:
533
+ found_media: List[str] = []
534
+ for media_name, patterns in MEDIA_KEYWORDS.items():
535
+ for p in patterns:
536
+ if p in text_lc and media_name not in found_media:
537
+ found_media.append(media_name)
538
+
539
+ if found_media:
540
+ _set_if_stronger(parsed, "Media Grown On", "; ".join(found_media))
541
+
542
+
543
+ # ------------------------------------------------------------
544
+ # Sugar fermentation parsing
545
+ # ------------------------------------------------------------
546
+
547
+ def _parse_sugars(text_lc: str, parsed: Dict[str, str]) -> None:
548
+ """
549
+ Handles patterns like:
550
+ - "glucose positive, mannitol negative"
551
+ - "ferments glucose, mannitol and sucrose but not lactose"
552
+ - "does not ferment lactose"
553
+ - "non-lactose fermenter"
554
+ - global non-fermenter phrases
555
+ """
556
+
557
+ # 0) Simple "glucose positive / negative" style
558
+ for sugar_key, field in SUGAR_FIELDS.items():
559
+ m_simple = re.search(
560
+ rf"{sugar_key}\s+(positive|negative|variable|pos|neg|\+|\-)",
561
+ text_lc,
562
+ )
563
+ if m_simple:
564
+ val = _value_from_pnv_context(m_simple.group(1))
565
+ if val:
566
+ _set_if_stronger(parsed, field, val)
567
+
568
+ # 1) "ferments X, Y and Z but not A, B"
569
+ ferments_pattern = re.compile(r"ferments\s+([a-z0-9 ,;/&\-]+)")
570
+ for m in ferments_pattern.finditer(text_lc):
571
+ seg = m.group(1)
572
+ # Split positive vs negative part on "but not"
573
+ neg_split = re.split(r"\bbut not\b", seg, maxsplit=1)
574
+ pos_part = neg_split[0]
575
+ neg_part = neg_split[1] if len(neg_split) > 1 else ""
576
+
577
+ # Positive sugars from pos_part
578
+ for sugar_key, field in SUGAR_FIELDS.items():
579
+ if re.search(rf"\b{sugar_key}\b", pos_part):
580
+ _set_if_stronger(parsed, field, "Positive")
581
+
582
+ # Negative sugars from neg_part
583
+ for sugar_key, field in SUGAR_FIELDS.items():
584
+ if re.search(rf"\b{sugar_key}\b", neg_part):
585
+ _set_if_stronger(parsed, field, "Negative")
586
+
587
+ # 2) "does not ferment X"
588
+ for sugar_key, field in SUGAR_FIELDS.items():
589
+ if re.search(rf"does (not|n't) ferment {sugar_key}\b", text_lc):
590
+ _set_if_stronger(parsed, field, "Negative")
591
+
592
+ # 3) "non-lactose fermenter"
593
+ for sugar_key, field in SUGAR_FIELDS.items():
594
+ if re.search(rf"non[- ]{sugar_key} ferment(ing|er)?", text_lc):
595
+ _set_if_stronger(parsed, field, "Negative")
596
+
597
+ # 4) "X fermentation positive/negative"
598
+ for sugar_key, field in SUGAR_FIELDS.items():
599
+ # "glucose fermentation positive"
600
+ m1 = re.search(
601
+ rf"{sugar_key}\s+fermentation[ \-]?(positive|negative|variable|pos|neg|\+|\-)",
602
+ text_lc,
603
+ )
604
+ if m1:
605
+ val = _value_from_pnv_context(m1.group(1))
606
+ if val:
607
+ _set_if_stronger(parsed, field, val)
608
+ continue
609
+
610
+ # "positive for glucose fermentation"
611
+ m2 = re.search(
612
+ rf"(positive|negative|variable|pos|neg|\+|\-)\s+(for\s+)?{sugar_key}\s+fermentation",
613
+ text_lc,
614
+ )
615
+ if m2:
616
+ val = _value_from_pnv_context(m2.group(1))
617
+ if val:
618
+ _set_if_stronger(parsed, field, val)
619
+ continue
620
+
621
+ # 5) Global non-fermenter patterns
622
+ if (
623
+ re.search(r"does (not|n't) ferment (carbohydrates|sugars)", text_lc)
624
+ or re.search(r"non[- ]ferment(er|ing|ative)", text_lc)
625
+ ):
626
+ for field in SUGAR_FIELDS.values():
627
+ if field not in parsed or parsed[field] == UNKNOWN:
628
+ _set_if_stronger(parsed, field, "Negative")
629
+
630
+
631
+ # ------------------------------------------------------------
632
+ # Colony morphology (coarse, optional)
633
+ # ------------------------------------------------------------
634
+
635
+ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
636
+ """
637
+ Very coarse mapping for colony morphology. We try:
638
+ - "colonies are yellow, mucoid"
639
+ - "colonies dry, white and irregular on nutrient agar"
640
+ - "forming green colonies", "forms mucoid colonies"
641
+ """
642
+ # Pattern 1: "colonies are ..."
643
+ m = re.search(r"colon(y|ies) (are|is)\s+([a-z0-9 ,;\-]+)", text_lc)
644
+ if m:
645
+ desc = m.group(3).strip()
646
+ if desc:
647
+ pretty = "; ".join(
648
+ [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
649
+ )
650
+ if pretty:
651
+ _set_if_stronger(parsed, "Colony Morphology", pretty)
652
+ return
653
+
654
+ # Pattern 2: "colonies dry, white and irregular on nutrient agar"
655
+ m2 = re.search(
656
+ r"colonies\s+([a-z0-9 ,;\-]+?)(?:\s+on\b|\.|,)",
657
+ text_lc,
658
+ )
659
+ if m2:
660
+ desc = m2.group(1).strip()
661
+ if desc:
662
+ pretty = "; ".join(
663
+ [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
664
+ )
665
+ if pretty:
666
+ _set_if_stronger(parsed, "Colony Morphology", pretty)
667
+ return
668
+
669
+ # Pattern 3: "forming green colonies", "forms mucoid colonies"
670
+ m3 = re.search(
671
+ r"(forming|forms|produces)\s+([a-z0-9 ,;\-]+?)\s+colonies",
672
+ text_lc,
673
+ )
674
+ if m3:
675
+ desc = m3.group(2).strip()
676
+ if desc:
677
+ pretty = "; ".join(
678
+ [s.strip().capitalize() for s in re.split(r"[;,]", desc) if s.strip()]
679
+ )
680
+ if pretty:
681
+ _set_if_stronger(parsed, "Colony Morphology", pretty)
682
+
683
+
684
+ # ------------------------------------------------------------
685
+ # PUBLIC API
686
+ # ------------------------------------------------------------
687
+
688
+ def parse_text_rules(text: str) -> Dict[str, Any]:
689
+ """
690
+ Main entry point.
691
+ """
692
+ original = text or ""
693
+ text_clean = _clean_text(original)
694
+ text_lc = text_clean.lower()
695
+
696
+ parsed: Dict[str, str] = {}
697
+
698
+ try:
699
+ _parse_gram_and_shape(text_lc, parsed)
700
+ _parse_haemolysis(text_lc, parsed)
701
+ _parse_core_bool_tests(text_lc, parsed)
702
+ _parse_motility_capsule_spores(text_lc, parsed)
703
+ _parse_oxygen(text_lc, parsed)
704
+ _parse_growth_temperature(text_lc, parsed)
705
+ _parse_media(text_lc, parsed)
706
+ _parse_sugars(text_lc, parsed)
707
+ _parse_colony(text_lc, parsed)
708
+
709
+ return {
710
+ "parsed_fields": parsed,
711
+ "source": "rule_parser",
712
+ "raw": original,
713
+ }
714
+
715
+ except Exception as e:
716
+ # Fail-safe: never crash the app, just report an error
717
+ return {
718
+ "parsed_fields": parsed,
719
+ "source": "rule_parser",
720
+ "raw": original,
721
+ "error": f"{type(e).__name__}: {e}",
722
  }