EphAsad commited on
Commit
0f6879b
·
verified ·
1 Parent(s): f16e112

Update engine/parser_rules.py

Browse files
Files changed (1) hide show
  1. engine/parser_rules.py +124 -53
engine/parser_rules.py CHANGED
@@ -1185,79 +1185,152 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
1185
  _set_if_stronger(parsed, "Colony Morphology", pretty)
1186
  return
1187
 
1188
- # ============================================================
1189
- # PARSER_PATCH_v1
1190
- # Extends missing behaviours without modifying core parser.
1191
- # This patch is applied AFTER all existing parsing logic.
1192
- #
1193
- # Adds:
1194
- # • Haemolysis ± without type ("haemolysis positive")
1195
- # • Motility ± phrases ("motility positive")
1196
- # • Spore Formation ± ("spore formation negative")
1197
- # • NaCl tolerance direct ± ("NaCl tolerant positive")
1198
- # • Growth Temperature parsing of:
1199
- # "20/40", "20//40", "20 / 40"
1200
- # • Multi-media extraction:
1201
- # "Media grown on: Blood Agar; MSA; MacConkey Agar"
1202
- # → merges with existing parsed value WITHOUT overwriting
1203
- # ============================================================
1204
-
1205
  def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
1206
  # ----------------------------------------------
1207
- # 1. Haemolysis: generic ± without type
1208
  # ----------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1209
  m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1210
  if m_h and "Haemolysis" not in parsed:
1211
- val = _value_from_pnv_context(m_h.group(1))
1212
  if val:
1213
  parsed["Haemolysis"] = val
 
 
1214
 
1215
- # ----------------------------------------------
1216
  # 2. Motility: generic ±
1217
- # ----------------------------------------------
1218
  m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1219
  if m_mot and "Motility" not in parsed:
1220
- val = _value_from_pnv_context(m_mot.group(1))
1221
  if val:
1222
  parsed["Motility"] = val
1223
 
1224
- # ----------------------------------------------
1225
  # 3. Spore formation ±
1226
- # ----------------------------------------------
1227
  m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1228
  if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
1229
- val = _value_from_pnv_context(m_sp.group(1))
1230
  if val:
1231
  parsed["Spore Formation"] = val
1232
 
1233
- # ----------------------------------------------
1234
- # 4. NaCl tolerance ± (simple)
1235
- # ----------------------------------------------
1236
- m_nacl = re.search(r"nacl(?:\s+tolerant)?\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1237
- if m_nacl and parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
1238
- val = _value_from_pnv_context(m_nacl.group(1))
1239
- if val:
1240
- parsed["NaCl Tolerant (>=6%)"] = val
1241
 
1242
- # ----------------------------------------------
1243
- # 5. Growth Temperature patterns like:
1244
- # "20/40", "20 / 40", "20//40"
1245
- # ----------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1246
  m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
1247
- if m_temp:
1248
- low, high = m_temp.group(1), m_temp.group(2)
1249
- # Only set if no better structured temp already present
1250
- if parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN:
1251
- parsed["Growth Temperature"] = f"{low}//{high}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1252
 
1253
- # ----------------------------------------------
1254
- # 6. Multi-media patch:
1255
- # extract all media separated by ',' or ';'
1256
- # preserve order from text
1257
- # merge with parsed["Media Grown On"]
1258
- # ----------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
1259
  if "media grown on" in text_lc or "grown on" in text_lc:
1260
- # capture segment after "grown on"
1261
  mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
1262
  if mm:
1263
  segment = mm.group(1)
@@ -1273,11 +1346,8 @@ def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) ->
1273
 
1274
  if detected_media:
1275
  existing = parsed.get("Media Grown On", "")
1276
- existing_list = []
1277
- if existing:
1278
- existing_list = [x.strip() for x in existing.split(";")]
1279
 
1280
- # Merge while preserving order from the text
1281
  merged = []
1282
  for m in existing_list:
1283
  if m not in merged:
@@ -1289,6 +1359,7 @@ def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) ->
1289
  parsed["Media Grown On"] = "; ".join(merged)
1290
 
1291
  return parsed
 
1292
  # ------------------------------------------------------------
1293
  # PUBLIC API
1294
  # ------------------------------------------------------------
 
1185
  _set_if_stronger(parsed, "Colony Morphology", pretty)
1186
  return
1187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1188
  def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
1189
  # ----------------------------------------------
1190
+ # helper for P/N/V
1191
  # ----------------------------------------------
1192
+ def _pnv(x: str) -> Optional[str]:
1193
+ x = x.strip().lower()
1194
+ if x in {"positive", "pos", "+", "strongly positive", "weakly positive"}:
1195
+ return "Positive"
1196
+ if x in {"negative", "neg", "-", "no"}:
1197
+ return "Negative"
1198
+ if x in {"variable", "var", "mixed"}:
1199
+ return "Variable"
1200
+ return None
1201
+
1202
+ # ============================================================
1203
+ # NEW LOGIC: Haemolysis Type detection (alpha/beta/none)
1204
+ # ============================================================
1205
+
1206
+ # alpha
1207
+ m_alpha = re.search(r"(alpha|α)[-\s]*haemolysis", text_lc) or \
1208
+ re.search(r"haemolysis type[: ]*(alpha|α)", text_lc)
1209
+ if m_alpha:
1210
+ if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
1211
+ parsed["Haemolysis"] = "Positive"
1212
+ if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
1213
+ parsed["Haemolysis Type"] = "Alpha"
1214
+
1215
+ # beta
1216
+ m_beta = re.search(r"(beta|β)[-\s]*haemolysis", text_lc) or \
1217
+ re.search(r"haemolysis type[: ]*(beta|β)", text_lc)
1218
+ if m_beta:
1219
+ if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
1220
+ parsed["Haemolysis"] = "Positive"
1221
+ if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
1222
+ parsed["Haemolysis Type"] = "Beta"
1223
+
1224
+ # gamma / none
1225
+ m_gamma = re.search(r"(gamma|γ)[-\s]*haemolysis", text_lc)
1226
+ m_none = re.search(r"(no haemolysis|non[- ]haemolytic|no hemolysis|non[- ]hemolytic)", text_lc)
1227
+ if m_gamma or m_none:
1228
+ if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
1229
+ parsed["Haemolysis"] = "Negative"
1230
+ if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
1231
+ parsed["Haemolysis Type"] = "None"
1232
+
1233
+ # ============================================================
1234
+ # ORIGINAL PATCH v1 LOGIC (fully preserved)
1235
+ # ============================================================
1236
+
1237
+ # 1. Haemolysis: generic ± without type
1238
  m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1239
  if m_h and "Haemolysis" not in parsed:
1240
+ val = _pnv(m_h.group(1))
1241
  if val:
1242
  parsed["Haemolysis"] = val
1243
+ if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN and val == "Positive":
1244
+ parsed["Haemolysis Type"] = "Unknown"
1245
 
 
1246
  # 2. Motility: generic ±
 
1247
  m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1248
  if m_mot and "Motility" not in parsed:
1249
+ val = _pnv(m_mot.group(1))
1250
  if val:
1251
  parsed["Motility"] = val
1252
 
 
1253
  # 3. Spore formation ±
 
1254
  m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
1255
  if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
1256
+ val = _pnv(m_sp.group(1))
1257
  if val:
1258
  parsed["Spore Formation"] = val
1259
 
1260
+ # ============================================================
1261
+ # FIXED NaCl tolerant logic (patch upgrade)
1262
+ # ============================================================
1263
+ if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
 
 
 
 
1264
 
1265
+ # direct p/n/v
1266
+ m_nacl = re.search(
1267
+ r"(?:nacl\s*(?:tolerant|tolerance)?|growth\s+in\s+6\%[\s]*nacl)"
1268
+ r"\s*(positive|negative|variable|pos|neg|\+|\-)",
1269
+ text_lc
1270
+ )
1271
+ if m_nacl:
1272
+ val = _pnv(m_nacl.group(1))
1273
+ if val:
1274
+ parsed["NaCl Tolerant (>=6%)"] = val
1275
+
1276
+ # "no growth in 6% nacl"
1277
+ if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
1278
+ if re.search(r"no\s+growth\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
1279
+ parsed["NaCl Tolerant (>=6%)"] = "Negative"
1280
+
1281
+ # "grows in 6% nacl"
1282
+ if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
1283
+ if re.search(r"grows?\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
1284
+ parsed["NaCl Tolerant (>=6%)"] = "Positive"
1285
+
1286
+ # ============================================================
1287
+ # Growth Temperature patterns (20/40, 20//40, 20 / 40)
1288
+ # ============================================================
1289
  m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
1290
+ if m_temp and parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN:
1291
+ parsed["Growth Temperature"] = f"{m_temp.group(1)}//{m_temp.group(2)}"
1292
+
1293
+ # ============================================================
1294
+ # Colony Morphology STRICT LIST extraction
1295
+ # ============================================================
1296
+ COLONY_TRIGGERS = [
1297
+ "colony morphology",
1298
+ "colonies are",
1299
+ "colonies appear",
1300
+ "colonies look",
1301
+ "colony appearance",
1302
+ "colony characteristics",
1303
+ ]
1304
+ if any(t in text_lc for t in COLONY_TRIGGERS):
1305
+ m_col = re.search(
1306
+ r"(?:colony morphology|colonies are|colonies appear|colonies look|colony appearance|colony characteristics)"
1307
+ r"[: ]+([a-z0-9 ,;/\-]+)",
1308
+ text_lc
1309
+ )
1310
+ if m_col:
1311
+ segment = m_col.group(1)
1312
+ parts = [x.strip() for x in re.split(r"[;,/]", segment) if x.strip()]
1313
 
1314
+ clean_desc = [p.capitalize() for p in parts if len(p) > 1]
1315
+
1316
+ if clean_desc:
1317
+ existing = parsed.get("Colony Morphology", "")
1318
+ existing_list = [x.strip() for x in existing.split(";")] if existing else []
1319
+
1320
+ merged = []
1321
+ for x in existing_list:
1322
+ if x not in merged:
1323
+ merged.append(x)
1324
+ for x in clean_desc:
1325
+ if x not in merged:
1326
+ merged.append(x)
1327
+
1328
+ parsed["Colony Morphology"] = "; ".join(merged)
1329
+
1330
+ # ============================================================
1331
+ # ORIGINAL MULTI-MEDIA PATCH (unchanged)
1332
+ # ============================================================
1333
  if "media grown on" in text_lc or "grown on" in text_lc:
 
1334
  mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
1335
  if mm:
1336
  segment = mm.group(1)
 
1346
 
1347
  if detected_media:
1348
  existing = parsed.get("Media Grown On", "")
1349
+ existing_list = [x.strip() for x in existing.split(";")] if existing else []
 
 
1350
 
 
1351
  merged = []
1352
  for m in existing_list:
1353
  if m not in merged:
 
1359
  parsed["Media Grown On"] = "; ".join(merged)
1360
 
1361
  return parsed
1362
+
1363
  # ------------------------------------------------------------
1364
  # PUBLIC API
1365
  # ------------------------------------------------------------