Update engine/parser_rules.py
Browse files- engine/parser_rules.py +124 -53
engine/parser_rules.py
CHANGED
|
@@ -1185,79 +1185,152 @@ def _parse_colony(text_lc: str, parsed: Dict[str, str]) -> None:
|
|
| 1185 |
_set_if_stronger(parsed, "Colony Morphology", pretty)
|
| 1186 |
return
|
| 1187 |
|
| 1188 |
-
# ============================================================
|
| 1189 |
-
# PARSER_PATCH_v1
|
| 1190 |
-
# Extends missing behaviours without modifying core parser.
|
| 1191 |
-
# This patch is applied AFTER all existing parsing logic.
|
| 1192 |
-
#
|
| 1193 |
-
# Adds:
|
| 1194 |
-
# • Haemolysis ± without type ("haemolysis positive")
|
| 1195 |
-
# • Motility ± phrases ("motility positive")
|
| 1196 |
-
# • Spore Formation ± ("spore formation negative")
|
| 1197 |
-
# • NaCl tolerance direct ± ("NaCl tolerant positive")
|
| 1198 |
-
# • Growth Temperature parsing of:
|
| 1199 |
-
# "20/40", "20//40", "20 / 40"
|
| 1200 |
-
# • Multi-media extraction:
|
| 1201 |
-
# "Media grown on: Blood Agar; MSA; MacConkey Agar"
|
| 1202 |
-
# → merges with existing parsed value WITHOUT overwriting
|
| 1203 |
-
# ============================================================
|
| 1204 |
-
|
| 1205 |
def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
|
| 1206 |
# ----------------------------------------------
|
| 1207 |
-
#
|
| 1208 |
# ----------------------------------------------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1209 |
m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1210 |
if m_h and "Haemolysis" not in parsed:
|
| 1211 |
-
val =
|
| 1212 |
if val:
|
| 1213 |
parsed["Haemolysis"] = val
|
|
|
|
|
|
|
| 1214 |
|
| 1215 |
-
# ----------------------------------------------
|
| 1216 |
# 2. Motility: generic ±
|
| 1217 |
-
# ----------------------------------------------
|
| 1218 |
m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1219 |
if m_mot and "Motility" not in parsed:
|
| 1220 |
-
val =
|
| 1221 |
if val:
|
| 1222 |
parsed["Motility"] = val
|
| 1223 |
|
| 1224 |
-
# ----------------------------------------------
|
| 1225 |
# 3. Spore formation ±
|
| 1226 |
-
# ----------------------------------------------
|
| 1227 |
m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1228 |
if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
|
| 1229 |
-
val =
|
| 1230 |
if val:
|
| 1231 |
parsed["Spore Formation"] = val
|
| 1232 |
|
| 1233 |
-
#
|
| 1234 |
-
#
|
| 1235 |
-
#
|
| 1236 |
-
|
| 1237 |
-
if m_nacl and parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
|
| 1238 |
-
val = _value_from_pnv_context(m_nacl.group(1))
|
| 1239 |
-
if val:
|
| 1240 |
-
parsed["NaCl Tolerant (>=6%)"] = val
|
| 1241 |
|
| 1242 |
-
|
| 1243 |
-
|
| 1244 |
-
|
| 1245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1246 |
m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
|
| 1247 |
-
if m_temp:
|
| 1248 |
-
|
| 1249 |
-
|
| 1250 |
-
|
| 1251 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1252 |
|
| 1253 |
-
|
| 1254 |
-
|
| 1255 |
-
|
| 1256 |
-
|
| 1257 |
-
|
| 1258 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1259 |
if "media grown on" in text_lc or "grown on" in text_lc:
|
| 1260 |
-
# capture segment after "grown on"
|
| 1261 |
mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
|
| 1262 |
if mm:
|
| 1263 |
segment = mm.group(1)
|
|
@@ -1273,11 +1346,8 @@ def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) ->
|
|
| 1273 |
|
| 1274 |
if detected_media:
|
| 1275 |
existing = parsed.get("Media Grown On", "")
|
| 1276 |
-
existing_list = []
|
| 1277 |
-
if existing:
|
| 1278 |
-
existing_list = [x.strip() for x in existing.split(";")]
|
| 1279 |
|
| 1280 |
-
# Merge while preserving order from the text
|
| 1281 |
merged = []
|
| 1282 |
for m in existing_list:
|
| 1283 |
if m not in merged:
|
|
@@ -1289,6 +1359,7 @@ def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) ->
|
|
| 1289 |
parsed["Media Grown On"] = "; ".join(merged)
|
| 1290 |
|
| 1291 |
return parsed
|
|
|
|
| 1292 |
# ------------------------------------------------------------
|
| 1293 |
# PUBLIC API
|
| 1294 |
# ------------------------------------------------------------
|
|
|
|
| 1185 |
_set_if_stronger(parsed, "Colony Morphology", pretty)
|
| 1186 |
return
|
| 1187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1188 |
def _apply_patches(original_text: str, text_lc: str, parsed: Dict[str, str]) -> Dict[str, str]:
|
| 1189 |
# ----------------------------------------------
|
| 1190 |
+
# helper for P/N/V
|
| 1191 |
# ----------------------------------------------
|
| 1192 |
+
def _pnv(x: str) -> Optional[str]:
|
| 1193 |
+
x = x.strip().lower()
|
| 1194 |
+
if x in {"positive", "pos", "+", "strongly positive", "weakly positive"}:
|
| 1195 |
+
return "Positive"
|
| 1196 |
+
if x in {"negative", "neg", "-", "no"}:
|
| 1197 |
+
return "Negative"
|
| 1198 |
+
if x in {"variable", "var", "mixed"}:
|
| 1199 |
+
return "Variable"
|
| 1200 |
+
return None
|
| 1201 |
+
|
| 1202 |
+
# ============================================================
|
| 1203 |
+
# NEW LOGIC: Haemolysis Type detection (alpha/beta/none)
|
| 1204 |
+
# ============================================================
|
| 1205 |
+
|
| 1206 |
+
# alpha
|
| 1207 |
+
m_alpha = re.search(r"(alpha|α)[-\s]*haemolysis", text_lc) or \
|
| 1208 |
+
re.search(r"haemolysis type[: ]*(alpha|α)", text_lc)
|
| 1209 |
+
if m_alpha:
|
| 1210 |
+
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
|
| 1211 |
+
parsed["Haemolysis"] = "Positive"
|
| 1212 |
+
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
|
| 1213 |
+
parsed["Haemolysis Type"] = "Alpha"
|
| 1214 |
+
|
| 1215 |
+
# beta
|
| 1216 |
+
m_beta = re.search(r"(beta|β)[-\s]*haemolysis", text_lc) or \
|
| 1217 |
+
re.search(r"haemolysis type[: ]*(beta|β)", text_lc)
|
| 1218 |
+
if m_beta:
|
| 1219 |
+
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
|
| 1220 |
+
parsed["Haemolysis"] = "Positive"
|
| 1221 |
+
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
|
| 1222 |
+
parsed["Haemolysis Type"] = "Beta"
|
| 1223 |
+
|
| 1224 |
+
# gamma / none
|
| 1225 |
+
m_gamma = re.search(r"(gamma|γ)[-\s]*haemolysis", text_lc)
|
| 1226 |
+
m_none = re.search(r"(no haemolysis|non[- ]haemolytic|no hemolysis|non[- ]hemolytic)", text_lc)
|
| 1227 |
+
if m_gamma or m_none:
|
| 1228 |
+
if parsed.get("Haemolysis", UNKNOWN) == UNKNOWN:
|
| 1229 |
+
parsed["Haemolysis"] = "Negative"
|
| 1230 |
+
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN:
|
| 1231 |
+
parsed["Haemolysis Type"] = "None"
|
| 1232 |
+
|
| 1233 |
+
# ============================================================
|
| 1234 |
+
# ORIGINAL PATCH v1 LOGIC (fully preserved)
|
| 1235 |
+
# ============================================================
|
| 1236 |
+
|
| 1237 |
+
# 1. Haemolysis: generic ± without type
|
| 1238 |
m_h = re.search(r"haemolysis\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1239 |
if m_h and "Haemolysis" not in parsed:
|
| 1240 |
+
val = _pnv(m_h.group(1))
|
| 1241 |
if val:
|
| 1242 |
parsed["Haemolysis"] = val
|
| 1243 |
+
if parsed.get("Haemolysis Type", UNKNOWN) == UNKNOWN and val == "Positive":
|
| 1244 |
+
parsed["Haemolysis Type"] = "Unknown"
|
| 1245 |
|
|
|
|
| 1246 |
# 2. Motility: generic ±
|
|
|
|
| 1247 |
m_mot = re.search(r"motility\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1248 |
if m_mot and "Motility" not in parsed:
|
| 1249 |
+
val = _pnv(m_mot.group(1))
|
| 1250 |
if val:
|
| 1251 |
parsed["Motility"] = val
|
| 1252 |
|
|
|
|
| 1253 |
# 3. Spore formation ±
|
|
|
|
| 1254 |
m_sp = re.search(r"spore formation\s+(positive|negative|variable|pos|neg|\+|\-)", text_lc)
|
| 1255 |
if m_sp and parsed.get("Spore Formation", UNKNOWN) == UNKNOWN:
|
| 1256 |
+
val = _pnv(m_sp.group(1))
|
| 1257 |
if val:
|
| 1258 |
parsed["Spore Formation"] = val
|
| 1259 |
|
| 1260 |
+
# ============================================================
|
| 1261 |
+
# FIXED NaCl tolerant logic (patch upgrade)
|
| 1262 |
+
# ============================================================
|
| 1263 |
+
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1264 |
|
| 1265 |
+
# direct p/n/v
|
| 1266 |
+
m_nacl = re.search(
|
| 1267 |
+
r"(?:nacl\s*(?:tolerant|tolerance)?|growth\s+in\s+6\%[\s]*nacl)"
|
| 1268 |
+
r"\s*(positive|negative|variable|pos|neg|\+|\-)",
|
| 1269 |
+
text_lc
|
| 1270 |
+
)
|
| 1271 |
+
if m_nacl:
|
| 1272 |
+
val = _pnv(m_nacl.group(1))
|
| 1273 |
+
if val:
|
| 1274 |
+
parsed["NaCl Tolerant (>=6%)"] = val
|
| 1275 |
+
|
| 1276 |
+
# "no growth in 6% nacl"
|
| 1277 |
+
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
|
| 1278 |
+
if re.search(r"no\s+growth\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
|
| 1279 |
+
parsed["NaCl Tolerant (>=6%)"] = "Negative"
|
| 1280 |
+
|
| 1281 |
+
# "grows in 6% nacl"
|
| 1282 |
+
if parsed.get("NaCl Tolerant (>=6%)", UNKNOWN) == UNKNOWN:
|
| 1283 |
+
if re.search(r"grows?\s+in\s+(?:>=)?\s*6\%?\s*nacl", text_lc):
|
| 1284 |
+
parsed["NaCl Tolerant (>=6%)"] = "Positive"
|
| 1285 |
+
|
| 1286 |
+
# ============================================================
|
| 1287 |
+
# Growth Temperature patterns (20/40, 20//40, 20 / 40)
|
| 1288 |
+
# ============================================================
|
| 1289 |
m_temp = re.search(r"\b(\d{1,3})\s*[/]{1,2}\s*(\d{1,3})\b", text_lc)
|
| 1290 |
+
if m_temp and parsed.get("Growth Temperature", UNKNOWN) == UNKNOWN:
|
| 1291 |
+
parsed["Growth Temperature"] = f"{m_temp.group(1)}//{m_temp.group(2)}"
|
| 1292 |
+
|
| 1293 |
+
# ============================================================
|
| 1294 |
+
# Colony Morphology STRICT LIST extraction
|
| 1295 |
+
# ============================================================
|
| 1296 |
+
COLONY_TRIGGERS = [
|
| 1297 |
+
"colony morphology",
|
| 1298 |
+
"colonies are",
|
| 1299 |
+
"colonies appear",
|
| 1300 |
+
"colonies look",
|
| 1301 |
+
"colony appearance",
|
| 1302 |
+
"colony characteristics",
|
| 1303 |
+
]
|
| 1304 |
+
if any(t in text_lc for t in COLONY_TRIGGERS):
|
| 1305 |
+
m_col = re.search(
|
| 1306 |
+
r"(?:colony morphology|colonies are|colonies appear|colonies look|colony appearance|colony characteristics)"
|
| 1307 |
+
r"[: ]+([a-z0-9 ,;/\-]+)",
|
| 1308 |
+
text_lc
|
| 1309 |
+
)
|
| 1310 |
+
if m_col:
|
| 1311 |
+
segment = m_col.group(1)
|
| 1312 |
+
parts = [x.strip() for x in re.split(r"[;,/]", segment) if x.strip()]
|
| 1313 |
|
| 1314 |
+
clean_desc = [p.capitalize() for p in parts if len(p) > 1]
|
| 1315 |
+
|
| 1316 |
+
if clean_desc:
|
| 1317 |
+
existing = parsed.get("Colony Morphology", "")
|
| 1318 |
+
existing_list = [x.strip() for x in existing.split(";")] if existing else []
|
| 1319 |
+
|
| 1320 |
+
merged = []
|
| 1321 |
+
for x in existing_list:
|
| 1322 |
+
if x not in merged:
|
| 1323 |
+
merged.append(x)
|
| 1324 |
+
for x in clean_desc:
|
| 1325 |
+
if x not in merged:
|
| 1326 |
+
merged.append(x)
|
| 1327 |
+
|
| 1328 |
+
parsed["Colony Morphology"] = "; ".join(merged)
|
| 1329 |
+
|
| 1330 |
+
# ============================================================
|
| 1331 |
+
# ORIGINAL MULTI-MEDIA PATCH (unchanged)
|
| 1332 |
+
# ============================================================
|
| 1333 |
if "media grown on" in text_lc or "grown on" in text_lc:
|
|
|
|
| 1334 |
mm = re.search(r"(?:media\s+grown\s+on|grown\s+on)[: ]+([a-z0-9 ,;/\-]+)", text_lc)
|
| 1335 |
if mm:
|
| 1336 |
segment = mm.group(1)
|
|
|
|
| 1346 |
|
| 1347 |
if detected_media:
|
| 1348 |
existing = parsed.get("Media Grown On", "")
|
| 1349 |
+
existing_list = [x.strip() for x in existing.split(";")] if existing else []
|
|
|
|
|
|
|
| 1350 |
|
|
|
|
| 1351 |
merged = []
|
| 1352 |
for m in existing_list:
|
| 1353 |
if m not in merged:
|
|
|
|
| 1359 |
parsed["Media Grown On"] = "; ".join(merged)
|
| 1360 |
|
| 1361 |
return parsed
|
| 1362 |
+
|
| 1363 |
# ------------------------------------------------------------
|
| 1364 |
# PUBLIC API
|
| 1365 |
# ------------------------------------------------------------
|