Spaces:

EphAsad
/

BactKing

Sleeping

App Files Files Community

EphAsad commited on Nov 15, 2025

Commit

1168cd6

verified ·

1 Parent(s): 6d84739

Upload 23 files

Browse files

Files changed (24) hide show

.gitattributes +1 -0
data/alias_maps.json +32 -0
data/bacteria_db.xlsx +3 -0
data/extended_schema.json +161 -0
data/signals_catalog.json +1012 -0
engine/__init__.py +4 -0
engine/bacteria_identifier.py +383 -0
engine/extended_reasoner.py +79 -0
engine/parser_ext.py +149 -0
engine/parser_fusion.py +107 -0
engine/parser_llm.py +137 -0
engine/parser_rules.py +186 -0
engine/schema.py +180 -0
engine/validator.py +12 -0
engine/weights.py +12 -0
training/__init__.py +2 -0
training/alias_trainer.py +126 -0
training/gold_tester.py +169 -0
training/gold_tests.json +0 -0
training/gold_trainer.py +69 -0
training/parser_eval.py +104 -0
training/repo_sync_hf.py +82 -0
training/schema_expander.py +13 -0
training/signal_trainer.py +13 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/bacteria_db.xlsx filter=lfs diff=lfs merge=lfs -text

data/alias_maps.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "field_aliases": {
+    "Dnase": "DNase",
+    "CAMP Test": "CAMP",
+    "Optochin Sensitivity": "Optochin",
+    "Bile Solubility Test": "Bile Solubility",
+    "Hippurate": "Hippurate Hydrolysis",
+    "PYR Test": "PYR"
+  },
+  "media_aliases": {
+    "mac": "MacConkey Agar",
+    "macconkey": "MacConkey Agar",
+    "msa": "Mannitol Salt Agar",
+    "bap": "Blood Agar",
+    "choc": "Chocolate Agar",
+    "chocolate": "Chocolate Agar",
+    "cled": "CLED Agar"
+  },
+  "value_aliases_pnv": {
+    "+": "Positive",
+    "pos": "Positive",
+    "positive": "Positive",
+    "-": "Negative",
+    "neg": "Negative",
+    "negative": "Negative",
+    "variable": "Variable",
+    "var": "Variable"
+  },
+  "Motility": {
+    "positive": "positive"
+  }
+}

data/bacteria_db.xlsx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c19c78ad2851aaa77f55f8f191748defa5e4a0654a11c9e5132f5b086d3c4543
+size 2687947

data/extended_schema.json ADDED Viewed

	@@ -0,0 +1,161 @@

+{
+  "CAMP": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": [
+      "CAMP Test"
+    ]
+  },
+  "Hippurate Hydrolysis": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "PYR": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Optochin": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": [
+      "Optochin Sensitivity"
+    ]
+  },
+  "Bile Solubility": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Novobiocin": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Bile Resistance": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Lipase": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Lecithinase": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Odour": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Growth Factors": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Fructose Fermentation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Glucose Oxidation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Glycerol Fermentation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Fermentation Products": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Cellobiose Fermentation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "pH Range": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Iron Oxidation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "NaCl Tolerant (>=15%)": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Temperature Dependence": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Sulfur Utilization": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Acid Fast": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Casein Hydrolysis": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Tyrosine Hydrolysis": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Mannose Fermentation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Gas Production": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Inulin Fermentation": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Other Products": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Antibiotic Resistance": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Metabolic Product": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  },
+  "Bacitracin": {
+    "value_type": "enum_PNV",
+    "status": "experimental",
+    "aliases": []
+  }
+}

data/signals_catalog.json ADDED Viewed

	@@ -0,0 +1,1012 @@

+{
+  "Staphylococcus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    },
+    "Novobiocin": {
+      "Positive": 4,
+      "Negative": 2,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Antibiotic Resistance": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "CAMP": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "PYR": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Optochin": {
+      "Positive": 0,
+      "Negative": 4,
+      "Variable": 0,
+      "_n": 4
+    },
+    "Bacitracin": {
+      "Positive": 0,
+      "Negative": 2,
+      "Variable": 0,
+      "_n": 2
+    }
+  },
+  "Salmonella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Enterobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 30,
+      "Variable": 0,
+      "_n": 30
+    }
+  },
+  "Pseudomonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Bacillus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 42,
+      "Variable": 0,
+      "_n": 42
+    },
+    "Lecithinase": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Shigella": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Escherichia": {
+    "Indole": {
+      "Positive": 12,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Klebsiella": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Proteus": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Clostridium": {
+    "Indole": {
+      "Positive": 18,
+      "Negative": 36,
+      "Variable": 0,
+      "_n": 54
+    },
+    "Lipase": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Lecithinase": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Odour": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Fructose Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Bacteroides": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 6,
+      "_n": 12
+    },
+    "Bile Resistance": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Streptococcus": {
+    "CAMP": {
+      "Positive": 20,
+      "Negative": 4,
+      "Variable": 0,
+      "_n": 24
+    },
+    "Hippurate Hydrolysis": {
+      "Positive": 12,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 12
+    },
+    "PYR": {
+      "Positive": 2,
+      "Negative": 16,
+      "Variable": 0,
+      "_n": 18
+    },
+    "Optochin": {
+      "Positive": 8,
+      "Negative": 4,
+      "Variable": 0,
+      "_n": 12
+    },
+    "Bile Solubility": {
+      "Positive": 14,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 14
+    },
+    "Inulin Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Metabolic Product": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Bacitracin": {
+      "Positive": 2,
+      "Negative": 2,
+      "Variable": 0,
+      "_n": 4
+    }
+  },
+  "Aeromonas": {
+    "Indole": {
+      "Positive": 48,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 48
+    },
+    "Gas Production": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Yersinia": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 18,
+      "_n": 24
+    }
+  },
+  "Morganella": {
+    "Indole": {
+      "Positive": 30,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 30
+    }
+  },
+  "Providencia": {
+    "Indole": {
+      "Positive": 30,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 36
+    }
+  },
+  "Pasteurella": {
+    "Indole": {
+      "Positive": 30,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 30
+    }
+  },
+  "Citrobacter": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 12,
+      "_n": 18
+    }
+  },
+  "Campylobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Vibrio": {
+    "Indole": {
+      "Positive": 42,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 42
+    }
+  },
+  "Burkholderia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 30,
+      "Variable": 0,
+      "_n": 30
+    },
+    "Odour": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Glucose Oxidation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Legionella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Helicobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Leptospira": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Serratia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 30,
+      "Variable": 0,
+      "_n": 30
+    },
+    "Temperature Dependence": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    }
+  },
+  "Alcaligenes": {
+    "Odour": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Shewanella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 24,
+      "Variable": 0,
+      "_n": 24
+    }
+  },
+  "Acinetobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 36,
+      "Variable": 0,
+      "_n": 36
+    }
+  },
+  "Haemophilus": {
+    "Growth Factors": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 12,
+      "_n": 18
+    }
+  },
+  "Micrococcus": {
+    "Glucose Oxidation": {
+      "Positive": 12,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 12
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Edwardsiella": {
+    "Indole": {
+      "Positive": 18,
+      "Negative": 6,
+      "Variable": 6,
+      "_n": 30
+    }
+  },
+  "Chromobacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 12,
+      "_n": 24
+    }
+  },
+  "Lactobacillus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    },
+    "pH Range": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Fermentation Product": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    }
+  },
+  "Corynebacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    }
+  },
+  "Nocardia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    },
+    "Acid Fast": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Casein Hydrolysis": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Tyrosine Hydrolysis": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Propionibacterium": {
+    "Indole": {
+      "Positive": 18,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 18
+    },
+    "Glycerol Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Mannose Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Other Products": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    }
+  },
+  "Peptostreptococcus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Veillonella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Fusobacterium": {
+    "Odour": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Indole": {
+      "Positive": 12,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Eubacterium": {
+    "Fermentation Products": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Cellobiose Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Halomonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 18,
+      "Variable": 0,
+      "_n": 18
+    },
+    "NaCl Tolerant (>=10%)": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Psychrobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Deinococcus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Thermus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Acidithiobacillus": {
+    "pH Range": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Iron Oxidation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Mycoplasma": {
+    "Arginine": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Arginine Hydrolysis": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Bordetella": {
+    "Growth Factors": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 0
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Stenotrophomonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 24,
+      "Variable": 0,
+      "_n": 24
+    }
+  },
+  "Ralstonia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Achromobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Brucella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Brevundimonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Arthrobacter": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Glucose Oxidation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Cytophaga": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Flavobacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Oerskovia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Sphingomonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    },
+    "Glucose Oxidation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Comamonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Halobacterium": {
+    "NaCl Tolerant (>=15%)": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Thermococcus": {
+    "Sulfur Utilization": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Actinomyces": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Elizabethkingia": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Hafnia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Photobacterium": {
+    "Indole": {
+      "Positive": 12,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Pantoea": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Raoultella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 0,
+      "Variable": 6,
+      "_n": 6
+    }
+  },
+  "Ochrobactrum": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Roseomonas": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Actinobacillus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Gemella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Rothia": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Listeria": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "CAMP": {
+      "Positive": 2,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 2
+    }
+  },
+  "Carnobacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Plesiomonas": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Janthinobacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Paenibacillus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Moraxella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Aerococcus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Kocuria": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Leuconostoc": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Gas Production": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Fructose Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Rhodococcus": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Francisella": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 12,
+      "Variable": 0,
+      "_n": 12
+    }
+  },
+  "Erysipelothrix": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    },
+    "Fructose Fermentation": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Arcanobacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Porphyromonas": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Prevotella": {
+    "Indole": {
+      "Positive": 6,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Microbacterium": {
+    "Indole": {
+      "Positive": 0,
+      "Negative": 6,
+      "Variable": 0,
+      "_n": 6
+    }
+  },
+  "Enterococcus": {
+    "PYR": {
+      "Positive": 2,
+      "Negative": 0,
+      "Variable": 0,
+      "_n": 2
+    },
+    "Optochin": {
+      "Positive": 0,
+      "Negative": 2,
+      "Variable": 0,
+      "_n": 2
+    },
+    "Novobiocin": {
+      "Positive": 0,
+      "Negative": 2,
+      "Variable": 0,
+      "_n": 2
+    }
+  }
+}

engine/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+# engine/__init__.py
+# Makes 'engine' a package and re-exports the identifier for convenience if you want.
+from .bacteria_identifier import BacteriaIdentifier

engine/bacteria_identifier.py ADDED Viewed

	@@ -0,0 +1,383 @@

+# engine/bacteria_identifier.py
+# ------------------------------------------------------------
+# Core identification engine + blended scoring with extended signals.
+import os
+import json
+import re
+import random
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+from engine.extended_reasoner import score_genera_from_extended
+DATA_DIR = "data"
+EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
+# -----------------------------
+# Helper Function
+# -----------------------------
+def join_with_and(items):
+    """Join list into a readable string, using commas and 'and' before last item."""
+    if not items:
+        return ""
+    if len(items) == 1:
+        return items[0]
+    return ", ".join(items[:-1]) + " and " + items[-1]
+# -----------------------------
+# Identification Result Class
+# -----------------------------
+class IdentificationResult:
+    """
+    Stores data about a single bacterial genus result and generates reasoning text.
+    Now includes optional extended-likelihood and blended confidence.
+    """
+    def __init__(
+        self,
+        genus: str,
+        total_score: int,
+        matched_fields: List[str],
+        mismatched_fields: List[str],
+        reasoning_factors: Dict[str, str],
+        total_fields_evaluated: int,
+        total_fields_possible: int,
+        extra_notes: str = "",
+        extended_likelihood: Optional[float] = None,
+        extended_explanation: str = "",
+    ):
+        self.genus = genus
+        self.total_score = total_score
+        self.matched_fields = matched_fields
+        self.mismatched_fields = mismatched_fields
+        self.reasoning_factors = reasoning_factors
+        self.total_fields_evaluated = total_fields_evaluated
+        self.total_fields_possible = total_fields_possible
+        self.extra_notes = extra_notes
+        # Extended reasoning
+        self.extended_likelihood = extended_likelihood  # 0–1, or None if no extended data
+        self.extended_explanation = extended_explanation
+    # -----------------------------
+    # Confidence Calculations
+    # -----------------------------
+    def confidence_percent(self) -> int:
+        """Confidence based only on tests the user entered."""
+        if self.total_fields_evaluated == 0:
+            return 0
+        return max(
+            0,
+            min(100, int((self.total_score / self.total_fields_evaluated) * 100)),
+        )
+    def true_confidence(self) -> int:
+        """Confidence based on *all* possible tests (complete database fields)."""
+        if self.total_fields_possible == 0:
+            return 0
+        return max(
+            0,
+            min(100, int((self.total_score / self.total_fields_possible) * 100)),
+        )
+    def blended_confidence_raw(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> float:
+        """
+        Blended confidence:
+            core = core-confidence (0–1)
+            ext  = extended likelihood (0–1, if available)
+        If no extended likelihood, return core.
+        """
+        core = self.confidence_percent() / 100.0
+        if self.extended_likelihood is None:
+            return core
+        return weight_core * core + weight_ext * self.extended_likelihood
+    def blended_confidence_percent(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> int:
+        return int(round(self.blended_confidence_raw(weight_core, weight_ext) * 100))
+    # -----------------------------
+    # Reasoning Paragraph Generator
+    # -----------------------------
+    def reasoning_paragraph(self, ranked_results=None) -> str:
+        """Generate detailed reasoning paragraph with comparison to other genera."""
+        if not self.matched_fields:
+            return "No significant biochemical or morphological matches were found."
+        intro = random.choice(
+            [
+                "Based on the observed biochemical and morphological traits,",
+                "According to the provided test results,",
+                "From the available laboratory findings,",
+                "Considering the entered reactions and colony traits,",
+            ]
+        )
+        # Key descriptive highlights
+        highlights = []
+        if "Gram Stain" in self.matched_fields:
+            highlights.append(
+                f"it is **Gram {self.reasoning_factors.get('Gram Stain', '').lower()}**"
+            )
+        if "Shape" in self.matched_fields:
+            highlights.append(
+                f"with a **{self.reasoning_factors.get('Shape', '').lower()}** morphology"
+            )
+        if "Catalase" in self.matched_fields:
+            highlights.append(
+                f"and **catalase {self.reasoning_factors.get('Catalase', '').lower()}** activity"
+            )
+        if "Oxidase" in self.matched_fields:
+            highlights.append(
+                f"and **oxidase {self.reasoning_factors.get('Oxidase', '').lower()}** reaction"
+            )
+        if "Oxygen Requirement" in self.matched_fields:
+            highlights.append(
+                f"which prefers **{self.reasoning_factors.get('Oxygen Requirement', '').lower()}** conditions"
+            )
+        # Join highlights grammatically
+        summary = (
+            ", ".join(highlights[:-1]) + " and " + highlights[-1]
+            if len(highlights) > 1
+            else "".join(highlights)
+        )
+        # Confidence text (core)
+        core_conf = self.confidence_percent()
+        confidence_text = (
+            "The confidence in this identification based on the entered tests is high."
+            if core_conf >= 70
+            else "The confidence in this identification based on the entered tests is moderate."
+        )
+        # Comparative reasoning vs other close results
+        comparison = ""
+        if ranked_results and len(ranked_results) > 1:
+            close_others = ranked_results[1:3]
+            other_names = [r.genus for r in close_others]
+            if other_names:
+                if self.total_score >= close_others[0].total_score:
+                    comparison = (
+                        f" It is **more likely** than {join_with_and(other_names)} "
+                        f"based on stronger alignment in {join_with_and(self.matched_fields[:3])}."
+                    )
+                else:
+                    comparison = (
+                        f" It is **less likely** than {join_with_and(other_names)} "
+                        f"due to differences in {join_with_and(self.mismatched_fields[:3])}."
+                    )
+        return f"{intro} {summary}, the isolate most closely resembles **{self.genus}**. {confidence_text}{comparison}"
+# -----------------------------
+# Bacteria Identifier Engine
+# -----------------------------
+class BacteriaIdentifier:
+    """
+    Main engine to match bacterial genus based on biochemical & morphological data.
+    Includes:
+      - Core rule-based matching vs bacteria_db.xlsx
+      - Optional blending with extended signals (signals_catalog.json)
+    """
+    def __init__(self, db: pd.DataFrame):
+        self.db = db.fillna("")
+        self.extended_fields = self._load_extended_fields()
+    def _load_extended_fields(self) -> List[str]:
+        if not os.path.exists(EXT_SCHEMA_PATH):
+            return []
+        try:
+            with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
+                schema = json.load(f)
+            return list(schema.keys())
+        except Exception:
+            return []
+    # -----------------------------
+    # Field Comparison Logic
+    # -----------------------------
+    def compare_field(self, db_val, user_val, field_name: str) -> int:
+        """Compare one test field between database and user input."""
+        if not user_val or str(user_val).strip() == "" or str(user_val).lower() == "unknown":
+            return 0  # Skip empty or unknown
+        db_val = str(db_val).strip().lower()
+        user_val = str(user_val).strip().lower()
+        hard_exclusions = ["Gram Stain", "Shape", "Spore Formation"]
+        # Split entries by separators for multi-value matches
+        db_options = re.split(r"[;/]", db_val)
+        user_options = re.split(r"[;/]", user_val)
+        db_options = [x.strip() for x in db_options if x.strip()]
+        user_options = [x.strip() for x in user_options if x.strip()]
+        # Handle "variable" logic
+        if "variable" in db_options or "variable" in user_options:
+            return 0
+        # Special handling for Growth Temperature
+        if field_name == "Growth Temperature":
+            try:
+                if "//" in db_val:
+                    low, high = [float(x) for x in db_val.split("//")]
+                    temp = float(user_val)
+                    return 1 if low <= temp <= high else -1
+            except Exception:
+                return 0
+        # Flexible match: partial overlap counts as match
+        match_found = any(
+            any(u in db_opt or db_opt in u for db_opt in db_options) for u in user_options
+        )
+        if match_found:
+            return 1
+        else:
+            if field_name in hard_exclusions:
+                return -999  # Hard exclusion
+            return -1
+    # -----------------------------
+    # Suggest Next Tests
+    # -----------------------------
+    def suggest_next_tests(self, top_results: List[IdentificationResult]) -> List[str]:
+        """Suggest 3 tests that best differentiate top matches."""
+        if len(top_results) < 2:
+            return []
+        varying_fields = []
+        top3 = top_results[:3]
+        for field in self.db.columns:
+            if field in ["Genus", "Extra Notes", "Colony Morphology"]:
+                continue
+            field_values = set()
+            for r in top3:
+                field_values.update(r.matched_fields)
+                field_values.update(r.mismatched_fields)
+            if len(field_values) > 1:
+                varying_fields.append(field)
+        random.shuffle(varying_fields)
+        return varying_fields[:3]
+    # -----------------------------
+    # Extended Input Extraction
+    # -----------------------------
+    def _extract_extended_input(self, user_input: Dict[str, str]) -> Dict[str, str]:
+        """
+        Extract extended tests (those in extended_schema.json but not part of the core db).
+        Only keep Positive/Negative/Variable (ignore Unknown/empty).
+        """
+        ext_in = {}
+        for field in self.extended_fields:
+            val = user_input.get(field, "Unknown")
+            if isinstance(val, str) and val.lower() in ("positive", "negative", "variable"):
+                ext_in[field] = val.capitalize()
+        return ext_in
+    # -----------------------------
+    # Main Identification Routine
+    # -----------------------------
+    def identify(self, user_input: Dict[str, str]) -> List[IdentificationResult]:
+        """Compare user input to database and rank possible genera with blended scoring."""
+        results: List[IdentificationResult] = []
+        total_fields_possible = len([c for c in self.db.columns if c != "Genus"])
+        # 1) Core scoring loop against bacteria_db.xlsx
+        for _, row in self.db.iterrows():
+            genus = row["Genus"]
+            total_score = 0
+            matched_fields: List[str] = []
+            mismatched_fields: List[str] = []
+            reasoning_factors: Dict[str, str] = {}
+            total_fields_evaluated = 0
+            for field in self.db.columns:
+                if field == "Genus":
+                    continue
+                db_val = row[field]
+                user_val = user_input.get(field, "")
+                score = self.compare_field(db_val, user_val, field)
+                # Count only real inputs for relative confidence
+                if user_val and str(user_val).lower() != "unknown":
+                    total_fields_evaluated += 1
+                if score == -999:
+                    total_score = -999
+                    break  # Hard exclusion ends comparison
+                elif score == 1:
+                    total_score += 1
+                    matched_fields.append(field)
+                    reasoning_factors[field] = user_val
+                elif score == -1:
+                    total_score -= 1
+                    mismatched_fields.append(field)
+            # Append valid genus result
+            if total_score > -999:
+                extra_notes = row.get("Extra Notes", "")
+                results.append(
+                    IdentificationResult(
+                        genus=genus,
+                        total_score=total_score,
+                        matched_fields=matched_fields,
+                        mismatched_fields=mismatched_fields,
+                        reasoning_factors=reasoning_factors,
+                        total_fields_evaluated=total_fields_evaluated,
+                        total_fields_possible=total_fields_possible,
+                        extra_notes=extra_notes,
+                    )
+                )
+        if not results:
+            return []
+        # 2) Suggest next tests for top core results
+        top_suggestions = self.suggest_next_tests(results)
+        for r in results[:3]:
+            r.reasoning_factors["next_tests"] = ", ".join(top_suggestions)
+        # 3) Extended likelihoods (if user provided extended tests)
+        ext_input = self._extract_extended_input(user_input)
+        ext_scores: Dict[str, float] = {}
+        ext_explanation = ""
+        if ext_input:
+            ranked, ext_explanation = score_genera_from_extended(ext_input)
+            ext_scores = {g: s for g, s in ranked}
+        # Attach extended scores/explanations to each result
+        if ext_scores:
+            for r in results:
+                if r.genus in ext_scores:
+                    r.extended_likelihood = ext_scores[r.genus]
+                else:
+                    # If genus not in signals, treat as neutral (no info)
+                    r.extended_likelihood = None
+                r.extended_explanation = ext_explanation
+        else:
+            for r in results:
+                r.extended_likelihood = None
+                r.extended_explanation = ""
+        # 4) Sort results
+        if any(r.extended_likelihood is not None for r in results):
+            # Sort by blended confidence when extended data is present
+            results.sort(key=lambda x: x.blended_confidence_raw(), reverse=True)
+        else:
+            # Fallback to core total_score
+            results.sort(key=lambda x: x.total_score, reverse=True)
+        # Return top 10
+        return results[:10]

engine/extended_reasoner.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# engine/extended_reasoner.py
+# ------------------------------------------------------------
+# Compute per-genus likelihoods from extended tests using signals_catalog.json
+import json, os, math
+from typing import Dict, List, Tuple
+SIGNALS_PATH = os.path.join("data", "signals_catalog.json")
+PNV = ("Positive", "Negative", "Variable")
+def _load_json(path: str, default):
+    if not os.path.exists(path):
+        return default
+    with open(path, "r", encoding="utf-8") as f:
+        try:
+            return json.load(f)
+        except Exception:
+            return default
+def _log(x: float) -> float:
+    # guard tiny values
+    return math.log(max(x, 1e-12))
+def score_genera_from_extended(parsed_ext: Dict[str, str], alpha: float = 1.0) -> Tuple[List[Tuple[str, float]], str]:
+    """
+    parsed_ext: dict of {ExtendedTestName: 'Positive'|'Negative'|'Variable'}
+    alpha: Laplace smoothing factor
+    Returns: ([(genus, score)], explanation_str)
+    """
+    signals = _load_json(SIGNALS_PATH, {})
+    if not parsed_ext or not signals:
+        return [], "No extended tests or signals available."
+    # collect all genera
+    genera = list(signals.keys())
+    if not genera:
+        return [], "No genera in signals catalog."
+    # For each genus, accumulate log-likelihoods over provided tests
+    scores: Dict[str, float] = {g: 0.0 for g in genera}
+    contributions: Dict[str, List[str]] = {g: [] for g in genera}
+    for test, val in parsed_ext.items():
+        if val not in PNV:
+            continue
+        for g in genera:
+            stats = signals.get(g, {}).get(test, None)
+            if not stats:
+                # unseen test for this genus → uniform
+                denom = 3.0 * alpha
+                prob = alpha / denom
+            else:
+                pos = stats.get("Positive", 0)
+                neg = stats.get("Negative", 0)
+                var = stats.get("Variable", 0)
+                n = stats.get("_n", (pos + neg + var))
+                if n <= 0:
+                    denom = 3.0 * alpha
+                    prob = alpha / denom
+                else:
+                    k = {"Positive": pos, "Negative": neg, "Variable": var}[val]
+                    denom = n + 3.0 * alpha
+                    prob = (k + alpha) / denom
+            scores[g] += _log(prob)
+            contributions[g].append(f"{test}={val}→{prob:.3f}")
+    # normalize scores (softmax) for readability
+    max_log = max(scores.values())
+    exp_scores = {g: math.exp(s - max_log) for g, s in scores.items()}
+    z = sum(exp_scores.values())
+    final = sorted([(g, (exp_scores[g] / z) if z > 0 else 0.0) for g in genera], key=lambda x: x[1], reverse=True)
+    # short explanation
+    top_rows = []
+    for g, sc in final[:5]:
+        top_rows.append(f"{g}: {sc:.3f}  |  {'; '.join(contributions[g][:3])}")
+    explain = "Extended-test likelihoods (top 5):\n" + "\n".join(top_rows) if top_rows else "No contributions."
+    return final, explain

engine/parser_ext.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# engine/parser_ext.py
+# ------------------------------------------------------------
+# Data-driven parser for extended tests (not in core schema).
+# Uses:
+#   - data/extended_schema.json
+#   - data/alias_maps.json
+#
+# Automatically extracts extended tests such as:
+#   CAMP, PYR, Optochin, Novobiocin, Bacitracin, Bile Solubility, Hippurate, etc.
+#
+# Core tests (Gram, Catalase, DNase, Indole, etc.) are EXCLUDED.
+import json
+import os
+import re
+from typing import Dict, List
+DATA_DIR = "data"
+EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
+ALIAS_MAPS_PATH = os.path.join(DATA_DIR, "alias_maps.json")
+# -------------------------------------------------------------------------
+# Hardcoded core test fields (NEVER to be parsed as extended)
+# -------------------------------------------------------------------------
+CORE_FIELDS = {
+    "Genus", "Species",
+    "Gram Stain", "Shape", "Colony Morphology", "Haemolysis", "Haemolysis Type",
+    "Motility", "Capsule", "Spore Formation", "Growth Temperature", "Oxygen Requirement",
+    "Media Grown On",
+    "Catalase", "Oxidase", "Coagulase", "DNase", "Urease", "Citrate", "Methyl Red", "VP",
+    "H2S", "ONPG", "Nitrate Reduction", "Lipase Test", "NaCl Tolerant (>=6%)",
+    "Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase",
+    "Gelatin Hydrolysis", "Esculin Hydrolysis",
+    "Glucose Fermentation", "Lactose Fermentation", "Sucrose Fermentation",
+    "Mannitol Fermentation", "Sorbitol Fermentation", "Maltose Fermentation",
+    "Xylose Fermentation", "Rhamnose Fermentation", "Arabinose Fermentation",
+    "Raffinose Fermentation", "Trehalose Fermentation", "Inositol Fermentation"
+}
+# -------------------------------------------------------------------------
+# Positive / Negative / Variable mapping
+# -------------------------------------------------------------------------
+PNV_MAP = {
+    "+": "Positive", "positive": "Positive", "pos": "Positive",
+    "-": "Negative", "negative": "Negative", "neg": "Negative",
+    "variable": "Variable", "var": "Variable"
+}
+# -------------------------------------------------------------------------
+# Sensitivity/Resistance mapping for disk diffusion tests
+# (e.g., optochin, novobiocin, bacitracin)
+# -------------------------------------------------------------------------
+SENS_MAP = {
+    "sensitive": "Positive",
+    "susceptible": "Positive",
+    "resistant": "Negative",
+    "insensitive": "Negative"
+}
+# -------------------------------------------------------------------------
+# JSON loaders
+# -------------------------------------------------------------------------
+def _load_json(path: str, default):
+    if not os.path.exists(path):
+        return default
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return json.load(f)
+    except Exception:
+        return default
+# -------------------------------------------------------------------------
+# Canonical value mapping (+, -, variable, resistant, sensitive)
+# -------------------------------------------------------------------------
+def _canon_value(token: str) -> str:
+    if token is None:
+        return "Unknown"
+    low = token.strip().lower()
+    if low in PNV_MAP:
+        return PNV_MAP[low]
+    if low in SENS_MAP:
+        return SENS_MAP[low]
+    return token.strip()
+# -------------------------------------------------------------------------
+# Gather all alias names for a field
+# -------------------------------------------------------------------------
+def _aliases_for(field: str, field_aliases: Dict[str, str]) -> List[str]:
+    """
+    Returns all known aliases for this test, including the canonical name.
+    Ordered longest→shortest to avoid partial matches.
+    """
+    aliases = {field}
+    for k, v in field_aliases.items():
+        if v.lower() == field.lower():
+            aliases.add(k)
+    return sorted(aliases, key=len, reverse=True)
+# -------------------------------------------------------------------------
+# Main Extended Parser
+# -------------------------------------------------------------------------
+def parse_text_extended(text: str) -> Dict[str, Dict]:
+    """
+    Parse ONLY tests listed in extended_schema.json.
+    Excludes all core tests completely.
+    Returns:
+      {
+        "parsed_fields": { TestName: "Positive"/"Negative"/"Variable" },
+        "source": "extended_parser"
+      }
+    """
+    ext_schema = _load_json(EXT_SCHEMA_PATH, {})
+    alias_maps = _load_json(ALIAS_MAPS_PATH, {"field_aliases": {}, "value_aliases_pnv": {}})
+    field_aliases = alias_maps.get("field_aliases", {})
+    t = text or ""
+    out: Dict[str, str] = {}
+    # LOOP: For each extended test, search text for aliases + P/N/V patterns
+    for canon_field in ext_schema.keys():
+        # Safety: never allow extended parser to treat core tests as extended
+        if canon_field in CORE_FIELDS:
+            continue
+        aliases = _aliases_for(canon_field, field_aliases)
+        for alias in aliases:
+            # Match: <alias> .... (positive|negative|variable|+|-|sensitive|resistant)
+            regex = (
+                rf"\b{re.escape(alias)}\b"
+                r"[^.\n]{0,80}?"  # lookahead window
+                r"\b(positive|negative|variable|\+|\-|susceptible|sensitive|resistant)\b"
+            )
+            m = re.search(regex, t, re.IGNORECASE)
+            if m:
+                out[canon_field] = _canon_value(m.group(1))
+                break  # found best match for this field
+    # Final cleanup: remove any forbidden core fields that slipped through
+    dirty = [k for k in out.keys() if k in CORE_FIELDS]
+    for d in dirty:
+        del out[d]
+    return {
+        "parsed_fields": out,
+        "source": "extended_parser"
+    }

engine/parser_fusion.py ADDED Viewed

	@@ -0,0 +1,107 @@

+# engine/parser_fusion.py
+# ------------------------------------------------------------
+# Tri-fusion parser:
+#   - Rule parser (parser_rules)
+#   - Extended parser (parser_ext)
+#   - LLM parser (parser_llm / Cloudflare)
+#
+# Combines all three into a single fused field set, with a simple
+# precedence rule:
+#   extended > rules > llm > Unknown
+#
+# Returns:
+# {
+#   "fused_fields": { ... },
+#   "sources": { field_name: "extended" | "rules" | "llm_cf" | "none" },
+#   "components": {
+#       "rules": <full rule parser output>,
+#       "extended": <full extended parser output>,
+#       "llm": <full llm parser output>
+#   }
+# }
+import json
+import os
+from typing import Dict, Any
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended, CORE_FIELDS
+from engine.parser_llm import parse_text_llm
+# Load extended schema so we know all possible fields
+EXT_SCHEMA_PATH = "data/extended_schema.json"
+try:
+    with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
+        EXT_SCHEMA = json.load(f)
+except Exception:
+    EXT_SCHEMA = {}
+ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
+def _is_known(val: Any) -> bool:
+    """
+    Decide if a value is 'real' (we should use it) or effectively Unknown/empty.
+    """
+    if val is None:
+        return False
+    if isinstance(val, str):
+        v = val.strip()
+        if not v:
+            return False
+        if v.lower() == "unknown":
+            return False
+    return True
+def parse_text_fused(text: str) -> Dict[str, Any]:
+    """
+    Run all three parsers and fuse their outputs.
+    Precedence: extended > rules > llm > Unknown.
+    """
+    # --- Run component parsers ---
+    rules_out = parse_text_rules(text or "")
+    ext_out = parse_text_extended(text or "")
+    llm_out = parse_text_llm(text or "")
+    rule_fields = rules_out.get("parsed_fields", {}) or {}
+    ext_fields = ext_out.get("parsed_fields", {}) or {}
+    llm_fields = llm_out.get("parsed_fields", {}) or {}
+    fused: Dict[str, Any] = {}
+    sources: Dict[str, str] = {}
+    for field in ALL_FIELDS:
+        val = None
+        src = "none"
+        ext_val = ext_fields.get(field, None)
+        rule_val = rule_fields.get(field, None)
+        llm_val = llm_fields.get(field, None)
+        if _is_known(ext_val):
+            val = ext_val
+            src = "extended"
+        elif _is_known(rule_val):
+            val = rule_val
+            src = "rules"
+        elif _is_known(llm_val):
+            val = llm_val
+            src = "llm_cf"
+        else:
+            val = "Unknown"
+            src = "none"
+        fused[field] = val
+        sources[field] = src
+    return {
+        "fused_fields": fused,
+        "sources": sources,
+        "components": {
+            "rules": rules_out,
+            "extended": ext_out,
+            "llm": llm_out,
+        },
+    }

engine/parser_llm.py ADDED Viewed

	@@ -0,0 +1,137 @@

+# engine/parser_llm.py
+# ------------------------------------------------------------
+# LLM-based parser using local Phi-2 model via HuggingFace.
+# ------------------------------------------------------------
+import json
+import re
+import torch
+import streamlit as st
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from engine.parser_ext import CORE_FIELDS
+EXT_SCHEMA_PATH = "data/extended_schema.json"
+try:
+    with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
+        EXT_SCHEMA = json.load(f)
+except:
+    EXT_SCHEMA = {}
+ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
+@st.cache_resource(show_spinner=True)
+def load_phi2_model():
+    """Load Phi-2 locally (CPU mode). Cached for entire session."""
+    name = "microsoft/phi-2"
+    tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(
+        name,
+        torch_dtype=torch.float32,
+        trust_remote_code=True,
+    )
+    model.eval()
+    return tokenizer, model
+PROMPT_TEMPLATE = """
+You are an expert clinical microbiology assistant.
+Extract ALL microbiology test results from the text and return a STRICT JSON object.
+RULES:
+- Use ONLY these fields:
+{FIELD_LIST}
+- Allowed values:
+  "Positive", "Negative", "Variable", "Unknown",
+  OR literal strings for temperatures (e.g. "37//40").
+- If a test is not mentioned: set "Unknown".
+- DO NOT create new fields or hallucinate.
+- DO NOT output explanations.
+- DO NOT wrap JSON in markdown code fences.
+- Output ONLY a raw JSON object.
+Text:
+---
+{TEXT}
+---
+JSON:
+"""
+def salvage_json(raw: str):
+    """Attempt to clean and parse 'almost JSON' returned by model."""
+    s = raw.strip()
+    start = s.find("{")
+    end = s.rfind("}")
+    if start == -1 or end == -1 or end <= start:
+        raise ValueError("No valid JSON object braces found.")
+    s = s[start : end + 1]
+    s = re.sub(r",\s*([}\]])", r"\1", s)
+    return json.loads(s)
+def normalise_value(val):
+    if val is None:
+        return "Unknown"
+    v = str(val).strip()
+    low = v.lower()
+    if low in ["positive", "+", "pos"]:
+        return "Positive"
+    if low in ["negative", "-", "neg"]:
+        return "Negative"
+    if low in ["variable", "var"]:
+        return "Variable"
+    return v
+def parse_text_llm(text: str):
+    tokenizer, model = load_phi2_model()
+    prompt = PROMPT_TEMPLATE.format(
+        FIELD_LIST=", ".join(ALL_FIELDS),
+        TEXT=text,
+    )
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"]
+    with torch.no_grad():
+        output_ids = model.generate(
+            input_ids=input_ids,
+            max_new_tokens=500,
+            temperature=0.0,
+            do_sample=False,
+        )
+    full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+    raw = full_text[len(prompt):].strip()
+    try:
+        parsed = json.loads(raw)
+    except Exception:
+        try:
+            parsed = salvage_json(raw)
+        except Exception:
+            return {
+                "parsed_fields": {},
+                "error": "Invalid JSON returned by model",
+                "raw": raw,
+            }
+    cleaned = {}
+    for f in ALL_FIELDS:
+        cleaned[f] = normalise_value(parsed.get(f, "Unknown"))
+    return {
+        "parsed_fields": cleaned,
+        "source": "llm_phi2",
+        "raw": raw,
+    }

engine/parser_rules.py ADDED Viewed

	@@ -0,0 +1,186 @@

+# engine/parser_rules.py
+# ------------------------------------------------------------
+# Deterministic rule-based parser for microbiology text
+# Loads alias_maps.json and applies synonyms learned in Stage 10B
+# ------------------------------------------------------------
+import re
+import json
+import os
+ALIAS_PATH = "data/alias_maps.json"
+# ------------------------------------------------------------
+# Load alias maps (if present)
+# ------------------------------------------------------------
+def load_alias_maps():
+    if os.path.exists(ALIAS_PATH):
+        try:
+            with open(ALIAS_PATH, "r", encoding="utf-8") as f:
+                return json.load(f)
+        except:
+            return {}
+    return {}
+ALIAS_MAPS = load_alias_maps()
+# ------------------------------------------------------------
+# Utility normalization
+# ------------------------------------------------------------
+def norm(text: str) -> str:
+    if not text:
+        return ""
+    return str(text).strip().lower()
+# Apply alias mapping per field
+def apply_alias(field: str, value: str) -> str:
+    f = norm(field)
+    v = norm(value)
+    if f in ALIAS_MAPS:
+        if v in ALIAS_MAPS[f]:
+            return ALIAS_MAPS[f][v]
+    return value
+# ------------------------------------------------------------
+# Main rule parser
+# ------------------------------------------------------------
+def parse_text_rules(text: str) -> dict:
+    """
+    Extracts structured microbiology fields from text using
+    deterministic regex rules + alias mapping.
+    """
+    if not text:
+        return {"parsed_fields": {}, "raw_text": text}
+    t = text.lower()
+    parsed = {}
+    # ------------------------------------------------------------
+    # Grammar / morphology
+    # ------------------------------------------------------------
+    if "gram-positive" in t or "gram positive" in t:
+        parsed["Gram Stain"] = "Positive"
+    elif "gram-negative" in t or "gram negative" in t:
+        parsed["Gram Stain"] = "Negative"
+    if "cocci" in t:
+        parsed["Shape"] = "Cocci"
+    elif "bacilli" in t or "rods" in t or "rod" in t:
+        parsed["Shape"] = "Rods"
+    # ------------------------------------------------------------
+    # Enzyme tests
+    # ------------------------------------------------------------
+    if "catalase positive" in t:
+        parsed["Catalase"] = "Positive"
+    elif "catalase negative" in t:
+        parsed["Catalase"] = "Negative"
+    if "oxidase positive" in t:
+        parsed["Oxidase"] = "Positive"
+    elif "oxidase negative" in t:
+        parsed["Oxidase"] = "Negative"
+    if "coagulase positive" in t:
+        parsed["Coagulase"] = "Positive"
+    elif "coagulase negative" in t:
+        parsed["Coagulase"] = "Negative"
+    if "dnase positive" in t or "dnase+" in t:
+        parsed["DNase"] = "Positive"
+    elif "dnase negative" in t:
+        parsed["DNase"] = "Negative"
+    if "urease positive" in t:
+        parsed["Urease"] = "Positive"
+    elif "urease negative" in t:
+        parsed["Urease"] = "Negative"
+    elif "urease variable" in t:
+        parsed["Urease"] = "Variable"
+    # ------------------------------------------------------------
+    # Indole, Citrate, VP, MR
+    # ------------------------------------------------------------
+    if "indole positive" in t:
+        parsed["Indole"] = "Positive"
+    elif "indole negative" in t:
+        parsed["Indole"] = "Negative"
+    if "citrate positive" in t:
+        parsed["Citrate"] = "Positive"
+    elif "citrate negative" in t:
+        parsed["Citrate"] = "Negative"
+    if "vp positive" in t or "voges-proskauer positive" in t:
+        parsed["VP"] = "Positive"
+    elif "vp negative" in t:
+        parsed["VP"] = "Negative"
+    if "mr positive" in t or "methyl red positive" in t:
+        parsed["Methyl Red"] = "Positive"
+    elif "mr negative" in t:
+        parsed["Methyl Red"] = "Negative"
+    # ------------------------------------------------------------
+    # Fermentation tests
+    # ------------------------------------------------------------
+    FERMENTS = {
+        "glucose": "Glucose Fermentation",
+        "lactose": "Lactose Fermentation",
+        "sucrose": "Sucrose Fermentation",
+        "mannitol": "Mannitol Fermentation",
+    }
+    for sugar, field in FERMENTS.items():
+        if f"ferments {sugar}" in t or f"{sugar} fermentation positive" in t:
+            parsed[field] = "Positive"
+        if f"does not ferment {sugar}" in t or f"{sugar} fermentation negative" in t:
+            parsed[field] = "Negative"
+    # ------------------------------------------------------------
+    # Haemolysis
+    # ------------------------------------------------------------
+    if "beta-haemolytic" in t or "beta-hemolytic" in t:
+        parsed["Haemolysis Type"] = "Beta"
+        parsed["Haemolysis"] = "Positive"
+    elif "alpha-haemolytic" in t:
+        parsed["Haemolysis Type"] = "Alpha"
+        parsed["Haemolysis"] = "Positive"
+    elif "gamma-haemolytic" in t or "non-haemolytic" in t:
+        parsed["Haemolysis Type"] = "Gamma"
+        parsed["Haemolysis"] = "Negative"
+    # ------------------------------------------------------------
+    # Media
+    # ------------------------------------------------------------
+    if "blood agar" in t:
+        parsed["Media Grown On"] = "Blood Agar"
+    elif "macconkey agar" in t:
+        parsed["Media Grown On"] = "MacConkey Agar"
+    elif "chocolate agar" in t:
+        parsed["Media Grown On"] = "Chocolate Agar"
+    # ------------------------------------------------------------
+    # Growth temperature extraction
+    # ------------------------------------------------------------
+    match_temp = re.search(r"grows at (\d+)", t)
+    if match_temp:
+        temp = match_temp.group(1)
+        parsed["Growth Temperature"] = f"{temp}//{temp}"
+    # ------------------------------------------------------------
+    # Apply alias mappings
+    # ------------------------------------------------------------
+    aliased = {}
+    for field, value in parsed.items():
+        fixed = apply_alias(field, value)
+        aliased[field] = fixed
+    return {
+        "parsed_fields": aliased,
+        "raw_text": text,
+    }

engine/schema.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# engine/schema.py
+from typing import Dict, List, Any, Tuple
+POS_NEG_VAR = ["Positive", "Negative", "Variable"]
+POS_NEG_VAR_UNKNOWN = ["Positive", "Negative", "Variable", "Unknown"]
+UNKNOWN = "Unknown"
+MULTI_SEPARATOR = ";"
+ENUMS = {
+    "Gram Stain": ["Positive", "Negative", "Variable"],
+    "Shape": ["Cocci", "Rods", "Bacilli", "Spiral", "Short Rods"],
+    "Haemolysis Type": ["None", "Beta", "Gamma", "Alpha"],
+}
+SCHEMA: Dict[str, Dict[str, Any]] = {
+    "Genus": {"type": "text", "required": True},
+    "Species": {"type": "text", "required": False},
+    "Gram Stain": {"type": "enum", "allowed": ENUMS["Gram Stain"]},
+    "Shape": {"type": "enum", "allowed": ENUMS["Shape"]},
+    "Colony Morphology": {"type": "multienum", "separator": MULTI_SEPARATOR},
+    "Haemolysis": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Haemolysis Type": {"type": "multienum", "separator": MULTI_SEPARATOR, "allowed": ENUMS["Haemolysis Type"]},
+    "Motility": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Capsule": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Spore Formation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Growth Temperature": {"type": "range", "format": "low//high", "units": "°C"},
+    "Oxygen Requirement": {"type": "text"},
+    "Media Grown On": {"type": "multienum", "separator": MULTI_SEPARATOR},
+    "Catalase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Oxidase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Indole": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Urease": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Citrate": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Methyl Red": {"type": "enum", "allowed": POS_NEG_VAR},
+    "VP": {"type": "enum", "allowed": POS_NEG_VAR},
+    "H2S": {"type": "enum", "allowed": POS_NEG_VAR},
+    "DNase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "ONPG": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Coagulase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Lipase Test": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Nitrate Reduction": {"type": "enum", "allowed": POS_NEG_VAR},
+    "NaCl Tolerant (>=6%)": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Lysine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Ornitihine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Arginine dihydrolase": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Gelatin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Esculin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Glucose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Lactose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Sucrose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Mannitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Sorbitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Maltose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Xylose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Rhamnose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Arabinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Raffinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Trehalose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Inositol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
+    "Extra Notes": {"type": "text"},
+}
+FIELDS_ORDER: List[str] = list(SCHEMA.keys())
+MULTI_FIELDS: List[str] = [
+    k for k, v in SCHEMA.items() if v.get("type") == "multienum"
+]
+PNV_FIELDS: List[str] = [
+    k for k, v in SCHEMA.items()
+    if v.get("type") == "enum" and v.get("allowed") == POS_NEG_VAR
+]
+def is_enum_field(field: str) -> bool:
+    return SCHEMA.get(field, {}).get("type") == "enum"
+def is_multienum_field(field: str) -> bool:
+    return SCHEMA.get(field, {}).get("type") == "multienum"
+def is_range_field(field: str) -> bool:
+    return SCHEMA.get(field, {}).get("type") == "range"
+def normalize_value(field: str, value: str) -> str:
+    if value is None or str(value).strip() == "":
+        return UNKNOWN
+    v = str(value).strip()
+    if v.lower() == "unknown":
+        return UNKNOWN
+    meta = SCHEMA.get(field, {})
+    ftype = meta.get("type")
+    if ftype == "enum":
+        allowed = meta.get("allowed", [])
+        for a in allowed:
+            if v.lower() == a.lower():
+                return a
+        if v.lower() in ["+", "positive", "pos"]:
+            return "Positive" if "Positive" in allowed else v
+        if v.lower() in ["-", "negative", "neg"]:
+            return "Negative" if "Negative" in allowed else v
+        if v.lower() in ["variable", "var", "v"]:
+            return "Variable" if "Variable" in allowed else v
+        return v
+    if ftype == "multienum":
+        parts = [p.strip() for p in v.split(MULTI_SEPARATOR) if p.strip()]
+        allowed = meta.get("allowed")
+        normed = []
+        for p in parts:
+            if not allowed:
+                normed.append(p)
+            else:
+                hit = next((a for a in allowed if a.lower() == p.lower()), None)
+                normed.append(hit if hit else p)
+        return f" {MULTI_SEPARATOR} ".join(normed) if normed else UNKNOWN
+    if ftype == "range":
+        txt = v.replace(" ", "")
+        return txt
+    return v
+def validate_record(rec: Dict[str, Any]) -> Tuple[bool, List[str]]:
+    issues: List[str] = []
+    for field in FIELDS_ORDER:
+        meta = SCHEMA[field]
+        if field not in rec:
+            continue
+        val = rec[field]
+        if meta["type"] == "enum":
+            allowed = meta.get("allowed", [])
+            if str(val) not in allowed + [UNKNOWN]:
+                issues.append(f"{field}: '{val}' not in {allowed + [UNKNOWN]}")
+        elif meta["type"] == "multienum":
+            if val == UNKNOWN:
+                continue
+            parts = [p.strip() for p in str(val).split(MULTI_SEPARATOR) if p.strip()]
+            allowed = meta.get("allowed")
+            if allowed:
+                bad = [p for p in parts if p not in allowed]
+                if bad:
+                    issues.append(f"{field}: invalid values {bad}; allowed {allowed}")
+        elif meta["type"] == "range":
+            if val == UNKNOWN:
+                continue
+            txt = str(val).replace(" ", "")
+            if "//" not in txt:
+                issues.append(f"{field}: expected 'low//high' got '{val}'")
+            else:
+                try:
+                    low, high = [float(x) for x in txt.split("//")]
+                    if low > high:
+                        issues.append(f"{field}: low {low} > high {high}")
+                except Exception:
+                    issues.append(f"{field}: non-numeric bounds '{val}'")
+    ok = len(issues) == 0
+    return ok, issues
+def empty_record() -> Dict[str, str]:
+    rec = {}
+    for f, meta in SCHEMA.items():
+        if f in ("Genus", "Species"):
+            rec[f] = ""
+        else:
+            rec[f] = UNKNOWN
+    return rec

engine/validator.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# engine/validator.py
+# ---------------------------------
+# Placeholder for logical validation layer
+def validate_record(parsed: dict) -> dict:
+    """
+    Later: check for contradictions, invalid values,
+    and normalize to schema.
+    """
+    parsed.setdefault("validation_notes", [])
+    parsed["validation_notes"].append("Validator not yet implemented.")
+    return parsed

engine/weights.py ADDED Viewed

	@@ -0,0 +1,12 @@

+# engine/weights.py
+# ---------------------------------
+# Placeholder for field importance weighting
+DEFAULT_WEIGHTS = {}
+def update_weights_from_gold(gold_results):
+    """
+    Future: adjust field importance weights
+    based on gold test accuracy stats.
+    """
+    pass

training/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Marks the 'training' directory as a Python package
2	+

training/alias_trainer.py ADDED Viewed

	@@ -0,0 +1,126 @@

+# training/alias_trainer.py
+# ------------------------------------------------------------
+# Stage 10B - Alias Trainer
+#
+# Learns field/value synonyms from gold tests by comparing:
+#   - expected values (gold standard)
+#   - parsed values (rules + extended)
+#
+# Outputs:
+#   - Updated alias_maps.json
+#
+# This is the core intelligence that allows BactAI-D
+# to understand variations in microbiology language.
+# ------------------------------------------------------------
+import json
+import os
+from collections import defaultdict
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+GOLD_PATH = "training/gold_tests.json"
+ALIAS_PATH = "data/alias_maps.json"
+def normalise(s):
+    if s is None:
+        return ""
+    return str(s).strip().lower()
+def learn_aliases():
+    """
+    Learns synonym mappings from gold tests.
+    """
+    if not os.path.exists(GOLD_PATH):
+        return {"error": f"Gold tests missing: {GOLD_PATH}"}
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        gold = json.load(f)
+    # Load or create alias map
+    if os.path.exists(ALIAS_PATH):
+        with open(ALIAS_PATH, "r", encoding="utf-8") as f:
+            alias_maps = json.load(f)
+    else:
+        alias_maps = {}
+    # Track suggestions
+    suggestions = defaultdict(lambda: defaultdict(int))
+    # ------------------------------------------------------------
+    # Compare expected vs parsed for all tests
+    # ------------------------------------------------------------
+    for test in gold:
+        text = test.get("input", "")
+        expected = test.get("expected", {})
+        rules = parse_text_rules(text).get("parsed_fields", {})
+        ext = parse_text_extended(text).get("parsed_fields", {})
+        # merge deterministic parsers
+        merged = dict(rules)
+        for k, v in ext.items():
+            if v != "Unknown":
+                merged[k] = v
+        # now compare with expected
+        for field, exp_val in expected.items():
+            exp_norm = normalise(exp_val)
+            got_norm = normalise(merged.get(field, "Unknown"))
+            # Skip correct matches
+            if exp_norm == got_norm:
+                continue
+            # Skip unknown expected
+            if exp_norm in ["", "unknown"]:
+                continue
+            # Mismatched → candidate alias
+            if got_norm not in ["", "unknown"]:
+                suggestions[field][got_norm] += 1
+    # ------------------------------------------------------------
+    # Convert suggestions into alias mappings
+    # ------------------------------------------------------------
+    alias_updates = {}
+    for field, values in suggestions.items():
+        # ignore fields with tiny evidence
+        for wrong_value, count in values.items():
+            if count < 2:
+                continue  # avoid noise
+            # add/update alias
+            if field not in alias_maps:
+                alias_maps[field] = {}
+            # map wrong_value → expected canonical version
+            # canonical version is the most common value in gold_tests for that field
+            canonical = None
+            # determine canonical
+            field_values = [normalise(t["expected"][field]) for t in gold if field in t["expected"]]
+            if field_values:
+                # most common expected value
+                canonical = max(set(field_values), key=field_values.count)
+            if canonical:
+                alias_maps[field][wrong_value] = canonical
+                alias_updates[f"{field}:{wrong_value}"] = canonical
+    # ------------------------------------------------------------
+    # Save alias maps
+    # ------------------------------------------------------------
+    with open(ALIAS_PATH, "w", encoding="utf-8") as f:
+        json.dump(alias_maps, f, indent=2)
+    return {
+        "ok": True,
+        "updated_aliases": alias_updates,
+        "total_updates": len(alias_updates),
+        "alias_map_path": ALIAS_PATH,
+    }

training/gold_tester.py ADDED Viewed

	@@ -0,0 +1,169 @@

+# training/gold_tester.py
+# ----------------------------------------------------
+# Enhanced tester: audits expected fields not in schema,
+# adds DNase/Dnase alias and range-aware Growth Temperature matching.
+import json, os, time, csv
+from collections import Counter
+from typing import Dict, List, Tuple
+from engine.schema import SCHEMA, UNKNOWN, normalize_value, is_enum_field
+from engine.parser_rules import parse_text_rules
+REPORTS_DIR = "reports"
+PROPOSALS_PATH = os.path.join("data", "extended_proposals.jsonl")
+GOLD_PATH = os.path.join("training", "gold_tests.json")
+# --- helpers ---
+def load_gold() -> List[Dict]:
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        return json.load(f)
+def _range_overlap(a: str, b: str) -> bool:
+    try:
+        la, ha = [float(x) for x in a.split("//")]
+        lb, hb = [float(x) for x in b.split("//")]
+        return not (ha < lb or hb < la)
+    except Exception:
+        return False
+def compare_records(pred: Dict[str, str], exp: Dict[str, str]) -> Tuple[int, int, Dict[str, Tuple[str, str]]]:
+    correct, total, errors = 0, 0, {}
+    for field, exp_val in exp.items():
+        total += 1
+        p = pred.get(field, UNKNOWN)
+        if field == "Growth Temperature":
+            if p != UNKNOWN and exp_val != UNKNOWN and _range_overlap(p, exp_val):
+                correct += 1
+                continue
+        if p == exp_val:
+            correct += 1
+        else:
+            errors[field] = (p, exp_val)
+    return correct, total, errors
+def append_proposal(record: Dict):
+    os.makedirs(os.path.dirname(PROPOSALS_PATH), exist_ok=True)
+    with open(PROPOSALS_PATH, "a", encoding="utf-8") as f:
+        f.write(json.dumps(record, ensure_ascii=False) + "\n")
+# --- main ---
+def run_gold_tests(mode: str = "rules") -> Dict:
+    tests = load_gold()
+    ts = time.strftime("%Y%m%d_%H%M%S")
+    per_field_counts, per_field_correct, per_field_cov = Counter(), Counter(), Counter()
+    unknown_fields, unknown_values = Counter(), Counter()
+    expected_unknowns = Counter()
+    detailed_rows = []
+    cases_with_misses = 0
+    for case in tests:
+        name, text, expected = case.get("name", ""), case.get("input", ""), case.get("expected", {})
+        # normalize expected key aliases
+        expected_norm = {}
+        for k, v in expected.items():
+            k2 = "DNase" if k.lower() == "dnase" else k
+            expected_norm[k2] = v
+        expected = expected_norm
+        out = parse_text_rules(text)
+        parsed = out.get("parsed_fields", {})
+        # normalize parser output
+        normalized_pred = {}
+        for field, val in parsed.items():
+            if field not in SCHEMA:
+                unknown_fields[field] += 1
+                append_proposal({
+                    "type": "unknown_field",
+                    "field": field,
+                    "value": val,
+                    "case_name": name,
+                    "timestamp": ts
+                })
+                continue
+            normalized_pred[field] = normalize_value(field, val)
+            if is_enum_field(field):
+                allowed = SCHEMA[field].get("allowed", [])
+                if normalized_pred[field] not in allowed + [UNKNOWN]:
+                    unknown_values[(field, normalized_pred[field])] += 1
+                    append_proposal({
+                        "type": "unknown_value",
+                        "field": field,
+                        "value": normalized_pred[field],
+                        "allowed": allowed,
+                        "case_name": name,
+                        "timestamp": ts
+                    })
+        # audit expected fields not in schema
+        for ef in expected.keys():
+            if ef not in SCHEMA:
+                expected_unknowns[ef] += 1
+                append_proposal({
+                    "type": "expected_field_not_in_schema",
+                    "field": ef,
+                    "case_name": name,
+                    "timestamp": ts
+                })
+        correct, total, errors = compare_records(normalized_pred, expected)
+        if errors:
+            cases_with_misses += 1
+        for f in expected.keys():
+            per_field_counts[f] += 1
+            if f in normalized_pred and normalized_pred[f] != UNKNOWN:
+                per_field_cov[f] += 1
+            if f not in errors:
+                per_field_correct[f] += 1
+        detailed_rows.append({
+            "name": name,
+            "parsed": json.dumps(normalized_pred, ensure_ascii=False),
+            "expected": json.dumps(expected, ensure_ascii=False),
+            "correct_fields": correct,
+            "total_fields": total
+        })
+    # --- aggregate metrics ---
+    per_field_metrics = []
+    for f, tot in per_field_counts.items():
+        acc = per_field_correct[f] / tot if tot else 0.0
+        cov = per_field_cov[f] / tot if tot else 0.0
+        per_field_metrics.append({"field": f, "accuracy": round(acc, 4), "coverage": round(cov, 4), "n": tot})
+    per_field_metrics.sort(key=lambda x: x["field"])
+    micro_acc = sum(per_field_correct.values()) / sum(per_field_counts.values()) if per_field_counts else 0.0
+    os.makedirs(REPORTS_DIR, exist_ok=True)
+    report = {
+        "mode": mode,
+        "timestamp": ts,
+        "num_tests": len(tests),
+        "micro_accuracy": round(micro_acc, 4),
+        "cases_with_misses": cases_with_misses,
+        "per_field": per_field_metrics,
+        "unknown_fields": dict(unknown_fields),
+        "unknown_values": {f"{k[0]}::{k[1]}": v for k, v in unknown_values.items()},
+        "expected_unknown_fields": dict(expected_unknowns),
+        "proposals_path": PROPOSALS_PATH
+    }
+    json_path = os.path.join(REPORTS_DIR, f"gold_report_{mode}_{ts}.json")
+    csv_fields = os.path.join(REPORTS_DIR, f"gold_fields_{mode}_{ts}.csv")
+    csv_cases = os.path.join(REPORTS_DIR, f"gold_cases_{mode}_{ts}.csv")
+    with open(json_path, "w", encoding="utf-8") as f:
+        json.dump(report, f, indent=2, ensure_ascii=False)
+    with open(csv_fields, "w", newline="", encoding="utf-8") as f:
+        import csv
+        w = csv.DictWriter(f, fieldnames=["field", "accuracy", "coverage", "n"])
+        w.writeheader()
+        w.writerows(per_field_metrics)
+    with open(csv_cases, "w", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=["name", "parsed", "expected", "correct_fields", "total_fields"])
+        w.writeheader()
+        w.writerows(detailed_rows)
+    return {"summary": report, "paths": {"json_report": json_path, "csv_fields": csv_fields, "csv_cases": csv_cases}}

training/gold_tests.json ADDED Viewed

The diff for this file is too large to render. See raw diff

training/gold_trainer.py ADDED Viewed

	@@ -0,0 +1,69 @@

+# training/gold_trainer.py
+# ------------------------------------------------------------
+# Master training pipeline for:
+#   - Alias Trainer (Stage 10B)
+#   - Schema Expansion (Stage 10C)
+#   - Extended Signals (Stage 10C)
+# ------------------------------------------------------------
+from typing import Dict, Any
+from training.alias_trainer import learn_aliases
+# Try importing Stage 10C components, but don't crash if missing
+try:
+    from training.schema_expander import expand_schema
+except Exception:
+    def expand_schema():
+        return {
+            "ok": False,
+            "message": "schema_expander not implemented or import failed (Stage 10C).",
+        }
+try:
+    from training.signal_trainer import train_signals
+except Exception:
+    def train_signals():
+        return {
+            "ok": False,
+            "message": "signal_trainer not implemented or import failed (Stage 10C).",
+        }
+def train_from_gold() -> Dict[str, Any]:
+    """
+    Runs all training modules on gold tests.
+    Currently:
+      - Stage 10B: Alias Trainer
+      - Stage 10C: Schema Expansion (stub)
+      - Stage 10C: Extended Signals (stub)
+    """
+    out: Dict[str, Any] = {}
+    # Stage 10B - Alias Trainer
+    alias_result = learn_aliases()
+    out["alias_trainer"] = alias_result
+    # Stage 10C - Schema Expansion
+    try:
+        schema_result = expand_schema()
+    except Exception as e:
+        schema_result = {
+            "ok": False,
+            "error": str(e),
+            "message": "Error while running schema_expander.",
+        }
+    out["schema_expander"] = schema_result
+    # Stage 10C - Signals Trainer
+    try:
+        signals_result = train_signals()
+    except Exception as e:
+        signals_result = {
+            "ok": False,
+            "error": str(e),
+            "message": "Error while running signal_trainer.",
+        }
+    out["signals_trainer"] = signals_result
+    return out

training/parser_eval.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# training/parser_eval.py
+# ------------------------------------------------------------
+# Parser Evaluation (Stage 10A)
+#
+# This version ONLY evaluates:
+#   - Rule parser
+#   - Extended parser
+#
+# The LLM parser is intentionally disabled at this stage
+# because alias maps and schema are not trained yet.
+#
+# This makes Stage 10A FAST and stable (< 3 seconds).
+# ------------------------------------------------------------
+import json
+import os
+from typing import Dict, Any
+from engine.parser_rules import parse_text_rules
+from engine.parser_ext import parse_text_extended
+# Path to the gold tests
+GOLD_PATH = "training/gold_tests.json"
+def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
+    """
+    Evaluate one gold test with rules + extended parsers.
+    """
+    text = test.get("input", "")
+    expected = test.get("expected", {})
+    # Run deterministic parsers
+    rule_out = parse_text_rules(text).get("parsed_fields", {})
+    ext_out = parse_text_extended(text).get("parsed_fields", {})
+    # Merge rule + extended (extended overwrites rules)
+    merged = dict(rule_out)
+    for k, v in ext_out.items():
+        if v != "Unknown":
+            merged[k] = v
+    total = len(expected)
+    correct = 0
+    wrong = {}
+    for field, exp_val in expected.items():
+        got = merged.get(field, "Unknown")
+        if got.lower() == exp_val.lower():
+            correct += 0 if exp_val == "Unknown" else 1   # Unknown is neutral
+        else:
+            wrong[field] = {"expected": exp_val, "got": got}
+    return {
+        "correct": correct,
+        "total": total,
+        "accuracy": correct / total if total else 0,
+        "wrong": wrong,
+        "merged": merged,
+    }
+def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
+    """
+    Evaluate ALL gold tests using rules + extended parsing only.
+    """
+    if not os.path.exists(GOLD_PATH):
+        return {"error": f"Gold test file not found at {GOLD_PATH}"}
+    with open(GOLD_PATH, "r", encoding="utf-8") as f:
+        gold = json.load(f)
+    results = []
+    wrong_cases = []
+    total_correct = 0
+    total_fields = 0
+    for test in gold:
+        out = evaluate_single_test(test)
+        results.append(out)
+        total_correct += out["correct"]
+        total_fields += out["total"]
+        if out["wrong"]:
+            wrong_cases.append({
+                "name": test.get("name", "Unnamed"),
+                "wrong": out["wrong"],
+                "parsed": out["merged"],
+                "expected": test.get("expected", {})
+            })
+    summary = {
+        "mode": "rules+extended",
+        "tests": len(gold),
+        "total_correct": total_correct,
+        "total_fields": total_fields,
+        "overall_accuracy": total_correct / total_fields if total_fields else 0,
+        "wrong_cases": wrong_cases,
+    }
+    return summary

training/repo_sync_hf.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# training/repo_sync_hf.py
+# ------------------------------------------------------------
+# Sync updated data files back to the SAME Hugging Face Space
+# repo that the app is running from.
+#
+# Uses:
+#   HF_TOKEN            -> a write token (set in Space secrets)
+#   HF_SPACE_REPO_ID    -> e.g. "username/space-name"
+#
+# Call from app.py with:
+#   from training.repo_sync_hf import push_updates_to_hf
+#   result = push_updates_to_hf([...], commit_message="...")
+# ------------------------------------------------------------
+import os
+from typing import List, Dict, Any
+from huggingface_hub import HfApi, CommitOperationAdd
+def push_updates_to_hf(
+    paths: List[str],
+    commit_message: str = "train: update extended schema, aliases, signals from gold tests",
+) -> Dict[str, Any]:
+    """
+    Create a single commit on the current Space repo with the given files.
+    Each path is used both as local path and path_in_repo.
+    """
+    repo_id = os.getenv("HF_SPACE_REPO_ID")
+    token = os.getenv("HF_TOKEN")
+    if not repo_id:
+        return {
+            "ok": False,
+            "error": "Missing HF_SPACE_REPO_ID environment variable.",
+            "uploaded": [],
+        }
+    if not token:
+        return {
+            "ok": False,
+            "error": "Missing HF_TOKEN environment variable.",
+            "uploaded": [],
+        }
+    api = HfApi()
+    operations = []
+    uploaded = []
+    for p in paths:
+        if not os.path.exists(p):
+            # Skip missing files, but record that they were skipped
+            continue
+        operations.append(
+            CommitOperationAdd(path_in_repo=p, path_or_fileobj=p)
+        )
+        uploaded.append(p)
+    if not operations:
+        return {
+            "ok": False,
+            "error": "No existing files to upload.",
+            "uploaded": [],
+        }
+    commit_info = api.create_commit(
+        repo_id=repo_id,
+        repo_type="space",
+        operations=operations,
+        commit_message=commit_message,
+        token=token,
+    )
+    return {
+        "ok": True,
+        "uploaded": uploaded,
+        "repo_id": repo_id,
+        "commit_message": commit_message,
+        "commit_url": commit_info.commit_url,
+    }

training/schema_expander.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# training/schema_expander.py
+# ------------------------------------------------------------
+# Placeholder for Stage 10C - Schema Expansion
+#
+# At this stage we haven't implemented schema expansion yet,
+# so this stub lets the alias trainer run without error.
+# ------------------------------------------------------------
+def expand_schema():
+    return {
+        "ok": True,
+        "message": "Schema expander not implemented yet (Stage 10C placeholder)."
+    }

training/signal_trainer.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# training/signal_trainer.py
+# ------------------------------------------------------------
+# Placeholder for Stage 10C - Extended Signals Trainer
+#
+# This will be implemented in Stage 10C.
+# For now, it must exist so imports succeed.
+# ------------------------------------------------------------
+def train_signals():
+    return {
+        "ok": True,
+        "message": "Signal trainer not implemented yet (Stage 10C placeholder)."
+    }