Upload 23 files
Browse files- .gitattributes +1 -0
- data/alias_maps.json +32 -0
- data/bacteria_db.xlsx +3 -0
- data/extended_schema.json +161 -0
- data/signals_catalog.json +1012 -0
- engine/__init__.py +4 -0
- engine/bacteria_identifier.py +383 -0
- engine/extended_reasoner.py +79 -0
- engine/parser_ext.py +149 -0
- engine/parser_fusion.py +107 -0
- engine/parser_llm.py +137 -0
- engine/parser_rules.py +186 -0
- engine/schema.py +180 -0
- engine/validator.py +12 -0
- engine/weights.py +12 -0
- training/__init__.py +2 -0
- training/alias_trainer.py +126 -0
- training/gold_tester.py +169 -0
- training/gold_tests.json +0 -0
- training/gold_trainer.py +69 -0
- training/parser_eval.py +104 -0
- training/repo_sync_hf.py +82 -0
- training/schema_expander.py +13 -0
- training/signal_trainer.py +13 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/bacteria_db.xlsx filter=lfs diff=lfs merge=lfs -text
|
data/alias_maps.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"field_aliases": {
|
| 3 |
+
"Dnase": "DNase",
|
| 4 |
+
"CAMP Test": "CAMP",
|
| 5 |
+
"Optochin Sensitivity": "Optochin",
|
| 6 |
+
"Bile Solubility Test": "Bile Solubility",
|
| 7 |
+
"Hippurate": "Hippurate Hydrolysis",
|
| 8 |
+
"PYR Test": "PYR"
|
| 9 |
+
},
|
| 10 |
+
"media_aliases": {
|
| 11 |
+
"mac": "MacConkey Agar",
|
| 12 |
+
"macconkey": "MacConkey Agar",
|
| 13 |
+
"msa": "Mannitol Salt Agar",
|
| 14 |
+
"bap": "Blood Agar",
|
| 15 |
+
"choc": "Chocolate Agar",
|
| 16 |
+
"chocolate": "Chocolate Agar",
|
| 17 |
+
"cled": "CLED Agar"
|
| 18 |
+
},
|
| 19 |
+
"value_aliases_pnv": {
|
| 20 |
+
"+": "Positive",
|
| 21 |
+
"pos": "Positive",
|
| 22 |
+
"positive": "Positive",
|
| 23 |
+
"-": "Negative",
|
| 24 |
+
"neg": "Negative",
|
| 25 |
+
"negative": "Negative",
|
| 26 |
+
"variable": "Variable",
|
| 27 |
+
"var": "Variable"
|
| 28 |
+
},
|
| 29 |
+
"Motility": {
|
| 30 |
+
"positive": "positive"
|
| 31 |
+
}
|
| 32 |
+
}
|
data/bacteria_db.xlsx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c19c78ad2851aaa77f55f8f191748defa5e4a0654a11c9e5132f5b086d3c4543
|
| 3 |
+
size 2687947
|
data/extended_schema.json
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"CAMP": {
|
| 3 |
+
"value_type": "enum_PNV",
|
| 4 |
+
"status": "experimental",
|
| 5 |
+
"aliases": [
|
| 6 |
+
"CAMP Test"
|
| 7 |
+
]
|
| 8 |
+
},
|
| 9 |
+
"Hippurate Hydrolysis": {
|
| 10 |
+
"value_type": "enum_PNV",
|
| 11 |
+
"status": "experimental",
|
| 12 |
+
"aliases": []
|
| 13 |
+
},
|
| 14 |
+
"PYR": {
|
| 15 |
+
"value_type": "enum_PNV",
|
| 16 |
+
"status": "experimental",
|
| 17 |
+
"aliases": []
|
| 18 |
+
},
|
| 19 |
+
"Optochin": {
|
| 20 |
+
"value_type": "enum_PNV",
|
| 21 |
+
"status": "experimental",
|
| 22 |
+
"aliases": [
|
| 23 |
+
"Optochin Sensitivity"
|
| 24 |
+
]
|
| 25 |
+
},
|
| 26 |
+
"Bile Solubility": {
|
| 27 |
+
"value_type": "enum_PNV",
|
| 28 |
+
"status": "experimental",
|
| 29 |
+
"aliases": []
|
| 30 |
+
},
|
| 31 |
+
"Novobiocin": {
|
| 32 |
+
"value_type": "enum_PNV",
|
| 33 |
+
"status": "experimental",
|
| 34 |
+
"aliases": []
|
| 35 |
+
},
|
| 36 |
+
"Bile Resistance": {
|
| 37 |
+
"value_type": "enum_PNV",
|
| 38 |
+
"status": "experimental",
|
| 39 |
+
"aliases": []
|
| 40 |
+
},
|
| 41 |
+
"Lipase": {
|
| 42 |
+
"value_type": "enum_PNV",
|
| 43 |
+
"status": "experimental",
|
| 44 |
+
"aliases": []
|
| 45 |
+
},
|
| 46 |
+
"Lecithinase": {
|
| 47 |
+
"value_type": "enum_PNV",
|
| 48 |
+
"status": "experimental",
|
| 49 |
+
"aliases": []
|
| 50 |
+
},
|
| 51 |
+
"Odour": {
|
| 52 |
+
"value_type": "enum_PNV",
|
| 53 |
+
"status": "experimental",
|
| 54 |
+
"aliases": []
|
| 55 |
+
},
|
| 56 |
+
"Growth Factors": {
|
| 57 |
+
"value_type": "enum_PNV",
|
| 58 |
+
"status": "experimental",
|
| 59 |
+
"aliases": []
|
| 60 |
+
},
|
| 61 |
+
"Fructose Fermentation": {
|
| 62 |
+
"value_type": "enum_PNV",
|
| 63 |
+
"status": "experimental",
|
| 64 |
+
"aliases": []
|
| 65 |
+
},
|
| 66 |
+
"Glucose Oxidation": {
|
| 67 |
+
"value_type": "enum_PNV",
|
| 68 |
+
"status": "experimental",
|
| 69 |
+
"aliases": []
|
| 70 |
+
},
|
| 71 |
+
"Glycerol Fermentation": {
|
| 72 |
+
"value_type": "enum_PNV",
|
| 73 |
+
"status": "experimental",
|
| 74 |
+
"aliases": []
|
| 75 |
+
},
|
| 76 |
+
"Fermentation Products": {
|
| 77 |
+
"value_type": "enum_PNV",
|
| 78 |
+
"status": "experimental",
|
| 79 |
+
"aliases": []
|
| 80 |
+
},
|
| 81 |
+
"Cellobiose Fermentation": {
|
| 82 |
+
"value_type": "enum_PNV",
|
| 83 |
+
"status": "experimental",
|
| 84 |
+
"aliases": []
|
| 85 |
+
},
|
| 86 |
+
"pH Range": {
|
| 87 |
+
"value_type": "enum_PNV",
|
| 88 |
+
"status": "experimental",
|
| 89 |
+
"aliases": []
|
| 90 |
+
},
|
| 91 |
+
"Iron Oxidation": {
|
| 92 |
+
"value_type": "enum_PNV",
|
| 93 |
+
"status": "experimental",
|
| 94 |
+
"aliases": []
|
| 95 |
+
},
|
| 96 |
+
"NaCl Tolerant (>=15%)": {
|
| 97 |
+
"value_type": "enum_PNV",
|
| 98 |
+
"status": "experimental",
|
| 99 |
+
"aliases": []
|
| 100 |
+
},
|
| 101 |
+
"Temperature Dependence": {
|
| 102 |
+
"value_type": "enum_PNV",
|
| 103 |
+
"status": "experimental",
|
| 104 |
+
"aliases": []
|
| 105 |
+
},
|
| 106 |
+
"Sulfur Utilization": {
|
| 107 |
+
"value_type": "enum_PNV",
|
| 108 |
+
"status": "experimental",
|
| 109 |
+
"aliases": []
|
| 110 |
+
},
|
| 111 |
+
"Acid Fast": {
|
| 112 |
+
"value_type": "enum_PNV",
|
| 113 |
+
"status": "experimental",
|
| 114 |
+
"aliases": []
|
| 115 |
+
},
|
| 116 |
+
"Casein Hydrolysis": {
|
| 117 |
+
"value_type": "enum_PNV",
|
| 118 |
+
"status": "experimental",
|
| 119 |
+
"aliases": []
|
| 120 |
+
},
|
| 121 |
+
"Tyrosine Hydrolysis": {
|
| 122 |
+
"value_type": "enum_PNV",
|
| 123 |
+
"status": "experimental",
|
| 124 |
+
"aliases": []
|
| 125 |
+
},
|
| 126 |
+
"Mannose Fermentation": {
|
| 127 |
+
"value_type": "enum_PNV",
|
| 128 |
+
"status": "experimental",
|
| 129 |
+
"aliases": []
|
| 130 |
+
},
|
| 131 |
+
"Gas Production": {
|
| 132 |
+
"value_type": "enum_PNV",
|
| 133 |
+
"status": "experimental",
|
| 134 |
+
"aliases": []
|
| 135 |
+
},
|
| 136 |
+
"Inulin Fermentation": {
|
| 137 |
+
"value_type": "enum_PNV",
|
| 138 |
+
"status": "experimental",
|
| 139 |
+
"aliases": []
|
| 140 |
+
},
|
| 141 |
+
"Other Products": {
|
| 142 |
+
"value_type": "enum_PNV",
|
| 143 |
+
"status": "experimental",
|
| 144 |
+
"aliases": []
|
| 145 |
+
},
|
| 146 |
+
"Antibiotic Resistance": {
|
| 147 |
+
"value_type": "enum_PNV",
|
| 148 |
+
"status": "experimental",
|
| 149 |
+
"aliases": []
|
| 150 |
+
},
|
| 151 |
+
"Metabolic Product": {
|
| 152 |
+
"value_type": "enum_PNV",
|
| 153 |
+
"status": "experimental",
|
| 154 |
+
"aliases": []
|
| 155 |
+
},
|
| 156 |
+
"Bacitracin": {
|
| 157 |
+
"value_type": "enum_PNV",
|
| 158 |
+
"status": "experimental",
|
| 159 |
+
"aliases": []
|
| 160 |
+
}
|
| 161 |
+
}
|
data/signals_catalog.json
ADDED
|
@@ -0,0 +1,1012 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"Staphylococcus": {
|
| 3 |
+
"Indole": {
|
| 4 |
+
"Positive": 0,
|
| 5 |
+
"Negative": 18,
|
| 6 |
+
"Variable": 0,
|
| 7 |
+
"_n": 18
|
| 8 |
+
},
|
| 9 |
+
"Novobiocin": {
|
| 10 |
+
"Positive": 4,
|
| 11 |
+
"Negative": 2,
|
| 12 |
+
"Variable": 0,
|
| 13 |
+
"_n": 6
|
| 14 |
+
},
|
| 15 |
+
"Antibiotic Resistance": {
|
| 16 |
+
"Positive": 0,
|
| 17 |
+
"Negative": 0,
|
| 18 |
+
"Variable": 0,
|
| 19 |
+
"_n": 0
|
| 20 |
+
},
|
| 21 |
+
"CAMP": {
|
| 22 |
+
"Positive": 0,
|
| 23 |
+
"Negative": 6,
|
| 24 |
+
"Variable": 0,
|
| 25 |
+
"_n": 6
|
| 26 |
+
},
|
| 27 |
+
"PYR": {
|
| 28 |
+
"Positive": 0,
|
| 29 |
+
"Negative": 6,
|
| 30 |
+
"Variable": 0,
|
| 31 |
+
"_n": 6
|
| 32 |
+
},
|
| 33 |
+
"Optochin": {
|
| 34 |
+
"Positive": 0,
|
| 35 |
+
"Negative": 4,
|
| 36 |
+
"Variable": 0,
|
| 37 |
+
"_n": 4
|
| 38 |
+
},
|
| 39 |
+
"Bacitracin": {
|
| 40 |
+
"Positive": 0,
|
| 41 |
+
"Negative": 2,
|
| 42 |
+
"Variable": 0,
|
| 43 |
+
"_n": 2
|
| 44 |
+
}
|
| 45 |
+
},
|
| 46 |
+
"Salmonella": {
|
| 47 |
+
"Indole": {
|
| 48 |
+
"Positive": 0,
|
| 49 |
+
"Negative": 18,
|
| 50 |
+
"Variable": 0,
|
| 51 |
+
"_n": 18
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
"Enterobacter": {
|
| 55 |
+
"Indole": {
|
| 56 |
+
"Positive": 0,
|
| 57 |
+
"Negative": 30,
|
| 58 |
+
"Variable": 0,
|
| 59 |
+
"_n": 30
|
| 60 |
+
}
|
| 61 |
+
},
|
| 62 |
+
"Pseudomonas": {
|
| 63 |
+
"Indole": {
|
| 64 |
+
"Positive": 0,
|
| 65 |
+
"Negative": 18,
|
| 66 |
+
"Variable": 0,
|
| 67 |
+
"_n": 18
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"Bacillus": {
|
| 71 |
+
"Indole": {
|
| 72 |
+
"Positive": 0,
|
| 73 |
+
"Negative": 42,
|
| 74 |
+
"Variable": 0,
|
| 75 |
+
"_n": 42
|
| 76 |
+
},
|
| 77 |
+
"Lecithinase": {
|
| 78 |
+
"Positive": 6,
|
| 79 |
+
"Negative": 0,
|
| 80 |
+
"Variable": 0,
|
| 81 |
+
"_n": 6
|
| 82 |
+
}
|
| 83 |
+
},
|
| 84 |
+
"Shigella": {
|
| 85 |
+
"Indole": {
|
| 86 |
+
"Positive": 6,
|
| 87 |
+
"Negative": 6,
|
| 88 |
+
"Variable": 0,
|
| 89 |
+
"_n": 12
|
| 90 |
+
}
|
| 91 |
+
},
|
| 92 |
+
"Escherichia": {
|
| 93 |
+
"Indole": {
|
| 94 |
+
"Positive": 12,
|
| 95 |
+
"Negative": 0,
|
| 96 |
+
"Variable": 0,
|
| 97 |
+
"_n": 12
|
| 98 |
+
}
|
| 99 |
+
},
|
| 100 |
+
"Klebsiella": {
|
| 101 |
+
"Indole": {
|
| 102 |
+
"Positive": 6,
|
| 103 |
+
"Negative": 12,
|
| 104 |
+
"Variable": 0,
|
| 105 |
+
"_n": 18
|
| 106 |
+
}
|
| 107 |
+
},
|
| 108 |
+
"Proteus": {
|
| 109 |
+
"Indole": {
|
| 110 |
+
"Positive": 6,
|
| 111 |
+
"Negative": 6,
|
| 112 |
+
"Variable": 0,
|
| 113 |
+
"_n": 12
|
| 114 |
+
}
|
| 115 |
+
},
|
| 116 |
+
"Clostridium": {
|
| 117 |
+
"Indole": {
|
| 118 |
+
"Positive": 18,
|
| 119 |
+
"Negative": 36,
|
| 120 |
+
"Variable": 0,
|
| 121 |
+
"_n": 54
|
| 122 |
+
},
|
| 123 |
+
"Lipase": {
|
| 124 |
+
"Positive": 6,
|
| 125 |
+
"Negative": 0,
|
| 126 |
+
"Variable": 0,
|
| 127 |
+
"_n": 6
|
| 128 |
+
},
|
| 129 |
+
"Lecithinase": {
|
| 130 |
+
"Positive": 6,
|
| 131 |
+
"Negative": 0,
|
| 132 |
+
"Variable": 0,
|
| 133 |
+
"_n": 6
|
| 134 |
+
},
|
| 135 |
+
"Odour": {
|
| 136 |
+
"Positive": 0,
|
| 137 |
+
"Negative": 0,
|
| 138 |
+
"Variable": 0,
|
| 139 |
+
"_n": 0
|
| 140 |
+
},
|
| 141 |
+
"Fructose Fermentation": {
|
| 142 |
+
"Positive": 6,
|
| 143 |
+
"Negative": 0,
|
| 144 |
+
"Variable": 0,
|
| 145 |
+
"_n": 6
|
| 146 |
+
}
|
| 147 |
+
},
|
| 148 |
+
"Bacteroides": {
|
| 149 |
+
"Indole": {
|
| 150 |
+
"Positive": 6,
|
| 151 |
+
"Negative": 0,
|
| 152 |
+
"Variable": 6,
|
| 153 |
+
"_n": 12
|
| 154 |
+
},
|
| 155 |
+
"Bile Resistance": {
|
| 156 |
+
"Positive": 6,
|
| 157 |
+
"Negative": 0,
|
| 158 |
+
"Variable": 0,
|
| 159 |
+
"_n": 6
|
| 160 |
+
}
|
| 161 |
+
},
|
| 162 |
+
"Streptococcus": {
|
| 163 |
+
"CAMP": {
|
| 164 |
+
"Positive": 20,
|
| 165 |
+
"Negative": 4,
|
| 166 |
+
"Variable": 0,
|
| 167 |
+
"_n": 24
|
| 168 |
+
},
|
| 169 |
+
"Hippurate Hydrolysis": {
|
| 170 |
+
"Positive": 12,
|
| 171 |
+
"Negative": 0,
|
| 172 |
+
"Variable": 0,
|
| 173 |
+
"_n": 12
|
| 174 |
+
},
|
| 175 |
+
"PYR": {
|
| 176 |
+
"Positive": 2,
|
| 177 |
+
"Negative": 16,
|
| 178 |
+
"Variable": 0,
|
| 179 |
+
"_n": 18
|
| 180 |
+
},
|
| 181 |
+
"Optochin": {
|
| 182 |
+
"Positive": 8,
|
| 183 |
+
"Negative": 4,
|
| 184 |
+
"Variable": 0,
|
| 185 |
+
"_n": 12
|
| 186 |
+
},
|
| 187 |
+
"Bile Solubility": {
|
| 188 |
+
"Positive": 14,
|
| 189 |
+
"Negative": 0,
|
| 190 |
+
"Variable": 0,
|
| 191 |
+
"_n": 14
|
| 192 |
+
},
|
| 193 |
+
"Inulin Fermentation": {
|
| 194 |
+
"Positive": 6,
|
| 195 |
+
"Negative": 0,
|
| 196 |
+
"Variable": 0,
|
| 197 |
+
"_n": 6
|
| 198 |
+
},
|
| 199 |
+
"Metabolic Product": {
|
| 200 |
+
"Positive": 0,
|
| 201 |
+
"Negative": 0,
|
| 202 |
+
"Variable": 0,
|
| 203 |
+
"_n": 0
|
| 204 |
+
},
|
| 205 |
+
"Bacitracin": {
|
| 206 |
+
"Positive": 2,
|
| 207 |
+
"Negative": 2,
|
| 208 |
+
"Variable": 0,
|
| 209 |
+
"_n": 4
|
| 210 |
+
}
|
| 211 |
+
},
|
| 212 |
+
"Aeromonas": {
|
| 213 |
+
"Indole": {
|
| 214 |
+
"Positive": 48,
|
| 215 |
+
"Negative": 0,
|
| 216 |
+
"Variable": 0,
|
| 217 |
+
"_n": 48
|
| 218 |
+
},
|
| 219 |
+
"Gas Production": {
|
| 220 |
+
"Positive": 6,
|
| 221 |
+
"Negative": 0,
|
| 222 |
+
"Variable": 0,
|
| 223 |
+
"_n": 6
|
| 224 |
+
}
|
| 225 |
+
},
|
| 226 |
+
"Yersinia": {
|
| 227 |
+
"Indole": {
|
| 228 |
+
"Positive": 6,
|
| 229 |
+
"Negative": 0,
|
| 230 |
+
"Variable": 18,
|
| 231 |
+
"_n": 24
|
| 232 |
+
}
|
| 233 |
+
},
|
| 234 |
+
"Morganella": {
|
| 235 |
+
"Indole": {
|
| 236 |
+
"Positive": 30,
|
| 237 |
+
"Negative": 0,
|
| 238 |
+
"Variable": 0,
|
| 239 |
+
"_n": 30
|
| 240 |
+
}
|
| 241 |
+
},
|
| 242 |
+
"Providencia": {
|
| 243 |
+
"Indole": {
|
| 244 |
+
"Positive": 30,
|
| 245 |
+
"Negative": 6,
|
| 246 |
+
"Variable": 0,
|
| 247 |
+
"_n": 36
|
| 248 |
+
}
|
| 249 |
+
},
|
| 250 |
+
"Pasteurella": {
|
| 251 |
+
"Indole": {
|
| 252 |
+
"Positive": 30,
|
| 253 |
+
"Negative": 0,
|
| 254 |
+
"Variable": 0,
|
| 255 |
+
"_n": 30
|
| 256 |
+
}
|
| 257 |
+
},
|
| 258 |
+
"Citrobacter": {
|
| 259 |
+
"Indole": {
|
| 260 |
+
"Positive": 6,
|
| 261 |
+
"Negative": 0,
|
| 262 |
+
"Variable": 12,
|
| 263 |
+
"_n": 18
|
| 264 |
+
}
|
| 265 |
+
},
|
| 266 |
+
"Campylobacter": {
|
| 267 |
+
"Indole": {
|
| 268 |
+
"Positive": 0,
|
| 269 |
+
"Negative": 18,
|
| 270 |
+
"Variable": 0,
|
| 271 |
+
"_n": 18
|
| 272 |
+
}
|
| 273 |
+
},
|
| 274 |
+
"Vibrio": {
|
| 275 |
+
"Indole": {
|
| 276 |
+
"Positive": 42,
|
| 277 |
+
"Negative": 0,
|
| 278 |
+
"Variable": 0,
|
| 279 |
+
"_n": 42
|
| 280 |
+
}
|
| 281 |
+
},
|
| 282 |
+
"Burkholderia": {
|
| 283 |
+
"Indole": {
|
| 284 |
+
"Positive": 0,
|
| 285 |
+
"Negative": 30,
|
| 286 |
+
"Variable": 0,
|
| 287 |
+
"_n": 30
|
| 288 |
+
},
|
| 289 |
+
"Odour": {
|
| 290 |
+
"Positive": 0,
|
| 291 |
+
"Negative": 0,
|
| 292 |
+
"Variable": 0,
|
| 293 |
+
"_n": 0
|
| 294 |
+
},
|
| 295 |
+
"Glucose Oxidation": {
|
| 296 |
+
"Positive": 6,
|
| 297 |
+
"Negative": 0,
|
| 298 |
+
"Variable": 0,
|
| 299 |
+
"_n": 6
|
| 300 |
+
}
|
| 301 |
+
},
|
| 302 |
+
"Legionella": {
|
| 303 |
+
"Indole": {
|
| 304 |
+
"Positive": 0,
|
| 305 |
+
"Negative": 6,
|
| 306 |
+
"Variable": 0,
|
| 307 |
+
"_n": 6
|
| 308 |
+
}
|
| 309 |
+
},
|
| 310 |
+
"Helicobacter": {
|
| 311 |
+
"Indole": {
|
| 312 |
+
"Positive": 0,
|
| 313 |
+
"Negative": 6,
|
| 314 |
+
"Variable": 0,
|
| 315 |
+
"_n": 6
|
| 316 |
+
}
|
| 317 |
+
},
|
| 318 |
+
"Leptospira": {
|
| 319 |
+
"Indole": {
|
| 320 |
+
"Positive": 0,
|
| 321 |
+
"Negative": 6,
|
| 322 |
+
"Variable": 0,
|
| 323 |
+
"_n": 6
|
| 324 |
+
}
|
| 325 |
+
},
|
| 326 |
+
"Serratia": {
|
| 327 |
+
"Indole": {
|
| 328 |
+
"Positive": 0,
|
| 329 |
+
"Negative": 30,
|
| 330 |
+
"Variable": 0,
|
| 331 |
+
"_n": 30
|
| 332 |
+
},
|
| 333 |
+
"Temperature Dependence": {
|
| 334 |
+
"Positive": 0,
|
| 335 |
+
"Negative": 0,
|
| 336 |
+
"Variable": 0,
|
| 337 |
+
"_n": 0
|
| 338 |
+
}
|
| 339 |
+
},
|
| 340 |
+
"Alcaligenes": {
|
| 341 |
+
"Odour": {
|
| 342 |
+
"Positive": 0,
|
| 343 |
+
"Negative": 0,
|
| 344 |
+
"Variable": 0,
|
| 345 |
+
"_n": 0
|
| 346 |
+
},
|
| 347 |
+
"Indole": {
|
| 348 |
+
"Positive": 0,
|
| 349 |
+
"Negative": 12,
|
| 350 |
+
"Variable": 0,
|
| 351 |
+
"_n": 12
|
| 352 |
+
}
|
| 353 |
+
},
|
| 354 |
+
"Shewanella": {
|
| 355 |
+
"Indole": {
|
| 356 |
+
"Positive": 0,
|
| 357 |
+
"Negative": 24,
|
| 358 |
+
"Variable": 0,
|
| 359 |
+
"_n": 24
|
| 360 |
+
}
|
| 361 |
+
},
|
| 362 |
+
"Acinetobacter": {
|
| 363 |
+
"Indole": {
|
| 364 |
+
"Positive": 0,
|
| 365 |
+
"Negative": 36,
|
| 366 |
+
"Variable": 0,
|
| 367 |
+
"_n": 36
|
| 368 |
+
}
|
| 369 |
+
},
|
| 370 |
+
"Haemophilus": {
|
| 371 |
+
"Growth Factors": {
|
| 372 |
+
"Positive": 0,
|
| 373 |
+
"Negative": 0,
|
| 374 |
+
"Variable": 0,
|
| 375 |
+
"_n": 0
|
| 376 |
+
},
|
| 377 |
+
"Indole": {
|
| 378 |
+
"Positive": 6,
|
| 379 |
+
"Negative": 0,
|
| 380 |
+
"Variable": 12,
|
| 381 |
+
"_n": 18
|
| 382 |
+
}
|
| 383 |
+
},
|
| 384 |
+
"Micrococcus": {
|
| 385 |
+
"Glucose Oxidation": {
|
| 386 |
+
"Positive": 12,
|
| 387 |
+
"Negative": 0,
|
| 388 |
+
"Variable": 0,
|
| 389 |
+
"_n": 12
|
| 390 |
+
},
|
| 391 |
+
"Indole": {
|
| 392 |
+
"Positive": 0,
|
| 393 |
+
"Negative": 18,
|
| 394 |
+
"Variable": 0,
|
| 395 |
+
"_n": 18
|
| 396 |
+
}
|
| 397 |
+
},
|
| 398 |
+
"Edwardsiella": {
|
| 399 |
+
"Indole": {
|
| 400 |
+
"Positive": 18,
|
| 401 |
+
"Negative": 6,
|
| 402 |
+
"Variable": 6,
|
| 403 |
+
"_n": 30
|
| 404 |
+
}
|
| 405 |
+
},
|
| 406 |
+
"Chromobacterium": {
|
| 407 |
+
"Indole": {
|
| 408 |
+
"Positive": 0,
|
| 409 |
+
"Negative": 12,
|
| 410 |
+
"Variable": 12,
|
| 411 |
+
"_n": 24
|
| 412 |
+
}
|
| 413 |
+
},
|
| 414 |
+
"Lactobacillus": {
|
| 415 |
+
"Indole": {
|
| 416 |
+
"Positive": 0,
|
| 417 |
+
"Negative": 18,
|
| 418 |
+
"Variable": 0,
|
| 419 |
+
"_n": 18
|
| 420 |
+
},
|
| 421 |
+
"pH Range": {
|
| 422 |
+
"Positive": 0,
|
| 423 |
+
"Negative": 0,
|
| 424 |
+
"Variable": 0,
|
| 425 |
+
"_n": 0
|
| 426 |
+
},
|
| 427 |
+
"Fermentation Product": {
|
| 428 |
+
"Positive": 0,
|
| 429 |
+
"Negative": 0,
|
| 430 |
+
"Variable": 0,
|
| 431 |
+
"_n": 0
|
| 432 |
+
}
|
| 433 |
+
},
|
| 434 |
+
"Corynebacterium": {
|
| 435 |
+
"Indole": {
|
| 436 |
+
"Positive": 0,
|
| 437 |
+
"Negative": 18,
|
| 438 |
+
"Variable": 0,
|
| 439 |
+
"_n": 18
|
| 440 |
+
}
|
| 441 |
+
},
|
| 442 |
+
"Nocardia": {
|
| 443 |
+
"Indole": {
|
| 444 |
+
"Positive": 0,
|
| 445 |
+
"Negative": 18,
|
| 446 |
+
"Variable": 0,
|
| 447 |
+
"_n": 18
|
| 448 |
+
},
|
| 449 |
+
"Acid Fast": {
|
| 450 |
+
"Positive": 0,
|
| 451 |
+
"Negative": 0,
|
| 452 |
+
"Variable": 0,
|
| 453 |
+
"_n": 0
|
| 454 |
+
},
|
| 455 |
+
"Casein Hydrolysis": {
|
| 456 |
+
"Positive": 6,
|
| 457 |
+
"Negative": 0,
|
| 458 |
+
"Variable": 0,
|
| 459 |
+
"_n": 6
|
| 460 |
+
},
|
| 461 |
+
"Tyrosine Hydrolysis": {
|
| 462 |
+
"Positive": 6,
|
| 463 |
+
"Negative": 0,
|
| 464 |
+
"Variable": 0,
|
| 465 |
+
"_n": 6
|
| 466 |
+
}
|
| 467 |
+
},
|
| 468 |
+
"Propionibacterium": {
|
| 469 |
+
"Indole": {
|
| 470 |
+
"Positive": 18,
|
| 471 |
+
"Negative": 0,
|
| 472 |
+
"Variable": 0,
|
| 473 |
+
"_n": 18
|
| 474 |
+
},
|
| 475 |
+
"Glycerol Fermentation": {
|
| 476 |
+
"Positive": 6,
|
| 477 |
+
"Negative": 0,
|
| 478 |
+
"Variable": 0,
|
| 479 |
+
"_n": 6
|
| 480 |
+
},
|
| 481 |
+
"Mannose Fermentation": {
|
| 482 |
+
"Positive": 6,
|
| 483 |
+
"Negative": 0,
|
| 484 |
+
"Variable": 0,
|
| 485 |
+
"_n": 6
|
| 486 |
+
},
|
| 487 |
+
"Other Products": {
|
| 488 |
+
"Positive": 0,
|
| 489 |
+
"Negative": 0,
|
| 490 |
+
"Variable": 0,
|
| 491 |
+
"_n": 0
|
| 492 |
+
}
|
| 493 |
+
},
|
| 494 |
+
"Peptostreptococcus": {
|
| 495 |
+
"Indole": {
|
| 496 |
+
"Positive": 0,
|
| 497 |
+
"Negative": 12,
|
| 498 |
+
"Variable": 0,
|
| 499 |
+
"_n": 12
|
| 500 |
+
}
|
| 501 |
+
},
|
| 502 |
+
"Veillonella": {
|
| 503 |
+
"Indole": {
|
| 504 |
+
"Positive": 0,
|
| 505 |
+
"Negative": 6,
|
| 506 |
+
"Variable": 0,
|
| 507 |
+
"_n": 6
|
| 508 |
+
}
|
| 509 |
+
},
|
| 510 |
+
"Fusobacterium": {
|
| 511 |
+
"Odour": {
|
| 512 |
+
"Positive": 0,
|
| 513 |
+
"Negative": 0,
|
| 514 |
+
"Variable": 0,
|
| 515 |
+
"_n": 0
|
| 516 |
+
},
|
| 517 |
+
"Indole": {
|
| 518 |
+
"Positive": 12,
|
| 519 |
+
"Negative": 0,
|
| 520 |
+
"Variable": 0,
|
| 521 |
+
"_n": 12
|
| 522 |
+
}
|
| 523 |
+
},
|
| 524 |
+
"Eubacterium": {
|
| 525 |
+
"Fermentation Products": {
|
| 526 |
+
"Positive": 0,
|
| 527 |
+
"Negative": 0,
|
| 528 |
+
"Variable": 0,
|
| 529 |
+
"_n": 0
|
| 530 |
+
},
|
| 531 |
+
"Cellobiose Fermentation": {
|
| 532 |
+
"Positive": 6,
|
| 533 |
+
"Negative": 0,
|
| 534 |
+
"Variable": 0,
|
| 535 |
+
"_n": 6
|
| 536 |
+
},
|
| 537 |
+
"Indole": {
|
| 538 |
+
"Positive": 0,
|
| 539 |
+
"Negative": 6,
|
| 540 |
+
"Variable": 0,
|
| 541 |
+
"_n": 6
|
| 542 |
+
}
|
| 543 |
+
},
|
| 544 |
+
"Halomonas": {
|
| 545 |
+
"Indole": {
|
| 546 |
+
"Positive": 0,
|
| 547 |
+
"Negative": 18,
|
| 548 |
+
"Variable": 0,
|
| 549 |
+
"_n": 18
|
| 550 |
+
},
|
| 551 |
+
"NaCl Tolerant (>=10%)": {
|
| 552 |
+
"Positive": 6,
|
| 553 |
+
"Negative": 0,
|
| 554 |
+
"Variable": 0,
|
| 555 |
+
"_n": 6
|
| 556 |
+
}
|
| 557 |
+
},
|
| 558 |
+
"Psychrobacter": {
|
| 559 |
+
"Indole": {
|
| 560 |
+
"Positive": 0,
|
| 561 |
+
"Negative": 12,
|
| 562 |
+
"Variable": 0,
|
| 563 |
+
"_n": 12
|
| 564 |
+
}
|
| 565 |
+
},
|
| 566 |
+
"Deinococcus": {
|
| 567 |
+
"Indole": {
|
| 568 |
+
"Positive": 0,
|
| 569 |
+
"Negative": 6,
|
| 570 |
+
"Variable": 0,
|
| 571 |
+
"_n": 6
|
| 572 |
+
}
|
| 573 |
+
},
|
| 574 |
+
"Thermus": {
|
| 575 |
+
"Indole": {
|
| 576 |
+
"Positive": 0,
|
| 577 |
+
"Negative": 12,
|
| 578 |
+
"Variable": 0,
|
| 579 |
+
"_n": 12
|
| 580 |
+
}
|
| 581 |
+
},
|
| 582 |
+
"Acidithiobacillus": {
|
| 583 |
+
"pH Range": {
|
| 584 |
+
"Positive": 0,
|
| 585 |
+
"Negative": 0,
|
| 586 |
+
"Variable": 0,
|
| 587 |
+
"_n": 0
|
| 588 |
+
},
|
| 589 |
+
"Indole": {
|
| 590 |
+
"Positive": 0,
|
| 591 |
+
"Negative": 6,
|
| 592 |
+
"Variable": 0,
|
| 593 |
+
"_n": 6
|
| 594 |
+
},
|
| 595 |
+
"Iron Oxidation": {
|
| 596 |
+
"Positive": 6,
|
| 597 |
+
"Negative": 0,
|
| 598 |
+
"Variable": 0,
|
| 599 |
+
"_n": 6
|
| 600 |
+
}
|
| 601 |
+
},
|
| 602 |
+
"Mycoplasma": {
|
| 603 |
+
"Arginine": {
|
| 604 |
+
"Positive": 6,
|
| 605 |
+
"Negative": 0,
|
| 606 |
+
"Variable": 0,
|
| 607 |
+
"_n": 6
|
| 608 |
+
},
|
| 609 |
+
"Arginine Hydrolysis": {
|
| 610 |
+
"Positive": 6,
|
| 611 |
+
"Negative": 0,
|
| 612 |
+
"Variable": 0,
|
| 613 |
+
"_n": 6
|
| 614 |
+
}
|
| 615 |
+
},
|
| 616 |
+
"Bordetella": {
|
| 617 |
+
"Growth Factors": {
|
| 618 |
+
"Positive": 0,
|
| 619 |
+
"Negative": 0,
|
| 620 |
+
"Variable": 0,
|
| 621 |
+
"_n": 0
|
| 622 |
+
},
|
| 623 |
+
"Indole": {
|
| 624 |
+
"Positive": 0,
|
| 625 |
+
"Negative": 6,
|
| 626 |
+
"Variable": 0,
|
| 627 |
+
"_n": 6
|
| 628 |
+
}
|
| 629 |
+
},
|
| 630 |
+
"Stenotrophomonas": {
|
| 631 |
+
"Indole": {
|
| 632 |
+
"Positive": 0,
|
| 633 |
+
"Negative": 24,
|
| 634 |
+
"Variable": 0,
|
| 635 |
+
"_n": 24
|
| 636 |
+
}
|
| 637 |
+
},
|
| 638 |
+
"Ralstonia": {
|
| 639 |
+
"Indole": {
|
| 640 |
+
"Positive": 0,
|
| 641 |
+
"Negative": 12,
|
| 642 |
+
"Variable": 0,
|
| 643 |
+
"_n": 12
|
| 644 |
+
}
|
| 645 |
+
},
|
| 646 |
+
"Achromobacter": {
|
| 647 |
+
"Indole": {
|
| 648 |
+
"Positive": 0,
|
| 649 |
+
"Negative": 6,
|
| 650 |
+
"Variable": 0,
|
| 651 |
+
"_n": 6
|
| 652 |
+
}
|
| 653 |
+
},
|
| 654 |
+
"Brucella": {
|
| 655 |
+
"Indole": {
|
| 656 |
+
"Positive": 0,
|
| 657 |
+
"Negative": 12,
|
| 658 |
+
"Variable": 0,
|
| 659 |
+
"_n": 12
|
| 660 |
+
}
|
| 661 |
+
},
|
| 662 |
+
"Brevundimonas": {
|
| 663 |
+
"Indole": {
|
| 664 |
+
"Positive": 0,
|
| 665 |
+
"Negative": 12,
|
| 666 |
+
"Variable": 0,
|
| 667 |
+
"_n": 12
|
| 668 |
+
}
|
| 669 |
+
},
|
| 670 |
+
"Arthrobacter": {
|
| 671 |
+
"Indole": {
|
| 672 |
+
"Positive": 0,
|
| 673 |
+
"Negative": 6,
|
| 674 |
+
"Variable": 0,
|
| 675 |
+
"_n": 6
|
| 676 |
+
},
|
| 677 |
+
"Glucose Oxidation": {
|
| 678 |
+
"Positive": 6,
|
| 679 |
+
"Negative": 0,
|
| 680 |
+
"Variable": 0,
|
| 681 |
+
"_n": 6
|
| 682 |
+
}
|
| 683 |
+
},
|
| 684 |
+
"Cytophaga": {
|
| 685 |
+
"Indole": {
|
| 686 |
+
"Positive": 0,
|
| 687 |
+
"Negative": 6,
|
| 688 |
+
"Variable": 0,
|
| 689 |
+
"_n": 6
|
| 690 |
+
}
|
| 691 |
+
},
|
| 692 |
+
"Flavobacterium": {
|
| 693 |
+
"Indole": {
|
| 694 |
+
"Positive": 0,
|
| 695 |
+
"Negative": 12,
|
| 696 |
+
"Variable": 0,
|
| 697 |
+
"_n": 12
|
| 698 |
+
}
|
| 699 |
+
},
|
| 700 |
+
"Oerskovia": {
|
| 701 |
+
"Indole": {
|
| 702 |
+
"Positive": 0,
|
| 703 |
+
"Negative": 6,
|
| 704 |
+
"Variable": 0,
|
| 705 |
+
"_n": 6
|
| 706 |
+
}
|
| 707 |
+
},
|
| 708 |
+
"Sphingomonas": {
|
| 709 |
+
"Indole": {
|
| 710 |
+
"Positive": 0,
|
| 711 |
+
"Negative": 12,
|
| 712 |
+
"Variable": 0,
|
| 713 |
+
"_n": 12
|
| 714 |
+
},
|
| 715 |
+
"Glucose Oxidation": {
|
| 716 |
+
"Positive": 6,
|
| 717 |
+
"Negative": 0,
|
| 718 |
+
"Variable": 0,
|
| 719 |
+
"_n": 6
|
| 720 |
+
}
|
| 721 |
+
},
|
| 722 |
+
"Comamonas": {
|
| 723 |
+
"Indole": {
|
| 724 |
+
"Positive": 0,
|
| 725 |
+
"Negative": 12,
|
| 726 |
+
"Variable": 0,
|
| 727 |
+
"_n": 12
|
| 728 |
+
}
|
| 729 |
+
},
|
| 730 |
+
"Halobacterium": {
|
| 731 |
+
"NaCl Tolerant (>=15%)": {
|
| 732 |
+
"Positive": 6,
|
| 733 |
+
"Negative": 0,
|
| 734 |
+
"Variable": 0,
|
| 735 |
+
"_n": 6
|
| 736 |
+
},
|
| 737 |
+
"Indole": {
|
| 738 |
+
"Positive": 0,
|
| 739 |
+
"Negative": 6,
|
| 740 |
+
"Variable": 0,
|
| 741 |
+
"_n": 6
|
| 742 |
+
}
|
| 743 |
+
},
|
| 744 |
+
"Thermococcus": {
|
| 745 |
+
"Sulfur Utilization": {
|
| 746 |
+
"Positive": 6,
|
| 747 |
+
"Negative": 0,
|
| 748 |
+
"Variable": 0,
|
| 749 |
+
"_n": 6
|
| 750 |
+
}
|
| 751 |
+
},
|
| 752 |
+
"Actinomyces": {
|
| 753 |
+
"Indole": {
|
| 754 |
+
"Positive": 0,
|
| 755 |
+
"Negative": 12,
|
| 756 |
+
"Variable": 0,
|
| 757 |
+
"_n": 12
|
| 758 |
+
}
|
| 759 |
+
},
|
| 760 |
+
"Elizabethkingia": {
|
| 761 |
+
"Indole": {
|
| 762 |
+
"Positive": 6,
|
| 763 |
+
"Negative": 0,
|
| 764 |
+
"Variable": 0,
|
| 765 |
+
"_n": 6
|
| 766 |
+
}
|
| 767 |
+
},
|
| 768 |
+
"Hafnia": {
|
| 769 |
+
"Indole": {
|
| 770 |
+
"Positive": 0,
|
| 771 |
+
"Negative": 6,
|
| 772 |
+
"Variable": 0,
|
| 773 |
+
"_n": 6
|
| 774 |
+
}
|
| 775 |
+
},
|
| 776 |
+
"Photobacterium": {
|
| 777 |
+
"Indole": {
|
| 778 |
+
"Positive": 12,
|
| 779 |
+
"Negative": 0,
|
| 780 |
+
"Variable": 0,
|
| 781 |
+
"_n": 12
|
| 782 |
+
}
|
| 783 |
+
},
|
| 784 |
+
"Pantoea": {
|
| 785 |
+
"Indole": {
|
| 786 |
+
"Positive": 0,
|
| 787 |
+
"Negative": 6,
|
| 788 |
+
"Variable": 0,
|
| 789 |
+
"_n": 6
|
| 790 |
+
}
|
| 791 |
+
},
|
| 792 |
+
"Raoultella": {
|
| 793 |
+
"Indole": {
|
| 794 |
+
"Positive": 0,
|
| 795 |
+
"Negative": 0,
|
| 796 |
+
"Variable": 6,
|
| 797 |
+
"_n": 6
|
| 798 |
+
}
|
| 799 |
+
},
|
| 800 |
+
"Ochrobactrum": {
|
| 801 |
+
"Indole": {
|
| 802 |
+
"Positive": 0,
|
| 803 |
+
"Negative": 6,
|
| 804 |
+
"Variable": 0,
|
| 805 |
+
"_n": 6
|
| 806 |
+
}
|
| 807 |
+
},
|
| 808 |
+
"Roseomonas": {
|
| 809 |
+
"Indole": {
|
| 810 |
+
"Positive": 0,
|
| 811 |
+
"Negative": 6,
|
| 812 |
+
"Variable": 0,
|
| 813 |
+
"_n": 6
|
| 814 |
+
}
|
| 815 |
+
},
|
| 816 |
+
"Actinobacillus": {
|
| 817 |
+
"Indole": {
|
| 818 |
+
"Positive": 0,
|
| 819 |
+
"Negative": 6,
|
| 820 |
+
"Variable": 0,
|
| 821 |
+
"_n": 6
|
| 822 |
+
}
|
| 823 |
+
},
|
| 824 |
+
"Gemella": {
|
| 825 |
+
"Indole": {
|
| 826 |
+
"Positive": 0,
|
| 827 |
+
"Negative": 12,
|
| 828 |
+
"Variable": 0,
|
| 829 |
+
"_n": 12
|
| 830 |
+
}
|
| 831 |
+
},
|
| 832 |
+
"Rothia": {
|
| 833 |
+
"Indole": {
|
| 834 |
+
"Positive": 0,
|
| 835 |
+
"Negative": 12,
|
| 836 |
+
"Variable": 0,
|
| 837 |
+
"_n": 12
|
| 838 |
+
}
|
| 839 |
+
},
|
| 840 |
+
"Listeria": {
|
| 841 |
+
"Indole": {
|
| 842 |
+
"Positive": 0,
|
| 843 |
+
"Negative": 6,
|
| 844 |
+
"Variable": 0,
|
| 845 |
+
"_n": 6
|
| 846 |
+
},
|
| 847 |
+
"CAMP": {
|
| 848 |
+
"Positive": 2,
|
| 849 |
+
"Negative": 0,
|
| 850 |
+
"Variable": 0,
|
| 851 |
+
"_n": 2
|
| 852 |
+
}
|
| 853 |
+
},
|
| 854 |
+
"Carnobacterium": {
|
| 855 |
+
"Indole": {
|
| 856 |
+
"Positive": 0,
|
| 857 |
+
"Negative": 6,
|
| 858 |
+
"Variable": 0,
|
| 859 |
+
"_n": 6
|
| 860 |
+
}
|
| 861 |
+
},
|
| 862 |
+
"Plesiomonas": {
|
| 863 |
+
"Indole": {
|
| 864 |
+
"Positive": 6,
|
| 865 |
+
"Negative": 0,
|
| 866 |
+
"Variable": 0,
|
| 867 |
+
"_n": 6
|
| 868 |
+
}
|
| 869 |
+
},
|
| 870 |
+
"Janthinobacterium": {
|
| 871 |
+
"Indole": {
|
| 872 |
+
"Positive": 0,
|
| 873 |
+
"Negative": 6,
|
| 874 |
+
"Variable": 0,
|
| 875 |
+
"_n": 6
|
| 876 |
+
}
|
| 877 |
+
},
|
| 878 |
+
"Paenibacillus": {
|
| 879 |
+
"Indole": {
|
| 880 |
+
"Positive": 0,
|
| 881 |
+
"Negative": 6,
|
| 882 |
+
"Variable": 0,
|
| 883 |
+
"_n": 6
|
| 884 |
+
}
|
| 885 |
+
},
|
| 886 |
+
"Moraxella": {
|
| 887 |
+
"Indole": {
|
| 888 |
+
"Positive": 0,
|
| 889 |
+
"Negative": 6,
|
| 890 |
+
"Variable": 0,
|
| 891 |
+
"_n": 6
|
| 892 |
+
}
|
| 893 |
+
},
|
| 894 |
+
"Aerococcus": {
|
| 895 |
+
"Indole": {
|
| 896 |
+
"Positive": 0,
|
| 897 |
+
"Negative": 6,
|
| 898 |
+
"Variable": 0,
|
| 899 |
+
"_n": 6
|
| 900 |
+
}
|
| 901 |
+
},
|
| 902 |
+
"Kocuria": {
|
| 903 |
+
"Indole": {
|
| 904 |
+
"Positive": 0,
|
| 905 |
+
"Negative": 6,
|
| 906 |
+
"Variable": 0,
|
| 907 |
+
"_n": 6
|
| 908 |
+
}
|
| 909 |
+
},
|
| 910 |
+
"Leuconostoc": {
|
| 911 |
+
"Indole": {
|
| 912 |
+
"Positive": 0,
|
| 913 |
+
"Negative": 6,
|
| 914 |
+
"Variable": 0,
|
| 915 |
+
"_n": 6
|
| 916 |
+
},
|
| 917 |
+
"Gas Production": {
|
| 918 |
+
"Positive": 6,
|
| 919 |
+
"Negative": 0,
|
| 920 |
+
"Variable": 0,
|
| 921 |
+
"_n": 6
|
| 922 |
+
},
|
| 923 |
+
"Fructose Fermentation": {
|
| 924 |
+
"Positive": 6,
|
| 925 |
+
"Negative": 0,
|
| 926 |
+
"Variable": 0,
|
| 927 |
+
"_n": 6
|
| 928 |
+
}
|
| 929 |
+
},
|
| 930 |
+
"Rhodococcus": {
|
| 931 |
+
"Indole": {
|
| 932 |
+
"Positive": 0,
|
| 933 |
+
"Negative": 6,
|
| 934 |
+
"Variable": 0,
|
| 935 |
+
"_n": 6
|
| 936 |
+
}
|
| 937 |
+
},
|
| 938 |
+
"Francisella": {
|
| 939 |
+
"Indole": {
|
| 940 |
+
"Positive": 0,
|
| 941 |
+
"Negative": 12,
|
| 942 |
+
"Variable": 0,
|
| 943 |
+
"_n": 12
|
| 944 |
+
}
|
| 945 |
+
},
|
| 946 |
+
"Erysipelothrix": {
|
| 947 |
+
"Indole": {
|
| 948 |
+
"Positive": 0,
|
| 949 |
+
"Negative": 6,
|
| 950 |
+
"Variable": 0,
|
| 951 |
+
"_n": 6
|
| 952 |
+
},
|
| 953 |
+
"Fructose Fermentation": {
|
| 954 |
+
"Positive": 6,
|
| 955 |
+
"Negative": 0,
|
| 956 |
+
"Variable": 0,
|
| 957 |
+
"_n": 6
|
| 958 |
+
}
|
| 959 |
+
},
|
| 960 |
+
"Arcanobacterium": {
|
| 961 |
+
"Indole": {
|
| 962 |
+
"Positive": 0,
|
| 963 |
+
"Negative": 6,
|
| 964 |
+
"Variable": 0,
|
| 965 |
+
"_n": 6
|
| 966 |
+
}
|
| 967 |
+
},
|
| 968 |
+
"Porphyromonas": {
|
| 969 |
+
"Indole": {
|
| 970 |
+
"Positive": 6,
|
| 971 |
+
"Negative": 0,
|
| 972 |
+
"Variable": 0,
|
| 973 |
+
"_n": 6
|
| 974 |
+
}
|
| 975 |
+
},
|
| 976 |
+
"Prevotella": {
|
| 977 |
+
"Indole": {
|
| 978 |
+
"Positive": 6,
|
| 979 |
+
"Negative": 0,
|
| 980 |
+
"Variable": 0,
|
| 981 |
+
"_n": 6
|
| 982 |
+
}
|
| 983 |
+
},
|
| 984 |
+
"Microbacterium": {
|
| 985 |
+
"Indole": {
|
| 986 |
+
"Positive": 0,
|
| 987 |
+
"Negative": 6,
|
| 988 |
+
"Variable": 0,
|
| 989 |
+
"_n": 6
|
| 990 |
+
}
|
| 991 |
+
},
|
| 992 |
+
"Enterococcus": {
|
| 993 |
+
"PYR": {
|
| 994 |
+
"Positive": 2,
|
| 995 |
+
"Negative": 0,
|
| 996 |
+
"Variable": 0,
|
| 997 |
+
"_n": 2
|
| 998 |
+
},
|
| 999 |
+
"Optochin": {
|
| 1000 |
+
"Positive": 0,
|
| 1001 |
+
"Negative": 2,
|
| 1002 |
+
"Variable": 0,
|
| 1003 |
+
"_n": 2
|
| 1004 |
+
},
|
| 1005 |
+
"Novobiocin": {
|
| 1006 |
+
"Positive": 0,
|
| 1007 |
+
"Negative": 2,
|
| 1008 |
+
"Variable": 0,
|
| 1009 |
+
"_n": 2
|
| 1010 |
+
}
|
| 1011 |
+
}
|
| 1012 |
+
}
|
engine/__init__.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/__init__.py
|
| 2 |
+
# Makes 'engine' a package and re-exports the identifier for convenience if you want.
|
| 3 |
+
from .bacteria_identifier import BacteriaIdentifier
|
| 4 |
+
|
engine/bacteria_identifier.py
ADDED
|
@@ -0,0 +1,383 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/bacteria_identifier.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Core identification engine + blended scoring with extended signals.
|
| 4 |
+
|
| 5 |
+
import os
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import random
|
| 9 |
+
from typing import Dict, List, Optional, Tuple
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
|
| 13 |
+
from engine.extended_reasoner import score_genera_from_extended
|
| 14 |
+
|
| 15 |
+
DATA_DIR = "data"
|
| 16 |
+
EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
# -----------------------------
|
| 20 |
+
# Helper Function
|
| 21 |
+
# -----------------------------
|
| 22 |
+
def join_with_and(items):
|
| 23 |
+
"""Join list into a readable string, using commas and 'and' before last item."""
|
| 24 |
+
if not items:
|
| 25 |
+
return ""
|
| 26 |
+
if len(items) == 1:
|
| 27 |
+
return items[0]
|
| 28 |
+
return ", ".join(items[:-1]) + " and " + items[-1]
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
# -----------------------------
|
| 32 |
+
# Identification Result Class
|
| 33 |
+
# -----------------------------
|
| 34 |
+
class IdentificationResult:
|
| 35 |
+
"""
|
| 36 |
+
Stores data about a single bacterial genus result and generates reasoning text.
|
| 37 |
+
Now includes optional extended-likelihood and blended confidence.
|
| 38 |
+
"""
|
| 39 |
+
def __init__(
|
| 40 |
+
self,
|
| 41 |
+
genus: str,
|
| 42 |
+
total_score: int,
|
| 43 |
+
matched_fields: List[str],
|
| 44 |
+
mismatched_fields: List[str],
|
| 45 |
+
reasoning_factors: Dict[str, str],
|
| 46 |
+
total_fields_evaluated: int,
|
| 47 |
+
total_fields_possible: int,
|
| 48 |
+
extra_notes: str = "",
|
| 49 |
+
extended_likelihood: Optional[float] = None,
|
| 50 |
+
extended_explanation: str = "",
|
| 51 |
+
):
|
| 52 |
+
self.genus = genus
|
| 53 |
+
self.total_score = total_score
|
| 54 |
+
self.matched_fields = matched_fields
|
| 55 |
+
self.mismatched_fields = mismatched_fields
|
| 56 |
+
self.reasoning_factors = reasoning_factors
|
| 57 |
+
self.total_fields_evaluated = total_fields_evaluated
|
| 58 |
+
self.total_fields_possible = total_fields_possible
|
| 59 |
+
self.extra_notes = extra_notes
|
| 60 |
+
|
| 61 |
+
# Extended reasoning
|
| 62 |
+
self.extended_likelihood = extended_likelihood # 0–1, or None if no extended data
|
| 63 |
+
self.extended_explanation = extended_explanation
|
| 64 |
+
|
| 65 |
+
# -----------------------------
|
| 66 |
+
# Confidence Calculations
|
| 67 |
+
# -----------------------------
|
| 68 |
+
def confidence_percent(self) -> int:
|
| 69 |
+
"""Confidence based only on tests the user entered."""
|
| 70 |
+
if self.total_fields_evaluated == 0:
|
| 71 |
+
return 0
|
| 72 |
+
return max(
|
| 73 |
+
0,
|
| 74 |
+
min(100, int((self.total_score / self.total_fields_evaluated) * 100)),
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
def true_confidence(self) -> int:
|
| 78 |
+
"""Confidence based on *all* possible tests (complete database fields)."""
|
| 79 |
+
if self.total_fields_possible == 0:
|
| 80 |
+
return 0
|
| 81 |
+
return max(
|
| 82 |
+
0,
|
| 83 |
+
min(100, int((self.total_score / self.total_fields_possible) * 100)),
|
| 84 |
+
)
|
| 85 |
+
|
| 86 |
+
def blended_confidence_raw(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> float:
|
| 87 |
+
"""
|
| 88 |
+
Blended confidence:
|
| 89 |
+
core = core-confidence (0–1)
|
| 90 |
+
ext = extended likelihood (0–1, if available)
|
| 91 |
+
If no extended likelihood, return core.
|
| 92 |
+
"""
|
| 93 |
+
core = self.confidence_percent() / 100.0
|
| 94 |
+
if self.extended_likelihood is None:
|
| 95 |
+
return core
|
| 96 |
+
return weight_core * core + weight_ext * self.extended_likelihood
|
| 97 |
+
|
| 98 |
+
def blended_confidence_percent(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> int:
|
| 99 |
+
return int(round(self.blended_confidence_raw(weight_core, weight_ext) * 100))
|
| 100 |
+
|
| 101 |
+
# -----------------------------
|
| 102 |
+
# Reasoning Paragraph Generator
|
| 103 |
+
# -----------------------------
|
| 104 |
+
def reasoning_paragraph(self, ranked_results=None) -> str:
|
| 105 |
+
"""Generate detailed reasoning paragraph with comparison to other genera."""
|
| 106 |
+
if not self.matched_fields:
|
| 107 |
+
return "No significant biochemical or morphological matches were found."
|
| 108 |
+
|
| 109 |
+
intro = random.choice(
|
| 110 |
+
[
|
| 111 |
+
"Based on the observed biochemical and morphological traits,",
|
| 112 |
+
"According to the provided test results,",
|
| 113 |
+
"From the available laboratory findings,",
|
| 114 |
+
"Considering the entered reactions and colony traits,",
|
| 115 |
+
]
|
| 116 |
+
)
|
| 117 |
+
|
| 118 |
+
# Key descriptive highlights
|
| 119 |
+
highlights = []
|
| 120 |
+
if "Gram Stain" in self.matched_fields:
|
| 121 |
+
highlights.append(
|
| 122 |
+
f"it is **Gram {self.reasoning_factors.get('Gram Stain', '').lower()}**"
|
| 123 |
+
)
|
| 124 |
+
if "Shape" in self.matched_fields:
|
| 125 |
+
highlights.append(
|
| 126 |
+
f"with a **{self.reasoning_factors.get('Shape', '').lower()}** morphology"
|
| 127 |
+
)
|
| 128 |
+
if "Catalase" in self.matched_fields:
|
| 129 |
+
highlights.append(
|
| 130 |
+
f"and **catalase {self.reasoning_factors.get('Catalase', '').lower()}** activity"
|
| 131 |
+
)
|
| 132 |
+
if "Oxidase" in self.matched_fields:
|
| 133 |
+
highlights.append(
|
| 134 |
+
f"and **oxidase {self.reasoning_factors.get('Oxidase', '').lower()}** reaction"
|
| 135 |
+
)
|
| 136 |
+
if "Oxygen Requirement" in self.matched_fields:
|
| 137 |
+
highlights.append(
|
| 138 |
+
f"which prefers **{self.reasoning_factors.get('Oxygen Requirement', '').lower()}** conditions"
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
# Join highlights grammatically
|
| 142 |
+
summary = (
|
| 143 |
+
", ".join(highlights[:-1]) + " and " + highlights[-1]
|
| 144 |
+
if len(highlights) > 1
|
| 145 |
+
else "".join(highlights)
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Confidence text (core)
|
| 149 |
+
core_conf = self.confidence_percent()
|
| 150 |
+
confidence_text = (
|
| 151 |
+
"The confidence in this identification based on the entered tests is high."
|
| 152 |
+
if core_conf >= 70
|
| 153 |
+
else "The confidence in this identification based on the entered tests is moderate."
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
# Comparative reasoning vs other close results
|
| 157 |
+
comparison = ""
|
| 158 |
+
if ranked_results and len(ranked_results) > 1:
|
| 159 |
+
close_others = ranked_results[1:3]
|
| 160 |
+
other_names = [r.genus for r in close_others]
|
| 161 |
+
if other_names:
|
| 162 |
+
if self.total_score >= close_others[0].total_score:
|
| 163 |
+
comparison = (
|
| 164 |
+
f" It is **more likely** than {join_with_and(other_names)} "
|
| 165 |
+
f"based on stronger alignment in {join_with_and(self.matched_fields[:3])}."
|
| 166 |
+
)
|
| 167 |
+
else:
|
| 168 |
+
comparison = (
|
| 169 |
+
f" It is **less likely** than {join_with_and(other_names)} "
|
| 170 |
+
f"due to differences in {join_with_and(self.mismatched_fields[:3])}."
|
| 171 |
+
)
|
| 172 |
+
|
| 173 |
+
return f"{intro} {summary}, the isolate most closely resembles **{self.genus}**. {confidence_text}{comparison}"
|
| 174 |
+
|
| 175 |
+
|
| 176 |
+
# -----------------------------
|
| 177 |
+
# Bacteria Identifier Engine
|
| 178 |
+
# -----------------------------
|
| 179 |
+
class BacteriaIdentifier:
|
| 180 |
+
"""
|
| 181 |
+
Main engine to match bacterial genus based on biochemical & morphological data.
|
| 182 |
+
Includes:
|
| 183 |
+
- Core rule-based matching vs bacteria_db.xlsx
|
| 184 |
+
- Optional blending with extended signals (signals_catalog.json)
|
| 185 |
+
"""
|
| 186 |
+
|
| 187 |
+
def __init__(self, db: pd.DataFrame):
|
| 188 |
+
self.db = db.fillna("")
|
| 189 |
+
self.extended_fields = self._load_extended_fields()
|
| 190 |
+
|
| 191 |
+
def _load_extended_fields(self) -> List[str]:
|
| 192 |
+
if not os.path.exists(EXT_SCHEMA_PATH):
|
| 193 |
+
return []
|
| 194 |
+
try:
|
| 195 |
+
with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
|
| 196 |
+
schema = json.load(f)
|
| 197 |
+
return list(schema.keys())
|
| 198 |
+
except Exception:
|
| 199 |
+
return []
|
| 200 |
+
|
| 201 |
+
# -----------------------------
|
| 202 |
+
# Field Comparison Logic
|
| 203 |
+
# -----------------------------
|
| 204 |
+
def compare_field(self, db_val, user_val, field_name: str) -> int:
|
| 205 |
+
"""Compare one test field between database and user input."""
|
| 206 |
+
if not user_val or str(user_val).strip() == "" or str(user_val).lower() == "unknown":
|
| 207 |
+
return 0 # Skip empty or unknown
|
| 208 |
+
|
| 209 |
+
db_val = str(db_val).strip().lower()
|
| 210 |
+
user_val = str(user_val).strip().lower()
|
| 211 |
+
hard_exclusions = ["Gram Stain", "Shape", "Spore Formation"]
|
| 212 |
+
|
| 213 |
+
# Split entries by separators for multi-value matches
|
| 214 |
+
db_options = re.split(r"[;/]", db_val)
|
| 215 |
+
user_options = re.split(r"[;/]", user_val)
|
| 216 |
+
db_options = [x.strip() for x in db_options if x.strip()]
|
| 217 |
+
user_options = [x.strip() for x in user_options if x.strip()]
|
| 218 |
+
|
| 219 |
+
# Handle "variable" logic
|
| 220 |
+
if "variable" in db_options or "variable" in user_options:
|
| 221 |
+
return 0
|
| 222 |
+
|
| 223 |
+
# Special handling for Growth Temperature
|
| 224 |
+
if field_name == "Growth Temperature":
|
| 225 |
+
try:
|
| 226 |
+
if "//" in db_val:
|
| 227 |
+
low, high = [float(x) for x in db_val.split("//")]
|
| 228 |
+
temp = float(user_val)
|
| 229 |
+
return 1 if low <= temp <= high else -1
|
| 230 |
+
except Exception:
|
| 231 |
+
return 0
|
| 232 |
+
|
| 233 |
+
# Flexible match: partial overlap counts as match
|
| 234 |
+
match_found = any(
|
| 235 |
+
any(u in db_opt or db_opt in u for db_opt in db_options) for u in user_options
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
if match_found:
|
| 239 |
+
return 1
|
| 240 |
+
else:
|
| 241 |
+
if field_name in hard_exclusions:
|
| 242 |
+
return -999 # Hard exclusion
|
| 243 |
+
return -1
|
| 244 |
+
|
| 245 |
+
# -----------------------------
|
| 246 |
+
# Suggest Next Tests
|
| 247 |
+
# -----------------------------
|
| 248 |
+
def suggest_next_tests(self, top_results: List[IdentificationResult]) -> List[str]:
|
| 249 |
+
"""Suggest 3 tests that best differentiate top matches."""
|
| 250 |
+
if len(top_results) < 2:
|
| 251 |
+
return []
|
| 252 |
+
varying_fields = []
|
| 253 |
+
top3 = top_results[:3]
|
| 254 |
+
|
| 255 |
+
for field in self.db.columns:
|
| 256 |
+
if field in ["Genus", "Extra Notes", "Colony Morphology"]:
|
| 257 |
+
continue
|
| 258 |
+
|
| 259 |
+
field_values = set()
|
| 260 |
+
for r in top3:
|
| 261 |
+
field_values.update(r.matched_fields)
|
| 262 |
+
field_values.update(r.mismatched_fields)
|
| 263 |
+
|
| 264 |
+
if len(field_values) > 1:
|
| 265 |
+
varying_fields.append(field)
|
| 266 |
+
|
| 267 |
+
random.shuffle(varying_fields)
|
| 268 |
+
return varying_fields[:3]
|
| 269 |
+
|
| 270 |
+
# -----------------------------
|
| 271 |
+
# Extended Input Extraction
|
| 272 |
+
# -----------------------------
|
| 273 |
+
def _extract_extended_input(self, user_input: Dict[str, str]) -> Dict[str, str]:
|
| 274 |
+
"""
|
| 275 |
+
Extract extended tests (those in extended_schema.json but not part of the core db).
|
| 276 |
+
Only keep Positive/Negative/Variable (ignore Unknown/empty).
|
| 277 |
+
"""
|
| 278 |
+
ext_in = {}
|
| 279 |
+
for field in self.extended_fields:
|
| 280 |
+
val = user_input.get(field, "Unknown")
|
| 281 |
+
if isinstance(val, str) and val.lower() in ("positive", "negative", "variable"):
|
| 282 |
+
ext_in[field] = val.capitalize()
|
| 283 |
+
return ext_in
|
| 284 |
+
|
| 285 |
+
# -----------------------------
|
| 286 |
+
# Main Identification Routine
|
| 287 |
+
# -----------------------------
|
| 288 |
+
def identify(self, user_input: Dict[str, str]) -> List[IdentificationResult]:
|
| 289 |
+
"""Compare user input to database and rank possible genera with blended scoring."""
|
| 290 |
+
results: List[IdentificationResult] = []
|
| 291 |
+
total_fields_possible = len([c for c in self.db.columns if c != "Genus"])
|
| 292 |
+
|
| 293 |
+
# 1) Core scoring loop against bacteria_db.xlsx
|
| 294 |
+
for _, row in self.db.iterrows():
|
| 295 |
+
genus = row["Genus"]
|
| 296 |
+
total_score = 0
|
| 297 |
+
matched_fields: List[str] = []
|
| 298 |
+
mismatched_fields: List[str] = []
|
| 299 |
+
reasoning_factors: Dict[str, str] = {}
|
| 300 |
+
total_fields_evaluated = 0
|
| 301 |
+
|
| 302 |
+
for field in self.db.columns:
|
| 303 |
+
if field == "Genus":
|
| 304 |
+
continue
|
| 305 |
+
|
| 306 |
+
db_val = row[field]
|
| 307 |
+
user_val = user_input.get(field, "")
|
| 308 |
+
score = self.compare_field(db_val, user_val, field)
|
| 309 |
+
|
| 310 |
+
# Count only real inputs for relative confidence
|
| 311 |
+
if user_val and str(user_val).lower() != "unknown":
|
| 312 |
+
total_fields_evaluated += 1
|
| 313 |
+
|
| 314 |
+
if score == -999:
|
| 315 |
+
total_score = -999
|
| 316 |
+
break # Hard exclusion ends comparison
|
| 317 |
+
|
| 318 |
+
elif score == 1:
|
| 319 |
+
total_score += 1
|
| 320 |
+
matched_fields.append(field)
|
| 321 |
+
reasoning_factors[field] = user_val
|
| 322 |
+
|
| 323 |
+
elif score == -1:
|
| 324 |
+
total_score -= 1
|
| 325 |
+
mismatched_fields.append(field)
|
| 326 |
+
|
| 327 |
+
# Append valid genus result
|
| 328 |
+
if total_score > -999:
|
| 329 |
+
extra_notes = row.get("Extra Notes", "")
|
| 330 |
+
results.append(
|
| 331 |
+
IdentificationResult(
|
| 332 |
+
genus=genus,
|
| 333 |
+
total_score=total_score,
|
| 334 |
+
matched_fields=matched_fields,
|
| 335 |
+
mismatched_fields=mismatched_fields,
|
| 336 |
+
reasoning_factors=reasoning_factors,
|
| 337 |
+
total_fields_evaluated=total_fields_evaluated,
|
| 338 |
+
total_fields_possible=total_fields_possible,
|
| 339 |
+
extra_notes=extra_notes,
|
| 340 |
+
)
|
| 341 |
+
)
|
| 342 |
+
|
| 343 |
+
if not results:
|
| 344 |
+
return []
|
| 345 |
+
|
| 346 |
+
# 2) Suggest next tests for top core results
|
| 347 |
+
top_suggestions = self.suggest_next_tests(results)
|
| 348 |
+
for r in results[:3]:
|
| 349 |
+
r.reasoning_factors["next_tests"] = ", ".join(top_suggestions)
|
| 350 |
+
|
| 351 |
+
# 3) Extended likelihoods (if user provided extended tests)
|
| 352 |
+
ext_input = self._extract_extended_input(user_input)
|
| 353 |
+
ext_scores: Dict[str, float] = {}
|
| 354 |
+
ext_explanation = ""
|
| 355 |
+
|
| 356 |
+
if ext_input:
|
| 357 |
+
ranked, ext_explanation = score_genera_from_extended(ext_input)
|
| 358 |
+
ext_scores = {g: s for g, s in ranked}
|
| 359 |
+
|
| 360 |
+
# Attach extended scores/explanations to each result
|
| 361 |
+
if ext_scores:
|
| 362 |
+
for r in results:
|
| 363 |
+
if r.genus in ext_scores:
|
| 364 |
+
r.extended_likelihood = ext_scores[r.genus]
|
| 365 |
+
else:
|
| 366 |
+
# If genus not in signals, treat as neutral (no info)
|
| 367 |
+
r.extended_likelihood = None
|
| 368 |
+
r.extended_explanation = ext_explanation
|
| 369 |
+
else:
|
| 370 |
+
for r in results:
|
| 371 |
+
r.extended_likelihood = None
|
| 372 |
+
r.extended_explanation = ""
|
| 373 |
+
|
| 374 |
+
# 4) Sort results
|
| 375 |
+
if any(r.extended_likelihood is not None for r in results):
|
| 376 |
+
# Sort by blended confidence when extended data is present
|
| 377 |
+
results.sort(key=lambda x: x.blended_confidence_raw(), reverse=True)
|
| 378 |
+
else:
|
| 379 |
+
# Fallback to core total_score
|
| 380 |
+
results.sort(key=lambda x: x.total_score, reverse=True)
|
| 381 |
+
|
| 382 |
+
# Return top 10
|
| 383 |
+
return results[:10]
|
engine/extended_reasoner.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/extended_reasoner.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Compute per-genus likelihoods from extended tests using signals_catalog.json
|
| 4 |
+
|
| 5 |
+
import json, os, math
|
| 6 |
+
from typing import Dict, List, Tuple
|
| 7 |
+
|
| 8 |
+
SIGNALS_PATH = os.path.join("data", "signals_catalog.json")
|
| 9 |
+
PNV = ("Positive", "Negative", "Variable")
|
| 10 |
+
|
| 11 |
+
def _load_json(path: str, default):
|
| 12 |
+
if not os.path.exists(path):
|
| 13 |
+
return default
|
| 14 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 15 |
+
try:
|
| 16 |
+
return json.load(f)
|
| 17 |
+
except Exception:
|
| 18 |
+
return default
|
| 19 |
+
|
| 20 |
+
def _log(x: float) -> float:
|
| 21 |
+
# guard tiny values
|
| 22 |
+
return math.log(max(x, 1e-12))
|
| 23 |
+
|
| 24 |
+
def score_genera_from_extended(parsed_ext: Dict[str, str], alpha: float = 1.0) -> Tuple[List[Tuple[str, float]], str]:
|
| 25 |
+
"""
|
| 26 |
+
parsed_ext: dict of {ExtendedTestName: 'Positive'|'Negative'|'Variable'}
|
| 27 |
+
alpha: Laplace smoothing factor
|
| 28 |
+
Returns: ([(genus, score)], explanation_str)
|
| 29 |
+
"""
|
| 30 |
+
signals = _load_json(SIGNALS_PATH, {})
|
| 31 |
+
if not parsed_ext or not signals:
|
| 32 |
+
return [], "No extended tests or signals available."
|
| 33 |
+
|
| 34 |
+
# collect all genera
|
| 35 |
+
genera = list(signals.keys())
|
| 36 |
+
if not genera:
|
| 37 |
+
return [], "No genera in signals catalog."
|
| 38 |
+
|
| 39 |
+
# For each genus, accumulate log-likelihoods over provided tests
|
| 40 |
+
scores: Dict[str, float] = {g: 0.0 for g in genera}
|
| 41 |
+
contributions: Dict[str, List[str]] = {g: [] for g in genera}
|
| 42 |
+
|
| 43 |
+
for test, val in parsed_ext.items():
|
| 44 |
+
if val not in PNV:
|
| 45 |
+
continue
|
| 46 |
+
for g in genera:
|
| 47 |
+
stats = signals.get(g, {}).get(test, None)
|
| 48 |
+
if not stats:
|
| 49 |
+
# unseen test for this genus → uniform
|
| 50 |
+
denom = 3.0 * alpha
|
| 51 |
+
prob = alpha / denom
|
| 52 |
+
else:
|
| 53 |
+
pos = stats.get("Positive", 0)
|
| 54 |
+
neg = stats.get("Negative", 0)
|
| 55 |
+
var = stats.get("Variable", 0)
|
| 56 |
+
n = stats.get("_n", (pos + neg + var))
|
| 57 |
+
if n <= 0:
|
| 58 |
+
denom = 3.0 * alpha
|
| 59 |
+
prob = alpha / denom
|
| 60 |
+
else:
|
| 61 |
+
k = {"Positive": pos, "Negative": neg, "Variable": var}[val]
|
| 62 |
+
denom = n + 3.0 * alpha
|
| 63 |
+
prob = (k + alpha) / denom
|
| 64 |
+
|
| 65 |
+
scores[g] += _log(prob)
|
| 66 |
+
contributions[g].append(f"{test}={val}→{prob:.3f}")
|
| 67 |
+
|
| 68 |
+
# normalize scores (softmax) for readability
|
| 69 |
+
max_log = max(scores.values())
|
| 70 |
+
exp_scores = {g: math.exp(s - max_log) for g, s in scores.items()}
|
| 71 |
+
z = sum(exp_scores.values())
|
| 72 |
+
final = sorted([(g, (exp_scores[g] / z) if z > 0 else 0.0) for g in genera], key=lambda x: x[1], reverse=True)
|
| 73 |
+
|
| 74 |
+
# short explanation
|
| 75 |
+
top_rows = []
|
| 76 |
+
for g, sc in final[:5]:
|
| 77 |
+
top_rows.append(f"{g}: {sc:.3f} | {'; '.join(contributions[g][:3])}")
|
| 78 |
+
explain = "Extended-test likelihoods (top 5):\n" + "\n".join(top_rows) if top_rows else "No contributions."
|
| 79 |
+
return final, explain
|
engine/parser_ext.py
ADDED
|
@@ -0,0 +1,149 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/parser_ext.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Data-driven parser for extended tests (not in core schema).
|
| 4 |
+
# Uses:
|
| 5 |
+
# - data/extended_schema.json
|
| 6 |
+
# - data/alias_maps.json
|
| 7 |
+
#
|
| 8 |
+
# Automatically extracts extended tests such as:
|
| 9 |
+
# CAMP, PYR, Optochin, Novobiocin, Bacitracin, Bile Solubility, Hippurate, etc.
|
| 10 |
+
#
|
| 11 |
+
# Core tests (Gram, Catalase, DNase, Indole, etc.) are EXCLUDED.
|
| 12 |
+
|
| 13 |
+
import json
|
| 14 |
+
import os
|
| 15 |
+
import re
|
| 16 |
+
from typing import Dict, List
|
| 17 |
+
|
| 18 |
+
DATA_DIR = "data"
|
| 19 |
+
EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
|
| 20 |
+
ALIAS_MAPS_PATH = os.path.join(DATA_DIR, "alias_maps.json")
|
| 21 |
+
|
| 22 |
+
# -------------------------------------------------------------------------
|
| 23 |
+
# Hardcoded core test fields (NEVER to be parsed as extended)
|
| 24 |
+
# -------------------------------------------------------------------------
|
| 25 |
+
CORE_FIELDS = {
|
| 26 |
+
"Genus", "Species",
|
| 27 |
+
"Gram Stain", "Shape", "Colony Morphology", "Haemolysis", "Haemolysis Type",
|
| 28 |
+
"Motility", "Capsule", "Spore Formation", "Growth Temperature", "Oxygen Requirement",
|
| 29 |
+
"Media Grown On",
|
| 30 |
+
"Catalase", "Oxidase", "Coagulase", "DNase", "Urease", "Citrate", "Methyl Red", "VP",
|
| 31 |
+
"H2S", "ONPG", "Nitrate Reduction", "Lipase Test", "NaCl Tolerant (>=6%)",
|
| 32 |
+
"Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase",
|
| 33 |
+
"Gelatin Hydrolysis", "Esculin Hydrolysis",
|
| 34 |
+
"Glucose Fermentation", "Lactose Fermentation", "Sucrose Fermentation",
|
| 35 |
+
"Mannitol Fermentation", "Sorbitol Fermentation", "Maltose Fermentation",
|
| 36 |
+
"Xylose Fermentation", "Rhamnose Fermentation", "Arabinose Fermentation",
|
| 37 |
+
"Raffinose Fermentation", "Trehalose Fermentation", "Inositol Fermentation"
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
# -------------------------------------------------------------------------
|
| 41 |
+
# Positive / Negative / Variable mapping
|
| 42 |
+
# -------------------------------------------------------------------------
|
| 43 |
+
PNV_MAP = {
|
| 44 |
+
"+": "Positive", "positive": "Positive", "pos": "Positive",
|
| 45 |
+
"-": "Negative", "negative": "Negative", "neg": "Negative",
|
| 46 |
+
"variable": "Variable", "var": "Variable"
|
| 47 |
+
}
|
| 48 |
+
|
| 49 |
+
# -------------------------------------------------------------------------
|
| 50 |
+
# Sensitivity/Resistance mapping for disk diffusion tests
|
| 51 |
+
# (e.g., optochin, novobiocin, bacitracin)
|
| 52 |
+
# -------------------------------------------------------------------------
|
| 53 |
+
SENS_MAP = {
|
| 54 |
+
"sensitive": "Positive",
|
| 55 |
+
"susceptible": "Positive",
|
| 56 |
+
"resistant": "Negative",
|
| 57 |
+
"insensitive": "Negative"
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
# -------------------------------------------------------------------------
|
| 61 |
+
# JSON loaders
|
| 62 |
+
# -------------------------------------------------------------------------
|
| 63 |
+
def _load_json(path: str, default):
|
| 64 |
+
if not os.path.exists(path):
|
| 65 |
+
return default
|
| 66 |
+
try:
|
| 67 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 68 |
+
return json.load(f)
|
| 69 |
+
except Exception:
|
| 70 |
+
return default
|
| 71 |
+
|
| 72 |
+
# -------------------------------------------------------------------------
|
| 73 |
+
# Canonical value mapping (+, -, variable, resistant, sensitive)
|
| 74 |
+
# -------------------------------------------------------------------------
|
| 75 |
+
def _canon_value(token: str) -> str:
|
| 76 |
+
if token is None:
|
| 77 |
+
return "Unknown"
|
| 78 |
+
low = token.strip().lower()
|
| 79 |
+
if low in PNV_MAP:
|
| 80 |
+
return PNV_MAP[low]
|
| 81 |
+
if low in SENS_MAP:
|
| 82 |
+
return SENS_MAP[low]
|
| 83 |
+
return token.strip()
|
| 84 |
+
|
| 85 |
+
# -------------------------------------------------------------------------
|
| 86 |
+
# Gather all alias names for a field
|
| 87 |
+
# -------------------------------------------------------------------------
|
| 88 |
+
def _aliases_for(field: str, field_aliases: Dict[str, str]) -> List[str]:
|
| 89 |
+
"""
|
| 90 |
+
Returns all known aliases for this test, including the canonical name.
|
| 91 |
+
Ordered longest→shortest to avoid partial matches.
|
| 92 |
+
"""
|
| 93 |
+
aliases = {field}
|
| 94 |
+
for k, v in field_aliases.items():
|
| 95 |
+
if v.lower() == field.lower():
|
| 96 |
+
aliases.add(k)
|
| 97 |
+
return sorted(aliases, key=len, reverse=True)
|
| 98 |
+
|
| 99 |
+
# -------------------------------------------------------------------------
|
| 100 |
+
# Main Extended Parser
|
| 101 |
+
# -------------------------------------------------------------------------
|
| 102 |
+
def parse_text_extended(text: str) -> Dict[str, Dict]:
|
| 103 |
+
"""
|
| 104 |
+
Parse ONLY tests listed in extended_schema.json.
|
| 105 |
+
Excludes all core tests completely.
|
| 106 |
+
Returns:
|
| 107 |
+
{
|
| 108 |
+
"parsed_fields": { TestName: "Positive"/"Negative"/"Variable" },
|
| 109 |
+
"source": "extended_parser"
|
| 110 |
+
}
|
| 111 |
+
"""
|
| 112 |
+
ext_schema = _load_json(EXT_SCHEMA_PATH, {})
|
| 113 |
+
alias_maps = _load_json(ALIAS_MAPS_PATH, {"field_aliases": {}, "value_aliases_pnv": {}})
|
| 114 |
+
field_aliases = alias_maps.get("field_aliases", {})
|
| 115 |
+
|
| 116 |
+
t = text or ""
|
| 117 |
+
out: Dict[str, str] = {}
|
| 118 |
+
|
| 119 |
+
# LOOP: For each extended test, search text for aliases + P/N/V patterns
|
| 120 |
+
for canon_field in ext_schema.keys():
|
| 121 |
+
|
| 122 |
+
# Safety: never allow extended parser to treat core tests as extended
|
| 123 |
+
if canon_field in CORE_FIELDS:
|
| 124 |
+
continue
|
| 125 |
+
|
| 126 |
+
aliases = _aliases_for(canon_field, field_aliases)
|
| 127 |
+
|
| 128 |
+
for alias in aliases:
|
| 129 |
+
# Match: <alias> .... (positive|negative|variable|+|-|sensitive|resistant)
|
| 130 |
+
regex = (
|
| 131 |
+
rf"\b{re.escape(alias)}\b"
|
| 132 |
+
r"[^.\n]{0,80}?" # lookahead window
|
| 133 |
+
r"\b(positive|negative|variable|\+|\-|susceptible|sensitive|resistant)\b"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
m = re.search(regex, t, re.IGNORECASE)
|
| 137 |
+
if m:
|
| 138 |
+
out[canon_field] = _canon_value(m.group(1))
|
| 139 |
+
break # found best match for this field
|
| 140 |
+
|
| 141 |
+
# Final cleanup: remove any forbidden core fields that slipped through
|
| 142 |
+
dirty = [k for k in out.keys() if k in CORE_FIELDS]
|
| 143 |
+
for d in dirty:
|
| 144 |
+
del out[d]
|
| 145 |
+
|
| 146 |
+
return {
|
| 147 |
+
"parsed_fields": out,
|
| 148 |
+
"source": "extended_parser"
|
| 149 |
+
}
|
engine/parser_fusion.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/parser_fusion.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Tri-fusion parser:
|
| 4 |
+
# - Rule parser (parser_rules)
|
| 5 |
+
# - Extended parser (parser_ext)
|
| 6 |
+
# - LLM parser (parser_llm / Cloudflare)
|
| 7 |
+
#
|
| 8 |
+
# Combines all three into a single fused field set, with a simple
|
| 9 |
+
# precedence rule:
|
| 10 |
+
# extended > rules > llm > Unknown
|
| 11 |
+
#
|
| 12 |
+
# Returns:
|
| 13 |
+
# {
|
| 14 |
+
# "fused_fields": { ... },
|
| 15 |
+
# "sources": { field_name: "extended" | "rules" | "llm_cf" | "none" },
|
| 16 |
+
# "components": {
|
| 17 |
+
# "rules": <full rule parser output>,
|
| 18 |
+
# "extended": <full extended parser output>,
|
| 19 |
+
# "llm": <full llm parser output>
|
| 20 |
+
# }
|
| 21 |
+
# }
|
| 22 |
+
|
| 23 |
+
import json
|
| 24 |
+
import os
|
| 25 |
+
from typing import Dict, Any
|
| 26 |
+
|
| 27 |
+
from engine.parser_rules import parse_text_rules
|
| 28 |
+
from engine.parser_ext import parse_text_extended, CORE_FIELDS
|
| 29 |
+
from engine.parser_llm import parse_text_llm
|
| 30 |
+
|
| 31 |
+
# Load extended schema so we know all possible fields
|
| 32 |
+
EXT_SCHEMA_PATH = "data/extended_schema.json"
|
| 33 |
+
try:
|
| 34 |
+
with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
|
| 35 |
+
EXT_SCHEMA = json.load(f)
|
| 36 |
+
except Exception:
|
| 37 |
+
EXT_SCHEMA = {}
|
| 38 |
+
|
| 39 |
+
ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def _is_known(val: Any) -> bool:
|
| 43 |
+
"""
|
| 44 |
+
Decide if a value is 'real' (we should use it) or effectively Unknown/empty.
|
| 45 |
+
"""
|
| 46 |
+
if val is None:
|
| 47 |
+
return False
|
| 48 |
+
if isinstance(val, str):
|
| 49 |
+
v = val.strip()
|
| 50 |
+
if not v:
|
| 51 |
+
return False
|
| 52 |
+
if v.lower() == "unknown":
|
| 53 |
+
return False
|
| 54 |
+
return True
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
def parse_text_fused(text: str) -> Dict[str, Any]:
|
| 58 |
+
"""
|
| 59 |
+
Run all three parsers and fuse their outputs.
|
| 60 |
+
Precedence: extended > rules > llm > Unknown.
|
| 61 |
+
"""
|
| 62 |
+
|
| 63 |
+
# --- Run component parsers ---
|
| 64 |
+
rules_out = parse_text_rules(text or "")
|
| 65 |
+
ext_out = parse_text_extended(text or "")
|
| 66 |
+
llm_out = parse_text_llm(text or "")
|
| 67 |
+
|
| 68 |
+
rule_fields = rules_out.get("parsed_fields", {}) or {}
|
| 69 |
+
ext_fields = ext_out.get("parsed_fields", {}) or {}
|
| 70 |
+
llm_fields = llm_out.get("parsed_fields", {}) or {}
|
| 71 |
+
|
| 72 |
+
fused: Dict[str, Any] = {}
|
| 73 |
+
sources: Dict[str, str] = {}
|
| 74 |
+
|
| 75 |
+
for field in ALL_FIELDS:
|
| 76 |
+
val = None
|
| 77 |
+
src = "none"
|
| 78 |
+
|
| 79 |
+
ext_val = ext_fields.get(field, None)
|
| 80 |
+
rule_val = rule_fields.get(field, None)
|
| 81 |
+
llm_val = llm_fields.get(field, None)
|
| 82 |
+
|
| 83 |
+
if _is_known(ext_val):
|
| 84 |
+
val = ext_val
|
| 85 |
+
src = "extended"
|
| 86 |
+
elif _is_known(rule_val):
|
| 87 |
+
val = rule_val
|
| 88 |
+
src = "rules"
|
| 89 |
+
elif _is_known(llm_val):
|
| 90 |
+
val = llm_val
|
| 91 |
+
src = "llm_cf"
|
| 92 |
+
else:
|
| 93 |
+
val = "Unknown"
|
| 94 |
+
src = "none"
|
| 95 |
+
|
| 96 |
+
fused[field] = val
|
| 97 |
+
sources[field] = src
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"fused_fields": fused,
|
| 101 |
+
"sources": sources,
|
| 102 |
+
"components": {
|
| 103 |
+
"rules": rules_out,
|
| 104 |
+
"extended": ext_out,
|
| 105 |
+
"llm": llm_out,
|
| 106 |
+
},
|
| 107 |
+
}
|
engine/parser_llm.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/parser_llm.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# LLM-based parser using local Phi-2 model via HuggingFace.
|
| 4 |
+
# ------------------------------------------------------------
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import re
|
| 8 |
+
import torch
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 11 |
+
|
| 12 |
+
from engine.parser_ext import CORE_FIELDS
|
| 13 |
+
|
| 14 |
+
EXT_SCHEMA_PATH = "data/extended_schema.json"
|
| 15 |
+
try:
|
| 16 |
+
with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
|
| 17 |
+
EXT_SCHEMA = json.load(f)
|
| 18 |
+
except:
|
| 19 |
+
EXT_SCHEMA = {}
|
| 20 |
+
|
| 21 |
+
ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@st.cache_resource(show_spinner=True)
|
| 25 |
+
def load_phi2_model():
|
| 26 |
+
"""Load Phi-2 locally (CPU mode). Cached for entire session."""
|
| 27 |
+
name = "microsoft/phi-2"
|
| 28 |
+
|
| 29 |
+
tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
|
| 30 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 31 |
+
name,
|
| 32 |
+
torch_dtype=torch.float32,
|
| 33 |
+
trust_remote_code=True,
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
model.eval()
|
| 37 |
+
return tokenizer, model
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
PROMPT_TEMPLATE = """
|
| 41 |
+
You are an expert clinical microbiology assistant.
|
| 42 |
+
|
| 43 |
+
Extract ALL microbiology test results from the text and return a STRICT JSON object.
|
| 44 |
+
|
| 45 |
+
RULES:
|
| 46 |
+
- Use ONLY these fields:
|
| 47 |
+
{FIELD_LIST}
|
| 48 |
+
- Allowed values:
|
| 49 |
+
"Positive", "Negative", "Variable", "Unknown",
|
| 50 |
+
OR literal strings for temperatures (e.g. "37//40").
|
| 51 |
+
- If a test is not mentioned: set "Unknown".
|
| 52 |
+
- DO NOT create new fields or hallucinate.
|
| 53 |
+
- DO NOT output explanations.
|
| 54 |
+
- DO NOT wrap JSON in markdown code fences.
|
| 55 |
+
- Output ONLY a raw JSON object.
|
| 56 |
+
|
| 57 |
+
Text:
|
| 58 |
+
---
|
| 59 |
+
{TEXT}
|
| 60 |
+
---
|
| 61 |
+
|
| 62 |
+
JSON:
|
| 63 |
+
"""
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
def salvage_json(raw: str):
|
| 67 |
+
"""Attempt to clean and parse 'almost JSON' returned by model."""
|
| 68 |
+
s = raw.strip()
|
| 69 |
+
|
| 70 |
+
start = s.find("{")
|
| 71 |
+
end = s.rfind("}")
|
| 72 |
+
if start == -1 or end == -1 or end <= start:
|
| 73 |
+
raise ValueError("No valid JSON object braces found.")
|
| 74 |
+
|
| 75 |
+
s = s[start : end + 1]
|
| 76 |
+
s = re.sub(r",\s*([}\]])", r"\1", s)
|
| 77 |
+
|
| 78 |
+
return json.loads(s)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def normalise_value(val):
|
| 82 |
+
if val is None:
|
| 83 |
+
return "Unknown"
|
| 84 |
+
v = str(val).strip()
|
| 85 |
+
low = v.lower()
|
| 86 |
+
if low in ["positive", "+", "pos"]:
|
| 87 |
+
return "Positive"
|
| 88 |
+
if low in ["negative", "-", "neg"]:
|
| 89 |
+
return "Negative"
|
| 90 |
+
if low in ["variable", "var"]:
|
| 91 |
+
return "Variable"
|
| 92 |
+
return v
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def parse_text_llm(text: str):
|
| 96 |
+
tokenizer, model = load_phi2_model()
|
| 97 |
+
|
| 98 |
+
prompt = PROMPT_TEMPLATE.format(
|
| 99 |
+
FIELD_LIST=", ".join(ALL_FIELDS),
|
| 100 |
+
TEXT=text,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 104 |
+
input_ids = inputs["input_ids"]
|
| 105 |
+
|
| 106 |
+
with torch.no_grad():
|
| 107 |
+
output_ids = model.generate(
|
| 108 |
+
input_ids=input_ids,
|
| 109 |
+
max_new_tokens=500,
|
| 110 |
+
temperature=0.0,
|
| 111 |
+
do_sample=False,
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
|
| 115 |
+
raw = full_text[len(prompt):].strip()
|
| 116 |
+
|
| 117 |
+
try:
|
| 118 |
+
parsed = json.loads(raw)
|
| 119 |
+
except Exception:
|
| 120 |
+
try:
|
| 121 |
+
parsed = salvage_json(raw)
|
| 122 |
+
except Exception:
|
| 123 |
+
return {
|
| 124 |
+
"parsed_fields": {},
|
| 125 |
+
"error": "Invalid JSON returned by model",
|
| 126 |
+
"raw": raw,
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
cleaned = {}
|
| 130 |
+
for f in ALL_FIELDS:
|
| 131 |
+
cleaned[f] = normalise_value(parsed.get(f, "Unknown"))
|
| 132 |
+
|
| 133 |
+
return {
|
| 134 |
+
"parsed_fields": cleaned,
|
| 135 |
+
"source": "llm_phi2",
|
| 136 |
+
"raw": raw,
|
| 137 |
+
}
|
engine/parser_rules.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/parser_rules.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Deterministic rule-based parser for microbiology text
|
| 4 |
+
# Loads alias_maps.json and applies synonyms learned in Stage 10B
|
| 5 |
+
# ------------------------------------------------------------
|
| 6 |
+
|
| 7 |
+
import re
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
ALIAS_PATH = "data/alias_maps.json"
|
| 12 |
+
|
| 13 |
+
# ------------------------------------------------------------
|
| 14 |
+
# Load alias maps (if present)
|
| 15 |
+
# ------------------------------------------------------------
|
| 16 |
+
def load_alias_maps():
|
| 17 |
+
if os.path.exists(ALIAS_PATH):
|
| 18 |
+
try:
|
| 19 |
+
with open(ALIAS_PATH, "r", encoding="utf-8") as f:
|
| 20 |
+
return json.load(f)
|
| 21 |
+
except:
|
| 22 |
+
return {}
|
| 23 |
+
return {}
|
| 24 |
+
|
| 25 |
+
ALIAS_MAPS = load_alias_maps()
|
| 26 |
+
|
| 27 |
+
# ------------------------------------------------------------
|
| 28 |
+
# Utility normalization
|
| 29 |
+
# ------------------------------------------------------------
|
| 30 |
+
def norm(text: str) -> str:
|
| 31 |
+
if not text:
|
| 32 |
+
return ""
|
| 33 |
+
return str(text).strip().lower()
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# Apply alias mapping per field
|
| 37 |
+
def apply_alias(field: str, value: str) -> str:
|
| 38 |
+
f = norm(field)
|
| 39 |
+
v = norm(value)
|
| 40 |
+
if f in ALIAS_MAPS:
|
| 41 |
+
if v in ALIAS_MAPS[f]:
|
| 42 |
+
return ALIAS_MAPS[f][v]
|
| 43 |
+
return value
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# ------------------------------------------------------------
|
| 47 |
+
# Main rule parser
|
| 48 |
+
# ------------------------------------------------------------
|
| 49 |
+
def parse_text_rules(text: str) -> dict:
|
| 50 |
+
"""
|
| 51 |
+
Extracts structured microbiology fields from text using
|
| 52 |
+
deterministic regex rules + alias mapping.
|
| 53 |
+
"""
|
| 54 |
+
|
| 55 |
+
if not text:
|
| 56 |
+
return {"parsed_fields": {}, "raw_text": text}
|
| 57 |
+
|
| 58 |
+
t = text.lower()
|
| 59 |
+
parsed = {}
|
| 60 |
+
|
| 61 |
+
# ------------------------------------------------------------
|
| 62 |
+
# Grammar / morphology
|
| 63 |
+
# ------------------------------------------------------------
|
| 64 |
+
if "gram-positive" in t or "gram positive" in t:
|
| 65 |
+
parsed["Gram Stain"] = "Positive"
|
| 66 |
+
elif "gram-negative" in t or "gram negative" in t:
|
| 67 |
+
parsed["Gram Stain"] = "Negative"
|
| 68 |
+
|
| 69 |
+
if "cocci" in t:
|
| 70 |
+
parsed["Shape"] = "Cocci"
|
| 71 |
+
elif "bacilli" in t or "rods" in t or "rod" in t:
|
| 72 |
+
parsed["Shape"] = "Rods"
|
| 73 |
+
|
| 74 |
+
# ------------------------------------------------------------
|
| 75 |
+
# Enzyme tests
|
| 76 |
+
# ------------------------------------------------------------
|
| 77 |
+
if "catalase positive" in t:
|
| 78 |
+
parsed["Catalase"] = "Positive"
|
| 79 |
+
elif "catalase negative" in t:
|
| 80 |
+
parsed["Catalase"] = "Negative"
|
| 81 |
+
|
| 82 |
+
if "oxidase positive" in t:
|
| 83 |
+
parsed["Oxidase"] = "Positive"
|
| 84 |
+
elif "oxidase negative" in t:
|
| 85 |
+
parsed["Oxidase"] = "Negative"
|
| 86 |
+
|
| 87 |
+
if "coagulase positive" in t:
|
| 88 |
+
parsed["Coagulase"] = "Positive"
|
| 89 |
+
elif "coagulase negative" in t:
|
| 90 |
+
parsed["Coagulase"] = "Negative"
|
| 91 |
+
|
| 92 |
+
if "dnase positive" in t or "dnase+" in t:
|
| 93 |
+
parsed["DNase"] = "Positive"
|
| 94 |
+
elif "dnase negative" in t:
|
| 95 |
+
parsed["DNase"] = "Negative"
|
| 96 |
+
|
| 97 |
+
if "urease positive" in t:
|
| 98 |
+
parsed["Urease"] = "Positive"
|
| 99 |
+
elif "urease negative" in t:
|
| 100 |
+
parsed["Urease"] = "Negative"
|
| 101 |
+
elif "urease variable" in t:
|
| 102 |
+
parsed["Urease"] = "Variable"
|
| 103 |
+
|
| 104 |
+
# ------------------------------------------------------------
|
| 105 |
+
# Indole, Citrate, VP, MR
|
| 106 |
+
# ------------------------------------------------------------
|
| 107 |
+
if "indole positive" in t:
|
| 108 |
+
parsed["Indole"] = "Positive"
|
| 109 |
+
elif "indole negative" in t:
|
| 110 |
+
parsed["Indole"] = "Negative"
|
| 111 |
+
|
| 112 |
+
if "citrate positive" in t:
|
| 113 |
+
parsed["Citrate"] = "Positive"
|
| 114 |
+
elif "citrate negative" in t:
|
| 115 |
+
parsed["Citrate"] = "Negative"
|
| 116 |
+
|
| 117 |
+
if "vp positive" in t or "voges-proskauer positive" in t:
|
| 118 |
+
parsed["VP"] = "Positive"
|
| 119 |
+
elif "vp negative" in t:
|
| 120 |
+
parsed["VP"] = "Negative"
|
| 121 |
+
|
| 122 |
+
if "mr positive" in t or "methyl red positive" in t:
|
| 123 |
+
parsed["Methyl Red"] = "Positive"
|
| 124 |
+
elif "mr negative" in t:
|
| 125 |
+
parsed["Methyl Red"] = "Negative"
|
| 126 |
+
|
| 127 |
+
# ------------------------------------------------------------
|
| 128 |
+
# Fermentation tests
|
| 129 |
+
# ------------------------------------------------------------
|
| 130 |
+
FERMENTS = {
|
| 131 |
+
"glucose": "Glucose Fermentation",
|
| 132 |
+
"lactose": "Lactose Fermentation",
|
| 133 |
+
"sucrose": "Sucrose Fermentation",
|
| 134 |
+
"mannitol": "Mannitol Fermentation",
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
for sugar, field in FERMENTS.items():
|
| 138 |
+
if f"ferments {sugar}" in t or f"{sugar} fermentation positive" in t:
|
| 139 |
+
parsed[field] = "Positive"
|
| 140 |
+
if f"does not ferment {sugar}" in t or f"{sugar} fermentation negative" in t:
|
| 141 |
+
parsed[field] = "Negative"
|
| 142 |
+
|
| 143 |
+
# ------------------------------------------------------------
|
| 144 |
+
# Haemolysis
|
| 145 |
+
# ------------------------------------------------------------
|
| 146 |
+
if "beta-haemolytic" in t or "beta-hemolytic" in t:
|
| 147 |
+
parsed["Haemolysis Type"] = "Beta"
|
| 148 |
+
parsed["Haemolysis"] = "Positive"
|
| 149 |
+
elif "alpha-haemolytic" in t:
|
| 150 |
+
parsed["Haemolysis Type"] = "Alpha"
|
| 151 |
+
parsed["Haemolysis"] = "Positive"
|
| 152 |
+
elif "gamma-haemolytic" in t or "non-haemolytic" in t:
|
| 153 |
+
parsed["Haemolysis Type"] = "Gamma"
|
| 154 |
+
parsed["Haemolysis"] = "Negative"
|
| 155 |
+
|
| 156 |
+
# ------------------------------------------------------------
|
| 157 |
+
# Media
|
| 158 |
+
# ------------------------------------------------------------
|
| 159 |
+
if "blood agar" in t:
|
| 160 |
+
parsed["Media Grown On"] = "Blood Agar"
|
| 161 |
+
elif "macconkey agar" in t:
|
| 162 |
+
parsed["Media Grown On"] = "MacConkey Agar"
|
| 163 |
+
elif "chocolate agar" in t:
|
| 164 |
+
parsed["Media Grown On"] = "Chocolate Agar"
|
| 165 |
+
|
| 166 |
+
# ------------------------------------------------------------
|
| 167 |
+
# Growth temperature extraction
|
| 168 |
+
# ------------------------------------------------------------
|
| 169 |
+
match_temp = re.search(r"grows at (\d+)", t)
|
| 170 |
+
if match_temp:
|
| 171 |
+
temp = match_temp.group(1)
|
| 172 |
+
parsed["Growth Temperature"] = f"{temp}//{temp}"
|
| 173 |
+
|
| 174 |
+
# ------------------------------------------------------------
|
| 175 |
+
# Apply alias mappings
|
| 176 |
+
# ------------------------------------------------------------
|
| 177 |
+
aliased = {}
|
| 178 |
+
|
| 179 |
+
for field, value in parsed.items():
|
| 180 |
+
fixed = apply_alias(field, value)
|
| 181 |
+
aliased[field] = fixed
|
| 182 |
+
|
| 183 |
+
return {
|
| 184 |
+
"parsed_fields": aliased,
|
| 185 |
+
"raw_text": text,
|
| 186 |
+
}
|
engine/schema.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/schema.py
|
| 2 |
+
from typing import Dict, List, Any, Tuple
|
| 3 |
+
|
| 4 |
+
POS_NEG_VAR = ["Positive", "Negative", "Variable"]
|
| 5 |
+
POS_NEG_VAR_UNKNOWN = ["Positive", "Negative", "Variable", "Unknown"]
|
| 6 |
+
UNKNOWN = "Unknown"
|
| 7 |
+
MULTI_SEPARATOR = ";"
|
| 8 |
+
|
| 9 |
+
ENUMS = {
|
| 10 |
+
"Gram Stain": ["Positive", "Negative", "Variable"],
|
| 11 |
+
"Shape": ["Cocci", "Rods", "Bacilli", "Spiral", "Short Rods"],
|
| 12 |
+
"Haemolysis Type": ["None", "Beta", "Gamma", "Alpha"],
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
SCHEMA: Dict[str, Dict[str, Any]] = {
|
| 16 |
+
"Genus": {"type": "text", "required": True},
|
| 17 |
+
"Species": {"type": "text", "required": False},
|
| 18 |
+
|
| 19 |
+
"Gram Stain": {"type": "enum", "allowed": ENUMS["Gram Stain"]},
|
| 20 |
+
"Shape": {"type": "enum", "allowed": ENUMS["Shape"]},
|
| 21 |
+
"Colony Morphology": {"type": "multienum", "separator": MULTI_SEPARATOR},
|
| 22 |
+
"Haemolysis": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 23 |
+
"Haemolysis Type": {"type": "multienum", "separator": MULTI_SEPARATOR, "allowed": ENUMS["Haemolysis Type"]},
|
| 24 |
+
"Motility": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 25 |
+
"Capsule": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 26 |
+
"Spore Formation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 27 |
+
|
| 28 |
+
"Growth Temperature": {"type": "range", "format": "low//high", "units": "°C"},
|
| 29 |
+
"Oxygen Requirement": {"type": "text"},
|
| 30 |
+
"Media Grown On": {"type": "multienum", "separator": MULTI_SEPARATOR},
|
| 31 |
+
|
| 32 |
+
"Catalase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 33 |
+
"Oxidase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 34 |
+
"Indole": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 35 |
+
"Urease": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 36 |
+
"Citrate": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 37 |
+
"Methyl Red": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 38 |
+
"VP": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 39 |
+
"H2S": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 40 |
+
"DNase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 41 |
+
"ONPG": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 42 |
+
"Coagulase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 43 |
+
"Lipase Test": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 44 |
+
"Nitrate Reduction": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 45 |
+
|
| 46 |
+
"NaCl Tolerant (>=6%)": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 47 |
+
|
| 48 |
+
"Lysine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 49 |
+
"Ornitihine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 50 |
+
"Arginine dihydrolase": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 51 |
+
|
| 52 |
+
"Gelatin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 53 |
+
"Esculin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 54 |
+
|
| 55 |
+
"Glucose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 56 |
+
"Lactose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 57 |
+
"Sucrose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 58 |
+
"Mannitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 59 |
+
"Sorbitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 60 |
+
"Maltose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 61 |
+
"Xylose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 62 |
+
"Rhamnose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 63 |
+
"Arabinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 64 |
+
"Raffinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 65 |
+
"Trehalose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 66 |
+
"Inositol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
|
| 67 |
+
|
| 68 |
+
"Extra Notes": {"type": "text"},
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
FIELDS_ORDER: List[str] = list(SCHEMA.keys())
|
| 72 |
+
|
| 73 |
+
MULTI_FIELDS: List[str] = [
|
| 74 |
+
k for k, v in SCHEMA.items() if v.get("type") == "multienum"
|
| 75 |
+
]
|
| 76 |
+
|
| 77 |
+
PNV_FIELDS: List[str] = [
|
| 78 |
+
k for k, v in SCHEMA.items()
|
| 79 |
+
if v.get("type") == "enum" and v.get("allowed") == POS_NEG_VAR
|
| 80 |
+
]
|
| 81 |
+
|
| 82 |
+
def is_enum_field(field: str) -> bool:
|
| 83 |
+
return SCHEMA.get(field, {}).get("type") == "enum"
|
| 84 |
+
|
| 85 |
+
def is_multienum_field(field: str) -> bool:
|
| 86 |
+
return SCHEMA.get(field, {}).get("type") == "multienum"
|
| 87 |
+
|
| 88 |
+
def is_range_field(field: str) -> bool:
|
| 89 |
+
return SCHEMA.get(field, {}).get("type") == "range"
|
| 90 |
+
|
| 91 |
+
def normalize_value(field: str, value: str) -> str:
|
| 92 |
+
if value is None or str(value).strip() == "":
|
| 93 |
+
return UNKNOWN
|
| 94 |
+
v = str(value).strip()
|
| 95 |
+
|
| 96 |
+
if v.lower() == "unknown":
|
| 97 |
+
return UNKNOWN
|
| 98 |
+
|
| 99 |
+
meta = SCHEMA.get(field, {})
|
| 100 |
+
ftype = meta.get("type")
|
| 101 |
+
|
| 102 |
+
if ftype == "enum":
|
| 103 |
+
allowed = meta.get("allowed", [])
|
| 104 |
+
for a in allowed:
|
| 105 |
+
if v.lower() == a.lower():
|
| 106 |
+
return a
|
| 107 |
+
if v.lower() in ["+", "positive", "pos"]:
|
| 108 |
+
return "Positive" if "Positive" in allowed else v
|
| 109 |
+
if v.lower() in ["-", "negative", "neg"]:
|
| 110 |
+
return "Negative" if "Negative" in allowed else v
|
| 111 |
+
if v.lower() in ["variable", "var", "v"]:
|
| 112 |
+
return "Variable" if "Variable" in allowed else v
|
| 113 |
+
return v
|
| 114 |
+
|
| 115 |
+
if ftype == "multienum":
|
| 116 |
+
parts = [p.strip() for p in v.split(MULTI_SEPARATOR) if p.strip()]
|
| 117 |
+
allowed = meta.get("allowed")
|
| 118 |
+
normed = []
|
| 119 |
+
for p in parts:
|
| 120 |
+
if not allowed:
|
| 121 |
+
normed.append(p)
|
| 122 |
+
else:
|
| 123 |
+
hit = next((a for a in allowed if a.lower() == p.lower()), None)
|
| 124 |
+
normed.append(hit if hit else p)
|
| 125 |
+
return f" {MULTI_SEPARATOR} ".join(normed) if normed else UNKNOWN
|
| 126 |
+
|
| 127 |
+
if ftype == "range":
|
| 128 |
+
txt = v.replace(" ", "")
|
| 129 |
+
return txt
|
| 130 |
+
|
| 131 |
+
return v
|
| 132 |
+
|
| 133 |
+
def validate_record(rec: Dict[str, Any]) -> Tuple[bool, List[str]]:
|
| 134 |
+
issues: List[str] = []
|
| 135 |
+
for field in FIELDS_ORDER:
|
| 136 |
+
meta = SCHEMA[field]
|
| 137 |
+
if field not in rec:
|
| 138 |
+
continue
|
| 139 |
+
val = rec[field]
|
| 140 |
+
|
| 141 |
+
if meta["type"] == "enum":
|
| 142 |
+
allowed = meta.get("allowed", [])
|
| 143 |
+
if str(val) not in allowed + [UNKNOWN]:
|
| 144 |
+
issues.append(f"{field}: '{val}' not in {allowed + [UNKNOWN]}")
|
| 145 |
+
|
| 146 |
+
elif meta["type"] == "multienum":
|
| 147 |
+
if val == UNKNOWN:
|
| 148 |
+
continue
|
| 149 |
+
parts = [p.strip() for p in str(val).split(MULTI_SEPARATOR) if p.strip()]
|
| 150 |
+
allowed = meta.get("allowed")
|
| 151 |
+
if allowed:
|
| 152 |
+
bad = [p for p in parts if p not in allowed]
|
| 153 |
+
if bad:
|
| 154 |
+
issues.append(f"{field}: invalid values {bad}; allowed {allowed}")
|
| 155 |
+
|
| 156 |
+
elif meta["type"] == "range":
|
| 157 |
+
if val == UNKNOWN:
|
| 158 |
+
continue
|
| 159 |
+
txt = str(val).replace(" ", "")
|
| 160 |
+
if "//" not in txt:
|
| 161 |
+
issues.append(f"{field}: expected 'low//high' got '{val}'")
|
| 162 |
+
else:
|
| 163 |
+
try:
|
| 164 |
+
low, high = [float(x) for x in txt.split("//")]
|
| 165 |
+
if low > high:
|
| 166 |
+
issues.append(f"{field}: low {low} > high {high}")
|
| 167 |
+
except Exception:
|
| 168 |
+
issues.append(f"{field}: non-numeric bounds '{val}'")
|
| 169 |
+
|
| 170 |
+
ok = len(issues) == 0
|
| 171 |
+
return ok, issues
|
| 172 |
+
|
| 173 |
+
def empty_record() -> Dict[str, str]:
|
| 174 |
+
rec = {}
|
| 175 |
+
for f, meta in SCHEMA.items():
|
| 176 |
+
if f in ("Genus", "Species"):
|
| 177 |
+
rec[f] = ""
|
| 178 |
+
else:
|
| 179 |
+
rec[f] = UNKNOWN
|
| 180 |
+
return rec
|
engine/validator.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/validator.py
|
| 2 |
+
# ---------------------------------
|
| 3 |
+
# Placeholder for logical validation layer
|
| 4 |
+
|
| 5 |
+
def validate_record(parsed: dict) -> dict:
|
| 6 |
+
"""
|
| 7 |
+
Later: check for contradictions, invalid values,
|
| 8 |
+
and normalize to schema.
|
| 9 |
+
"""
|
| 10 |
+
parsed.setdefault("validation_notes", [])
|
| 11 |
+
parsed["validation_notes"].append("Validator not yet implemented.")
|
| 12 |
+
return parsed
|
engine/weights.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# engine/weights.py
|
| 2 |
+
# ---------------------------------
|
| 3 |
+
# Placeholder for field importance weighting
|
| 4 |
+
|
| 5 |
+
DEFAULT_WEIGHTS = {}
|
| 6 |
+
|
| 7 |
+
def update_weights_from_gold(gold_results):
|
| 8 |
+
"""
|
| 9 |
+
Future: adjust field importance weights
|
| 10 |
+
based on gold test accuracy stats.
|
| 11 |
+
"""
|
| 12 |
+
pass
|
training/__init__.py
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Marks the 'training' directory as a Python package
|
| 2 |
+
|
training/alias_trainer.py
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/alias_trainer.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Stage 10B - Alias Trainer
|
| 4 |
+
#
|
| 5 |
+
# Learns field/value synonyms from gold tests by comparing:
|
| 6 |
+
# - expected values (gold standard)
|
| 7 |
+
# - parsed values (rules + extended)
|
| 8 |
+
#
|
| 9 |
+
# Outputs:
|
| 10 |
+
# - Updated alias_maps.json
|
| 11 |
+
#
|
| 12 |
+
# This is the core intelligence that allows BactAI-D
|
| 13 |
+
# to understand variations in microbiology language.
|
| 14 |
+
# ------------------------------------------------------------
|
| 15 |
+
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
from collections import defaultdict
|
| 19 |
+
|
| 20 |
+
from engine.parser_rules import parse_text_rules
|
| 21 |
+
from engine.parser_ext import parse_text_extended
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
GOLD_PATH = "training/gold_tests.json"
|
| 25 |
+
ALIAS_PATH = "data/alias_maps.json"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def normalise(s):
|
| 29 |
+
if s is None:
|
| 30 |
+
return ""
|
| 31 |
+
return str(s).strip().lower()
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def learn_aliases():
|
| 35 |
+
"""
|
| 36 |
+
Learns synonym mappings from gold tests.
|
| 37 |
+
"""
|
| 38 |
+
if not os.path.exists(GOLD_PATH):
|
| 39 |
+
return {"error": f"Gold tests missing: {GOLD_PATH}"}
|
| 40 |
+
|
| 41 |
+
with open(GOLD_PATH, "r", encoding="utf-8") as f:
|
| 42 |
+
gold = json.load(f)
|
| 43 |
+
|
| 44 |
+
# Load or create alias map
|
| 45 |
+
if os.path.exists(ALIAS_PATH):
|
| 46 |
+
with open(ALIAS_PATH, "r", encoding="utf-8") as f:
|
| 47 |
+
alias_maps = json.load(f)
|
| 48 |
+
else:
|
| 49 |
+
alias_maps = {}
|
| 50 |
+
|
| 51 |
+
# Track suggestions
|
| 52 |
+
suggestions = defaultdict(lambda: defaultdict(int))
|
| 53 |
+
|
| 54 |
+
# ------------------------------------------------------------
|
| 55 |
+
# Compare expected vs parsed for all tests
|
| 56 |
+
# ------------------------------------------------------------
|
| 57 |
+
for test in gold:
|
| 58 |
+
text = test.get("input", "")
|
| 59 |
+
expected = test.get("expected", {})
|
| 60 |
+
|
| 61 |
+
rules = parse_text_rules(text).get("parsed_fields", {})
|
| 62 |
+
ext = parse_text_extended(text).get("parsed_fields", {})
|
| 63 |
+
|
| 64 |
+
# merge deterministic parsers
|
| 65 |
+
merged = dict(rules)
|
| 66 |
+
for k, v in ext.items():
|
| 67 |
+
if v != "Unknown":
|
| 68 |
+
merged[k] = v
|
| 69 |
+
|
| 70 |
+
# now compare with expected
|
| 71 |
+
for field, exp_val in expected.items():
|
| 72 |
+
exp_norm = normalise(exp_val)
|
| 73 |
+
got_norm = normalise(merged.get(field, "Unknown"))
|
| 74 |
+
|
| 75 |
+
# Skip correct matches
|
| 76 |
+
if exp_norm == got_norm:
|
| 77 |
+
continue
|
| 78 |
+
|
| 79 |
+
# Skip unknown expected
|
| 80 |
+
if exp_norm in ["", "unknown"]:
|
| 81 |
+
continue
|
| 82 |
+
|
| 83 |
+
# Mismatched → candidate alias
|
| 84 |
+
if got_norm not in ["", "unknown"]:
|
| 85 |
+
suggestions[field][got_norm] += 1
|
| 86 |
+
|
| 87 |
+
# ------------------------------------------------------------
|
| 88 |
+
# Convert suggestions into alias mappings
|
| 89 |
+
# ------------------------------------------------------------
|
| 90 |
+
alias_updates = {}
|
| 91 |
+
|
| 92 |
+
for field, values in suggestions.items():
|
| 93 |
+
# ignore fields with tiny evidence
|
| 94 |
+
for wrong_value, count in values.items():
|
| 95 |
+
if count < 2:
|
| 96 |
+
continue # avoid noise
|
| 97 |
+
|
| 98 |
+
# add/update alias
|
| 99 |
+
if field not in alias_maps:
|
| 100 |
+
alias_maps[field] = {}
|
| 101 |
+
|
| 102 |
+
# map wrong_value → expected canonical version
|
| 103 |
+
# canonical version is the most common value in gold_tests for that field
|
| 104 |
+
canonical = None
|
| 105 |
+
# determine canonical
|
| 106 |
+
field_values = [normalise(t["expected"][field]) for t in gold if field in t["expected"]]
|
| 107 |
+
if field_values:
|
| 108 |
+
# most common expected value
|
| 109 |
+
canonical = max(set(field_values), key=field_values.count)
|
| 110 |
+
|
| 111 |
+
if canonical:
|
| 112 |
+
alias_maps[field][wrong_value] = canonical
|
| 113 |
+
alias_updates[f"{field}:{wrong_value}"] = canonical
|
| 114 |
+
|
| 115 |
+
# ------------------------------------------------------------
|
| 116 |
+
# Save alias maps
|
| 117 |
+
# ------------------------------------------------------------
|
| 118 |
+
with open(ALIAS_PATH, "w", encoding="utf-8") as f:
|
| 119 |
+
json.dump(alias_maps, f, indent=2)
|
| 120 |
+
|
| 121 |
+
return {
|
| 122 |
+
"ok": True,
|
| 123 |
+
"updated_aliases": alias_updates,
|
| 124 |
+
"total_updates": len(alias_updates),
|
| 125 |
+
"alias_map_path": ALIAS_PATH,
|
| 126 |
+
}
|
training/gold_tester.py
ADDED
|
@@ -0,0 +1,169 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/gold_tester.py
|
| 2 |
+
# ----------------------------------------------------
|
| 3 |
+
# Enhanced tester: audits expected fields not in schema,
|
| 4 |
+
# adds DNase/Dnase alias and range-aware Growth Temperature matching.
|
| 5 |
+
|
| 6 |
+
import json, os, time, csv
|
| 7 |
+
from collections import Counter
|
| 8 |
+
from typing import Dict, List, Tuple
|
| 9 |
+
from engine.schema import SCHEMA, UNKNOWN, normalize_value, is_enum_field
|
| 10 |
+
from engine.parser_rules import parse_text_rules
|
| 11 |
+
|
| 12 |
+
REPORTS_DIR = "reports"
|
| 13 |
+
PROPOSALS_PATH = os.path.join("data", "extended_proposals.jsonl")
|
| 14 |
+
GOLD_PATH = os.path.join("training", "gold_tests.json")
|
| 15 |
+
|
| 16 |
+
# --- helpers ---
|
| 17 |
+
def load_gold() -> List[Dict]:
|
| 18 |
+
with open(GOLD_PATH, "r", encoding="utf-8") as f:
|
| 19 |
+
return json.load(f)
|
| 20 |
+
|
| 21 |
+
def _range_overlap(a: str, b: str) -> bool:
|
| 22 |
+
try:
|
| 23 |
+
la, ha = [float(x) for x in a.split("//")]
|
| 24 |
+
lb, hb = [float(x) for x in b.split("//")]
|
| 25 |
+
return not (ha < lb or hb < la)
|
| 26 |
+
except Exception:
|
| 27 |
+
return False
|
| 28 |
+
|
| 29 |
+
def compare_records(pred: Dict[str, str], exp: Dict[str, str]) -> Tuple[int, int, Dict[str, Tuple[str, str]]]:
|
| 30 |
+
correct, total, errors = 0, 0, {}
|
| 31 |
+
for field, exp_val in exp.items():
|
| 32 |
+
total += 1
|
| 33 |
+
p = pred.get(field, UNKNOWN)
|
| 34 |
+
if field == "Growth Temperature":
|
| 35 |
+
if p != UNKNOWN and exp_val != UNKNOWN and _range_overlap(p, exp_val):
|
| 36 |
+
correct += 1
|
| 37 |
+
continue
|
| 38 |
+
if p == exp_val:
|
| 39 |
+
correct += 1
|
| 40 |
+
else:
|
| 41 |
+
errors[field] = (p, exp_val)
|
| 42 |
+
return correct, total, errors
|
| 43 |
+
|
| 44 |
+
def append_proposal(record: Dict):
|
| 45 |
+
os.makedirs(os.path.dirname(PROPOSALS_PATH), exist_ok=True)
|
| 46 |
+
with open(PROPOSALS_PATH, "a", encoding="utf-8") as f:
|
| 47 |
+
f.write(json.dumps(record, ensure_ascii=False) + "\n")
|
| 48 |
+
|
| 49 |
+
# --- main ---
|
| 50 |
+
def run_gold_tests(mode: str = "rules") -> Dict:
|
| 51 |
+
tests = load_gold()
|
| 52 |
+
ts = time.strftime("%Y%m%d_%H%M%S")
|
| 53 |
+
|
| 54 |
+
per_field_counts, per_field_correct, per_field_cov = Counter(), Counter(), Counter()
|
| 55 |
+
unknown_fields, unknown_values = Counter(), Counter()
|
| 56 |
+
expected_unknowns = Counter()
|
| 57 |
+
detailed_rows = []
|
| 58 |
+
cases_with_misses = 0
|
| 59 |
+
|
| 60 |
+
for case in tests:
|
| 61 |
+
name, text, expected = case.get("name", ""), case.get("input", ""), case.get("expected", {})
|
| 62 |
+
|
| 63 |
+
# normalize expected key aliases
|
| 64 |
+
expected_norm = {}
|
| 65 |
+
for k, v in expected.items():
|
| 66 |
+
k2 = "DNase" if k.lower() == "dnase" else k
|
| 67 |
+
expected_norm[k2] = v
|
| 68 |
+
expected = expected_norm
|
| 69 |
+
|
| 70 |
+
out = parse_text_rules(text)
|
| 71 |
+
parsed = out.get("parsed_fields", {})
|
| 72 |
+
|
| 73 |
+
# normalize parser output
|
| 74 |
+
normalized_pred = {}
|
| 75 |
+
for field, val in parsed.items():
|
| 76 |
+
if field not in SCHEMA:
|
| 77 |
+
unknown_fields[field] += 1
|
| 78 |
+
append_proposal({
|
| 79 |
+
"type": "unknown_field",
|
| 80 |
+
"field": field,
|
| 81 |
+
"value": val,
|
| 82 |
+
"case_name": name,
|
| 83 |
+
"timestamp": ts
|
| 84 |
+
})
|
| 85 |
+
continue
|
| 86 |
+
normalized_pred[field] = normalize_value(field, val)
|
| 87 |
+
if is_enum_field(field):
|
| 88 |
+
allowed = SCHEMA[field].get("allowed", [])
|
| 89 |
+
if normalized_pred[field] not in allowed + [UNKNOWN]:
|
| 90 |
+
unknown_values[(field, normalized_pred[field])] += 1
|
| 91 |
+
append_proposal({
|
| 92 |
+
"type": "unknown_value",
|
| 93 |
+
"field": field,
|
| 94 |
+
"value": normalized_pred[field],
|
| 95 |
+
"allowed": allowed,
|
| 96 |
+
"case_name": name,
|
| 97 |
+
"timestamp": ts
|
| 98 |
+
})
|
| 99 |
+
|
| 100 |
+
# audit expected fields not in schema
|
| 101 |
+
for ef in expected.keys():
|
| 102 |
+
if ef not in SCHEMA:
|
| 103 |
+
expected_unknowns[ef] += 1
|
| 104 |
+
append_proposal({
|
| 105 |
+
"type": "expected_field_not_in_schema",
|
| 106 |
+
"field": ef,
|
| 107 |
+
"case_name": name,
|
| 108 |
+
"timestamp": ts
|
| 109 |
+
})
|
| 110 |
+
|
| 111 |
+
correct, total, errors = compare_records(normalized_pred, expected)
|
| 112 |
+
if errors:
|
| 113 |
+
cases_with_misses += 1
|
| 114 |
+
|
| 115 |
+
for f in expected.keys():
|
| 116 |
+
per_field_counts[f] += 1
|
| 117 |
+
if f in normalized_pred and normalized_pred[f] != UNKNOWN:
|
| 118 |
+
per_field_cov[f] += 1
|
| 119 |
+
if f not in errors:
|
| 120 |
+
per_field_correct[f] += 1
|
| 121 |
+
|
| 122 |
+
detailed_rows.append({
|
| 123 |
+
"name": name,
|
| 124 |
+
"parsed": json.dumps(normalized_pred, ensure_ascii=False),
|
| 125 |
+
"expected": json.dumps(expected, ensure_ascii=False),
|
| 126 |
+
"correct_fields": correct,
|
| 127 |
+
"total_fields": total
|
| 128 |
+
})
|
| 129 |
+
|
| 130 |
+
# --- aggregate metrics ---
|
| 131 |
+
per_field_metrics = []
|
| 132 |
+
for f, tot in per_field_counts.items():
|
| 133 |
+
acc = per_field_correct[f] / tot if tot else 0.0
|
| 134 |
+
cov = per_field_cov[f] / tot if tot else 0.0
|
| 135 |
+
per_field_metrics.append({"field": f, "accuracy": round(acc, 4), "coverage": round(cov, 4), "n": tot})
|
| 136 |
+
per_field_metrics.sort(key=lambda x: x["field"])
|
| 137 |
+
|
| 138 |
+
micro_acc = sum(per_field_correct.values()) / sum(per_field_counts.values()) if per_field_counts else 0.0
|
| 139 |
+
|
| 140 |
+
os.makedirs(REPORTS_DIR, exist_ok=True)
|
| 141 |
+
report = {
|
| 142 |
+
"mode": mode,
|
| 143 |
+
"timestamp": ts,
|
| 144 |
+
"num_tests": len(tests),
|
| 145 |
+
"micro_accuracy": round(micro_acc, 4),
|
| 146 |
+
"cases_with_misses": cases_with_misses,
|
| 147 |
+
"per_field": per_field_metrics,
|
| 148 |
+
"unknown_fields": dict(unknown_fields),
|
| 149 |
+
"unknown_values": {f"{k[0]}::{k[1]}": v for k, v in unknown_values.items()},
|
| 150 |
+
"expected_unknown_fields": dict(expected_unknowns),
|
| 151 |
+
"proposals_path": PROPOSALS_PATH
|
| 152 |
+
}
|
| 153 |
+
json_path = os.path.join(REPORTS_DIR, f"gold_report_{mode}_{ts}.json")
|
| 154 |
+
csv_fields = os.path.join(REPORTS_DIR, f"gold_fields_{mode}_{ts}.csv")
|
| 155 |
+
csv_cases = os.path.join(REPORTS_DIR, f"gold_cases_{mode}_{ts}.csv")
|
| 156 |
+
|
| 157 |
+
with open(json_path, "w", encoding="utf-8") as f:
|
| 158 |
+
json.dump(report, f, indent=2, ensure_ascii=False)
|
| 159 |
+
with open(csv_fields, "w", newline="", encoding="utf-8") as f:
|
| 160 |
+
import csv
|
| 161 |
+
w = csv.DictWriter(f, fieldnames=["field", "accuracy", "coverage", "n"])
|
| 162 |
+
w.writeheader()
|
| 163 |
+
w.writerows(per_field_metrics)
|
| 164 |
+
with open(csv_cases, "w", newline="", encoding="utf-8") as f:
|
| 165 |
+
w = csv.DictWriter(f, fieldnames=["name", "parsed", "expected", "correct_fields", "total_fields"])
|
| 166 |
+
w.writeheader()
|
| 167 |
+
w.writerows(detailed_rows)
|
| 168 |
+
|
| 169 |
+
return {"summary": report, "paths": {"json_report": json_path, "csv_fields": csv_fields, "csv_cases": csv_cases}}
|
training/gold_tests.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training/gold_trainer.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/gold_trainer.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Master training pipeline for:
|
| 4 |
+
# - Alias Trainer (Stage 10B)
|
| 5 |
+
# - Schema Expansion (Stage 10C)
|
| 6 |
+
# - Extended Signals (Stage 10C)
|
| 7 |
+
# ------------------------------------------------------------
|
| 8 |
+
|
| 9 |
+
from typing import Dict, Any
|
| 10 |
+
|
| 11 |
+
from training.alias_trainer import learn_aliases
|
| 12 |
+
|
| 13 |
+
# Try importing Stage 10C components, but don't crash if missing
|
| 14 |
+
try:
|
| 15 |
+
from training.schema_expander import expand_schema
|
| 16 |
+
except Exception:
|
| 17 |
+
def expand_schema():
|
| 18 |
+
return {
|
| 19 |
+
"ok": False,
|
| 20 |
+
"message": "schema_expander not implemented or import failed (Stage 10C).",
|
| 21 |
+
}
|
| 22 |
+
|
| 23 |
+
try:
|
| 24 |
+
from training.signal_trainer import train_signals
|
| 25 |
+
except Exception:
|
| 26 |
+
def train_signals():
|
| 27 |
+
return {
|
| 28 |
+
"ok": False,
|
| 29 |
+
"message": "signal_trainer not implemented or import failed (Stage 10C).",
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def train_from_gold() -> Dict[str, Any]:
|
| 34 |
+
"""
|
| 35 |
+
Runs all training modules on gold tests.
|
| 36 |
+
Currently:
|
| 37 |
+
- Stage 10B: Alias Trainer
|
| 38 |
+
- Stage 10C: Schema Expansion (stub)
|
| 39 |
+
- Stage 10C: Extended Signals (stub)
|
| 40 |
+
"""
|
| 41 |
+
out: Dict[str, Any] = {}
|
| 42 |
+
|
| 43 |
+
# Stage 10B - Alias Trainer
|
| 44 |
+
alias_result = learn_aliases()
|
| 45 |
+
out["alias_trainer"] = alias_result
|
| 46 |
+
|
| 47 |
+
# Stage 10C - Schema Expansion
|
| 48 |
+
try:
|
| 49 |
+
schema_result = expand_schema()
|
| 50 |
+
except Exception as e:
|
| 51 |
+
schema_result = {
|
| 52 |
+
"ok": False,
|
| 53 |
+
"error": str(e),
|
| 54 |
+
"message": "Error while running schema_expander.",
|
| 55 |
+
}
|
| 56 |
+
out["schema_expander"] = schema_result
|
| 57 |
+
|
| 58 |
+
# Stage 10C - Signals Trainer
|
| 59 |
+
try:
|
| 60 |
+
signals_result = train_signals()
|
| 61 |
+
except Exception as e:
|
| 62 |
+
signals_result = {
|
| 63 |
+
"ok": False,
|
| 64 |
+
"error": str(e),
|
| 65 |
+
"message": "Error while running signal_trainer.",
|
| 66 |
+
}
|
| 67 |
+
out["signals_trainer"] = signals_result
|
| 68 |
+
|
| 69 |
+
return out
|
training/parser_eval.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/parser_eval.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Parser Evaluation (Stage 10A)
|
| 4 |
+
#
|
| 5 |
+
# This version ONLY evaluates:
|
| 6 |
+
# - Rule parser
|
| 7 |
+
# - Extended parser
|
| 8 |
+
#
|
| 9 |
+
# The LLM parser is intentionally disabled at this stage
|
| 10 |
+
# because alias maps and schema are not trained yet.
|
| 11 |
+
#
|
| 12 |
+
# This makes Stage 10A FAST and stable (< 3 seconds).
|
| 13 |
+
# ------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
import json
|
| 16 |
+
import os
|
| 17 |
+
from typing import Dict, Any
|
| 18 |
+
|
| 19 |
+
from engine.parser_rules import parse_text_rules
|
| 20 |
+
from engine.parser_ext import parse_text_extended
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
# Path to the gold tests
|
| 24 |
+
GOLD_PATH = "training/gold_tests.json"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
|
| 28 |
+
"""
|
| 29 |
+
Evaluate one gold test with rules + extended parsers.
|
| 30 |
+
"""
|
| 31 |
+
text = test.get("input", "")
|
| 32 |
+
expected = test.get("expected", {})
|
| 33 |
+
|
| 34 |
+
# Run deterministic parsers
|
| 35 |
+
rule_out = parse_text_rules(text).get("parsed_fields", {})
|
| 36 |
+
ext_out = parse_text_extended(text).get("parsed_fields", {})
|
| 37 |
+
|
| 38 |
+
# Merge rule + extended (extended overwrites rules)
|
| 39 |
+
merged = dict(rule_out)
|
| 40 |
+
for k, v in ext_out.items():
|
| 41 |
+
if v != "Unknown":
|
| 42 |
+
merged[k] = v
|
| 43 |
+
|
| 44 |
+
total = len(expected)
|
| 45 |
+
correct = 0
|
| 46 |
+
wrong = {}
|
| 47 |
+
|
| 48 |
+
for field, exp_val in expected.items():
|
| 49 |
+
got = merged.get(field, "Unknown")
|
| 50 |
+
if got.lower() == exp_val.lower():
|
| 51 |
+
correct += 0 if exp_val == "Unknown" else 1 # Unknown is neutral
|
| 52 |
+
else:
|
| 53 |
+
wrong[field] = {"expected": exp_val, "got": got}
|
| 54 |
+
|
| 55 |
+
return {
|
| 56 |
+
"correct": correct,
|
| 57 |
+
"total": total,
|
| 58 |
+
"accuracy": correct / total if total else 0,
|
| 59 |
+
"wrong": wrong,
|
| 60 |
+
"merged": merged,
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
|
| 65 |
+
"""
|
| 66 |
+
Evaluate ALL gold tests using rules + extended parsing only.
|
| 67 |
+
"""
|
| 68 |
+
if not os.path.exists(GOLD_PATH):
|
| 69 |
+
return {"error": f"Gold test file not found at {GOLD_PATH}"}
|
| 70 |
+
|
| 71 |
+
with open(GOLD_PATH, "r", encoding="utf-8") as f:
|
| 72 |
+
gold = json.load(f)
|
| 73 |
+
|
| 74 |
+
results = []
|
| 75 |
+
wrong_cases = []
|
| 76 |
+
|
| 77 |
+
total_correct = 0
|
| 78 |
+
total_fields = 0
|
| 79 |
+
|
| 80 |
+
for test in gold:
|
| 81 |
+
out = evaluate_single_test(test)
|
| 82 |
+
results.append(out)
|
| 83 |
+
|
| 84 |
+
total_correct += out["correct"]
|
| 85 |
+
total_fields += out["total"]
|
| 86 |
+
|
| 87 |
+
if out["wrong"]:
|
| 88 |
+
wrong_cases.append({
|
| 89 |
+
"name": test.get("name", "Unnamed"),
|
| 90 |
+
"wrong": out["wrong"],
|
| 91 |
+
"parsed": out["merged"],
|
| 92 |
+
"expected": test.get("expected", {})
|
| 93 |
+
})
|
| 94 |
+
|
| 95 |
+
summary = {
|
| 96 |
+
"mode": "rules+extended",
|
| 97 |
+
"tests": len(gold),
|
| 98 |
+
"total_correct": total_correct,
|
| 99 |
+
"total_fields": total_fields,
|
| 100 |
+
"overall_accuracy": total_correct / total_fields if total_fields else 0,
|
| 101 |
+
"wrong_cases": wrong_cases,
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
return summary
|
training/repo_sync_hf.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/repo_sync_hf.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Sync updated data files back to the SAME Hugging Face Space
|
| 4 |
+
# repo that the app is running from.
|
| 5 |
+
#
|
| 6 |
+
# Uses:
|
| 7 |
+
# HF_TOKEN -> a write token (set in Space secrets)
|
| 8 |
+
# HF_SPACE_REPO_ID -> e.g. "username/space-name"
|
| 9 |
+
#
|
| 10 |
+
# Call from app.py with:
|
| 11 |
+
# from training.repo_sync_hf import push_updates_to_hf
|
| 12 |
+
# result = push_updates_to_hf([...], commit_message="...")
|
| 13 |
+
# ------------------------------------------------------------
|
| 14 |
+
|
| 15 |
+
import os
|
| 16 |
+
from typing import List, Dict, Any
|
| 17 |
+
|
| 18 |
+
from huggingface_hub import HfApi, CommitOperationAdd
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def push_updates_to_hf(
|
| 22 |
+
paths: List[str],
|
| 23 |
+
commit_message: str = "train: update extended schema, aliases, signals from gold tests",
|
| 24 |
+
) -> Dict[str, Any]:
|
| 25 |
+
"""
|
| 26 |
+
Create a single commit on the current Space repo with the given files.
|
| 27 |
+
Each path is used both as local path and path_in_repo.
|
| 28 |
+
"""
|
| 29 |
+
|
| 30 |
+
repo_id = os.getenv("HF_SPACE_REPO_ID")
|
| 31 |
+
token = os.getenv("HF_TOKEN")
|
| 32 |
+
|
| 33 |
+
if not repo_id:
|
| 34 |
+
return {
|
| 35 |
+
"ok": False,
|
| 36 |
+
"error": "Missing HF_SPACE_REPO_ID environment variable.",
|
| 37 |
+
"uploaded": [],
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
if not token:
|
| 41 |
+
return {
|
| 42 |
+
"ok": False,
|
| 43 |
+
"error": "Missing HF_TOKEN environment variable.",
|
| 44 |
+
"uploaded": [],
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
api = HfApi()
|
| 48 |
+
operations = []
|
| 49 |
+
uploaded = []
|
| 50 |
+
|
| 51 |
+
for p in paths:
|
| 52 |
+
if not os.path.exists(p):
|
| 53 |
+
# Skip missing files, but record that they were skipped
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
operations.append(
|
| 57 |
+
CommitOperationAdd(path_in_repo=p, path_or_fileobj=p)
|
| 58 |
+
)
|
| 59 |
+
uploaded.append(p)
|
| 60 |
+
|
| 61 |
+
if not operations:
|
| 62 |
+
return {
|
| 63 |
+
"ok": False,
|
| 64 |
+
"error": "No existing files to upload.",
|
| 65 |
+
"uploaded": [],
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
commit_info = api.create_commit(
|
| 69 |
+
repo_id=repo_id,
|
| 70 |
+
repo_type="space",
|
| 71 |
+
operations=operations,
|
| 72 |
+
commit_message=commit_message,
|
| 73 |
+
token=token,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
return {
|
| 77 |
+
"ok": True,
|
| 78 |
+
"uploaded": uploaded,
|
| 79 |
+
"repo_id": repo_id,
|
| 80 |
+
"commit_message": commit_message,
|
| 81 |
+
"commit_url": commit_info.commit_url,
|
| 82 |
+
}
|
training/schema_expander.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/schema_expander.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Placeholder for Stage 10C - Schema Expansion
|
| 4 |
+
#
|
| 5 |
+
# At this stage we haven't implemented schema expansion yet,
|
| 6 |
+
# so this stub lets the alias trainer run without error.
|
| 7 |
+
# ------------------------------------------------------------
|
| 8 |
+
|
| 9 |
+
def expand_schema():
|
| 10 |
+
return {
|
| 11 |
+
"ok": True,
|
| 12 |
+
"message": "Schema expander not implemented yet (Stage 10C placeholder)."
|
| 13 |
+
}
|
training/signal_trainer.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# training/signal_trainer.py
|
| 2 |
+
# ------------------------------------------------------------
|
| 3 |
+
# Placeholder for Stage 10C - Extended Signals Trainer
|
| 4 |
+
#
|
| 5 |
+
# This will be implemented in Stage 10C.
|
| 6 |
+
# For now, it must exist so imports succeed.
|
| 7 |
+
# ------------------------------------------------------------
|
| 8 |
+
|
| 9 |
+
def train_signals():
|
| 10 |
+
return {
|
| 11 |
+
"ok": True,
|
| 12 |
+
"message": "Signal trainer not implemented yet (Stage 10C placeholder)."
|
| 13 |
+
}
|