EphAsad commited on
Commit
1168cd6
·
verified ·
1 Parent(s): 6d84739

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/bacteria_db.xlsx filter=lfs diff=lfs merge=lfs -text
data/alias_maps.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "field_aliases": {
3
+ "Dnase": "DNase",
4
+ "CAMP Test": "CAMP",
5
+ "Optochin Sensitivity": "Optochin",
6
+ "Bile Solubility Test": "Bile Solubility",
7
+ "Hippurate": "Hippurate Hydrolysis",
8
+ "PYR Test": "PYR"
9
+ },
10
+ "media_aliases": {
11
+ "mac": "MacConkey Agar",
12
+ "macconkey": "MacConkey Agar",
13
+ "msa": "Mannitol Salt Agar",
14
+ "bap": "Blood Agar",
15
+ "choc": "Chocolate Agar",
16
+ "chocolate": "Chocolate Agar",
17
+ "cled": "CLED Agar"
18
+ },
19
+ "value_aliases_pnv": {
20
+ "+": "Positive",
21
+ "pos": "Positive",
22
+ "positive": "Positive",
23
+ "-": "Negative",
24
+ "neg": "Negative",
25
+ "negative": "Negative",
26
+ "variable": "Variable",
27
+ "var": "Variable"
28
+ },
29
+ "Motility": {
30
+ "positive": "positive"
31
+ }
32
+ }
data/bacteria_db.xlsx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c19c78ad2851aaa77f55f8f191748defa5e4a0654a11c9e5132f5b086d3c4543
3
+ size 2687947
data/extended_schema.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "CAMP": {
3
+ "value_type": "enum_PNV",
4
+ "status": "experimental",
5
+ "aliases": [
6
+ "CAMP Test"
7
+ ]
8
+ },
9
+ "Hippurate Hydrolysis": {
10
+ "value_type": "enum_PNV",
11
+ "status": "experimental",
12
+ "aliases": []
13
+ },
14
+ "PYR": {
15
+ "value_type": "enum_PNV",
16
+ "status": "experimental",
17
+ "aliases": []
18
+ },
19
+ "Optochin": {
20
+ "value_type": "enum_PNV",
21
+ "status": "experimental",
22
+ "aliases": [
23
+ "Optochin Sensitivity"
24
+ ]
25
+ },
26
+ "Bile Solubility": {
27
+ "value_type": "enum_PNV",
28
+ "status": "experimental",
29
+ "aliases": []
30
+ },
31
+ "Novobiocin": {
32
+ "value_type": "enum_PNV",
33
+ "status": "experimental",
34
+ "aliases": []
35
+ },
36
+ "Bile Resistance": {
37
+ "value_type": "enum_PNV",
38
+ "status": "experimental",
39
+ "aliases": []
40
+ },
41
+ "Lipase": {
42
+ "value_type": "enum_PNV",
43
+ "status": "experimental",
44
+ "aliases": []
45
+ },
46
+ "Lecithinase": {
47
+ "value_type": "enum_PNV",
48
+ "status": "experimental",
49
+ "aliases": []
50
+ },
51
+ "Odour": {
52
+ "value_type": "enum_PNV",
53
+ "status": "experimental",
54
+ "aliases": []
55
+ },
56
+ "Growth Factors": {
57
+ "value_type": "enum_PNV",
58
+ "status": "experimental",
59
+ "aliases": []
60
+ },
61
+ "Fructose Fermentation": {
62
+ "value_type": "enum_PNV",
63
+ "status": "experimental",
64
+ "aliases": []
65
+ },
66
+ "Glucose Oxidation": {
67
+ "value_type": "enum_PNV",
68
+ "status": "experimental",
69
+ "aliases": []
70
+ },
71
+ "Glycerol Fermentation": {
72
+ "value_type": "enum_PNV",
73
+ "status": "experimental",
74
+ "aliases": []
75
+ },
76
+ "Fermentation Products": {
77
+ "value_type": "enum_PNV",
78
+ "status": "experimental",
79
+ "aliases": []
80
+ },
81
+ "Cellobiose Fermentation": {
82
+ "value_type": "enum_PNV",
83
+ "status": "experimental",
84
+ "aliases": []
85
+ },
86
+ "pH Range": {
87
+ "value_type": "enum_PNV",
88
+ "status": "experimental",
89
+ "aliases": []
90
+ },
91
+ "Iron Oxidation": {
92
+ "value_type": "enum_PNV",
93
+ "status": "experimental",
94
+ "aliases": []
95
+ },
96
+ "NaCl Tolerant (>=15%)": {
97
+ "value_type": "enum_PNV",
98
+ "status": "experimental",
99
+ "aliases": []
100
+ },
101
+ "Temperature Dependence": {
102
+ "value_type": "enum_PNV",
103
+ "status": "experimental",
104
+ "aliases": []
105
+ },
106
+ "Sulfur Utilization": {
107
+ "value_type": "enum_PNV",
108
+ "status": "experimental",
109
+ "aliases": []
110
+ },
111
+ "Acid Fast": {
112
+ "value_type": "enum_PNV",
113
+ "status": "experimental",
114
+ "aliases": []
115
+ },
116
+ "Casein Hydrolysis": {
117
+ "value_type": "enum_PNV",
118
+ "status": "experimental",
119
+ "aliases": []
120
+ },
121
+ "Tyrosine Hydrolysis": {
122
+ "value_type": "enum_PNV",
123
+ "status": "experimental",
124
+ "aliases": []
125
+ },
126
+ "Mannose Fermentation": {
127
+ "value_type": "enum_PNV",
128
+ "status": "experimental",
129
+ "aliases": []
130
+ },
131
+ "Gas Production": {
132
+ "value_type": "enum_PNV",
133
+ "status": "experimental",
134
+ "aliases": []
135
+ },
136
+ "Inulin Fermentation": {
137
+ "value_type": "enum_PNV",
138
+ "status": "experimental",
139
+ "aliases": []
140
+ },
141
+ "Other Products": {
142
+ "value_type": "enum_PNV",
143
+ "status": "experimental",
144
+ "aliases": []
145
+ },
146
+ "Antibiotic Resistance": {
147
+ "value_type": "enum_PNV",
148
+ "status": "experimental",
149
+ "aliases": []
150
+ },
151
+ "Metabolic Product": {
152
+ "value_type": "enum_PNV",
153
+ "status": "experimental",
154
+ "aliases": []
155
+ },
156
+ "Bacitracin": {
157
+ "value_type": "enum_PNV",
158
+ "status": "experimental",
159
+ "aliases": []
160
+ }
161
+ }
data/signals_catalog.json ADDED
@@ -0,0 +1,1012 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Staphylococcus": {
3
+ "Indole": {
4
+ "Positive": 0,
5
+ "Negative": 18,
6
+ "Variable": 0,
7
+ "_n": 18
8
+ },
9
+ "Novobiocin": {
10
+ "Positive": 4,
11
+ "Negative": 2,
12
+ "Variable": 0,
13
+ "_n": 6
14
+ },
15
+ "Antibiotic Resistance": {
16
+ "Positive": 0,
17
+ "Negative": 0,
18
+ "Variable": 0,
19
+ "_n": 0
20
+ },
21
+ "CAMP": {
22
+ "Positive": 0,
23
+ "Negative": 6,
24
+ "Variable": 0,
25
+ "_n": 6
26
+ },
27
+ "PYR": {
28
+ "Positive": 0,
29
+ "Negative": 6,
30
+ "Variable": 0,
31
+ "_n": 6
32
+ },
33
+ "Optochin": {
34
+ "Positive": 0,
35
+ "Negative": 4,
36
+ "Variable": 0,
37
+ "_n": 4
38
+ },
39
+ "Bacitracin": {
40
+ "Positive": 0,
41
+ "Negative": 2,
42
+ "Variable": 0,
43
+ "_n": 2
44
+ }
45
+ },
46
+ "Salmonella": {
47
+ "Indole": {
48
+ "Positive": 0,
49
+ "Negative": 18,
50
+ "Variable": 0,
51
+ "_n": 18
52
+ }
53
+ },
54
+ "Enterobacter": {
55
+ "Indole": {
56
+ "Positive": 0,
57
+ "Negative": 30,
58
+ "Variable": 0,
59
+ "_n": 30
60
+ }
61
+ },
62
+ "Pseudomonas": {
63
+ "Indole": {
64
+ "Positive": 0,
65
+ "Negative": 18,
66
+ "Variable": 0,
67
+ "_n": 18
68
+ }
69
+ },
70
+ "Bacillus": {
71
+ "Indole": {
72
+ "Positive": 0,
73
+ "Negative": 42,
74
+ "Variable": 0,
75
+ "_n": 42
76
+ },
77
+ "Lecithinase": {
78
+ "Positive": 6,
79
+ "Negative": 0,
80
+ "Variable": 0,
81
+ "_n": 6
82
+ }
83
+ },
84
+ "Shigella": {
85
+ "Indole": {
86
+ "Positive": 6,
87
+ "Negative": 6,
88
+ "Variable": 0,
89
+ "_n": 12
90
+ }
91
+ },
92
+ "Escherichia": {
93
+ "Indole": {
94
+ "Positive": 12,
95
+ "Negative": 0,
96
+ "Variable": 0,
97
+ "_n": 12
98
+ }
99
+ },
100
+ "Klebsiella": {
101
+ "Indole": {
102
+ "Positive": 6,
103
+ "Negative": 12,
104
+ "Variable": 0,
105
+ "_n": 18
106
+ }
107
+ },
108
+ "Proteus": {
109
+ "Indole": {
110
+ "Positive": 6,
111
+ "Negative": 6,
112
+ "Variable": 0,
113
+ "_n": 12
114
+ }
115
+ },
116
+ "Clostridium": {
117
+ "Indole": {
118
+ "Positive": 18,
119
+ "Negative": 36,
120
+ "Variable": 0,
121
+ "_n": 54
122
+ },
123
+ "Lipase": {
124
+ "Positive": 6,
125
+ "Negative": 0,
126
+ "Variable": 0,
127
+ "_n": 6
128
+ },
129
+ "Lecithinase": {
130
+ "Positive": 6,
131
+ "Negative": 0,
132
+ "Variable": 0,
133
+ "_n": 6
134
+ },
135
+ "Odour": {
136
+ "Positive": 0,
137
+ "Negative": 0,
138
+ "Variable": 0,
139
+ "_n": 0
140
+ },
141
+ "Fructose Fermentation": {
142
+ "Positive": 6,
143
+ "Negative": 0,
144
+ "Variable": 0,
145
+ "_n": 6
146
+ }
147
+ },
148
+ "Bacteroides": {
149
+ "Indole": {
150
+ "Positive": 6,
151
+ "Negative": 0,
152
+ "Variable": 6,
153
+ "_n": 12
154
+ },
155
+ "Bile Resistance": {
156
+ "Positive": 6,
157
+ "Negative": 0,
158
+ "Variable": 0,
159
+ "_n": 6
160
+ }
161
+ },
162
+ "Streptococcus": {
163
+ "CAMP": {
164
+ "Positive": 20,
165
+ "Negative": 4,
166
+ "Variable": 0,
167
+ "_n": 24
168
+ },
169
+ "Hippurate Hydrolysis": {
170
+ "Positive": 12,
171
+ "Negative": 0,
172
+ "Variable": 0,
173
+ "_n": 12
174
+ },
175
+ "PYR": {
176
+ "Positive": 2,
177
+ "Negative": 16,
178
+ "Variable": 0,
179
+ "_n": 18
180
+ },
181
+ "Optochin": {
182
+ "Positive": 8,
183
+ "Negative": 4,
184
+ "Variable": 0,
185
+ "_n": 12
186
+ },
187
+ "Bile Solubility": {
188
+ "Positive": 14,
189
+ "Negative": 0,
190
+ "Variable": 0,
191
+ "_n": 14
192
+ },
193
+ "Inulin Fermentation": {
194
+ "Positive": 6,
195
+ "Negative": 0,
196
+ "Variable": 0,
197
+ "_n": 6
198
+ },
199
+ "Metabolic Product": {
200
+ "Positive": 0,
201
+ "Negative": 0,
202
+ "Variable": 0,
203
+ "_n": 0
204
+ },
205
+ "Bacitracin": {
206
+ "Positive": 2,
207
+ "Negative": 2,
208
+ "Variable": 0,
209
+ "_n": 4
210
+ }
211
+ },
212
+ "Aeromonas": {
213
+ "Indole": {
214
+ "Positive": 48,
215
+ "Negative": 0,
216
+ "Variable": 0,
217
+ "_n": 48
218
+ },
219
+ "Gas Production": {
220
+ "Positive": 6,
221
+ "Negative": 0,
222
+ "Variable": 0,
223
+ "_n": 6
224
+ }
225
+ },
226
+ "Yersinia": {
227
+ "Indole": {
228
+ "Positive": 6,
229
+ "Negative": 0,
230
+ "Variable": 18,
231
+ "_n": 24
232
+ }
233
+ },
234
+ "Morganella": {
235
+ "Indole": {
236
+ "Positive": 30,
237
+ "Negative": 0,
238
+ "Variable": 0,
239
+ "_n": 30
240
+ }
241
+ },
242
+ "Providencia": {
243
+ "Indole": {
244
+ "Positive": 30,
245
+ "Negative": 6,
246
+ "Variable": 0,
247
+ "_n": 36
248
+ }
249
+ },
250
+ "Pasteurella": {
251
+ "Indole": {
252
+ "Positive": 30,
253
+ "Negative": 0,
254
+ "Variable": 0,
255
+ "_n": 30
256
+ }
257
+ },
258
+ "Citrobacter": {
259
+ "Indole": {
260
+ "Positive": 6,
261
+ "Negative": 0,
262
+ "Variable": 12,
263
+ "_n": 18
264
+ }
265
+ },
266
+ "Campylobacter": {
267
+ "Indole": {
268
+ "Positive": 0,
269
+ "Negative": 18,
270
+ "Variable": 0,
271
+ "_n": 18
272
+ }
273
+ },
274
+ "Vibrio": {
275
+ "Indole": {
276
+ "Positive": 42,
277
+ "Negative": 0,
278
+ "Variable": 0,
279
+ "_n": 42
280
+ }
281
+ },
282
+ "Burkholderia": {
283
+ "Indole": {
284
+ "Positive": 0,
285
+ "Negative": 30,
286
+ "Variable": 0,
287
+ "_n": 30
288
+ },
289
+ "Odour": {
290
+ "Positive": 0,
291
+ "Negative": 0,
292
+ "Variable": 0,
293
+ "_n": 0
294
+ },
295
+ "Glucose Oxidation": {
296
+ "Positive": 6,
297
+ "Negative": 0,
298
+ "Variable": 0,
299
+ "_n": 6
300
+ }
301
+ },
302
+ "Legionella": {
303
+ "Indole": {
304
+ "Positive": 0,
305
+ "Negative": 6,
306
+ "Variable": 0,
307
+ "_n": 6
308
+ }
309
+ },
310
+ "Helicobacter": {
311
+ "Indole": {
312
+ "Positive": 0,
313
+ "Negative": 6,
314
+ "Variable": 0,
315
+ "_n": 6
316
+ }
317
+ },
318
+ "Leptospira": {
319
+ "Indole": {
320
+ "Positive": 0,
321
+ "Negative": 6,
322
+ "Variable": 0,
323
+ "_n": 6
324
+ }
325
+ },
326
+ "Serratia": {
327
+ "Indole": {
328
+ "Positive": 0,
329
+ "Negative": 30,
330
+ "Variable": 0,
331
+ "_n": 30
332
+ },
333
+ "Temperature Dependence": {
334
+ "Positive": 0,
335
+ "Negative": 0,
336
+ "Variable": 0,
337
+ "_n": 0
338
+ }
339
+ },
340
+ "Alcaligenes": {
341
+ "Odour": {
342
+ "Positive": 0,
343
+ "Negative": 0,
344
+ "Variable": 0,
345
+ "_n": 0
346
+ },
347
+ "Indole": {
348
+ "Positive": 0,
349
+ "Negative": 12,
350
+ "Variable": 0,
351
+ "_n": 12
352
+ }
353
+ },
354
+ "Shewanella": {
355
+ "Indole": {
356
+ "Positive": 0,
357
+ "Negative": 24,
358
+ "Variable": 0,
359
+ "_n": 24
360
+ }
361
+ },
362
+ "Acinetobacter": {
363
+ "Indole": {
364
+ "Positive": 0,
365
+ "Negative": 36,
366
+ "Variable": 0,
367
+ "_n": 36
368
+ }
369
+ },
370
+ "Haemophilus": {
371
+ "Growth Factors": {
372
+ "Positive": 0,
373
+ "Negative": 0,
374
+ "Variable": 0,
375
+ "_n": 0
376
+ },
377
+ "Indole": {
378
+ "Positive": 6,
379
+ "Negative": 0,
380
+ "Variable": 12,
381
+ "_n": 18
382
+ }
383
+ },
384
+ "Micrococcus": {
385
+ "Glucose Oxidation": {
386
+ "Positive": 12,
387
+ "Negative": 0,
388
+ "Variable": 0,
389
+ "_n": 12
390
+ },
391
+ "Indole": {
392
+ "Positive": 0,
393
+ "Negative": 18,
394
+ "Variable": 0,
395
+ "_n": 18
396
+ }
397
+ },
398
+ "Edwardsiella": {
399
+ "Indole": {
400
+ "Positive": 18,
401
+ "Negative": 6,
402
+ "Variable": 6,
403
+ "_n": 30
404
+ }
405
+ },
406
+ "Chromobacterium": {
407
+ "Indole": {
408
+ "Positive": 0,
409
+ "Negative": 12,
410
+ "Variable": 12,
411
+ "_n": 24
412
+ }
413
+ },
414
+ "Lactobacillus": {
415
+ "Indole": {
416
+ "Positive": 0,
417
+ "Negative": 18,
418
+ "Variable": 0,
419
+ "_n": 18
420
+ },
421
+ "pH Range": {
422
+ "Positive": 0,
423
+ "Negative": 0,
424
+ "Variable": 0,
425
+ "_n": 0
426
+ },
427
+ "Fermentation Product": {
428
+ "Positive": 0,
429
+ "Negative": 0,
430
+ "Variable": 0,
431
+ "_n": 0
432
+ }
433
+ },
434
+ "Corynebacterium": {
435
+ "Indole": {
436
+ "Positive": 0,
437
+ "Negative": 18,
438
+ "Variable": 0,
439
+ "_n": 18
440
+ }
441
+ },
442
+ "Nocardia": {
443
+ "Indole": {
444
+ "Positive": 0,
445
+ "Negative": 18,
446
+ "Variable": 0,
447
+ "_n": 18
448
+ },
449
+ "Acid Fast": {
450
+ "Positive": 0,
451
+ "Negative": 0,
452
+ "Variable": 0,
453
+ "_n": 0
454
+ },
455
+ "Casein Hydrolysis": {
456
+ "Positive": 6,
457
+ "Negative": 0,
458
+ "Variable": 0,
459
+ "_n": 6
460
+ },
461
+ "Tyrosine Hydrolysis": {
462
+ "Positive": 6,
463
+ "Negative": 0,
464
+ "Variable": 0,
465
+ "_n": 6
466
+ }
467
+ },
468
+ "Propionibacterium": {
469
+ "Indole": {
470
+ "Positive": 18,
471
+ "Negative": 0,
472
+ "Variable": 0,
473
+ "_n": 18
474
+ },
475
+ "Glycerol Fermentation": {
476
+ "Positive": 6,
477
+ "Negative": 0,
478
+ "Variable": 0,
479
+ "_n": 6
480
+ },
481
+ "Mannose Fermentation": {
482
+ "Positive": 6,
483
+ "Negative": 0,
484
+ "Variable": 0,
485
+ "_n": 6
486
+ },
487
+ "Other Products": {
488
+ "Positive": 0,
489
+ "Negative": 0,
490
+ "Variable": 0,
491
+ "_n": 0
492
+ }
493
+ },
494
+ "Peptostreptococcus": {
495
+ "Indole": {
496
+ "Positive": 0,
497
+ "Negative": 12,
498
+ "Variable": 0,
499
+ "_n": 12
500
+ }
501
+ },
502
+ "Veillonella": {
503
+ "Indole": {
504
+ "Positive": 0,
505
+ "Negative": 6,
506
+ "Variable": 0,
507
+ "_n": 6
508
+ }
509
+ },
510
+ "Fusobacterium": {
511
+ "Odour": {
512
+ "Positive": 0,
513
+ "Negative": 0,
514
+ "Variable": 0,
515
+ "_n": 0
516
+ },
517
+ "Indole": {
518
+ "Positive": 12,
519
+ "Negative": 0,
520
+ "Variable": 0,
521
+ "_n": 12
522
+ }
523
+ },
524
+ "Eubacterium": {
525
+ "Fermentation Products": {
526
+ "Positive": 0,
527
+ "Negative": 0,
528
+ "Variable": 0,
529
+ "_n": 0
530
+ },
531
+ "Cellobiose Fermentation": {
532
+ "Positive": 6,
533
+ "Negative": 0,
534
+ "Variable": 0,
535
+ "_n": 6
536
+ },
537
+ "Indole": {
538
+ "Positive": 0,
539
+ "Negative": 6,
540
+ "Variable": 0,
541
+ "_n": 6
542
+ }
543
+ },
544
+ "Halomonas": {
545
+ "Indole": {
546
+ "Positive": 0,
547
+ "Negative": 18,
548
+ "Variable": 0,
549
+ "_n": 18
550
+ },
551
+ "NaCl Tolerant (>=10%)": {
552
+ "Positive": 6,
553
+ "Negative": 0,
554
+ "Variable": 0,
555
+ "_n": 6
556
+ }
557
+ },
558
+ "Psychrobacter": {
559
+ "Indole": {
560
+ "Positive": 0,
561
+ "Negative": 12,
562
+ "Variable": 0,
563
+ "_n": 12
564
+ }
565
+ },
566
+ "Deinococcus": {
567
+ "Indole": {
568
+ "Positive": 0,
569
+ "Negative": 6,
570
+ "Variable": 0,
571
+ "_n": 6
572
+ }
573
+ },
574
+ "Thermus": {
575
+ "Indole": {
576
+ "Positive": 0,
577
+ "Negative": 12,
578
+ "Variable": 0,
579
+ "_n": 12
580
+ }
581
+ },
582
+ "Acidithiobacillus": {
583
+ "pH Range": {
584
+ "Positive": 0,
585
+ "Negative": 0,
586
+ "Variable": 0,
587
+ "_n": 0
588
+ },
589
+ "Indole": {
590
+ "Positive": 0,
591
+ "Negative": 6,
592
+ "Variable": 0,
593
+ "_n": 6
594
+ },
595
+ "Iron Oxidation": {
596
+ "Positive": 6,
597
+ "Negative": 0,
598
+ "Variable": 0,
599
+ "_n": 6
600
+ }
601
+ },
602
+ "Mycoplasma": {
603
+ "Arginine": {
604
+ "Positive": 6,
605
+ "Negative": 0,
606
+ "Variable": 0,
607
+ "_n": 6
608
+ },
609
+ "Arginine Hydrolysis": {
610
+ "Positive": 6,
611
+ "Negative": 0,
612
+ "Variable": 0,
613
+ "_n": 6
614
+ }
615
+ },
616
+ "Bordetella": {
617
+ "Growth Factors": {
618
+ "Positive": 0,
619
+ "Negative": 0,
620
+ "Variable": 0,
621
+ "_n": 0
622
+ },
623
+ "Indole": {
624
+ "Positive": 0,
625
+ "Negative": 6,
626
+ "Variable": 0,
627
+ "_n": 6
628
+ }
629
+ },
630
+ "Stenotrophomonas": {
631
+ "Indole": {
632
+ "Positive": 0,
633
+ "Negative": 24,
634
+ "Variable": 0,
635
+ "_n": 24
636
+ }
637
+ },
638
+ "Ralstonia": {
639
+ "Indole": {
640
+ "Positive": 0,
641
+ "Negative": 12,
642
+ "Variable": 0,
643
+ "_n": 12
644
+ }
645
+ },
646
+ "Achromobacter": {
647
+ "Indole": {
648
+ "Positive": 0,
649
+ "Negative": 6,
650
+ "Variable": 0,
651
+ "_n": 6
652
+ }
653
+ },
654
+ "Brucella": {
655
+ "Indole": {
656
+ "Positive": 0,
657
+ "Negative": 12,
658
+ "Variable": 0,
659
+ "_n": 12
660
+ }
661
+ },
662
+ "Brevundimonas": {
663
+ "Indole": {
664
+ "Positive": 0,
665
+ "Negative": 12,
666
+ "Variable": 0,
667
+ "_n": 12
668
+ }
669
+ },
670
+ "Arthrobacter": {
671
+ "Indole": {
672
+ "Positive": 0,
673
+ "Negative": 6,
674
+ "Variable": 0,
675
+ "_n": 6
676
+ },
677
+ "Glucose Oxidation": {
678
+ "Positive": 6,
679
+ "Negative": 0,
680
+ "Variable": 0,
681
+ "_n": 6
682
+ }
683
+ },
684
+ "Cytophaga": {
685
+ "Indole": {
686
+ "Positive": 0,
687
+ "Negative": 6,
688
+ "Variable": 0,
689
+ "_n": 6
690
+ }
691
+ },
692
+ "Flavobacterium": {
693
+ "Indole": {
694
+ "Positive": 0,
695
+ "Negative": 12,
696
+ "Variable": 0,
697
+ "_n": 12
698
+ }
699
+ },
700
+ "Oerskovia": {
701
+ "Indole": {
702
+ "Positive": 0,
703
+ "Negative": 6,
704
+ "Variable": 0,
705
+ "_n": 6
706
+ }
707
+ },
708
+ "Sphingomonas": {
709
+ "Indole": {
710
+ "Positive": 0,
711
+ "Negative": 12,
712
+ "Variable": 0,
713
+ "_n": 12
714
+ },
715
+ "Glucose Oxidation": {
716
+ "Positive": 6,
717
+ "Negative": 0,
718
+ "Variable": 0,
719
+ "_n": 6
720
+ }
721
+ },
722
+ "Comamonas": {
723
+ "Indole": {
724
+ "Positive": 0,
725
+ "Negative": 12,
726
+ "Variable": 0,
727
+ "_n": 12
728
+ }
729
+ },
730
+ "Halobacterium": {
731
+ "NaCl Tolerant (>=15%)": {
732
+ "Positive": 6,
733
+ "Negative": 0,
734
+ "Variable": 0,
735
+ "_n": 6
736
+ },
737
+ "Indole": {
738
+ "Positive": 0,
739
+ "Negative": 6,
740
+ "Variable": 0,
741
+ "_n": 6
742
+ }
743
+ },
744
+ "Thermococcus": {
745
+ "Sulfur Utilization": {
746
+ "Positive": 6,
747
+ "Negative": 0,
748
+ "Variable": 0,
749
+ "_n": 6
750
+ }
751
+ },
752
+ "Actinomyces": {
753
+ "Indole": {
754
+ "Positive": 0,
755
+ "Negative": 12,
756
+ "Variable": 0,
757
+ "_n": 12
758
+ }
759
+ },
760
+ "Elizabethkingia": {
761
+ "Indole": {
762
+ "Positive": 6,
763
+ "Negative": 0,
764
+ "Variable": 0,
765
+ "_n": 6
766
+ }
767
+ },
768
+ "Hafnia": {
769
+ "Indole": {
770
+ "Positive": 0,
771
+ "Negative": 6,
772
+ "Variable": 0,
773
+ "_n": 6
774
+ }
775
+ },
776
+ "Photobacterium": {
777
+ "Indole": {
778
+ "Positive": 12,
779
+ "Negative": 0,
780
+ "Variable": 0,
781
+ "_n": 12
782
+ }
783
+ },
784
+ "Pantoea": {
785
+ "Indole": {
786
+ "Positive": 0,
787
+ "Negative": 6,
788
+ "Variable": 0,
789
+ "_n": 6
790
+ }
791
+ },
792
+ "Raoultella": {
793
+ "Indole": {
794
+ "Positive": 0,
795
+ "Negative": 0,
796
+ "Variable": 6,
797
+ "_n": 6
798
+ }
799
+ },
800
+ "Ochrobactrum": {
801
+ "Indole": {
802
+ "Positive": 0,
803
+ "Negative": 6,
804
+ "Variable": 0,
805
+ "_n": 6
806
+ }
807
+ },
808
+ "Roseomonas": {
809
+ "Indole": {
810
+ "Positive": 0,
811
+ "Negative": 6,
812
+ "Variable": 0,
813
+ "_n": 6
814
+ }
815
+ },
816
+ "Actinobacillus": {
817
+ "Indole": {
818
+ "Positive": 0,
819
+ "Negative": 6,
820
+ "Variable": 0,
821
+ "_n": 6
822
+ }
823
+ },
824
+ "Gemella": {
825
+ "Indole": {
826
+ "Positive": 0,
827
+ "Negative": 12,
828
+ "Variable": 0,
829
+ "_n": 12
830
+ }
831
+ },
832
+ "Rothia": {
833
+ "Indole": {
834
+ "Positive": 0,
835
+ "Negative": 12,
836
+ "Variable": 0,
837
+ "_n": 12
838
+ }
839
+ },
840
+ "Listeria": {
841
+ "Indole": {
842
+ "Positive": 0,
843
+ "Negative": 6,
844
+ "Variable": 0,
845
+ "_n": 6
846
+ },
847
+ "CAMP": {
848
+ "Positive": 2,
849
+ "Negative": 0,
850
+ "Variable": 0,
851
+ "_n": 2
852
+ }
853
+ },
854
+ "Carnobacterium": {
855
+ "Indole": {
856
+ "Positive": 0,
857
+ "Negative": 6,
858
+ "Variable": 0,
859
+ "_n": 6
860
+ }
861
+ },
862
+ "Plesiomonas": {
863
+ "Indole": {
864
+ "Positive": 6,
865
+ "Negative": 0,
866
+ "Variable": 0,
867
+ "_n": 6
868
+ }
869
+ },
870
+ "Janthinobacterium": {
871
+ "Indole": {
872
+ "Positive": 0,
873
+ "Negative": 6,
874
+ "Variable": 0,
875
+ "_n": 6
876
+ }
877
+ },
878
+ "Paenibacillus": {
879
+ "Indole": {
880
+ "Positive": 0,
881
+ "Negative": 6,
882
+ "Variable": 0,
883
+ "_n": 6
884
+ }
885
+ },
886
+ "Moraxella": {
887
+ "Indole": {
888
+ "Positive": 0,
889
+ "Negative": 6,
890
+ "Variable": 0,
891
+ "_n": 6
892
+ }
893
+ },
894
+ "Aerococcus": {
895
+ "Indole": {
896
+ "Positive": 0,
897
+ "Negative": 6,
898
+ "Variable": 0,
899
+ "_n": 6
900
+ }
901
+ },
902
+ "Kocuria": {
903
+ "Indole": {
904
+ "Positive": 0,
905
+ "Negative": 6,
906
+ "Variable": 0,
907
+ "_n": 6
908
+ }
909
+ },
910
+ "Leuconostoc": {
911
+ "Indole": {
912
+ "Positive": 0,
913
+ "Negative": 6,
914
+ "Variable": 0,
915
+ "_n": 6
916
+ },
917
+ "Gas Production": {
918
+ "Positive": 6,
919
+ "Negative": 0,
920
+ "Variable": 0,
921
+ "_n": 6
922
+ },
923
+ "Fructose Fermentation": {
924
+ "Positive": 6,
925
+ "Negative": 0,
926
+ "Variable": 0,
927
+ "_n": 6
928
+ }
929
+ },
930
+ "Rhodococcus": {
931
+ "Indole": {
932
+ "Positive": 0,
933
+ "Negative": 6,
934
+ "Variable": 0,
935
+ "_n": 6
936
+ }
937
+ },
938
+ "Francisella": {
939
+ "Indole": {
940
+ "Positive": 0,
941
+ "Negative": 12,
942
+ "Variable": 0,
943
+ "_n": 12
944
+ }
945
+ },
946
+ "Erysipelothrix": {
947
+ "Indole": {
948
+ "Positive": 0,
949
+ "Negative": 6,
950
+ "Variable": 0,
951
+ "_n": 6
952
+ },
953
+ "Fructose Fermentation": {
954
+ "Positive": 6,
955
+ "Negative": 0,
956
+ "Variable": 0,
957
+ "_n": 6
958
+ }
959
+ },
960
+ "Arcanobacterium": {
961
+ "Indole": {
962
+ "Positive": 0,
963
+ "Negative": 6,
964
+ "Variable": 0,
965
+ "_n": 6
966
+ }
967
+ },
968
+ "Porphyromonas": {
969
+ "Indole": {
970
+ "Positive": 6,
971
+ "Negative": 0,
972
+ "Variable": 0,
973
+ "_n": 6
974
+ }
975
+ },
976
+ "Prevotella": {
977
+ "Indole": {
978
+ "Positive": 6,
979
+ "Negative": 0,
980
+ "Variable": 0,
981
+ "_n": 6
982
+ }
983
+ },
984
+ "Microbacterium": {
985
+ "Indole": {
986
+ "Positive": 0,
987
+ "Negative": 6,
988
+ "Variable": 0,
989
+ "_n": 6
990
+ }
991
+ },
992
+ "Enterococcus": {
993
+ "PYR": {
994
+ "Positive": 2,
995
+ "Negative": 0,
996
+ "Variable": 0,
997
+ "_n": 2
998
+ },
999
+ "Optochin": {
1000
+ "Positive": 0,
1001
+ "Negative": 2,
1002
+ "Variable": 0,
1003
+ "_n": 2
1004
+ },
1005
+ "Novobiocin": {
1006
+ "Positive": 0,
1007
+ "Negative": 2,
1008
+ "Variable": 0,
1009
+ "_n": 2
1010
+ }
1011
+ }
1012
+ }
engine/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # engine/__init__.py
2
+ # Makes 'engine' a package and re-exports the identifier for convenience if you want.
3
+ from .bacteria_identifier import BacteriaIdentifier
4
+
engine/bacteria_identifier.py ADDED
@@ -0,0 +1,383 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/bacteria_identifier.py
2
+ # ------------------------------------------------------------
3
+ # Core identification engine + blended scoring with extended signals.
4
+
5
+ import os
6
+ import json
7
+ import re
8
+ import random
9
+ from typing import Dict, List, Optional, Tuple
10
+
11
+ import pandas as pd
12
+
13
+ from engine.extended_reasoner import score_genera_from_extended
14
+
15
+ DATA_DIR = "data"
16
+ EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
17
+
18
+
19
+ # -----------------------------
20
+ # Helper Function
21
+ # -----------------------------
22
+ def join_with_and(items):
23
+ """Join list into a readable string, using commas and 'and' before last item."""
24
+ if not items:
25
+ return ""
26
+ if len(items) == 1:
27
+ return items[0]
28
+ return ", ".join(items[:-1]) + " and " + items[-1]
29
+
30
+
31
+ # -----------------------------
32
+ # Identification Result Class
33
+ # -----------------------------
34
+ class IdentificationResult:
35
+ """
36
+ Stores data about a single bacterial genus result and generates reasoning text.
37
+ Now includes optional extended-likelihood and blended confidence.
38
+ """
39
+ def __init__(
40
+ self,
41
+ genus: str,
42
+ total_score: int,
43
+ matched_fields: List[str],
44
+ mismatched_fields: List[str],
45
+ reasoning_factors: Dict[str, str],
46
+ total_fields_evaluated: int,
47
+ total_fields_possible: int,
48
+ extra_notes: str = "",
49
+ extended_likelihood: Optional[float] = None,
50
+ extended_explanation: str = "",
51
+ ):
52
+ self.genus = genus
53
+ self.total_score = total_score
54
+ self.matched_fields = matched_fields
55
+ self.mismatched_fields = mismatched_fields
56
+ self.reasoning_factors = reasoning_factors
57
+ self.total_fields_evaluated = total_fields_evaluated
58
+ self.total_fields_possible = total_fields_possible
59
+ self.extra_notes = extra_notes
60
+
61
+ # Extended reasoning
62
+ self.extended_likelihood = extended_likelihood # 0–1, or None if no extended data
63
+ self.extended_explanation = extended_explanation
64
+
65
+ # -----------------------------
66
+ # Confidence Calculations
67
+ # -----------------------------
68
+ def confidence_percent(self) -> int:
69
+ """Confidence based only on tests the user entered."""
70
+ if self.total_fields_evaluated == 0:
71
+ return 0
72
+ return max(
73
+ 0,
74
+ min(100, int((self.total_score / self.total_fields_evaluated) * 100)),
75
+ )
76
+
77
+ def true_confidence(self) -> int:
78
+ """Confidence based on *all* possible tests (complete database fields)."""
79
+ if self.total_fields_possible == 0:
80
+ return 0
81
+ return max(
82
+ 0,
83
+ min(100, int((self.total_score / self.total_fields_possible) * 100)),
84
+ )
85
+
86
+ def blended_confidence_raw(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> float:
87
+ """
88
+ Blended confidence:
89
+ core = core-confidence (0–1)
90
+ ext = extended likelihood (0–1, if available)
91
+ If no extended likelihood, return core.
92
+ """
93
+ core = self.confidence_percent() / 100.0
94
+ if self.extended_likelihood is None:
95
+ return core
96
+ return weight_core * core + weight_ext * self.extended_likelihood
97
+
98
+ def blended_confidence_percent(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> int:
99
+ return int(round(self.blended_confidence_raw(weight_core, weight_ext) * 100))
100
+
101
+ # -----------------------------
102
+ # Reasoning Paragraph Generator
103
+ # -----------------------------
104
+ def reasoning_paragraph(self, ranked_results=None) -> str:
105
+ """Generate detailed reasoning paragraph with comparison to other genera."""
106
+ if not self.matched_fields:
107
+ return "No significant biochemical or morphological matches were found."
108
+
109
+ intro = random.choice(
110
+ [
111
+ "Based on the observed biochemical and morphological traits,",
112
+ "According to the provided test results,",
113
+ "From the available laboratory findings,",
114
+ "Considering the entered reactions and colony traits,",
115
+ ]
116
+ )
117
+
118
+ # Key descriptive highlights
119
+ highlights = []
120
+ if "Gram Stain" in self.matched_fields:
121
+ highlights.append(
122
+ f"it is **Gram {self.reasoning_factors.get('Gram Stain', '').lower()}**"
123
+ )
124
+ if "Shape" in self.matched_fields:
125
+ highlights.append(
126
+ f"with a **{self.reasoning_factors.get('Shape', '').lower()}** morphology"
127
+ )
128
+ if "Catalase" in self.matched_fields:
129
+ highlights.append(
130
+ f"and **catalase {self.reasoning_factors.get('Catalase', '').lower()}** activity"
131
+ )
132
+ if "Oxidase" in self.matched_fields:
133
+ highlights.append(
134
+ f"and **oxidase {self.reasoning_factors.get('Oxidase', '').lower()}** reaction"
135
+ )
136
+ if "Oxygen Requirement" in self.matched_fields:
137
+ highlights.append(
138
+ f"which prefers **{self.reasoning_factors.get('Oxygen Requirement', '').lower()}** conditions"
139
+ )
140
+
141
+ # Join highlights grammatically
142
+ summary = (
143
+ ", ".join(highlights[:-1]) + " and " + highlights[-1]
144
+ if len(highlights) > 1
145
+ else "".join(highlights)
146
+ )
147
+
148
+ # Confidence text (core)
149
+ core_conf = self.confidence_percent()
150
+ confidence_text = (
151
+ "The confidence in this identification based on the entered tests is high."
152
+ if core_conf >= 70
153
+ else "The confidence in this identification based on the entered tests is moderate."
154
+ )
155
+
156
+ # Comparative reasoning vs other close results
157
+ comparison = ""
158
+ if ranked_results and len(ranked_results) > 1:
159
+ close_others = ranked_results[1:3]
160
+ other_names = [r.genus for r in close_others]
161
+ if other_names:
162
+ if self.total_score >= close_others[0].total_score:
163
+ comparison = (
164
+ f" It is **more likely** than {join_with_and(other_names)} "
165
+ f"based on stronger alignment in {join_with_and(self.matched_fields[:3])}."
166
+ )
167
+ else:
168
+ comparison = (
169
+ f" It is **less likely** than {join_with_and(other_names)} "
170
+ f"due to differences in {join_with_and(self.mismatched_fields[:3])}."
171
+ )
172
+
173
+ return f"{intro} {summary}, the isolate most closely resembles **{self.genus}**. {confidence_text}{comparison}"
174
+
175
+
176
+ # -----------------------------
177
+ # Bacteria Identifier Engine
178
+ # -----------------------------
179
+ class BacteriaIdentifier:
180
+ """
181
+ Main engine to match bacterial genus based on biochemical & morphological data.
182
+ Includes:
183
+ - Core rule-based matching vs bacteria_db.xlsx
184
+ - Optional blending with extended signals (signals_catalog.json)
185
+ """
186
+
187
+ def __init__(self, db: pd.DataFrame):
188
+ self.db = db.fillna("")
189
+ self.extended_fields = self._load_extended_fields()
190
+
191
+ def _load_extended_fields(self) -> List[str]:
192
+ if not os.path.exists(EXT_SCHEMA_PATH):
193
+ return []
194
+ try:
195
+ with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
196
+ schema = json.load(f)
197
+ return list(schema.keys())
198
+ except Exception:
199
+ return []
200
+
201
+ # -----------------------------
202
+ # Field Comparison Logic
203
+ # -----------------------------
204
+ def compare_field(self, db_val, user_val, field_name: str) -> int:
205
+ """Compare one test field between database and user input."""
206
+ if not user_val or str(user_val).strip() == "" or str(user_val).lower() == "unknown":
207
+ return 0 # Skip empty or unknown
208
+
209
+ db_val = str(db_val).strip().lower()
210
+ user_val = str(user_val).strip().lower()
211
+ hard_exclusions = ["Gram Stain", "Shape", "Spore Formation"]
212
+
213
+ # Split entries by separators for multi-value matches
214
+ db_options = re.split(r"[;/]", db_val)
215
+ user_options = re.split(r"[;/]", user_val)
216
+ db_options = [x.strip() for x in db_options if x.strip()]
217
+ user_options = [x.strip() for x in user_options if x.strip()]
218
+
219
+ # Handle "variable" logic
220
+ if "variable" in db_options or "variable" in user_options:
221
+ return 0
222
+
223
+ # Special handling for Growth Temperature
224
+ if field_name == "Growth Temperature":
225
+ try:
226
+ if "//" in db_val:
227
+ low, high = [float(x) for x in db_val.split("//")]
228
+ temp = float(user_val)
229
+ return 1 if low <= temp <= high else -1
230
+ except Exception:
231
+ return 0
232
+
233
+ # Flexible match: partial overlap counts as match
234
+ match_found = any(
235
+ any(u in db_opt or db_opt in u for db_opt in db_options) for u in user_options
236
+ )
237
+
238
+ if match_found:
239
+ return 1
240
+ else:
241
+ if field_name in hard_exclusions:
242
+ return -999 # Hard exclusion
243
+ return -1
244
+
245
+ # -----------------------------
246
+ # Suggest Next Tests
247
+ # -----------------------------
248
+ def suggest_next_tests(self, top_results: List[IdentificationResult]) -> List[str]:
249
+ """Suggest 3 tests that best differentiate top matches."""
250
+ if len(top_results) < 2:
251
+ return []
252
+ varying_fields = []
253
+ top3 = top_results[:3]
254
+
255
+ for field in self.db.columns:
256
+ if field in ["Genus", "Extra Notes", "Colony Morphology"]:
257
+ continue
258
+
259
+ field_values = set()
260
+ for r in top3:
261
+ field_values.update(r.matched_fields)
262
+ field_values.update(r.mismatched_fields)
263
+
264
+ if len(field_values) > 1:
265
+ varying_fields.append(field)
266
+
267
+ random.shuffle(varying_fields)
268
+ return varying_fields[:3]
269
+
270
+ # -----------------------------
271
+ # Extended Input Extraction
272
+ # -----------------------------
273
+ def _extract_extended_input(self, user_input: Dict[str, str]) -> Dict[str, str]:
274
+ """
275
+ Extract extended tests (those in extended_schema.json but not part of the core db).
276
+ Only keep Positive/Negative/Variable (ignore Unknown/empty).
277
+ """
278
+ ext_in = {}
279
+ for field in self.extended_fields:
280
+ val = user_input.get(field, "Unknown")
281
+ if isinstance(val, str) and val.lower() in ("positive", "negative", "variable"):
282
+ ext_in[field] = val.capitalize()
283
+ return ext_in
284
+
285
+ # -----------------------------
286
+ # Main Identification Routine
287
+ # -----------------------------
288
+ def identify(self, user_input: Dict[str, str]) -> List[IdentificationResult]:
289
+ """Compare user input to database and rank possible genera with blended scoring."""
290
+ results: List[IdentificationResult] = []
291
+ total_fields_possible = len([c for c in self.db.columns if c != "Genus"])
292
+
293
+ # 1) Core scoring loop against bacteria_db.xlsx
294
+ for _, row in self.db.iterrows():
295
+ genus = row["Genus"]
296
+ total_score = 0
297
+ matched_fields: List[str] = []
298
+ mismatched_fields: List[str] = []
299
+ reasoning_factors: Dict[str, str] = {}
300
+ total_fields_evaluated = 0
301
+
302
+ for field in self.db.columns:
303
+ if field == "Genus":
304
+ continue
305
+
306
+ db_val = row[field]
307
+ user_val = user_input.get(field, "")
308
+ score = self.compare_field(db_val, user_val, field)
309
+
310
+ # Count only real inputs for relative confidence
311
+ if user_val and str(user_val).lower() != "unknown":
312
+ total_fields_evaluated += 1
313
+
314
+ if score == -999:
315
+ total_score = -999
316
+ break # Hard exclusion ends comparison
317
+
318
+ elif score == 1:
319
+ total_score += 1
320
+ matched_fields.append(field)
321
+ reasoning_factors[field] = user_val
322
+
323
+ elif score == -1:
324
+ total_score -= 1
325
+ mismatched_fields.append(field)
326
+
327
+ # Append valid genus result
328
+ if total_score > -999:
329
+ extra_notes = row.get("Extra Notes", "")
330
+ results.append(
331
+ IdentificationResult(
332
+ genus=genus,
333
+ total_score=total_score,
334
+ matched_fields=matched_fields,
335
+ mismatched_fields=mismatched_fields,
336
+ reasoning_factors=reasoning_factors,
337
+ total_fields_evaluated=total_fields_evaluated,
338
+ total_fields_possible=total_fields_possible,
339
+ extra_notes=extra_notes,
340
+ )
341
+ )
342
+
343
+ if not results:
344
+ return []
345
+
346
+ # 2) Suggest next tests for top core results
347
+ top_suggestions = self.suggest_next_tests(results)
348
+ for r in results[:3]:
349
+ r.reasoning_factors["next_tests"] = ", ".join(top_suggestions)
350
+
351
+ # 3) Extended likelihoods (if user provided extended tests)
352
+ ext_input = self._extract_extended_input(user_input)
353
+ ext_scores: Dict[str, float] = {}
354
+ ext_explanation = ""
355
+
356
+ if ext_input:
357
+ ranked, ext_explanation = score_genera_from_extended(ext_input)
358
+ ext_scores = {g: s for g, s in ranked}
359
+
360
+ # Attach extended scores/explanations to each result
361
+ if ext_scores:
362
+ for r in results:
363
+ if r.genus in ext_scores:
364
+ r.extended_likelihood = ext_scores[r.genus]
365
+ else:
366
+ # If genus not in signals, treat as neutral (no info)
367
+ r.extended_likelihood = None
368
+ r.extended_explanation = ext_explanation
369
+ else:
370
+ for r in results:
371
+ r.extended_likelihood = None
372
+ r.extended_explanation = ""
373
+
374
+ # 4) Sort results
375
+ if any(r.extended_likelihood is not None for r in results):
376
+ # Sort by blended confidence when extended data is present
377
+ results.sort(key=lambda x: x.blended_confidence_raw(), reverse=True)
378
+ else:
379
+ # Fallback to core total_score
380
+ results.sort(key=lambda x: x.total_score, reverse=True)
381
+
382
+ # Return top 10
383
+ return results[:10]
engine/extended_reasoner.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/extended_reasoner.py
2
+ # ------------------------------------------------------------
3
+ # Compute per-genus likelihoods from extended tests using signals_catalog.json
4
+
5
+ import json, os, math
6
+ from typing import Dict, List, Tuple
7
+
8
+ SIGNALS_PATH = os.path.join("data", "signals_catalog.json")
9
+ PNV = ("Positive", "Negative", "Variable")
10
+
11
+ def _load_json(path: str, default):
12
+ if not os.path.exists(path):
13
+ return default
14
+ with open(path, "r", encoding="utf-8") as f:
15
+ try:
16
+ return json.load(f)
17
+ except Exception:
18
+ return default
19
+
20
+ def _log(x: float) -> float:
21
+ # guard tiny values
22
+ return math.log(max(x, 1e-12))
23
+
24
+ def score_genera_from_extended(parsed_ext: Dict[str, str], alpha: float = 1.0) -> Tuple[List[Tuple[str, float]], str]:
25
+ """
26
+ parsed_ext: dict of {ExtendedTestName: 'Positive'|'Negative'|'Variable'}
27
+ alpha: Laplace smoothing factor
28
+ Returns: ([(genus, score)], explanation_str)
29
+ """
30
+ signals = _load_json(SIGNALS_PATH, {})
31
+ if not parsed_ext or not signals:
32
+ return [], "No extended tests or signals available."
33
+
34
+ # collect all genera
35
+ genera = list(signals.keys())
36
+ if not genera:
37
+ return [], "No genera in signals catalog."
38
+
39
+ # For each genus, accumulate log-likelihoods over provided tests
40
+ scores: Dict[str, float] = {g: 0.0 for g in genera}
41
+ contributions: Dict[str, List[str]] = {g: [] for g in genera}
42
+
43
+ for test, val in parsed_ext.items():
44
+ if val not in PNV:
45
+ continue
46
+ for g in genera:
47
+ stats = signals.get(g, {}).get(test, None)
48
+ if not stats:
49
+ # unseen test for this genus → uniform
50
+ denom = 3.0 * alpha
51
+ prob = alpha / denom
52
+ else:
53
+ pos = stats.get("Positive", 0)
54
+ neg = stats.get("Negative", 0)
55
+ var = stats.get("Variable", 0)
56
+ n = stats.get("_n", (pos + neg + var))
57
+ if n <= 0:
58
+ denom = 3.0 * alpha
59
+ prob = alpha / denom
60
+ else:
61
+ k = {"Positive": pos, "Negative": neg, "Variable": var}[val]
62
+ denom = n + 3.0 * alpha
63
+ prob = (k + alpha) / denom
64
+
65
+ scores[g] += _log(prob)
66
+ contributions[g].append(f"{test}={val}→{prob:.3f}")
67
+
68
+ # normalize scores (softmax) for readability
69
+ max_log = max(scores.values())
70
+ exp_scores = {g: math.exp(s - max_log) for g, s in scores.items()}
71
+ z = sum(exp_scores.values())
72
+ final = sorted([(g, (exp_scores[g] / z) if z > 0 else 0.0) for g in genera], key=lambda x: x[1], reverse=True)
73
+
74
+ # short explanation
75
+ top_rows = []
76
+ for g, sc in final[:5]:
77
+ top_rows.append(f"{g}: {sc:.3f} | {'; '.join(contributions[g][:3])}")
78
+ explain = "Extended-test likelihoods (top 5):\n" + "\n".join(top_rows) if top_rows else "No contributions."
79
+ return final, explain
engine/parser_ext.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/parser_ext.py
2
+ # ------------------------------------------------------------
3
+ # Data-driven parser for extended tests (not in core schema).
4
+ # Uses:
5
+ # - data/extended_schema.json
6
+ # - data/alias_maps.json
7
+ #
8
+ # Automatically extracts extended tests such as:
9
+ # CAMP, PYR, Optochin, Novobiocin, Bacitracin, Bile Solubility, Hippurate, etc.
10
+ #
11
+ # Core tests (Gram, Catalase, DNase, Indole, etc.) are EXCLUDED.
12
+
13
+ import json
14
+ import os
15
+ import re
16
+ from typing import Dict, List
17
+
18
+ DATA_DIR = "data"
19
+ EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
20
+ ALIAS_MAPS_PATH = os.path.join(DATA_DIR, "alias_maps.json")
21
+
22
+ # -------------------------------------------------------------------------
23
+ # Hardcoded core test fields (NEVER to be parsed as extended)
24
+ # -------------------------------------------------------------------------
25
+ CORE_FIELDS = {
26
+ "Genus", "Species",
27
+ "Gram Stain", "Shape", "Colony Morphology", "Haemolysis", "Haemolysis Type",
28
+ "Motility", "Capsule", "Spore Formation", "Growth Temperature", "Oxygen Requirement",
29
+ "Media Grown On",
30
+ "Catalase", "Oxidase", "Coagulase", "DNase", "Urease", "Citrate", "Methyl Red", "VP",
31
+ "H2S", "ONPG", "Nitrate Reduction", "Lipase Test", "NaCl Tolerant (>=6%)",
32
+ "Lysine Decarboxylase", "Ornitihine Decarboxylase", "Arginine dihydrolase",
33
+ "Gelatin Hydrolysis", "Esculin Hydrolysis",
34
+ "Glucose Fermentation", "Lactose Fermentation", "Sucrose Fermentation",
35
+ "Mannitol Fermentation", "Sorbitol Fermentation", "Maltose Fermentation",
36
+ "Xylose Fermentation", "Rhamnose Fermentation", "Arabinose Fermentation",
37
+ "Raffinose Fermentation", "Trehalose Fermentation", "Inositol Fermentation"
38
+ }
39
+
40
+ # -------------------------------------------------------------------------
41
+ # Positive / Negative / Variable mapping
42
+ # -------------------------------------------------------------------------
43
+ PNV_MAP = {
44
+ "+": "Positive", "positive": "Positive", "pos": "Positive",
45
+ "-": "Negative", "negative": "Negative", "neg": "Negative",
46
+ "variable": "Variable", "var": "Variable"
47
+ }
48
+
49
+ # -------------------------------------------------------------------------
50
+ # Sensitivity/Resistance mapping for disk diffusion tests
51
+ # (e.g., optochin, novobiocin, bacitracin)
52
+ # -------------------------------------------------------------------------
53
+ SENS_MAP = {
54
+ "sensitive": "Positive",
55
+ "susceptible": "Positive",
56
+ "resistant": "Negative",
57
+ "insensitive": "Negative"
58
+ }
59
+
60
+ # -------------------------------------------------------------------------
61
+ # JSON loaders
62
+ # -------------------------------------------------------------------------
63
+ def _load_json(path: str, default):
64
+ if not os.path.exists(path):
65
+ return default
66
+ try:
67
+ with open(path, "r", encoding="utf-8") as f:
68
+ return json.load(f)
69
+ except Exception:
70
+ return default
71
+
72
+ # -------------------------------------------------------------------------
73
+ # Canonical value mapping (+, -, variable, resistant, sensitive)
74
+ # -------------------------------------------------------------------------
75
+ def _canon_value(token: str) -> str:
76
+ if token is None:
77
+ return "Unknown"
78
+ low = token.strip().lower()
79
+ if low in PNV_MAP:
80
+ return PNV_MAP[low]
81
+ if low in SENS_MAP:
82
+ return SENS_MAP[low]
83
+ return token.strip()
84
+
85
+ # -------------------------------------------------------------------------
86
+ # Gather all alias names for a field
87
+ # -------------------------------------------------------------------------
88
+ def _aliases_for(field: str, field_aliases: Dict[str, str]) -> List[str]:
89
+ """
90
+ Returns all known aliases for this test, including the canonical name.
91
+ Ordered longest→shortest to avoid partial matches.
92
+ """
93
+ aliases = {field}
94
+ for k, v in field_aliases.items():
95
+ if v.lower() == field.lower():
96
+ aliases.add(k)
97
+ return sorted(aliases, key=len, reverse=True)
98
+
99
+ # -------------------------------------------------------------------------
100
+ # Main Extended Parser
101
+ # -------------------------------------------------------------------------
102
+ def parse_text_extended(text: str) -> Dict[str, Dict]:
103
+ """
104
+ Parse ONLY tests listed in extended_schema.json.
105
+ Excludes all core tests completely.
106
+ Returns:
107
+ {
108
+ "parsed_fields": { TestName: "Positive"/"Negative"/"Variable" },
109
+ "source": "extended_parser"
110
+ }
111
+ """
112
+ ext_schema = _load_json(EXT_SCHEMA_PATH, {})
113
+ alias_maps = _load_json(ALIAS_MAPS_PATH, {"field_aliases": {}, "value_aliases_pnv": {}})
114
+ field_aliases = alias_maps.get("field_aliases", {})
115
+
116
+ t = text or ""
117
+ out: Dict[str, str] = {}
118
+
119
+ # LOOP: For each extended test, search text for aliases + P/N/V patterns
120
+ for canon_field in ext_schema.keys():
121
+
122
+ # Safety: never allow extended parser to treat core tests as extended
123
+ if canon_field in CORE_FIELDS:
124
+ continue
125
+
126
+ aliases = _aliases_for(canon_field, field_aliases)
127
+
128
+ for alias in aliases:
129
+ # Match: <alias> .... (positive|negative|variable|+|-|sensitive|resistant)
130
+ regex = (
131
+ rf"\b{re.escape(alias)}\b"
132
+ r"[^.\n]{0,80}?" # lookahead window
133
+ r"\b(positive|negative|variable|\+|\-|susceptible|sensitive|resistant)\b"
134
+ )
135
+
136
+ m = re.search(regex, t, re.IGNORECASE)
137
+ if m:
138
+ out[canon_field] = _canon_value(m.group(1))
139
+ break # found best match for this field
140
+
141
+ # Final cleanup: remove any forbidden core fields that slipped through
142
+ dirty = [k for k in out.keys() if k in CORE_FIELDS]
143
+ for d in dirty:
144
+ del out[d]
145
+
146
+ return {
147
+ "parsed_fields": out,
148
+ "source": "extended_parser"
149
+ }
engine/parser_fusion.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/parser_fusion.py
2
+ # ------------------------------------------------------------
3
+ # Tri-fusion parser:
4
+ # - Rule parser (parser_rules)
5
+ # - Extended parser (parser_ext)
6
+ # - LLM parser (parser_llm / Cloudflare)
7
+ #
8
+ # Combines all three into a single fused field set, with a simple
9
+ # precedence rule:
10
+ # extended > rules > llm > Unknown
11
+ #
12
+ # Returns:
13
+ # {
14
+ # "fused_fields": { ... },
15
+ # "sources": { field_name: "extended" | "rules" | "llm_cf" | "none" },
16
+ # "components": {
17
+ # "rules": <full rule parser output>,
18
+ # "extended": <full extended parser output>,
19
+ # "llm": <full llm parser output>
20
+ # }
21
+ # }
22
+
23
+ import json
24
+ import os
25
+ from typing import Dict, Any
26
+
27
+ from engine.parser_rules import parse_text_rules
28
+ from engine.parser_ext import parse_text_extended, CORE_FIELDS
29
+ from engine.parser_llm import parse_text_llm
30
+
31
+ # Load extended schema so we know all possible fields
32
+ EXT_SCHEMA_PATH = "data/extended_schema.json"
33
+ try:
34
+ with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
35
+ EXT_SCHEMA = json.load(f)
36
+ except Exception:
37
+ EXT_SCHEMA = {}
38
+
39
+ ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
40
+
41
+
42
+ def _is_known(val: Any) -> bool:
43
+ """
44
+ Decide if a value is 'real' (we should use it) or effectively Unknown/empty.
45
+ """
46
+ if val is None:
47
+ return False
48
+ if isinstance(val, str):
49
+ v = val.strip()
50
+ if not v:
51
+ return False
52
+ if v.lower() == "unknown":
53
+ return False
54
+ return True
55
+
56
+
57
+ def parse_text_fused(text: str) -> Dict[str, Any]:
58
+ """
59
+ Run all three parsers and fuse their outputs.
60
+ Precedence: extended > rules > llm > Unknown.
61
+ """
62
+
63
+ # --- Run component parsers ---
64
+ rules_out = parse_text_rules(text or "")
65
+ ext_out = parse_text_extended(text or "")
66
+ llm_out = parse_text_llm(text or "")
67
+
68
+ rule_fields = rules_out.get("parsed_fields", {}) or {}
69
+ ext_fields = ext_out.get("parsed_fields", {}) or {}
70
+ llm_fields = llm_out.get("parsed_fields", {}) or {}
71
+
72
+ fused: Dict[str, Any] = {}
73
+ sources: Dict[str, str] = {}
74
+
75
+ for field in ALL_FIELDS:
76
+ val = None
77
+ src = "none"
78
+
79
+ ext_val = ext_fields.get(field, None)
80
+ rule_val = rule_fields.get(field, None)
81
+ llm_val = llm_fields.get(field, None)
82
+
83
+ if _is_known(ext_val):
84
+ val = ext_val
85
+ src = "extended"
86
+ elif _is_known(rule_val):
87
+ val = rule_val
88
+ src = "rules"
89
+ elif _is_known(llm_val):
90
+ val = llm_val
91
+ src = "llm_cf"
92
+ else:
93
+ val = "Unknown"
94
+ src = "none"
95
+
96
+ fused[field] = val
97
+ sources[field] = src
98
+
99
+ return {
100
+ "fused_fields": fused,
101
+ "sources": sources,
102
+ "components": {
103
+ "rules": rules_out,
104
+ "extended": ext_out,
105
+ "llm": llm_out,
106
+ },
107
+ }
engine/parser_llm.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/parser_llm.py
2
+ # ------------------------------------------------------------
3
+ # LLM-based parser using local Phi-2 model via HuggingFace.
4
+ # ------------------------------------------------------------
5
+
6
+ import json
7
+ import re
8
+ import torch
9
+ import streamlit as st
10
+ from transformers import AutoModelForCausalLM, AutoTokenizer
11
+
12
+ from engine.parser_ext import CORE_FIELDS
13
+
14
+ EXT_SCHEMA_PATH = "data/extended_schema.json"
15
+ try:
16
+ with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
17
+ EXT_SCHEMA = json.load(f)
18
+ except:
19
+ EXT_SCHEMA = {}
20
+
21
+ ALL_FIELDS = sorted(set(list(CORE_FIELDS) + list(EXT_SCHEMA.keys())))
22
+
23
+
24
+ @st.cache_resource(show_spinner=True)
25
+ def load_phi2_model():
26
+ """Load Phi-2 locally (CPU mode). Cached for entire session."""
27
+ name = "microsoft/phi-2"
28
+
29
+ tokenizer = AutoTokenizer.from_pretrained(name, trust_remote_code=True)
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ name,
32
+ torch_dtype=torch.float32,
33
+ trust_remote_code=True,
34
+ )
35
+
36
+ model.eval()
37
+ return tokenizer, model
38
+
39
+
40
+ PROMPT_TEMPLATE = """
41
+ You are an expert clinical microbiology assistant.
42
+
43
+ Extract ALL microbiology test results from the text and return a STRICT JSON object.
44
+
45
+ RULES:
46
+ - Use ONLY these fields:
47
+ {FIELD_LIST}
48
+ - Allowed values:
49
+ "Positive", "Negative", "Variable", "Unknown",
50
+ OR literal strings for temperatures (e.g. "37//40").
51
+ - If a test is not mentioned: set "Unknown".
52
+ - DO NOT create new fields or hallucinate.
53
+ - DO NOT output explanations.
54
+ - DO NOT wrap JSON in markdown code fences.
55
+ - Output ONLY a raw JSON object.
56
+
57
+ Text:
58
+ ---
59
+ {TEXT}
60
+ ---
61
+
62
+ JSON:
63
+ """
64
+
65
+
66
+ def salvage_json(raw: str):
67
+ """Attempt to clean and parse 'almost JSON' returned by model."""
68
+ s = raw.strip()
69
+
70
+ start = s.find("{")
71
+ end = s.rfind("}")
72
+ if start == -1 or end == -1 or end <= start:
73
+ raise ValueError("No valid JSON object braces found.")
74
+
75
+ s = s[start : end + 1]
76
+ s = re.sub(r",\s*([}\]])", r"\1", s)
77
+
78
+ return json.loads(s)
79
+
80
+
81
+ def normalise_value(val):
82
+ if val is None:
83
+ return "Unknown"
84
+ v = str(val).strip()
85
+ low = v.lower()
86
+ if low in ["positive", "+", "pos"]:
87
+ return "Positive"
88
+ if low in ["negative", "-", "neg"]:
89
+ return "Negative"
90
+ if low in ["variable", "var"]:
91
+ return "Variable"
92
+ return v
93
+
94
+
95
+ def parse_text_llm(text: str):
96
+ tokenizer, model = load_phi2_model()
97
+
98
+ prompt = PROMPT_TEMPLATE.format(
99
+ FIELD_LIST=", ".join(ALL_FIELDS),
100
+ TEXT=text,
101
+ )
102
+
103
+ inputs = tokenizer(prompt, return_tensors="pt")
104
+ input_ids = inputs["input_ids"]
105
+
106
+ with torch.no_grad():
107
+ output_ids = model.generate(
108
+ input_ids=input_ids,
109
+ max_new_tokens=500,
110
+ temperature=0.0,
111
+ do_sample=False,
112
+ )
113
+
114
+ full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
115
+ raw = full_text[len(prompt):].strip()
116
+
117
+ try:
118
+ parsed = json.loads(raw)
119
+ except Exception:
120
+ try:
121
+ parsed = salvage_json(raw)
122
+ except Exception:
123
+ return {
124
+ "parsed_fields": {},
125
+ "error": "Invalid JSON returned by model",
126
+ "raw": raw,
127
+ }
128
+
129
+ cleaned = {}
130
+ for f in ALL_FIELDS:
131
+ cleaned[f] = normalise_value(parsed.get(f, "Unknown"))
132
+
133
+ return {
134
+ "parsed_fields": cleaned,
135
+ "source": "llm_phi2",
136
+ "raw": raw,
137
+ }
engine/parser_rules.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/parser_rules.py
2
+ # ------------------------------------------------------------
3
+ # Deterministic rule-based parser for microbiology text
4
+ # Loads alias_maps.json and applies synonyms learned in Stage 10B
5
+ # ------------------------------------------------------------
6
+
7
+ import re
8
+ import json
9
+ import os
10
+
11
+ ALIAS_PATH = "data/alias_maps.json"
12
+
13
+ # ------------------------------------------------------------
14
+ # Load alias maps (if present)
15
+ # ------------------------------------------------------------
16
+ def load_alias_maps():
17
+ if os.path.exists(ALIAS_PATH):
18
+ try:
19
+ with open(ALIAS_PATH, "r", encoding="utf-8") as f:
20
+ return json.load(f)
21
+ except:
22
+ return {}
23
+ return {}
24
+
25
+ ALIAS_MAPS = load_alias_maps()
26
+
27
+ # ------------------------------------------------------------
28
+ # Utility normalization
29
+ # ------------------------------------------------------------
30
+ def norm(text: str) -> str:
31
+ if not text:
32
+ return ""
33
+ return str(text).strip().lower()
34
+
35
+
36
+ # Apply alias mapping per field
37
+ def apply_alias(field: str, value: str) -> str:
38
+ f = norm(field)
39
+ v = norm(value)
40
+ if f in ALIAS_MAPS:
41
+ if v in ALIAS_MAPS[f]:
42
+ return ALIAS_MAPS[f][v]
43
+ return value
44
+
45
+
46
+ # ------------------------------------------------------------
47
+ # Main rule parser
48
+ # ------------------------------------------------------------
49
+ def parse_text_rules(text: str) -> dict:
50
+ """
51
+ Extracts structured microbiology fields from text using
52
+ deterministic regex rules + alias mapping.
53
+ """
54
+
55
+ if not text:
56
+ return {"parsed_fields": {}, "raw_text": text}
57
+
58
+ t = text.lower()
59
+ parsed = {}
60
+
61
+ # ------------------------------------------------------------
62
+ # Grammar / morphology
63
+ # ------------------------------------------------------------
64
+ if "gram-positive" in t or "gram positive" in t:
65
+ parsed["Gram Stain"] = "Positive"
66
+ elif "gram-negative" in t or "gram negative" in t:
67
+ parsed["Gram Stain"] = "Negative"
68
+
69
+ if "cocci" in t:
70
+ parsed["Shape"] = "Cocci"
71
+ elif "bacilli" in t or "rods" in t or "rod" in t:
72
+ parsed["Shape"] = "Rods"
73
+
74
+ # ------------------------------------------------------------
75
+ # Enzyme tests
76
+ # ------------------------------------------------------------
77
+ if "catalase positive" in t:
78
+ parsed["Catalase"] = "Positive"
79
+ elif "catalase negative" in t:
80
+ parsed["Catalase"] = "Negative"
81
+
82
+ if "oxidase positive" in t:
83
+ parsed["Oxidase"] = "Positive"
84
+ elif "oxidase negative" in t:
85
+ parsed["Oxidase"] = "Negative"
86
+
87
+ if "coagulase positive" in t:
88
+ parsed["Coagulase"] = "Positive"
89
+ elif "coagulase negative" in t:
90
+ parsed["Coagulase"] = "Negative"
91
+
92
+ if "dnase positive" in t or "dnase+" in t:
93
+ parsed["DNase"] = "Positive"
94
+ elif "dnase negative" in t:
95
+ parsed["DNase"] = "Negative"
96
+
97
+ if "urease positive" in t:
98
+ parsed["Urease"] = "Positive"
99
+ elif "urease negative" in t:
100
+ parsed["Urease"] = "Negative"
101
+ elif "urease variable" in t:
102
+ parsed["Urease"] = "Variable"
103
+
104
+ # ------------------------------------------------------------
105
+ # Indole, Citrate, VP, MR
106
+ # ------------------------------------------------------------
107
+ if "indole positive" in t:
108
+ parsed["Indole"] = "Positive"
109
+ elif "indole negative" in t:
110
+ parsed["Indole"] = "Negative"
111
+
112
+ if "citrate positive" in t:
113
+ parsed["Citrate"] = "Positive"
114
+ elif "citrate negative" in t:
115
+ parsed["Citrate"] = "Negative"
116
+
117
+ if "vp positive" in t or "voges-proskauer positive" in t:
118
+ parsed["VP"] = "Positive"
119
+ elif "vp negative" in t:
120
+ parsed["VP"] = "Negative"
121
+
122
+ if "mr positive" in t or "methyl red positive" in t:
123
+ parsed["Methyl Red"] = "Positive"
124
+ elif "mr negative" in t:
125
+ parsed["Methyl Red"] = "Negative"
126
+
127
+ # ------------------------------------------------------------
128
+ # Fermentation tests
129
+ # ------------------------------------------------------------
130
+ FERMENTS = {
131
+ "glucose": "Glucose Fermentation",
132
+ "lactose": "Lactose Fermentation",
133
+ "sucrose": "Sucrose Fermentation",
134
+ "mannitol": "Mannitol Fermentation",
135
+ }
136
+
137
+ for sugar, field in FERMENTS.items():
138
+ if f"ferments {sugar}" in t or f"{sugar} fermentation positive" in t:
139
+ parsed[field] = "Positive"
140
+ if f"does not ferment {sugar}" in t or f"{sugar} fermentation negative" in t:
141
+ parsed[field] = "Negative"
142
+
143
+ # ------------------------------------------------------------
144
+ # Haemolysis
145
+ # ------------------------------------------------------------
146
+ if "beta-haemolytic" in t or "beta-hemolytic" in t:
147
+ parsed["Haemolysis Type"] = "Beta"
148
+ parsed["Haemolysis"] = "Positive"
149
+ elif "alpha-haemolytic" in t:
150
+ parsed["Haemolysis Type"] = "Alpha"
151
+ parsed["Haemolysis"] = "Positive"
152
+ elif "gamma-haemolytic" in t or "non-haemolytic" in t:
153
+ parsed["Haemolysis Type"] = "Gamma"
154
+ parsed["Haemolysis"] = "Negative"
155
+
156
+ # ------------------------------------------------------------
157
+ # Media
158
+ # ------------------------------------------------------------
159
+ if "blood agar" in t:
160
+ parsed["Media Grown On"] = "Blood Agar"
161
+ elif "macconkey agar" in t:
162
+ parsed["Media Grown On"] = "MacConkey Agar"
163
+ elif "chocolate agar" in t:
164
+ parsed["Media Grown On"] = "Chocolate Agar"
165
+
166
+ # ------------------------------------------------------------
167
+ # Growth temperature extraction
168
+ # ------------------------------------------------------------
169
+ match_temp = re.search(r"grows at (\d+)", t)
170
+ if match_temp:
171
+ temp = match_temp.group(1)
172
+ parsed["Growth Temperature"] = f"{temp}//{temp}"
173
+
174
+ # ------------------------------------------------------------
175
+ # Apply alias mappings
176
+ # ------------------------------------------------------------
177
+ aliased = {}
178
+
179
+ for field, value in parsed.items():
180
+ fixed = apply_alias(field, value)
181
+ aliased[field] = fixed
182
+
183
+ return {
184
+ "parsed_fields": aliased,
185
+ "raw_text": text,
186
+ }
engine/schema.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/schema.py
2
+ from typing import Dict, List, Any, Tuple
3
+
4
+ POS_NEG_VAR = ["Positive", "Negative", "Variable"]
5
+ POS_NEG_VAR_UNKNOWN = ["Positive", "Negative", "Variable", "Unknown"]
6
+ UNKNOWN = "Unknown"
7
+ MULTI_SEPARATOR = ";"
8
+
9
+ ENUMS = {
10
+ "Gram Stain": ["Positive", "Negative", "Variable"],
11
+ "Shape": ["Cocci", "Rods", "Bacilli", "Spiral", "Short Rods"],
12
+ "Haemolysis Type": ["None", "Beta", "Gamma", "Alpha"],
13
+ }
14
+
15
+ SCHEMA: Dict[str, Dict[str, Any]] = {
16
+ "Genus": {"type": "text", "required": True},
17
+ "Species": {"type": "text", "required": False},
18
+
19
+ "Gram Stain": {"type": "enum", "allowed": ENUMS["Gram Stain"]},
20
+ "Shape": {"type": "enum", "allowed": ENUMS["Shape"]},
21
+ "Colony Morphology": {"type": "multienum", "separator": MULTI_SEPARATOR},
22
+ "Haemolysis": {"type": "enum", "allowed": POS_NEG_VAR},
23
+ "Haemolysis Type": {"type": "multienum", "separator": MULTI_SEPARATOR, "allowed": ENUMS["Haemolysis Type"]},
24
+ "Motility": {"type": "enum", "allowed": POS_NEG_VAR},
25
+ "Capsule": {"type": "enum", "allowed": POS_NEG_VAR},
26
+ "Spore Formation": {"type": "enum", "allowed": POS_NEG_VAR},
27
+
28
+ "Growth Temperature": {"type": "range", "format": "low//high", "units": "°C"},
29
+ "Oxygen Requirement": {"type": "text"},
30
+ "Media Grown On": {"type": "multienum", "separator": MULTI_SEPARATOR},
31
+
32
+ "Catalase": {"type": "enum", "allowed": POS_NEG_VAR},
33
+ "Oxidase": {"type": "enum", "allowed": POS_NEG_VAR},
34
+ "Indole": {"type": "enum", "allowed": POS_NEG_VAR},
35
+ "Urease": {"type": "enum", "allowed": POS_NEG_VAR},
36
+ "Citrate": {"type": "enum", "allowed": POS_NEG_VAR},
37
+ "Methyl Red": {"type": "enum", "allowed": POS_NEG_VAR},
38
+ "VP": {"type": "enum", "allowed": POS_NEG_VAR},
39
+ "H2S": {"type": "enum", "allowed": POS_NEG_VAR},
40
+ "DNase": {"type": "enum", "allowed": POS_NEG_VAR},
41
+ "ONPG": {"type": "enum", "allowed": POS_NEG_VAR},
42
+ "Coagulase": {"type": "enum", "allowed": POS_NEG_VAR},
43
+ "Lipase Test": {"type": "enum", "allowed": POS_NEG_VAR},
44
+ "Nitrate Reduction": {"type": "enum", "allowed": POS_NEG_VAR},
45
+
46
+ "NaCl Tolerant (>=6%)": {"type": "enum", "allowed": POS_NEG_VAR},
47
+
48
+ "Lysine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
49
+ "Ornitihine Decarboxylase": {"type": "enum", "allowed": POS_NEG_VAR},
50
+ "Arginine dihydrolase": {"type": "enum", "allowed": POS_NEG_VAR},
51
+
52
+ "Gelatin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
53
+ "Esculin Hydrolysis": {"type": "enum", "allowed": POS_NEG_VAR},
54
+
55
+ "Glucose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
56
+ "Lactose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
57
+ "Sucrose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
58
+ "Mannitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
59
+ "Sorbitol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
60
+ "Maltose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
61
+ "Xylose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
62
+ "Rhamnose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
63
+ "Arabinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
64
+ "Raffinose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
65
+ "Trehalose Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
66
+ "Inositol Fermentation": {"type": "enum", "allowed": POS_NEG_VAR},
67
+
68
+ "Extra Notes": {"type": "text"},
69
+ }
70
+
71
+ FIELDS_ORDER: List[str] = list(SCHEMA.keys())
72
+
73
+ MULTI_FIELDS: List[str] = [
74
+ k for k, v in SCHEMA.items() if v.get("type") == "multienum"
75
+ ]
76
+
77
+ PNV_FIELDS: List[str] = [
78
+ k for k, v in SCHEMA.items()
79
+ if v.get("type") == "enum" and v.get("allowed") == POS_NEG_VAR
80
+ ]
81
+
82
+ def is_enum_field(field: str) -> bool:
83
+ return SCHEMA.get(field, {}).get("type") == "enum"
84
+
85
+ def is_multienum_field(field: str) -> bool:
86
+ return SCHEMA.get(field, {}).get("type") == "multienum"
87
+
88
+ def is_range_field(field: str) -> bool:
89
+ return SCHEMA.get(field, {}).get("type") == "range"
90
+
91
+ def normalize_value(field: str, value: str) -> str:
92
+ if value is None or str(value).strip() == "":
93
+ return UNKNOWN
94
+ v = str(value).strip()
95
+
96
+ if v.lower() == "unknown":
97
+ return UNKNOWN
98
+
99
+ meta = SCHEMA.get(field, {})
100
+ ftype = meta.get("type")
101
+
102
+ if ftype == "enum":
103
+ allowed = meta.get("allowed", [])
104
+ for a in allowed:
105
+ if v.lower() == a.lower():
106
+ return a
107
+ if v.lower() in ["+", "positive", "pos"]:
108
+ return "Positive" if "Positive" in allowed else v
109
+ if v.lower() in ["-", "negative", "neg"]:
110
+ return "Negative" if "Negative" in allowed else v
111
+ if v.lower() in ["variable", "var", "v"]:
112
+ return "Variable" if "Variable" in allowed else v
113
+ return v
114
+
115
+ if ftype == "multienum":
116
+ parts = [p.strip() for p in v.split(MULTI_SEPARATOR) if p.strip()]
117
+ allowed = meta.get("allowed")
118
+ normed = []
119
+ for p in parts:
120
+ if not allowed:
121
+ normed.append(p)
122
+ else:
123
+ hit = next((a for a in allowed if a.lower() == p.lower()), None)
124
+ normed.append(hit if hit else p)
125
+ return f" {MULTI_SEPARATOR} ".join(normed) if normed else UNKNOWN
126
+
127
+ if ftype == "range":
128
+ txt = v.replace(" ", "")
129
+ return txt
130
+
131
+ return v
132
+
133
+ def validate_record(rec: Dict[str, Any]) -> Tuple[bool, List[str]]:
134
+ issues: List[str] = []
135
+ for field in FIELDS_ORDER:
136
+ meta = SCHEMA[field]
137
+ if field not in rec:
138
+ continue
139
+ val = rec[field]
140
+
141
+ if meta["type"] == "enum":
142
+ allowed = meta.get("allowed", [])
143
+ if str(val) not in allowed + [UNKNOWN]:
144
+ issues.append(f"{field}: '{val}' not in {allowed + [UNKNOWN]}")
145
+
146
+ elif meta["type"] == "multienum":
147
+ if val == UNKNOWN:
148
+ continue
149
+ parts = [p.strip() for p in str(val).split(MULTI_SEPARATOR) if p.strip()]
150
+ allowed = meta.get("allowed")
151
+ if allowed:
152
+ bad = [p for p in parts if p not in allowed]
153
+ if bad:
154
+ issues.append(f"{field}: invalid values {bad}; allowed {allowed}")
155
+
156
+ elif meta["type"] == "range":
157
+ if val == UNKNOWN:
158
+ continue
159
+ txt = str(val).replace(" ", "")
160
+ if "//" not in txt:
161
+ issues.append(f"{field}: expected 'low//high' got '{val}'")
162
+ else:
163
+ try:
164
+ low, high = [float(x) for x in txt.split("//")]
165
+ if low > high:
166
+ issues.append(f"{field}: low {low} > high {high}")
167
+ except Exception:
168
+ issues.append(f"{field}: non-numeric bounds '{val}'")
169
+
170
+ ok = len(issues) == 0
171
+ return ok, issues
172
+
173
+ def empty_record() -> Dict[str, str]:
174
+ rec = {}
175
+ for f, meta in SCHEMA.items():
176
+ if f in ("Genus", "Species"):
177
+ rec[f] = ""
178
+ else:
179
+ rec[f] = UNKNOWN
180
+ return rec
engine/validator.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/validator.py
2
+ # ---------------------------------
3
+ # Placeholder for logical validation layer
4
+
5
+ def validate_record(parsed: dict) -> dict:
6
+ """
7
+ Later: check for contradictions, invalid values,
8
+ and normalize to schema.
9
+ """
10
+ parsed.setdefault("validation_notes", [])
11
+ parsed["validation_notes"].append("Validator not yet implemented.")
12
+ return parsed
engine/weights.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # engine/weights.py
2
+ # ---------------------------------
3
+ # Placeholder for field importance weighting
4
+
5
+ DEFAULT_WEIGHTS = {}
6
+
7
+ def update_weights_from_gold(gold_results):
8
+ """
9
+ Future: adjust field importance weights
10
+ based on gold test accuracy stats.
11
+ """
12
+ pass
training/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Marks the 'training' directory as a Python package
2
+
training/alias_trainer.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/alias_trainer.py
2
+ # ------------------------------------------------------------
3
+ # Stage 10B - Alias Trainer
4
+ #
5
+ # Learns field/value synonyms from gold tests by comparing:
6
+ # - expected values (gold standard)
7
+ # - parsed values (rules + extended)
8
+ #
9
+ # Outputs:
10
+ # - Updated alias_maps.json
11
+ #
12
+ # This is the core intelligence that allows BactAI-D
13
+ # to understand variations in microbiology language.
14
+ # ------------------------------------------------------------
15
+
16
+ import json
17
+ import os
18
+ from collections import defaultdict
19
+
20
+ from engine.parser_rules import parse_text_rules
21
+ from engine.parser_ext import parse_text_extended
22
+
23
+
24
+ GOLD_PATH = "training/gold_tests.json"
25
+ ALIAS_PATH = "data/alias_maps.json"
26
+
27
+
28
+ def normalise(s):
29
+ if s is None:
30
+ return ""
31
+ return str(s).strip().lower()
32
+
33
+
34
+ def learn_aliases():
35
+ """
36
+ Learns synonym mappings from gold tests.
37
+ """
38
+ if not os.path.exists(GOLD_PATH):
39
+ return {"error": f"Gold tests missing: {GOLD_PATH}"}
40
+
41
+ with open(GOLD_PATH, "r", encoding="utf-8") as f:
42
+ gold = json.load(f)
43
+
44
+ # Load or create alias map
45
+ if os.path.exists(ALIAS_PATH):
46
+ with open(ALIAS_PATH, "r", encoding="utf-8") as f:
47
+ alias_maps = json.load(f)
48
+ else:
49
+ alias_maps = {}
50
+
51
+ # Track suggestions
52
+ suggestions = defaultdict(lambda: defaultdict(int))
53
+
54
+ # ------------------------------------------------------------
55
+ # Compare expected vs parsed for all tests
56
+ # ------------------------------------------------------------
57
+ for test in gold:
58
+ text = test.get("input", "")
59
+ expected = test.get("expected", {})
60
+
61
+ rules = parse_text_rules(text).get("parsed_fields", {})
62
+ ext = parse_text_extended(text).get("parsed_fields", {})
63
+
64
+ # merge deterministic parsers
65
+ merged = dict(rules)
66
+ for k, v in ext.items():
67
+ if v != "Unknown":
68
+ merged[k] = v
69
+
70
+ # now compare with expected
71
+ for field, exp_val in expected.items():
72
+ exp_norm = normalise(exp_val)
73
+ got_norm = normalise(merged.get(field, "Unknown"))
74
+
75
+ # Skip correct matches
76
+ if exp_norm == got_norm:
77
+ continue
78
+
79
+ # Skip unknown expected
80
+ if exp_norm in ["", "unknown"]:
81
+ continue
82
+
83
+ # Mismatched → candidate alias
84
+ if got_norm not in ["", "unknown"]:
85
+ suggestions[field][got_norm] += 1
86
+
87
+ # ------------------------------------------------------------
88
+ # Convert suggestions into alias mappings
89
+ # ------------------------------------------------------------
90
+ alias_updates = {}
91
+
92
+ for field, values in suggestions.items():
93
+ # ignore fields with tiny evidence
94
+ for wrong_value, count in values.items():
95
+ if count < 2:
96
+ continue # avoid noise
97
+
98
+ # add/update alias
99
+ if field not in alias_maps:
100
+ alias_maps[field] = {}
101
+
102
+ # map wrong_value → expected canonical version
103
+ # canonical version is the most common value in gold_tests for that field
104
+ canonical = None
105
+ # determine canonical
106
+ field_values = [normalise(t["expected"][field]) for t in gold if field in t["expected"]]
107
+ if field_values:
108
+ # most common expected value
109
+ canonical = max(set(field_values), key=field_values.count)
110
+
111
+ if canonical:
112
+ alias_maps[field][wrong_value] = canonical
113
+ alias_updates[f"{field}:{wrong_value}"] = canonical
114
+
115
+ # ------------------------------------------------------------
116
+ # Save alias maps
117
+ # ------------------------------------------------------------
118
+ with open(ALIAS_PATH, "w", encoding="utf-8") as f:
119
+ json.dump(alias_maps, f, indent=2)
120
+
121
+ return {
122
+ "ok": True,
123
+ "updated_aliases": alias_updates,
124
+ "total_updates": len(alias_updates),
125
+ "alias_map_path": ALIAS_PATH,
126
+ }
training/gold_tester.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/gold_tester.py
2
+ # ----------------------------------------------------
3
+ # Enhanced tester: audits expected fields not in schema,
4
+ # adds DNase/Dnase alias and range-aware Growth Temperature matching.
5
+
6
+ import json, os, time, csv
7
+ from collections import Counter
8
+ from typing import Dict, List, Tuple
9
+ from engine.schema import SCHEMA, UNKNOWN, normalize_value, is_enum_field
10
+ from engine.parser_rules import parse_text_rules
11
+
12
+ REPORTS_DIR = "reports"
13
+ PROPOSALS_PATH = os.path.join("data", "extended_proposals.jsonl")
14
+ GOLD_PATH = os.path.join("training", "gold_tests.json")
15
+
16
+ # --- helpers ---
17
+ def load_gold() -> List[Dict]:
18
+ with open(GOLD_PATH, "r", encoding="utf-8") as f:
19
+ return json.load(f)
20
+
21
+ def _range_overlap(a: str, b: str) -> bool:
22
+ try:
23
+ la, ha = [float(x) for x in a.split("//")]
24
+ lb, hb = [float(x) for x in b.split("//")]
25
+ return not (ha < lb or hb < la)
26
+ except Exception:
27
+ return False
28
+
29
+ def compare_records(pred: Dict[str, str], exp: Dict[str, str]) -> Tuple[int, int, Dict[str, Tuple[str, str]]]:
30
+ correct, total, errors = 0, 0, {}
31
+ for field, exp_val in exp.items():
32
+ total += 1
33
+ p = pred.get(field, UNKNOWN)
34
+ if field == "Growth Temperature":
35
+ if p != UNKNOWN and exp_val != UNKNOWN and _range_overlap(p, exp_val):
36
+ correct += 1
37
+ continue
38
+ if p == exp_val:
39
+ correct += 1
40
+ else:
41
+ errors[field] = (p, exp_val)
42
+ return correct, total, errors
43
+
44
+ def append_proposal(record: Dict):
45
+ os.makedirs(os.path.dirname(PROPOSALS_PATH), exist_ok=True)
46
+ with open(PROPOSALS_PATH, "a", encoding="utf-8") as f:
47
+ f.write(json.dumps(record, ensure_ascii=False) + "\n")
48
+
49
+ # --- main ---
50
+ def run_gold_tests(mode: str = "rules") -> Dict:
51
+ tests = load_gold()
52
+ ts = time.strftime("%Y%m%d_%H%M%S")
53
+
54
+ per_field_counts, per_field_correct, per_field_cov = Counter(), Counter(), Counter()
55
+ unknown_fields, unknown_values = Counter(), Counter()
56
+ expected_unknowns = Counter()
57
+ detailed_rows = []
58
+ cases_with_misses = 0
59
+
60
+ for case in tests:
61
+ name, text, expected = case.get("name", ""), case.get("input", ""), case.get("expected", {})
62
+
63
+ # normalize expected key aliases
64
+ expected_norm = {}
65
+ for k, v in expected.items():
66
+ k2 = "DNase" if k.lower() == "dnase" else k
67
+ expected_norm[k2] = v
68
+ expected = expected_norm
69
+
70
+ out = parse_text_rules(text)
71
+ parsed = out.get("parsed_fields", {})
72
+
73
+ # normalize parser output
74
+ normalized_pred = {}
75
+ for field, val in parsed.items():
76
+ if field not in SCHEMA:
77
+ unknown_fields[field] += 1
78
+ append_proposal({
79
+ "type": "unknown_field",
80
+ "field": field,
81
+ "value": val,
82
+ "case_name": name,
83
+ "timestamp": ts
84
+ })
85
+ continue
86
+ normalized_pred[field] = normalize_value(field, val)
87
+ if is_enum_field(field):
88
+ allowed = SCHEMA[field].get("allowed", [])
89
+ if normalized_pred[field] not in allowed + [UNKNOWN]:
90
+ unknown_values[(field, normalized_pred[field])] += 1
91
+ append_proposal({
92
+ "type": "unknown_value",
93
+ "field": field,
94
+ "value": normalized_pred[field],
95
+ "allowed": allowed,
96
+ "case_name": name,
97
+ "timestamp": ts
98
+ })
99
+
100
+ # audit expected fields not in schema
101
+ for ef in expected.keys():
102
+ if ef not in SCHEMA:
103
+ expected_unknowns[ef] += 1
104
+ append_proposal({
105
+ "type": "expected_field_not_in_schema",
106
+ "field": ef,
107
+ "case_name": name,
108
+ "timestamp": ts
109
+ })
110
+
111
+ correct, total, errors = compare_records(normalized_pred, expected)
112
+ if errors:
113
+ cases_with_misses += 1
114
+
115
+ for f in expected.keys():
116
+ per_field_counts[f] += 1
117
+ if f in normalized_pred and normalized_pred[f] != UNKNOWN:
118
+ per_field_cov[f] += 1
119
+ if f not in errors:
120
+ per_field_correct[f] += 1
121
+
122
+ detailed_rows.append({
123
+ "name": name,
124
+ "parsed": json.dumps(normalized_pred, ensure_ascii=False),
125
+ "expected": json.dumps(expected, ensure_ascii=False),
126
+ "correct_fields": correct,
127
+ "total_fields": total
128
+ })
129
+
130
+ # --- aggregate metrics ---
131
+ per_field_metrics = []
132
+ for f, tot in per_field_counts.items():
133
+ acc = per_field_correct[f] / tot if tot else 0.0
134
+ cov = per_field_cov[f] / tot if tot else 0.0
135
+ per_field_metrics.append({"field": f, "accuracy": round(acc, 4), "coverage": round(cov, 4), "n": tot})
136
+ per_field_metrics.sort(key=lambda x: x["field"])
137
+
138
+ micro_acc = sum(per_field_correct.values()) / sum(per_field_counts.values()) if per_field_counts else 0.0
139
+
140
+ os.makedirs(REPORTS_DIR, exist_ok=True)
141
+ report = {
142
+ "mode": mode,
143
+ "timestamp": ts,
144
+ "num_tests": len(tests),
145
+ "micro_accuracy": round(micro_acc, 4),
146
+ "cases_with_misses": cases_with_misses,
147
+ "per_field": per_field_metrics,
148
+ "unknown_fields": dict(unknown_fields),
149
+ "unknown_values": {f"{k[0]}::{k[1]}": v for k, v in unknown_values.items()},
150
+ "expected_unknown_fields": dict(expected_unknowns),
151
+ "proposals_path": PROPOSALS_PATH
152
+ }
153
+ json_path = os.path.join(REPORTS_DIR, f"gold_report_{mode}_{ts}.json")
154
+ csv_fields = os.path.join(REPORTS_DIR, f"gold_fields_{mode}_{ts}.csv")
155
+ csv_cases = os.path.join(REPORTS_DIR, f"gold_cases_{mode}_{ts}.csv")
156
+
157
+ with open(json_path, "w", encoding="utf-8") as f:
158
+ json.dump(report, f, indent=2, ensure_ascii=False)
159
+ with open(csv_fields, "w", newline="", encoding="utf-8") as f:
160
+ import csv
161
+ w = csv.DictWriter(f, fieldnames=["field", "accuracy", "coverage", "n"])
162
+ w.writeheader()
163
+ w.writerows(per_field_metrics)
164
+ with open(csv_cases, "w", newline="", encoding="utf-8") as f:
165
+ w = csv.DictWriter(f, fieldnames=["name", "parsed", "expected", "correct_fields", "total_fields"])
166
+ w.writeheader()
167
+ w.writerows(detailed_rows)
168
+
169
+ return {"summary": report, "paths": {"json_report": json_path, "csv_fields": csv_fields, "csv_cases": csv_cases}}
training/gold_tests.json ADDED
The diff for this file is too large to render. See raw diff
 
training/gold_trainer.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/gold_trainer.py
2
+ # ------------------------------------------------------------
3
+ # Master training pipeline for:
4
+ # - Alias Trainer (Stage 10B)
5
+ # - Schema Expansion (Stage 10C)
6
+ # - Extended Signals (Stage 10C)
7
+ # ------------------------------------------------------------
8
+
9
+ from typing import Dict, Any
10
+
11
+ from training.alias_trainer import learn_aliases
12
+
13
+ # Try importing Stage 10C components, but don't crash if missing
14
+ try:
15
+ from training.schema_expander import expand_schema
16
+ except Exception:
17
+ def expand_schema():
18
+ return {
19
+ "ok": False,
20
+ "message": "schema_expander not implemented or import failed (Stage 10C).",
21
+ }
22
+
23
+ try:
24
+ from training.signal_trainer import train_signals
25
+ except Exception:
26
+ def train_signals():
27
+ return {
28
+ "ok": False,
29
+ "message": "signal_trainer not implemented or import failed (Stage 10C).",
30
+ }
31
+
32
+
33
+ def train_from_gold() -> Dict[str, Any]:
34
+ """
35
+ Runs all training modules on gold tests.
36
+ Currently:
37
+ - Stage 10B: Alias Trainer
38
+ - Stage 10C: Schema Expansion (stub)
39
+ - Stage 10C: Extended Signals (stub)
40
+ """
41
+ out: Dict[str, Any] = {}
42
+
43
+ # Stage 10B - Alias Trainer
44
+ alias_result = learn_aliases()
45
+ out["alias_trainer"] = alias_result
46
+
47
+ # Stage 10C - Schema Expansion
48
+ try:
49
+ schema_result = expand_schema()
50
+ except Exception as e:
51
+ schema_result = {
52
+ "ok": False,
53
+ "error": str(e),
54
+ "message": "Error while running schema_expander.",
55
+ }
56
+ out["schema_expander"] = schema_result
57
+
58
+ # Stage 10C - Signals Trainer
59
+ try:
60
+ signals_result = train_signals()
61
+ except Exception as e:
62
+ signals_result = {
63
+ "ok": False,
64
+ "error": str(e),
65
+ "message": "Error while running signal_trainer.",
66
+ }
67
+ out["signals_trainer"] = signals_result
68
+
69
+ return out
training/parser_eval.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/parser_eval.py
2
+ # ------------------------------------------------------------
3
+ # Parser Evaluation (Stage 10A)
4
+ #
5
+ # This version ONLY evaluates:
6
+ # - Rule parser
7
+ # - Extended parser
8
+ #
9
+ # The LLM parser is intentionally disabled at this stage
10
+ # because alias maps and schema are not trained yet.
11
+ #
12
+ # This makes Stage 10A FAST and stable (< 3 seconds).
13
+ # ------------------------------------------------------------
14
+
15
+ import json
16
+ import os
17
+ from typing import Dict, Any
18
+
19
+ from engine.parser_rules import parse_text_rules
20
+ from engine.parser_ext import parse_text_extended
21
+
22
+
23
+ # Path to the gold tests
24
+ GOLD_PATH = "training/gold_tests.json"
25
+
26
+
27
+ def evaluate_single_test(test: Dict[str, Any]) -> Dict[str, Any]:
28
+ """
29
+ Evaluate one gold test with rules + extended parsers.
30
+ """
31
+ text = test.get("input", "")
32
+ expected = test.get("expected", {})
33
+
34
+ # Run deterministic parsers
35
+ rule_out = parse_text_rules(text).get("parsed_fields", {})
36
+ ext_out = parse_text_extended(text).get("parsed_fields", {})
37
+
38
+ # Merge rule + extended (extended overwrites rules)
39
+ merged = dict(rule_out)
40
+ for k, v in ext_out.items():
41
+ if v != "Unknown":
42
+ merged[k] = v
43
+
44
+ total = len(expected)
45
+ correct = 0
46
+ wrong = {}
47
+
48
+ for field, exp_val in expected.items():
49
+ got = merged.get(field, "Unknown")
50
+ if got.lower() == exp_val.lower():
51
+ correct += 0 if exp_val == "Unknown" else 1 # Unknown is neutral
52
+ else:
53
+ wrong[field] = {"expected": exp_val, "got": got}
54
+
55
+ return {
56
+ "correct": correct,
57
+ "total": total,
58
+ "accuracy": correct / total if total else 0,
59
+ "wrong": wrong,
60
+ "merged": merged,
61
+ }
62
+
63
+
64
+ def run_parser_eval(mode: str = "rules_extended") -> Dict[str, Any]:
65
+ """
66
+ Evaluate ALL gold tests using rules + extended parsing only.
67
+ """
68
+ if not os.path.exists(GOLD_PATH):
69
+ return {"error": f"Gold test file not found at {GOLD_PATH}"}
70
+
71
+ with open(GOLD_PATH, "r", encoding="utf-8") as f:
72
+ gold = json.load(f)
73
+
74
+ results = []
75
+ wrong_cases = []
76
+
77
+ total_correct = 0
78
+ total_fields = 0
79
+
80
+ for test in gold:
81
+ out = evaluate_single_test(test)
82
+ results.append(out)
83
+
84
+ total_correct += out["correct"]
85
+ total_fields += out["total"]
86
+
87
+ if out["wrong"]:
88
+ wrong_cases.append({
89
+ "name": test.get("name", "Unnamed"),
90
+ "wrong": out["wrong"],
91
+ "parsed": out["merged"],
92
+ "expected": test.get("expected", {})
93
+ })
94
+
95
+ summary = {
96
+ "mode": "rules+extended",
97
+ "tests": len(gold),
98
+ "total_correct": total_correct,
99
+ "total_fields": total_fields,
100
+ "overall_accuracy": total_correct / total_fields if total_fields else 0,
101
+ "wrong_cases": wrong_cases,
102
+ }
103
+
104
+ return summary
training/repo_sync_hf.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/repo_sync_hf.py
2
+ # ------------------------------------------------------------
3
+ # Sync updated data files back to the SAME Hugging Face Space
4
+ # repo that the app is running from.
5
+ #
6
+ # Uses:
7
+ # HF_TOKEN -> a write token (set in Space secrets)
8
+ # HF_SPACE_REPO_ID -> e.g. "username/space-name"
9
+ #
10
+ # Call from app.py with:
11
+ # from training.repo_sync_hf import push_updates_to_hf
12
+ # result = push_updates_to_hf([...], commit_message="...")
13
+ # ------------------------------------------------------------
14
+
15
+ import os
16
+ from typing import List, Dict, Any
17
+
18
+ from huggingface_hub import HfApi, CommitOperationAdd
19
+
20
+
21
+ def push_updates_to_hf(
22
+ paths: List[str],
23
+ commit_message: str = "train: update extended schema, aliases, signals from gold tests",
24
+ ) -> Dict[str, Any]:
25
+ """
26
+ Create a single commit on the current Space repo with the given files.
27
+ Each path is used both as local path and path_in_repo.
28
+ """
29
+
30
+ repo_id = os.getenv("HF_SPACE_REPO_ID")
31
+ token = os.getenv("HF_TOKEN")
32
+
33
+ if not repo_id:
34
+ return {
35
+ "ok": False,
36
+ "error": "Missing HF_SPACE_REPO_ID environment variable.",
37
+ "uploaded": [],
38
+ }
39
+
40
+ if not token:
41
+ return {
42
+ "ok": False,
43
+ "error": "Missing HF_TOKEN environment variable.",
44
+ "uploaded": [],
45
+ }
46
+
47
+ api = HfApi()
48
+ operations = []
49
+ uploaded = []
50
+
51
+ for p in paths:
52
+ if not os.path.exists(p):
53
+ # Skip missing files, but record that they were skipped
54
+ continue
55
+
56
+ operations.append(
57
+ CommitOperationAdd(path_in_repo=p, path_or_fileobj=p)
58
+ )
59
+ uploaded.append(p)
60
+
61
+ if not operations:
62
+ return {
63
+ "ok": False,
64
+ "error": "No existing files to upload.",
65
+ "uploaded": [],
66
+ }
67
+
68
+ commit_info = api.create_commit(
69
+ repo_id=repo_id,
70
+ repo_type="space",
71
+ operations=operations,
72
+ commit_message=commit_message,
73
+ token=token,
74
+ )
75
+
76
+ return {
77
+ "ok": True,
78
+ "uploaded": uploaded,
79
+ "repo_id": repo_id,
80
+ "commit_message": commit_message,
81
+ "commit_url": commit_info.commit_url,
82
+ }
training/schema_expander.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/schema_expander.py
2
+ # ------------------------------------------------------------
3
+ # Placeholder for Stage 10C - Schema Expansion
4
+ #
5
+ # At this stage we haven't implemented schema expansion yet,
6
+ # so this stub lets the alias trainer run without error.
7
+ # ------------------------------------------------------------
8
+
9
+ def expand_schema():
10
+ return {
11
+ "ok": True,
12
+ "message": "Schema expander not implemented yet (Stage 10C placeholder)."
13
+ }
training/signal_trainer.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # training/signal_trainer.py
2
+ # ------------------------------------------------------------
3
+ # Placeholder for Stage 10C - Extended Signals Trainer
4
+ #
5
+ # This will be implemented in Stage 10C.
6
+ # For now, it must exist so imports succeed.
7
+ # ------------------------------------------------------------
8
+
9
+ def train_signals():
10
+ return {
11
+ "ok": True,
12
+ "message": "Signal trainer not implemented yet (Stage 10C placeholder)."
13
+ }