EphAsad commited on
Commit
3f68bef
·
verified ·
1 Parent(s): 73b0e20

Update engine/bacteria_identifier.py

Browse files
Files changed (1) hide show
  1. engine/bacteria_identifier.py +381 -383
engine/bacteria_identifier.py CHANGED
@@ -1,383 +1,381 @@
1
- # engine/bacteria_identifier.py
2
- # ------------------------------------------------------------
3
- # Core identification engine + blended scoring with extended signals.
4
-
5
- import os
6
- import json
7
- import re
8
- import random
9
- from typing import Dict, List, Optional, Tuple
10
-
11
- import pandas as pd
12
-
13
- from engine.extended_reasoner import score_genera_from_extended
14
-
15
- DATA_DIR = "data"
16
- EXT_SCHEMA_PATH = os.path.join(DATA_DIR, "extended_schema.json")
17
-
18
-
19
- # -----------------------------
20
- # Helper Function
21
- # -----------------------------
22
- def join_with_and(items):
23
- """Join list into a readable string, using commas and 'and' before last item."""
24
- if not items:
25
- return ""
26
- if len(items) == 1:
27
- return items[0]
28
- return ", ".join(items[:-1]) + " and " + items[-1]
29
-
30
-
31
- # -----------------------------
32
- # Identification Result Class
33
- # -----------------------------
34
- class IdentificationResult:
35
- """
36
- Stores data about a single bacterial genus result and generates reasoning text.
37
- Now includes optional extended-likelihood and blended confidence.
38
- """
39
- def __init__(
40
- self,
41
- genus: str,
42
- total_score: int,
43
- matched_fields: List[str],
44
- mismatched_fields: List[str],
45
- reasoning_factors: Dict[str, str],
46
- total_fields_evaluated: int,
47
- total_fields_possible: int,
48
- extra_notes: str = "",
49
- extended_likelihood: Optional[float] = None,
50
- extended_explanation: str = "",
51
- ):
52
- self.genus = genus
53
- self.total_score = total_score
54
- self.matched_fields = matched_fields
55
- self.mismatched_fields = mismatched_fields
56
- self.reasoning_factors = reasoning_factors
57
- self.total_fields_evaluated = total_fields_evaluated
58
- self.total_fields_possible = total_fields_possible
59
- self.extra_notes = extra_notes
60
-
61
- # Extended reasoning
62
- self.extended_likelihood = extended_likelihood # 0–1, or None if no extended data
63
- self.extended_explanation = extended_explanation
64
-
65
- # -----------------------------
66
- # Confidence Calculations
67
- # -----------------------------
68
- def confidence_percent(self) -> int:
69
- """Confidence based only on tests the user entered."""
70
- if self.total_fields_evaluated == 0:
71
- return 0
72
- return max(
73
- 0,
74
- min(100, int((self.total_score / self.total_fields_evaluated) * 100)),
75
- )
76
-
77
- def true_confidence(self) -> int:
78
- """Confidence based on *all* possible tests (complete database fields)."""
79
- if self.total_fields_possible == 0:
80
- return 0
81
- return max(
82
- 0,
83
- min(100, int((self.total_score / self.total_fields_possible) * 100)),
84
- )
85
-
86
- def blended_confidence_raw(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> float:
87
- """
88
- Blended confidence:
89
- core = core-confidence (0–1)
90
- ext = extended likelihood (0–1, if available)
91
- If no extended likelihood, return core.
92
- """
93
- core = self.confidence_percent() / 100.0
94
- if self.extended_likelihood is None:
95
- return core
96
- return weight_core * core + weight_ext * self.extended_likelihood
97
-
98
- def blended_confidence_percent(self, weight_core: float = 0.7, weight_ext: float = 0.3) -> int:
99
- return int(round(self.blended_confidence_raw(weight_core, weight_ext) * 100))
100
-
101
- # -----------------------------
102
- # Reasoning Paragraph Generator
103
- # -----------------------------
104
- def reasoning_paragraph(self, ranked_results=None) -> str:
105
- """Generate detailed reasoning paragraph with comparison to other genera."""
106
- if not self.matched_fields:
107
- return "No significant biochemical or morphological matches were found."
108
-
109
- intro = random.choice(
110
- [
111
- "Based on the observed biochemical and morphological traits,",
112
- "According to the provided test results,",
113
- "From the available laboratory findings,",
114
- "Considering the entered reactions and colony traits,",
115
- ]
116
- )
117
-
118
- # Key descriptive highlights
119
- highlights = []
120
- if "Gram Stain" in self.matched_fields:
121
- highlights.append(
122
- f"it is **Gram {self.reasoning_factors.get('Gram Stain', '').lower()}**"
123
- )
124
- if "Shape" in self.matched_fields:
125
- highlights.append(
126
- f"with a **{self.reasoning_factors.get('Shape', '').lower()}** morphology"
127
- )
128
- if "Catalase" in self.matched_fields:
129
- highlights.append(
130
- f"and **catalase {self.reasoning_factors.get('Catalase', '').lower()}** activity"
131
- )
132
- if "Oxidase" in self.matched_fields:
133
- highlights.append(
134
- f"and **oxidase {self.reasoning_factors.get('Oxidase', '').lower()}** reaction"
135
- )
136
- if "Oxygen Requirement" in self.matched_fields:
137
- highlights.append(
138
- f"which prefers **{self.reasoning_factors.get('Oxygen Requirement', '').lower()}** conditions"
139
- )
140
-
141
- # Join highlights grammatically
142
- summary = (
143
- ", ".join(highlights[:-1]) + " and " + highlights[-1]
144
- if len(highlights) > 1
145
- else "".join(highlights)
146
- )
147
-
148
- # Confidence text (core)
149
- core_conf = self.confidence_percent()
150
- confidence_text = (
151
- "The confidence in this identification based on the entered tests is high."
152
- if core_conf >= 70
153
- else "The confidence in this identification based on the entered tests is moderate."
154
- )
155
-
156
- # Comparative reasoning vs other close results
157
- comparison = ""
158
- if ranked_results and len(ranked_results) > 1:
159
- close_others = ranked_results[1:3]
160
- other_names = [r.genus for r in close_others]
161
- if other_names:
162
- if self.total_score >= close_others[0].total_score:
163
- comparison = (
164
- f" It is **more likely** than {join_with_and(other_names)} "
165
- f"based on stronger alignment in {join_with_and(self.matched_fields[:3])}."
166
- )
167
- else:
168
- comparison = (
169
- f" It is **less likely** than {join_with_and(other_names)} "
170
- f"due to differences in {join_with_and(self.mismatched_fields[:3])}."
171
- )
172
-
173
- return f"{intro} {summary}, the isolate most closely resembles **{self.genus}**. {confidence_text}{comparison}"
174
-
175
-
176
- # -----------------------------
177
- # Bacteria Identifier Engine
178
- # -----------------------------
179
- class BacteriaIdentifier:
180
- """
181
- Main engine to match bacterial genus based on biochemical & morphological data.
182
- Includes:
183
- - Core rule-based matching vs bacteria_db.xlsx
184
- - Optional blending with extended signals (signals_catalog.json)
185
- """
186
-
187
- def __init__(self, db: pd.DataFrame):
188
- self.db = db.fillna("")
189
- self.extended_fields = self._load_extended_fields()
190
-
191
- def _load_extended_fields(self) -> List[str]:
192
- if not os.path.exists(EXT_SCHEMA_PATH):
193
- return []
194
- try:
195
- with open(EXT_SCHEMA_PATH, "r", encoding="utf-8") as f:
196
- schema = json.load(f)
197
- return list(schema.keys())
198
- except Exception:
199
- return []
200
-
201
- # -----------------------------
202
- # Field Comparison Logic
203
- # -----------------------------
204
- def compare_field(self, db_val, user_val, field_name: str) -> int:
205
- """Compare one test field between database and user input."""
206
- if not user_val or str(user_val).strip() == "" or str(user_val).lower() == "unknown":
207
- return 0 # Skip empty or unknown
208
-
209
- db_val = str(db_val).strip().lower()
210
- user_val = str(user_val).strip().lower()
211
- hard_exclusions = ["Gram Stain", "Shape", "Spore Formation"]
212
-
213
- # Split entries by separators for multi-value matches
214
- db_options = re.split(r"[;/]", db_val)
215
- user_options = re.split(r"[;/]", user_val)
216
- db_options = [x.strip() for x in db_options if x.strip()]
217
- user_options = [x.strip() for x in user_options if x.strip()]
218
-
219
- # Handle "variable" logic
220
- if "variable" in db_options or "variable" in user_options:
221
- return 0
222
-
223
- # Special handling for Growth Temperature
224
- if field_name == "Growth Temperature":
225
- try:
226
- if "//" in db_val:
227
- low, high = [float(x) for x in db_val.split("//")]
228
- temp = float(user_val)
229
- return 1 if low <= temp <= high else -1
230
- except Exception:
231
- return 0
232
-
233
- # Flexible match: partial overlap counts as match
234
- match_found = any(
235
- any(u in db_opt or db_opt in u for db_opt in db_options) for u in user_options
236
- )
237
-
238
- if match_found:
239
- return 1
240
- else:
241
- if field_name in hard_exclusions:
242
- return -999 # Hard exclusion
243
- return -1
244
-
245
- # -----------------------------
246
- # Suggest Next Tests
247
- # -----------------------------
248
- def suggest_next_tests(self, top_results: List[IdentificationResult]) -> List[str]:
249
- """Suggest 3 tests that best differentiate top matches."""
250
- if len(top_results) < 2:
251
- return []
252
- varying_fields = []
253
- top3 = top_results[:3]
254
-
255
- for field in self.db.columns:
256
- if field in ["Genus", "Extra Notes", "Colony Morphology"]:
257
- continue
258
-
259
- field_values = set()
260
- for r in top3:
261
- field_values.update(r.matched_fields)
262
- field_values.update(r.mismatched_fields)
263
-
264
- if len(field_values) > 1:
265
- varying_fields.append(field)
266
-
267
- random.shuffle(varying_fields)
268
- return varying_fields[:3]
269
-
270
- # -----------------------------
271
- # Extended Input Extraction
272
- # -----------------------------
273
- def _extract_extended_input(self, user_input: Dict[str, str]) -> Dict[str, str]:
274
- """
275
- Extract extended tests (those in extended_schema.json but not part of the core db).
276
- Only keep Positive/Negative/Variable (ignore Unknown/empty).
277
- """
278
- ext_in = {}
279
- for field in self.extended_fields:
280
- val = user_input.get(field, "Unknown")
281
- if isinstance(val, str) and val.lower() in ("positive", "negative", "variable"):
282
- ext_in[field] = val.capitalize()
283
- return ext_in
284
-
285
- # -----------------------------
286
- # Main Identification Routine
287
- # -----------------------------
288
- def identify(self, user_input: Dict[str, str]) -> List[IdentificationResult]:
289
- """Compare user input to database and rank possible genera with blended scoring."""
290
- results: List[IdentificationResult] = []
291
- total_fields_possible = len([c for c in self.db.columns if c != "Genus"])
292
-
293
- # 1) Core scoring loop against bacteria_db.xlsx
294
- for _, row in self.db.iterrows():
295
- genus = row["Genus"]
296
- total_score = 0
297
- matched_fields: List[str] = []
298
- mismatched_fields: List[str] = []
299
- reasoning_factors: Dict[str, str] = {}
300
- total_fields_evaluated = 0
301
-
302
- for field in self.db.columns:
303
- if field == "Genus":
304
- continue
305
-
306
- db_val = row[field]
307
- user_val = user_input.get(field, "")
308
- score = self.compare_field(db_val, user_val, field)
309
-
310
- # Count only real inputs for relative confidence
311
- if user_val and str(user_val).lower() != "unknown":
312
- total_fields_evaluated += 1
313
-
314
- if score == -999:
315
- total_score = -999
316
- break # Hard exclusion ends comparison
317
-
318
- elif score == 1:
319
- total_score += 1
320
- matched_fields.append(field)
321
- reasoning_factors[field] = user_val
322
-
323
- elif score == -1:
324
- total_score -= 1
325
- mismatched_fields.append(field)
326
-
327
- # Append valid genus result
328
- if total_score > -999:
329
- extra_notes = row.get("Extra Notes", "")
330
- results.append(
331
- IdentificationResult(
332
- genus=genus,
333
- total_score=total_score,
334
- matched_fields=matched_fields,
335
- mismatched_fields=mismatched_fields,
336
- reasoning_factors=reasoning_factors,
337
- total_fields_evaluated=total_fields_evaluated,
338
- total_fields_possible=total_fields_possible,
339
- extra_notes=extra_notes,
340
- )
341
- )
342
-
343
- if not results:
344
- return []
345
-
346
- # 2) Suggest next tests for top core results
347
- top_suggestions = self.suggest_next_tests(results)
348
- for r in results[:3]:
349
- r.reasoning_factors["next_tests"] = ", ".join(top_suggestions)
350
-
351
- # 3) Extended likelihoods (if user provided extended tests)
352
- ext_input = self._extract_extended_input(user_input)
353
- ext_scores: Dict[str, float] = {}
354
- ext_explanation = ""
355
-
356
- if ext_input:
357
- ranked, ext_explanation = score_genera_from_extended(ext_input)
358
- ext_scores = {g: s for g, s in ranked}
359
-
360
- # Attach extended scores/explanations to each result
361
- if ext_scores:
362
- for r in results:
363
- if r.genus in ext_scores:
364
- r.extended_likelihood = ext_scores[r.genus]
365
- else:
366
- # If genus not in signals, treat as neutral (no info)
367
- r.extended_likelihood = None
368
- r.extended_explanation = ext_explanation
369
- else:
370
- for r in results:
371
- r.extended_likelihood = None
372
- r.extended_explanation = ""
373
-
374
- # 4) Sort results
375
- if any(r.extended_likelihood is not None for r in results):
376
- # Sort by blended confidence when extended data is present
377
- results.sort(key=lambda x: x.blended_confidence_raw(), reverse=True)
378
- else:
379
- # Fallback to core total_score
380
- results.sort(key=lambda x: x.total_score, reverse=True)
381
-
382
- # Return top 10
383
- return results[:10]
 
1
+ # engine/bacteria_identifier.py
2
+ # ------------------------------------------------------------
3
+ # Core BactAI-D identification engine.
4
+ # - Scores genera from Excel DB (core phenotype fields)
5
+ # - Integrates optional extended-test reasoning
6
+ # - Provides blended confidence and narrative reasoning
7
+ # ------------------------------------------------------------
8
+
9
+ from __future__ import annotations
10
+
11
+ import re
12
+ from dataclasses import dataclass, field
13
+ from typing import Dict, List, Any, Optional, Tuple
14
+
15
+ import pandas as pd
16
+
17
+ try:
18
+ from engine.extended_reasoner import score_genera_from_extended
19
+ HAS_EXTENDED_REASONER = True
20
+ except Exception:
21
+ HAS_EXTENDED_REASONER = False
22
+
23
+
24
+ # ------------------------------------------------------------
25
+ # Helper
26
+ # ------------------------------------------------------------
27
+
28
+ def join_with_and(items: List[str]) -> str:
29
+ if not items:
30
+ return ""
31
+ if len(items) == 1:
32
+ return items[0]
33
+ return ", ".join(items[:-1]) + " and " + items[-1]
34
+
35
+
36
+ # ------------------------------------------------------------
37
+ # Identification Result
38
+ # ------------------------------------------------------------
39
+
40
+ @dataclass
41
+ class IdentificationResult:
42
+ genus: str
43
+ total_score: int
44
+ matched_fields: List[str] = field(default_factory=list)
45
+ mismatched_fields: List[str] = field(default_factory=list)
46
+ reasoning_factors: Dict[str, Any] = field(default_factory=dict)
47
+ total_fields_evaluated: int = 0
48
+ total_fields_possible: int = 0
49
+ extra_notes: str = ""
50
+ extended_score: float = 0.0 # 0.0–1.0
51
+ extended_explanation: str = ""
52
+
53
+ # ---------- Confidence metrics ----------
54
+
55
+ def confidence_percent(self) -> int:
56
+ """Confidence based only on tests the user entered."""
57
+ if self.total_fields_evaluated <= 0:
58
+ return 0
59
+ pct = (self.total_score / max(1, self.total_fields_evaluated)) * 100
60
+ return max(0, min(100, int(round(pct))))
61
+
62
+ def true_confidence(self) -> int:
63
+ """Confidence based on all possible fields in the DB."""
64
+ if self.total_fields_possible <= 0:
65
+ return 0
66
+ pct = (self.total_score / max(1, self.total_fields_possible)) * 100
67
+ return max(0, min(100, int(round(pct))))
68
+
69
+ def blended_confidence_percent(self) -> int:
70
+ """
71
+ Blend core confidence with extended_score (0–1).
72
+ If no extended signal, return core confidence.
73
+ Simple blend: 70% core, 30% extended signal.
74
+ """
75
+ core = self.confidence_percent()
76
+ if self.extended_score <= 0:
77
+ return core
78
+
79
+ ext_pct = max(0.0, min(1.0, float(self.extended_score))) * 100.0
80
+ blended = 0.7 * core + 0.3 * ext_pct
81
+ return max(0, min(100, int(round(blended))))
82
+
83
+ # ---------- Reasoning text ----------
84
+
85
+ def reasoning_paragraph(self, ranked_results: Optional[List["IdentificationResult"]] = None) -> str:
86
+ """Generate a narrative explanation from core matches."""
87
+ if not self.matched_fields and not self.reasoning_factors:
88
+ return "No significant biochemical or morphological matches were found."
89
+
90
+ intro_options = [
91
+ "Based on the observed biochemical and morphological traits,",
92
+ "According to the provided test results,",
93
+ "From the available laboratory findings,",
94
+ "Considering the entered reactions and colony characteristics,",
95
+ ]
96
+
97
+ import random
98
+ intro = random.choice(intro_options)
99
+
100
+ highlights = []
101
+
102
+ gram = self.reasoning_factors.get("Gram Stain")
103
+ if gram:
104
+ highlights.append(f"it is **Gram {str(gram).lower()}**")
105
+
106
+ shape = self.reasoning_factors.get("Shape")
107
+ if shape:
108
+ highlights.append(f"with a **{str(shape).lower()}** morphology")
109
+
110
+ catalase = self.reasoning_factors.get("Catalase")
111
+ if catalase:
112
+ highlights.append(f"and **catalase {str(catalase).lower()}** activity")
113
+
114
+ oxidase = self.reasoning_factors.get("Oxidase")
115
+ if oxidase:
116
+ highlights.append(f"and **oxidase {str(oxidase).lower()}** reaction")
117
+
118
+ oxy = self.reasoning_factors.get("Oxygen Requirement")
119
+ if oxy:
120
+ highlights.append(f"which prefers **{str(oxy).lower()}** conditions")
121
+
122
+ if len(highlights) > 1:
123
+ summary = ", ".join(highlights[:-1]) + " and " + highlights[-1]
124
+ else:
125
+ summary = "".join(highlights)
126
+
127
+ # Confidence text (core)
128
+ core_conf = self.confidence_percent()
129
+ if core_conf >= 70:
130
+ confidence_text = "The confidence in this identification is high."
131
+ elif core_conf >= 40:
132
+ confidence_text = "The confidence in this identification is moderate."
133
+ else:
134
+ confidence_text = "The confidence in this identification is low."
135
+
136
+ # Comparison vs other top results
137
+ comparison = ""
138
+ if ranked_results and len(ranked_results) > 1:
139
+ close_others = ranked_results[1:3]
140
+ other_names = [r.genus for r in close_others]
141
+ if other_names:
142
+ if self.total_score >= close_others[0].total_score:
143
+ comparison = (
144
+ f" It is **more likely** than {join_with_and(other_names)} "
145
+ f"based on stronger alignment in {join_with_and(self.matched_fields[:3])}."
146
+ )
147
+ else:
148
+ comparison = (
149
+ f" It is **less likely** than {join_with_and(other_names)} "
150
+ f"due to differences in {join_with_and(self.mismatched_fields[:3])}."
151
+ )
152
+
153
+ return f"{intro} {summary}, the isolate most closely resembles **{self.genus}**. {confidence_text}{comparison}"
154
+
155
+
156
+ # ------------------------------------------------------------
157
+ # Bacteria Identifier
158
+ # ------------------------------------------------------------
159
+
160
+ class BacteriaIdentifier:
161
+ """
162
+ Main engine to match bacterial genus based on biochemical & morphological data.
163
+ """
164
+
165
+ def __init__(self, db: pd.DataFrame):
166
+ self.db: pd.DataFrame = db.fillna("")
167
+ self.db_columns = list(self.db.columns)
168
+
169
+ # ---------- Field comparison ----------
170
+
171
+ def compare_field(self, db_val: Any, user_val: Any, field_name: str) -> int:
172
+ """
173
+ Compare one test field between database and user input.
174
+ Returns:
175
+ +1 match
176
+ -1 mismatch
177
+ 0 unknown / ignored
178
+ Return -999 to indicate a hard exclusion (stop comparing this genus).
179
+ """
180
+ if user_val is None:
181
+ return 0
182
+
183
+ user_str = str(user_val).strip()
184
+ if user_str == "" or user_str.lower() == "unknown":
185
+ return 0 # ignore unknown/empty
186
+
187
+ db_str = str(db_val).strip()
188
+ db_l = db_str.lower()
189
+ user_l = user_str.lower()
190
+
191
+ hard_exclusions = {"Gram Stain", "Shape", "Spore Formation"}
192
+
193
+ # Split multi-value fields on ; or / or ,
194
+ db_options = [p.strip().lower() for p in re.split(r"[;/,]", db_str) if p.strip()]
195
+ user_options = [p.strip().lower() for p in re.split(r"[;/,]", user_str) if p.strip()]
196
+
197
+ # "variable" logic: if either is variable, don't penalize
198
+ if "variable" in db_options or "variable" in user_options:
199
+ return 0
200
+
201
+ # Growth Temperature as range "low//high", user enters single numeric or similar
202
+ if field_name == "Growth Temperature":
203
+ try:
204
+ if "//" in db_str:
205
+ low_s, high_s = db_str.split("//", 1)
206
+ low = float(low_s)
207
+ high = float(high_s)
208
+ # user may have given "37//37" or "37" etc.
209
+ if "//" in user_str:
210
+ ut = float(user_str.split("//", 1)[0])
211
+ else:
212
+ ut = float(user_str)
213
+ if low <= ut <= high:
214
+ return 1
215
+ else:
216
+ return -1
217
+ except Exception:
218
+ return 0
219
+
220
+ # Flexible overlap match
221
+ match_found = False
222
+ for u in user_options:
223
+ for d in db_options:
224
+ if not d or not u:
225
+ continue
226
+ if u == d:
227
+ match_found = True
228
+ break
229
+ if u in d or d in u:
230
+ match_found = True
231
+ break
232
+ if match_found:
233
+ break
234
+
235
+ if match_found:
236
+ return 1
237
+
238
+ if field_name in hard_exclusions:
239
+ return -999 # hard mismatch
240
+ return -1
241
+
242
+ # ---------- Next-test suggestions ----------
243
+
244
+ def suggest_next_tests(
245
+ self,
246
+ top_results: List[IdentificationResult],
247
+ user_input: Dict[str, Any],
248
+ max_tests: int = 3,
249
+ ) -> List[str]:
250
+ """
251
+ Suggest tests that best differentiate top matches and haven't
252
+ already been entered or marked 'Unknown' by the user.
253
+ """
254
+ if not top_results:
255
+ return []
256
+
257
+ # Only consider first 3–5 genera
258
+ top_names = {r.genus for r in top_results[:5]}
259
+ varying_fields: List[str] = []
260
+
261
+ for field in self.db_columns:
262
+ if field == "Genus":
263
+ continue
264
+
265
+ # Skip fields user already filled with a known value
266
+ u_val = user_input.get(field, "")
267
+ if isinstance(u_val, str) and u_val.lower() not in {"", "unknown"}:
268
+ continue
269
+
270
+ # Check if this field differs meaningfully between top genera
271
+ values_for_field = set()
272
+ for _, row in self.db.iterrows():
273
+ g = row.get("Genus", "")
274
+ if g in top_names:
275
+ v = str(row.get(field, "")).strip().lower()
276
+ if v:
277
+ values_for_field.add(v)
278
+
279
+ if len(values_for_field) > 1:
280
+ varying_fields.append(field)
281
+
282
+ # simple deterministic: take first few
283
+ return varying_fields[:max_tests]
284
+
285
+ # ---------- Main identification routine ----------
286
+
287
+ def identify(self, user_input: Dict[str, Any]) -> List[IdentificationResult]:
288
+ """
289
+ Compare user input to database and rank possible genera.
290
+ Integrates extended signals when available.
291
+ """
292
+
293
+ results: List[IdentificationResult] = []
294
+ total_fields_possible = len([c for c in self.db_columns if c != "Genus"])
295
+
296
+ # Pre-compute extended scores if extended_reasoner is available
297
+ extended_scores: Dict[str, float] = {}
298
+ extended_explanation: str = ""
299
+
300
+ if HAS_EXTENDED_REASONER:
301
+ try:
302
+ ranked_ext, explanation = score_genera_from_extended(user_input)
303
+ extended_explanation = explanation or ""
304
+ for genus, score in ranked_ext:
305
+ extended_scores[str(genus)] = float(score)
306
+ except Exception:
307
+ extended_scores = {}
308
+ extended_explanation = ""
309
+
310
+ for _, row in self.db.iterrows():
311
+ genus = str(row.get("Genus", "")).strip()
312
+ if not genus:
313
+ continue
314
+
315
+ total_score = 0
316
+ matched_fields: List[str] = []
317
+ mismatched_fields: List[str] = []
318
+ reasoning_factors: Dict[str, Any] = {}
319
+ total_fields_evaluated = 0
320
+
321
+ hard_excluded = False
322
+
323
+ for field in self.db_columns:
324
+ if field == "Genus":
325
+ continue
326
+
327
+ db_val = row.get(field, "")
328
+ user_val = user_input.get(field, "")
329
+
330
+ score = self.compare_field(db_val, user_val, field)
331
+
332
+ if user_val is not None and str(user_val).strip() != "" and str(user_val).strip().lower() != "unknown":
333
+ total_fields_evaluated += 1
334
+
335
+ if score == -999:
336
+ hard_excluded = True
337
+ total_score = -999
338
+ break
339
+ elif score == 1:
340
+ total_score += 1
341
+ matched_fields.append(field)
342
+ reasoning_factors[field] = user_val
343
+ elif score == -1:
344
+ total_score -= 1
345
+ mismatched_fields.append(field)
346
+
347
+ if hard_excluded:
348
+ continue # skip this genus entirely
349
+
350
+ extra_notes = str(row.get("Extra Notes", "")).strip() if "Extra Notes" in row else ""
351
+
352
+ r = IdentificationResult(
353
+ genus=genus,
354
+ total_score=total_score,
355
+ matched_fields=matched_fields,
356
+ mismatched_fields=mismatched_fields,
357
+ reasoning_factors=reasoning_factors,
358
+ total_fields_evaluated=total_fields_evaluated,
359
+ total_fields_possible=total_fields_possible,
360
+ extra_notes=extra_notes,
361
+ )
362
+
363
+ # Attach extended score if available
364
+ if genus in extended_scores:
365
+ r.extended_score = extended_scores[genus]
366
+ r.extended_explanation = extended_explanation
367
+
368
+ results.append(r)
369
+
370
+ # Sort by core score descending
371
+ results.sort(key=lambda r: r.total_score, reverse=True)
372
+
373
+ # Suggest next tests for top few
374
+ if results:
375
+ next_tests = self.suggest_next_tests(results[:5], user_input)
376
+ next_tests_str = ", ".join(next_tests) if next_tests else ""
377
+ for r in results[:5]:
378
+ r.reasoning_factors["next_tests"] = next_tests_str
379
+
380
+ # Return top 10
381
+ return results[:10]