EphAsad commited on
Commit
83b046c
·
verified ·
1 Parent(s): 7132230

Update engine/parser_fusion.py

Browse files
Files changed (1) hide show
  1. engine/parser_fusion.py +263 -367
engine/parser_fusion.py CHANGED
@@ -1,436 +1,332 @@
1
- # engine/parser_llm.py
2
  # ------------------------------------------------------------
3
- # Local LLM parser for BactAI-D (Flan-T5, CPU-friendly)
4
- # Third parser head: repair & recovery
5
  #
6
- # Drop-in patched version:
7
- # - Few-shot examples increased to 15 (configurable via env)
8
- # - Field alias mapping (prevents silent field drops)
9
- # - Non-greedy JSON extraction (prevents regex over-capture)
10
- # - Improved P/N/V normalization (Flan phrasing coverage)
11
- # - Prompt refined for "extract/clarify" (reduces Unknown collapse)
12
- # - Debug prints (toggle via env var)
13
- # - Sugar logic scaffold preserved
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  # ------------------------------------------------------------
15
 
16
  from __future__ import annotations
17
 
18
  import json
19
  import os
20
- import random
21
- import re
22
- from typing import Dict, Any, List, Optional
23
-
24
- import torch
25
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
26
-
27
- # ------------------------------------------------------------
28
- # Model configuration
29
- # ------------------------------------------------------------
30
-
31
- DEFAULT_MODEL = os.getenv(
32
-     "BACTAI_LLM_PARSER_MODEL",
33
-     "google/flan-t5-base",
34
- )
35
-
36
- # You asked to raise snapshots to 15
37
- MAX_FEWSHOT_EXAMPLES = int(os.getenv("BACTAI_LLM_FEWSHOT", "25"))
38
 
39
- MAX_NEW_TOKENS = int(os.getenv("BACTAI_LLM_MAX_NEW_TOKENS", "128"))
 
40
 
41
- # Debug visibility (prints raw model output + parsed dict)
42
- DEBUG_LLM = os.getenv("BACTAI_LLM_DEBUG", "1").strip().lower() in {"1", "true", "yes", "y", "on"}
 
 
 
 
 
43
 
44
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
45
 
46
- _tokenizer: Optional[AutoTokenizer] = None
47
- _model: Optional[AutoModelForSeq2SeqLM] = None
48
- _GOLD_EXAMPLES: Optional[List[Dict[str, Any]]] = None
49
-
50
- # ------------------------------------------------------------
51
- # Allowed fields
52
- # ------------------------------------------------------------
53
-
54
- ALL_FIELDS: List[str] = [
55
-     "Gram Stain",
56
-     "Shape",
57
-     "Motility",
58
-     "Capsule",
59
-     "Spore Formation",
60
-     "Haemolysis",
61
-     "Haemolysis Type",
62
-     "Media Grown On",
63
-     "Colony Morphology",
64
-     "Oxygen Requirement",
65
-     "Growth Temperature",
66
-     "Catalase",
67
-     "Oxidase",
68
-     "Indole",
69
-     "Urease",
70
-     "Citrate",
71
-     "Methyl Red",
72
-     "VP",
73
-     "H2S",
74
-     "DNase",
75
-     "ONPG",
76
-     "Coagulase",
77
-     "Gelatin Hydrolysis",
78
-     "Esculin Hydrolysis",
79
-     "Nitrate Reduction",
80
-     "NaCl Tolerant (>=6%)",
81
-     "Lipase Test",
82
-     "Lysine Decarboxylase",
83
-     "Ornithine Decarboxylase",
84
-     "Ornitihine Decarboxylase",
85
-     "Arginine dihydrolase",
86
-     "Glucose Fermentation",
87
-     "Lactose Fermentation",
88
-     "Sucrose Fermentation",
89
-     "Maltose Fermentation",
90
-     "Mannitol Fermentation",
91
-     "Sorbitol Fermentation",
92
-     "Xylose Fermentation",
93
-     "Rhamnose Fermentation",
94
-     "Arabinose Fermentation",
95
-     "Raffinose Fermentation",
96
-     "Trehalose Fermentation",
97
-     "Inositol Fermentation",
98
-     "Gas Production",
99
-     "TSI Pattern",
100
-     "Colony Pattern",
101
-     "Pigment",
102
-     "Motility Type",
103
-     "Odor",
104
- ]
105
-
106
- SUGAR_FIELDS = [
107
-     "Glucose Fermentation",
108
-     "Lactose Fermentation",
109
-     "Sucrose Fermentation",
110
-     "Maltose Fermentation",
111
-     "Mannitol Fermentation",
112
-     "Sorbitol Fermentation",
113
-     "Xylose Fermentation",
114
-     "Rhamnose Fermentation",
115
-     "Arabinose Fermentation",
116
-     "Raffinose Fermentation",
117
-     "Trehalose Fermentation",
118
-     "Inositol Fermentation",
119
- ]
120
-
121
- PNV_FIELDS = set(
122
-     f for f in ALL_FIELDS
123
-     if f not in {
124
-         "Media Grown On",
125
-         "Colony Morphology",
126
-         "Growth Temperature",
127
-         "Gram Stain",
128
-         "Shape",
129
-         "Oxygen Requirement",
130
-         "Haemolysis Type",
131
-     }
132
- )
133
-
134
- # ------------------------------------------------------------
135
- # Field alias mapping (CRITICAL)
136
- # ------------------------------------------------------------
137
 
138
- FIELD_ALIASES: Dict[str, str] = {
139
-     # Gram
140
-     "Gram": "Gram Stain",
141
-     "Gram stain": "Gram Stain",
142
-     "Gram Stain Result": "Gram Stain",
143
-
144
-     # Salt tolerance
145
-     "NaCl tolerance": "NaCl Tolerant (>=6%)",
146
-     "NaCl Tolerant": "NaCl Tolerant (>=6%)",
147
-     "Salt tolerance": "NaCl Tolerant (>=6%)",
148
-     "Salt tolerant": "NaCl Tolerant (>=6%)",
149
-     "6.5% NaCl": "NaCl Tolerant (>=6%)",
150
-     "6% NaCl": "NaCl Tolerant (>=6%)",
151
-
152
-     # Temperature
153
-     "Growth temp": "Growth Temperature",
154
-     "Growth temperature": "Growth Temperature",
155
-     "Temperature growth": "Growth Temperature",
156
-
157
-     # Tests
158
-     "Catalase test": "Catalase",
159
-     "Oxidase test": "Oxidase",
160
-     "Indole test": "Indole",
161
-     "Urease test": "Urease",
162
-     "Citrate test": "Citrate",
163
-
164
-     # Sugars (common lowercase variants)
165
-     "Glucose fermentation": "Glucose Fermentation",
166
-     "Lactose fermentation": "Lactose Fermentation",
167
-     "Sucrose fermentation": "Sucrose Fermentation",
168
-     "Maltose fermentation": "Maltose Fermentation",
169
-     "Mannitol fermentation": "Mannitol Fermentation",
170
-     "Sorbitol fermentation": "Sorbitol Fermentation",
171
-     "Xylose fermentation": "Xylose Fermentation",
172
-     "Rhamnose fermentation": "Rhamnose Fermentation",
173
-     "Arabinose fermentation": "Arabinose Fermentation",
174
-     "Raffinose fermentation": "Raffinose Fermentation",
175
-     "Trehalose fermentation": "Trehalose Fermentation",
176
-     "Inositol fermentation": "Inositol Fermentation",
177
- }
178
 
179
  # ------------------------------------------------------------
180
- # Normalisation helpers
181
  # ------------------------------------------------------------
182
 
183
- def _norm_str(s: Any) -> str:
184
-     return str(s).strip() if s is not None else ""
185
-
186
- def _normalise_pnv_value(raw: Any) -> str:
187
      """
188
-     Expanded Flan-friendly normalization.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
      """
190
-     s = _norm_str(raw).lower()
191
-     if not s:
192
-         return "Unknown"
193
-
194
-     if any(x in s for x in {"positive", "pos", "+", "yes", "present", "detected", "reactive"}):
195
-         return "Positive"
196
-
197
-     if any(x in s for x in {"negative", "neg", "-", "no", "none", "absent", "not detected", "no growth"}):
198
-         return "Negative"
199
-
200
-     if "variable" in s or "mixed" in s or "inconsistent" in s:
201
-         return "Variable"
202
-
203
-     return "Unknown"
204
 
205
- def _normalise_gram(raw: Any) -> str:
206
-     s = _norm_str(raw).lower()
207
-     if "positive" in s:
208
-         return "Positive"
209
-     if "negative" in s:
210
-         return "Negative"
211
-     if "variable" in s:
212
-         return "Variable"
213
-     return "Unknown"
214
 
215
- def _merge_ornithine_variants(fields: Dict[str, str]) -> Dict[str, str]:
216
-     v = fields.get("Ornithine Decarboxylase") or fields.get("Ornitihine Decarboxylase")
217
-     if v and v != "Unknown":
218
-         fields["Ornithine Decarboxylase"] = v
219
-         fields["Ornitihine Decarboxylase"] = v
220
-     return fields
221
 
222
- # ------------------------------------------------------------
223
- # Sugar logic (RESTORED)
224
- # ------------------------------------------------------------
225
 
226
- _NON_FERMENTER_PATTERNS = re.compile(
227
-     r"\b(non[-\s]?fermenter|non[-\s]?fermentative|asaccharolytic|"
228
-     r"does not ferment (sugars|carbohydrates)|no carbohydrate fermentation)\b",
229
-     re.IGNORECASE,
230
- )
231
 
232
- def _apply_global_sugar_logic(fields: Dict[str, str], original_text: str) -> Dict[str, str]:
233
      """
234
-     If phenotype text indicates global non-fermenter behaviour,
235
-     mark all sugar fields Negative unless explicitly overridden.
236
      """
237
-     if not _NON_FERMENTER_PATTERNS.search(original_text):
238
-         return fields
239
-
240
-     for sugar in SUGAR_FIELDS:
241
-         current = fields.get(sugar)
242
-         if current in {"Positive", "Variable"}:
243
-             continue
244
-         fields[sugar] = "Negative"
245
-
246
-     return fields
247
 
248
- # ------------------------------------------------------------
249
- # Gold examples
250
- # ------------------------------------------------------------
251
 
252
- def _get_project_root() -> str:
253
-     return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
254
 
255
- def _load_gold_examples() -> List[Dict[str, Any]]:
256
-     global _GOLD_EXAMPLES
257
-     if _GOLD_EXAMPLES is not None:
258
-         return _GOLD_EXAMPLES
259
 
260
-     path = os.path.join(_get_project_root(), "data", "llm_gold_examples.json")
261
-     try:
262
-         with open(path, "r", encoding="utf-8") as f:
263
-             data = json.load(f)
264
-             _GOLD_EXAMPLES = data if isinstance(data, list) else []
265
-     except Exception:
266
-         _GOLD_EXAMPLES = []
267
-
268
-     return _GOLD_EXAMPLES
269
-
270
- # ------------------------------------------------------------
271
- # Prompt
272
- # ------------------------------------------------------------
273
 
274
- PROMPT_HEADER = """
275
- You are a microbiology expert assisting an automated phenotype parser.
276
 
277
- Your task is to EXTRACT OR CLARIFY phenotypic and biochemical test results
278
- from the input text.
 
279
 
280
- Rules:
281
- - Return ONLY valid JSON
282
- - Do NOT invent results
283
- - If a result is unclear or not stated, use "Unknown"
284
- - Prefer explicit statements over assumptions
285
 
286
- Output format:
287
- {
288
-   "parsed_fields": {
289
-     "Field Name": "Value",
290
-     ...
291
-   }
292
- }
293
- """
294
 
295
- PROMPT_FOOTER = """
296
- Now process the following phenotype description.
297
 
298
- Input:
299
- \"\"\"<<PHENOTYPE>>\"\"\"
300
 
301
- Return ONLY the JSON object.
302
- """
 
 
 
303
 
304
- def _build_prompt(text: str) -> str:
305
-     examples = _load_gold_examples()
306
-     n = min(MAX_FEWSHOT_EXAMPLES, len(examples))
307
-     sampled = random.sample(examples, n) if n > 0 else []
308
 
309
-     blocks: List[str] = [PROMPT_HEADER]
 
310
 
311
-     for ex in sampled:
312
-         inp = _norm_str(ex.get("input", ""))
313
-         exp = ex.get("expected", {})
314
-         if not isinstance(exp, dict):
315
-             exp = {}
316
-         blocks.append(
317
-             f'Input:\n"""{inp}"""\nOutput:\n'
318
-             f'{json.dumps({"parsed_fields": exp}, ensure_ascii=False)}\n'
319
-         )
320
 
321
-     blocks.append(PROMPT_FOOTER.replace("<<PHENOTYPE>>", text))
322
-     return "\n".join(blocks)
323
 
324
- # ------------------------------------------------------------
325
- # Model loader
326
- # ------------------------------------------------------------
327
 
328
- def _load_model() -> None:
329
-     global _model, _tokenizer
330
-     if _model is not None and _tokenizer is not None:
331
-         return
332
-     _tokenizer = AutoTokenizer.from_pretrained(DEFAULT_MODEL)
333
-     _model = AutoModelForSeq2SeqLM.from_pretrained(DEFAULT_MODEL).to(DEVICE)
334
-     _model.eval()
335
 
336
  # ------------------------------------------------------------
337
- # JSON extraction (non-greedy)
338
  # ------------------------------------------------------------
339
 
340
- _JSON_OBJECT_RE = re.compile(r"\{[\s\S]*?\}")
341
-
342
- def _extract_first_json_object(text: str) -> Dict[str, Any]:
343
      """
344
-     Extract the first JSON object from model output (non-greedy).
345
      """
346
-     m = _JSON_OBJECT_RE.search(text)
347
-     if not m:
348
-         return {}
349
-     try:
350
-         return json.loads(m.group(0))
351
-     except Exception:
352
-         return {}
 
353
 
354
- def _apply_field_aliases(fields_raw: Dict[str, Any]) -> Dict[str, Any]:
 
355
      """
356
-     Normalize keys via FIELD_ALIASES, preserving original values.
 
 
 
 
 
 
 
 
 
 
 
357
      """
358
-     out: Dict[str, Any] = {}
359
-     for k, v in fields_raw.items():
360
-         key = _norm_str(k)
361
-         if not key:
362
-             continue
363
-         mapped = FIELD_ALIASES.get(key, key)
364
-         out[mapped] = v
365
-     return out
366
 
367
- # ------------------------------------------------------------
368
- # PUBLIC API
369
- # ------------------------------------------------------------
370
 
371
- def parse_llm(text: str) -> Dict[str, Any]:
372
-     original = text or ""
373
-     if not original.strip():
374
-         return {"parsed_fields": {}, "source": "llm_parser", "raw": original}
375
 
376
-     _load_model()
377
-     assert _tokenizer is not None and _model is not None
 
 
 
 
 
 
 
 
 
 
 
378
 
379
-     prompt = _build_prompt(original)
 
 
 
 
380
 
381
-     # NOTE: Flan-T5 has a relatively small input length; truncation may occur.
382
-     inputs = _tokenizer(prompt, return_tensors="pt", truncation=True).to(DEVICE)
383
 
384
-     with torch.no_grad():
385
-         output = _model.generate(
386
-             **inputs,
387
-             max_new_tokens=MAX_NEW_TOKENS,
388
-             do_sample=False,
389
-             temperature=0.0,
390
-         )
391
 
392
-     decoded = _tokenizer.decode(output[0], skip_special_tokens=True)
 
393
 
394
-     if DEBUG_LLM:
395
-         print("=== LLM RAW OUTPUT ===")
396
-         print(decoded)
397
-         print("======================")
 
398
 
399
-     parsed_obj = _extract_first_json_object(decoded)
400
-     fields_raw = parsed_obj.get("parsed_fields", {}) if isinstance(parsed_obj, dict) else {}
401
-     if not isinstance(fields_raw, dict):
402
-         fields_raw = {}
403
 
404
-     # Apply alias mapping so we don't silently drop values
405
-     fields_raw = _apply_field_aliases(fields_raw)
 
406
 
407
-     if DEBUG_LLM:
408
-         print("=== LLM PARSED_FIELDS (RAW) ===")
409
-         try:
410
-             print(json.dumps(fields_raw, indent=2, ensure_ascii=False))
411
-         except Exception:
412
-             print(fields_raw)
413
-         print("===============================")
414
 
415
-     cleaned: Dict[str, str] = {}
 
 
 
416
 
417
-     for field in ALL_FIELDS:
418
-         if field not in fields_raw:
419
-             continue
420
-         raw_val = fields_raw[field]
421
 
422
-         if field == "Gram Stain":
423
-             cleaned[field] = _normalise_gram(raw_val)
424
-         elif field in PNV_FIELDS:
425
-             cleaned[field] = _normalise_pnv_value(raw_val)
426
-         else:
427
-             cleaned[field] = _norm_str(raw_val) or "Unknown"
428
 
429
-     cleaned = _merge_ornithine_variants(cleaned)
430
-     cleaned = _apply_global_sugar_logic(cleaned, original)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
431
 
432
      return {
433
-         "parsed_fields": cleaned,
434
-         "source": "llm_parser",
435
-         "raw": original,
436
-     }
 
 
1
+ # engine/parser_fusion.py
2
  # ------------------------------------------------------------
3
+ # Tri-Parser Fusion Stage 12B (Weighted, SOTA-style)
 
4
  #
5
+ # This module combines:
6
+ #   - Rule parser (parser_rules.parse_text_rules)
7
+ #   - Extended parser (parser_ext.parse_text_extended)
8
+ #   - LLM parser (parser_llm.parse_llm)    [optional]
9
+ #
10
+ # using per-field reliability weights learned in Stage 12A
11
+ # and stored in:
12
+ #   data/field_weights.json
13
+ #
14
+ # Behaviour:
15
+ #   - For each field, gather predictions from available parsers.
16
+ #   - For that field, load weights:
17
+ #          field_weights[field]  (if present)
18
+ #          else global weights
19
+ #          else equal weights across available parsers
20
+ #   - Discard parsers that:
21
+ #          * did not predict the field
22
+ #          * or only predicted "Unknown"
23
+ #   - Group by predicted value and sum the weights of parsers
24
+ #     that voted for each value.
25
+ #   - Choose the value with highest total weight.
26
+ #     Tie-break: prefer rules > extended > llm if needed.
27
+ #
28
+ # Output format:
29
+ #   {
30
+ #     "fused_fields": { field: value, ... },   # used by DB identifier AND genus ML
31
+ #     "by_parser": {
32
+ #       "rules": { ... },
33
+ #       "extended": { ... },
34
+ #       "llm": { ... }   # may be empty
35
+ #     },
36
+ #     "votes": {
37
+ #       field_name: {
38
+ #         "per_parser": {
39
+ #           "rules": {"value": "Positive", "weight": 0.95},
40
+ #           "extended": {"value": "Unknown", "weight": 0.03},
41
+ #           ...
42
+ #         },
43
+ #         "summed": {
44
+ #           "Positive": 0.97,
45
+ #           "Negative": 0.02
46
+ #         },
47
+ #         "chosen": "Positive"
48
+ #       },
49
+ #       ...
50
+ #     },
51
+ #     "weights_meta": {
52
+ #       "has_weights_file": True/False,
53
+ #       "weights_path": "data/field_weights.json",
54
+ #       "meta": { ... }  # from file if present
55
+ #     }
56
+ #   }
57
  # ------------------------------------------------------------
58
 
59
  from __future__ import annotations
60
 
61
  import json
62
  import os
63
+ from typing import Any, Dict, Optional
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ from engine.parser_rules import parse_text_rules
66
+ from engine.parser_ext import parse_text_extended
67
 
68
+ # Optional LLM parser
69
+ try:
70
+     from engine.parser_llm import parse_llm as parse_text_llm  # type: ignore
71
+     HAS_LLM = True
72
+ except Exception:
73
+     parse_text_llm = None  # type: ignore
74
+     HAS_LLM = False
75
 
76
+ # Path to learned weights
77
+ FIELD_WEIGHTS_PATH = os.path.join("data", "field_weights.json")
78
 
79
+ UNKNOWN = "Unknown"
80
+ PARSER_ORDER = ["rules", "extended", "llm"]  # used for tie-breaking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  # ------------------------------------------------------------
84
+ # Weights loading and helpers
85
  # ------------------------------------------------------------
86
 
87
+ def _load_field_weights(path: str = FIELD_WEIGHTS_PATH) -> Dict[str, Any]:
 
 
 
88
      """
89
+     Load the JSON weights file produced by Stage 12A.
90
+
91
+     Expected structure:
92
+       {
93
+         "global": { "rules": 0.7, "extended": 0.2, "llm": 0.1 },
94
+         "fields": {
95
+           "DNase": {
96
+             "rules": 0.95,
97
+             "extended": 0.03,
98
+             "llm": 0.02,
99
+             "support": 123
100
+           },
101
+           ...
102
+         },
103
+         "meta": { ... }
104
+       }
105
+
106
+     If the file is missing or broken, we fall back to an empty dict,
107
+     which triggers equal-weight behaviour later.
108
      """
109
+     if not os.path.exists(path):
110
+         return {}
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
+     try:
113
+         with open(path, "r", encoding="utf-8") as f:
114
+             obj = json.load(f)
115
+         if isinstance(obj, dict):
116
+             return obj
117
+         return {}
118
+     except Exception:
119
+         return {}
 
120
 
 
 
 
 
 
 
121
 
122
+ FIELD_WEIGHTS_RAW: Dict[str, Any] = _load_field_weights()
123
+ HAS_WEIGHTS_FILE: bool = bool(FIELD_WEIGHTS_RAW)
 
124
 
 
 
 
 
 
125
 
126
+ def _normalise_scores(scores: Dict[str, float]) -> Dict[str, float]:
127
      """
128
+     Normalise a dict of parser -> score into weights summing to 1.
129
+     If all scores are zero or dict is empty, return equal weights.
130
      """
131
+     cleaned = {k: max(0.0, float(v)) for k, v in scores.items()}
132
+     total = sum(cleaned.values())
 
 
 
 
 
 
 
 
133
 
134
+     if total <= 0:
135
+         n = len(cleaned) or 1
136
+         return {k: 1.0 / n for k in cleaned}
137
 
138
+     return {k: v / total for k, v in cleaned.items()}
 
139
 
 
 
 
 
140
 
141
+ def _get_base_weights_for_parsers(include_llm: bool) -> Dict[str, float]:
142
+     """
143
+     Get a naive equal-weight distribution across available parsers.
144
+     Used when no learned weights are available.
145
+     """
146
+     parsers = ["rules", "extended"]
147
+     if include_llm:
148
+         parsers.append("llm")
149
+     n = len(parsers) or 1
150
+     return {p: 1.0 / n for p in parsers}
 
 
 
151
 
 
 
152
 
153
+ def _get_weights_for_field(field_name: str, include_llm: bool) -> Dict[str, float]:
154
+     """
155
+     Get weights for a specific field.
156
 
157
+     Priority:
158
+       1) If FIELD_WEIGHTS_RAW has a 'fields[field_name]' entry,
159
+          use that.
160
+       2) Else if FIELD_WEIGHTS_RAW has 'global', use that.
161
+       3) Else equal weights.
162
 
163
+     In all cases:
164
+       - Drop 'llm' if include_llm == False
165
+       - Normalise
166
+     """
167
+     if not FIELD_WEIGHTS_RAW:
168
+         base = _get_base_weights_for_parsers(include_llm)
169
+         return _normalise_scores(base)
 
170
 
171
+     fields_block = FIELD_WEIGHTS_RAW.get("fields", {}) or {}
172
+     global_block = FIELD_WEIGHTS_RAW.get("global", {}) or {}
173
 
174
+     raw: Dict[str, float] = {}
 
175
 
176
+     field_entry = fields_block.get(field_name)
177
+     if isinstance(field_entry, dict):
178
+         for k, v in field_entry.items():
179
+             if k in ("rules", "extended", "llm"):
180
+                 raw[k] = float(v)
181
 
182
+     if not raw and isinstance(global_block, dict):
183
+         for k, v in global_block.items():
184
+             if k in ("rules", "extended", "llm"):
185
+                 raw[k] = float(v)
186
 
187
+     if not raw:
188
+         raw = _get_base_weights_for_parsers(include_llm)
189
 
190
+     if not include_llm and "llm" in raw:
191
+         raw.pop("llm", None)
 
 
 
 
 
 
 
192
 
193
+     if not raw:
194
+         raw = _get_base_weights_for_parsers(include_llm=False)
195
 
196
+     return _normalise_scores(raw)
 
 
197
 
 
 
 
 
 
 
 
198
 
199
  # ------------------------------------------------------------
200
+ # Fusion logic
201
  # ------------------------------------------------------------
202
 
203
+ def _clean_pred_value(val: Optional[str]) -> Optional[str]:
 
 
204
      """
205
+     Treat None, "", or explicit "Unknown" as missing for fusion.
206
      """
207
+     if val is None:
208
+         return None
209
+     s = str(val).strip()
210
+     if not s:
211
+         return None
212
+     if s.lower() == UNKNOWN.lower():
213
+         return None
214
+     return s
215
 
216
+
217
+ def parse_text_fused(text: str, use_llm: Optional[bool] = None) -> Dict[str, Any]:
218
      """
219
+     Main tri-fusion entrypoint.
220
+
221
+     Parameters
222
+     ----------
223
+     text : str
224
+     use_llm : bool or None
225
+         If True → include LLM.
226
+         If False → skip LLM.
227
+         If None → include if HAS_LLM.
228
+
229
+     Returns:
230
+       full fusion output including votes + per-parser summaries.
231
      """
232
+     original = text or ""
233
+     include_llm = HAS_LLM if use_llm is None else bool(use_llm)
 
 
 
 
 
 
234
 
235
+     rules_out = parse_text_rules(original) or {}
236
+     ext_out = parse_text_extended(original) or {}
 
237
 
238
+     rules_fields = dict(rules_out.get("parsed_fields", {}))
239
+     ext_fields = dict(ext_out.get("parsed_fields", {}))
 
 
240
 
241
+     llm_fields: Dict[str, Any] = {}
242
+     if include_llm and parse_text_llm is not None:
243
+         try:
244
+             llm_out = parse_text_llm(original)
245
+             if isinstance(llm_out, dict):
246
+                 if "parsed_fields" in llm_out:
247
+                     llm_fields = dict(llm_out.get("parsed_fields", {}))
248
+                 else:
249
+                     llm_fields = {str(k): v for k, v in llm_out.items()}
250
+         except Exception:
251
+             llm_fields = {}
252
+     else:
253
+         include_llm = False
254
 
255
+     by_parser: Dict[str, Dict[str, Any]] = {
256
+         "rules": rules_fields,
257
+         "extended": ext_fields,
258
+         "llm": llm_fields if include_llm else {},
259
+     }
260
 
261
+     candidate_fields = set(rules_fields.keys()) | set(ext_fields.keys()) | set(llm_fields.keys())
 
262
 
263
+     fused_fields: Dict[str, Any] = {}
264
+     votes_debug: Dict[str, Any] = {}
 
 
 
 
 
265
 
266
+     for field in sorted(candidate_fields):
267
+         weights = _get_weights_for_field(field, include_llm=include_llm)
268
 
269
+         parser_preds: Dict[str, Optional[str]] = {
270
+             "rules": _clean_pred_value(rules_fields.get(field)),
271
+             "extended": _clean_pred_value(ext_fields.get(field)),
272
+             "llm": _clean_pred_value(llm_fields.get(field)) if include_llm else None,
273
+         }
274
 
275
+         per_parser_info: Dict[str, Any] = {}
276
+         value_scores: Dict[str, float] = {}
 
 
277
 
278
+         for parser_name in PARSER_ORDER:
279
+             if parser_name == "llm" and not include_llm:
280
+                 continue
281
 
282
+             pred = parser_preds.get(parser_name)
283
+             w = float(weights.get(parser_name, 0.0))
 
 
 
 
 
284
 
285
+             per_parser_info[parser_name] = {
286
+                 "value": pred if pred is not None else UNKNOWN,
287
+                 "weight": w,
288
+             }
289
 
290
+             if pred is None:
291
+                 continue
 
 
292
 
293
+             value_scores[pred] = value_scores.get(pred, 0.0) + w
 
 
 
 
 
294
 
295
+         if not value_scores:
296
+             fused_value = UNKNOWN
297
+         else:
298
+             max_score = max(value_scores.values())
299
+             best_values = [v for v, s in value_scores.items() if s == max_score]
300
+
301
+             if len(best_values) == 1:
302
+                 fused_value = best_values[0]
303
+             else:
304
+                 fused_value = best_values[0]
305
+                 for parser_name in PARSER_ORDER:
306
+                     if parser_name == "llm" and not include_llm:
307
+                         continue
308
+                     pred = parser_preds.get(parser_name)
309
+                     if pred in best_values:
310
+                         fused_value = pred
311
+                         break
312
+
313
+         fused_fields[field] = fused_value
314
+
315
+         votes_debug[field] = {
316
+             "per_parser": per_parser_info,
317
+             "summed": value_scores,
318
+             "chosen": fused_value,
319
+         }
320
+
321
+     weights_meta = {
322
+         "has_weights_file": HAS_WEIGHTS_FILE,
323
+         "weights_path": FIELD_WEIGHTS_PATH,
324
+         "meta": FIELD_WEIGHTS_RAW.get("meta", {}) if HAS_WEIGHTS_FILE else {},
325
+     }
326
 
327
      return {
328
+         "fused_fields": fused_fields,
329
+         "by_parser": by_parser,
330
+         "votes": votes_debug,
331
+         "weights_meta": weights_meta,
332
+     }