from __future__ import annotations import json import re from dataclasses import dataclass, field from typing import Any from json_repair import loads as repair_json_loads from src.local_env import load_local_env load_local_env() EXTRACTION_PROMPT = """ You are extracting laboratory test results from a medical document. Return only valid JSON with this exact shape: { "patient": { "age": "string or null", "age_years": 0.0, "sex": "male | female | unknown" }, "tests": [ { "marker": "string", "value": "string", "unit": "string or null", "reference_range": "string or null", "status": "low | normal | high | abnormal | unknown", "source_text": "short source snippet", "confidence": 0.0 } ], "notes": ["string"] } Field guide (how each test row should look): - marker: The test name exactly as printed on the report. Keep abbreviations in parentheses when shown (e.g. "Hemoglobin (Hb)", "Mean Cell Volume (MCV)"). One JSON object per result row — do not merge percent differentials with absolute counts. - value: The measured result only, as a string. Use digits for numeric results (no unit). Remove thousands separators ("150,000" → "150000"). Qualitative results stay as printed ("Negative", "Positive", "<5"). - unit: The measurement unit as printed (e.g. "g/dL", "%", "K/mcL", "mg/dL", "U/L", "cumm"). null if absent. - reference_range: The lab's printed normal interval or limit, copied verbatim when possible (e.g. "13.0 - 17.0", "70-99", "<200"). null if missing. - status: Compare value to the printed reference range or report flag. Map H/L, LOW/HIGH, *, or out-of-range arrows to "low" or "high". Use "abnormal" for qualitative out-of-range results. Use "unknown" when the range or flag is missing. - source_text: A short verbatim snippet from the document for that row (test name + value + unit/flag), under ~120 characters. - confidence: 0.0–1.0 — how certain you are that marker, value, and unit are correct. Rules: - Extract pure lab values only. - Do not diagnose, interpret, recommend food, supplements, or exercise. - Extract patient age and sex only when visibly present in the document. - Normalize sex to "male", "female", or "unknown"; do not infer sex from the patient's name. - Use null for age and age_years when age is missing. - Do not invent missing values. - Preserve units and reference ranges exactly as shown when possible. - If a marker is unreadable, omit it or add a short note in "notes". - Use null for missing units or reference ranges. - Confidence must be a number from 0 to 1. Few-shot examples (format only — extract from the uploaded document, not these samples): Example 1 — CBC with flags: { "patient": {"age": "58 years", "age_years": 58.0, "sex": "female"}, "tests": [ { "marker": "Hemoglobin (Hb)", "value": "12.5", "unit": "g/dL", "reference_range": "13.0 - 17.0", "status": "low", "source_text": "Hemoglobin (Hb) 12.5 g/dL L", "confidence": 0.96 }, { "marker": "Packed Cell Volume (PCV)", "value": "57.5", "unit": "%", "reference_range": "40 - 50", "status": "high", "source_text": "Packed Cell Volume (PCV) 57.5 % H", "confidence": 0.94 }, { "marker": "Platelet Count", "value": "150000", "unit": "cumm", "reference_range": "150000 - 410000", "status": "normal", "source_text": "Platelet Count 150000 cumm", "confidence": 0.93 } ], "notes": [] } Example 2 — chemistry panel: { "patient": {"age": null, "age_years": null, "sex": "unknown"}, "tests": [ { "marker": "Glucose", "value": "95", "unit": "mg/dL", "reference_range": "70-99", "status": "normal", "source_text": "Glucose 95 mg/dL", "confidence": 0.95 }, { "marker": "ALT", "value": "42", "unit": "U/L", "reference_range": "7-56", "status": "normal", "source_text": "ALT 42 U/L", "confidence": 0.92 }, { "marker": "Creatinine", "value": "1.8", "unit": "mg/dL", "reference_range": "0.6-1.2", "status": "high", "source_text": "Creatinine 1.8 mg/dL H", "confidence": 0.94 } ], "notes": [] } Example 3 — absolute counts (separate rows from % differentials): { "patient": {"age": "34", "age_years": 34.0, "sex": "male"}, "tests": [ { "marker": "Neutrophils", "value": "60", "unit": "%", "reference_range": "50 - 62", "status": "normal", "source_text": "Neutrophils 60 %", "confidence": 0.91 }, { "marker": "Neutrophil, Absolute", "value": "3.5", "unit": "K/mcL", "reference_range": "1.8-7.8", "status": "normal", "source_text": "Neutrophil, Absolute 3.5 K/mcL", "confidence": 0.90 }, { "marker": "White Blood Cell (WBC)", "value": "6.9", "unit": "K/mcL", "reference_range": "4.8-10.8", "status": "normal", "source_text": "White Blood Cell (WBC) 6.9 K/mcL", "confidence": 0.93 } ], "notes": [] } """.strip() @dataclass(frozen=True) class ExtractionResult: tests: list[dict[str, Any]] notes: list[str] raw_response: str request_summary: dict[str, Any] patient: dict[str, Any] = field(default_factory=dict) def summarize_document_parts(parts: list[dict[str, Any]]) -> dict[str, int]: """Lightweight payload stats for pipeline traces (no base64 blobs).""" image_count = 0 text_characters = 0 for part in parts: if part.get("type") == "image_url": image_count += 1 elif part.get("type") == "text": text_characters += len(str(part.get("text") or "")) return {"image_count": image_count, "text_characters": text_characters} def _parse_json_response(text: str) -> dict[str, Any]: cleaned = _strip_think(_strip_code_fence(text)) parsed = _loads_model_json(cleaned) # Some models (e.g. MiniCPM-V in "thinking" mode) return a bare array of tests # instead of the {tests, notes} object. Wrap it so the rest of the app is unchanged. if isinstance(parsed, list): return {"tests": parsed, "notes": []} if not isinstance(parsed, dict): raise ValueError("Model response JSON must be an object or array.") return parsed def _loads_model_json(text: str) -> Any: try: return json.loads(text) except json.JSONDecodeError: try: return json.loads(text, strict=False) except json.JSONDecodeError: match = re.search(r"\[.*\]|\{.*\}", text, flags=re.DOTALL) if not match: raise ValueError("Model response did not contain JSON.") snippet = match.group(0) try: return json.loads(snippet) except json.JSONDecodeError: try: return json.loads(snippet, strict=False) except json.JSONDecodeError: return repair_json_loads(snippet) def _strip_code_fence(text: str) -> str: stripped = text.strip() if stripped.startswith("```"): stripped = re.sub(r"^```(?:json)?\s*", "", stripped, flags=re.IGNORECASE) stripped = re.sub(r"\s*```$", "", stripped) return stripped.strip() _THINK_RE = re.compile(r".*?", flags=re.DOTALL | re.IGNORECASE) def _strip_think(text: str) -> str: """Drop ... reasoning blocks some models emit before the JSON.""" return _THINK_RE.sub("", text).strip() def _normalize_tests(value: Any) -> list[dict[str, Any]]: if not isinstance(value, list): return [] tests: list[dict[str, Any]] = [] for item in value: if not isinstance(item, dict): continue tests.append( { "marker": str(item.get("marker") or "").strip(), "value": str(item.get("value") or "").strip(), "unit": _optional_string(item.get("unit")), "reference_range": _optional_string(item.get("reference_range")), "status": str(item.get("status") or "unknown").strip().lower(), "source_text": _optional_string(item.get("source_text")), "confidence": _confidence(item.get("confidence")), } ) return [test for test in tests if test["marker"] and test["value"]] def _normalize_notes(value: Any) -> list[str]: if not isinstance(value, list): return [] return [str(note).strip() for note in value if str(note).strip()] def _normalize_patient(value: Any) -> dict[str, Any]: if not isinstance(value, dict): return {"age": None, "age_years": None, "sex": "unknown"} age = _optional_string(value.get("age") or value.get("age_text") or value.get("patient_age")) age_years = _optional_float(value.get("age_years")) sex = _normalize_sex(value.get("sex") or value.get("patient_sex") or value.get("gender")) return { "age": age, "age_years": age_years, "sex": sex, } def _optional_string(value: Any) -> str | None: if value is None: return None text = str(value).strip() return text or None def _optional_float(value: Any) -> float | None: if value is None or value == "": return None try: return float(value) except (TypeError, ValueError): return None def _normalize_sex(value: Any) -> str: text = str(value or "").strip().casefold() if text in {"m", "male"}: return "male" if text in {"f", "female"}: return "female" return "unknown" def _confidence(value: Any) -> float: try: score = float(value) except (TypeError, ValueError): return 0.0 return max(0.0, min(1.0, score))