""" MediGuard AI — Medical-Aware Text Chunker Section-aware chunking with biomarker / condition metadata extraction. """ from __future__ import annotations import re from dataclasses import dataclass, field # Biomarker names to detect in chunk text _BIOMARKER_NAMES: set[str] = { "Glucose", "Cholesterol", "Triglycerides", "HbA1c", "LDL", "HDL", "Insulin", "BMI", "Hemoglobin", "Platelets", "WBC", "RBC", "Hematocrit", "MCV", "MCH", "MCHC", "Heart Rate", "Systolic", "Diastolic", "Troponin", "CRP", "C-reactive Protein", "ALT", "AST", "Creatinine", "TSH", "T3", "T4", "Sodium", "Potassium", "Calcium", } _CONDITION_KEYWORDS: dict[str, str] = { "diabetes": "diabetes", "diabetic": "diabetes", "hyperglycemia": "diabetes", "insulin resistance": "diabetes", "anemia": "anemia", "anaemia": "anemia", "iron deficiency": "anemia", "thalassemia": "thalassemia", "thalassaemia": "thalassemia", "thrombocytopenia": "thrombocytopenia", "heart disease": "heart_disease", "cardiovascular": "heart_disease", "coronary": "heart_disease", "hypertension": "heart_disease", "atherosclerosis": "heart_disease", "hyperlipidemia": "heart_disease", } _SECTION_RE = re.compile( r"^(?:#+\s*)?(" r"abstract|introduction|background|methods?|methodology|materials?" r"|results?|findings|discussion|conclusion|summary" r"|guidelines?|recommendations?|references?|bibliography" r"|clinical\s*presentation|pathophysiology|diagnosis|treatment|prognosis" r")\b", re.IGNORECASE | re.MULTILINE, ) @dataclass class MedicalChunk: """A single chunk with medical metadata.""" text: str chunk_index: int document_id: str = "" title: str = "" source_file: str = "" page_number: int | None = None section_title: str = "" biomarkers_mentioned: list[str] = field(default_factory=list) condition_tags: list[str] = field(default_factory=list) word_count: int = 0 def to_dict(self) -> dict: return { "chunk_text": self.text, "chunk_index": self.chunk_index, "document_id": self.document_id, "title": self.title, "source_file": self.source_file, "page_number": self.page_number, "section_title": self.section_title, "biomarkers_mentioned": self.biomarkers_mentioned, "condition_tags": self.condition_tags, } class MedicalTextChunker: """Section-aware text chunker optimised for medical documents.""" def __init__( self, target_words: int = 600, overlap_words: int = 100, min_words: int = 50, ): self.target_words = target_words self.overlap_words = overlap_words self.min_words = min_words def chunk_text( self, text: str, *, document_id: str = "", title: str = "", source_file: str = "", ) -> list[MedicalChunk]: """Split text into enriched medical chunks.""" sections = self._split_sections(text) chunks: list[MedicalChunk] = [] idx = 0 for section_title, section_text in sections: words = section_text.split() if not words: continue start = 0 while start < len(words): end = min(start + self.target_words, len(words)) chunk_words = words[start:end] if len(chunk_words) < self.min_words and chunks: # merge tiny tail into previous chunk chunks[-1].text += " " + " ".join(chunk_words) chunks[-1].word_count = len(chunks[-1].text.split()) break chunk_text = " ".join(chunk_words) biomarkers = self._detect_biomarkers(chunk_text) conditions = self._detect_conditions(chunk_text) chunks.append( MedicalChunk( text=chunk_text, chunk_index=idx, document_id=document_id, title=title, source_file=source_file, section_title=section_title, biomarkers_mentioned=biomarkers, condition_tags=conditions, word_count=len(chunk_words), ) ) idx += 1 start = end - self.overlap_words if end < len(words) else len(words) return chunks # ── internal helpers ───────────────────────────────────────────────── @staticmethod def _split_sections(text: str) -> list[tuple[str, str]]: """Split text by detected section headers.""" matches = list(_SECTION_RE.finditer(text)) if not matches: return [("", text)] sections: list[tuple[str, str]] = [] # text before first section header if matches[0].start() > 0: preamble = text[: matches[0].start()].strip() if preamble: sections.append(("", preamble)) for i, match in enumerate(matches): header = match.group(1).strip().title() start = match.end() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) body = text[start:end].strip() # Skip reference/bibliography sections if header.lower() in ("references", "bibliography"): continue if body: sections.append((header, body)) return sections or [("", text)] @staticmethod def _detect_biomarkers(text: str) -> list[str]: text_lower = text.lower() return sorted({name for name in _BIOMARKER_NAMES if name.lower() in text_lower}) @staticmethod def _detect_conditions(text: str) -> list[str]: text_lower = text.lower() return sorted({tag for kw, tag in _CONDITION_KEYWORDS.items() if kw in text_lower})