Spaces:
Running
Running
| """ | |
| MediGuard AI β Medical-Aware Text Chunker | |
| Section-aware chunking with biomarker / condition metadata extraction. | |
| """ | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass, field | |
| # Biomarker names to detect in chunk text | |
| _BIOMARKER_NAMES: set[str] = { | |
| "Glucose", | |
| "Cholesterol", | |
| "Triglycerides", | |
| "HbA1c", | |
| "LDL", | |
| "HDL", | |
| "Insulin", | |
| "BMI", | |
| "Hemoglobin", | |
| "Platelets", | |
| "WBC", | |
| "RBC", | |
| "Hematocrit", | |
| "MCV", | |
| "MCH", | |
| "MCHC", | |
| "Heart Rate", | |
| "Systolic", | |
| "Diastolic", | |
| "Troponin", | |
| "CRP", | |
| "C-reactive Protein", | |
| "ALT", | |
| "AST", | |
| "Creatinine", | |
| "TSH", | |
| "T3", | |
| "T4", | |
| "Sodium", | |
| "Potassium", | |
| "Calcium", | |
| } | |
| _CONDITION_KEYWORDS: dict[str, str] = { | |
| "diabetes": "diabetes", | |
| "diabetic": "diabetes", | |
| "hyperglycemia": "diabetes", | |
| "insulin resistance": "diabetes", | |
| "anemia": "anemia", | |
| "anaemia": "anemia", | |
| "iron deficiency": "anemia", | |
| "thalassemia": "thalassemia", | |
| "thalassaemia": "thalassemia", | |
| "thrombocytopenia": "thrombocytopenia", | |
| "heart disease": "heart_disease", | |
| "cardiovascular": "heart_disease", | |
| "coronary": "heart_disease", | |
| "hypertension": "heart_disease", | |
| "atherosclerosis": "heart_disease", | |
| "hyperlipidemia": "heart_disease", | |
| } | |
| _SECTION_RE = re.compile( | |
| r"^(?:#+\s*)?(" | |
| r"abstract|introduction|background|methods?|methodology|materials?" | |
| r"|results?|findings|discussion|conclusion|summary" | |
| r"|guidelines?|recommendations?|references?|bibliography" | |
| r"|clinical\s*presentation|pathophysiology|diagnosis|treatment|prognosis" | |
| r")\b", | |
| re.IGNORECASE | re.MULTILINE, | |
| ) | |
| class MedicalChunk: | |
| """A single chunk with medical metadata.""" | |
| text: str | |
| chunk_index: int | |
| document_id: str = "" | |
| title: str = "" | |
| source_file: str = "" | |
| page_number: int | None = None | |
| section_title: str = "" | |
| biomarkers_mentioned: list[str] = field(default_factory=list) | |
| condition_tags: list[str] = field(default_factory=list) | |
| word_count: int = 0 | |
| def to_dict(self) -> dict: | |
| return { | |
| "chunk_text": self.text, | |
| "chunk_index": self.chunk_index, | |
| "document_id": self.document_id, | |
| "title": self.title, | |
| "source_file": self.source_file, | |
| "page_number": self.page_number, | |
| "section_title": self.section_title, | |
| "biomarkers_mentioned": self.biomarkers_mentioned, | |
| "condition_tags": self.condition_tags, | |
| } | |
| class MedicalTextChunker: | |
| """Section-aware text chunker optimised for medical documents.""" | |
| def __init__( | |
| self, | |
| target_words: int = 600, | |
| overlap_words: int = 100, | |
| min_words: int = 50, | |
| ): | |
| self.target_words = target_words | |
| self.overlap_words = overlap_words | |
| self.min_words = min_words | |
| def chunk_text( | |
| self, | |
| text: str, | |
| *, | |
| document_id: str = "", | |
| title: str = "", | |
| source_file: str = "", | |
| ) -> list[MedicalChunk]: | |
| """Split text into enriched medical chunks.""" | |
| sections = self._split_sections(text) | |
| chunks: list[MedicalChunk] = [] | |
| idx = 0 | |
| for section_title, section_text in sections: | |
| words = section_text.split() | |
| if not words: | |
| continue | |
| start = 0 | |
| while start < len(words): | |
| end = min(start + self.target_words, len(words)) | |
| chunk_words = words[start:end] | |
| if len(chunk_words) < self.min_words and chunks: | |
| # merge tiny tail into previous chunk | |
| chunks[-1].text += " " + " ".join(chunk_words) | |
| chunks[-1].word_count = len(chunks[-1].text.split()) | |
| break | |
| chunk_text = " ".join(chunk_words) | |
| biomarkers = self._detect_biomarkers(chunk_text) | |
| conditions = self._detect_conditions(chunk_text) | |
| chunks.append( | |
| MedicalChunk( | |
| text=chunk_text, | |
| chunk_index=idx, | |
| document_id=document_id, | |
| title=title, | |
| source_file=source_file, | |
| section_title=section_title, | |
| biomarkers_mentioned=biomarkers, | |
| condition_tags=conditions, | |
| word_count=len(chunk_words), | |
| ) | |
| ) | |
| idx += 1 | |
| start = end - self.overlap_words if end < len(words) else len(words) | |
| return chunks | |
| # ββ internal helpers βββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _split_sections(text: str) -> list[tuple[str, str]]: | |
| """Split text by detected section headers.""" | |
| matches = list(_SECTION_RE.finditer(text)) | |
| if not matches: | |
| return [("", text)] | |
| sections: list[tuple[str, str]] = [] | |
| # text before first section header | |
| if matches[0].start() > 0: | |
| preamble = text[: matches[0].start()].strip() | |
| if preamble: | |
| sections.append(("", preamble)) | |
| for i, match in enumerate(matches): | |
| header = match.group(1).strip().title() | |
| start = match.end() | |
| end = matches[i + 1].start() if i + 1 < len(matches) else len(text) | |
| body = text[start:end].strip() | |
| # Skip reference/bibliography sections | |
| if header.lower() in ("references", "bibliography"): | |
| continue | |
| if body: | |
| sections.append((header, body)) | |
| return sections or [("", text)] | |
| def _detect_biomarkers(text: str) -> list[str]: | |
| text_lower = text.lower() | |
| return sorted({name for name in _BIOMARKER_NAMES if name.lower() in text_lower}) | |
| def _detect_conditions(text: str) -> list[str]: | |
| text_lower = text.lower() | |
| return sorted({tag for kw, tag in _CONDITION_KEYWORDS.items() if kw in text_lower}) | |