Spaces:

T0X1N
/

Agentic-RagBot

Running

File size: 6,276 Bytes

"""
MediGuard AI — Medical-Aware Text Chunker

Section-aware chunking with biomarker / condition metadata extraction.
"""

from __future__ import annotations

import re
from dataclasses import dataclass, field

# Biomarker names to detect in chunk text
_BIOMARKER_NAMES: set[str] = {
    "Glucose",
    "Cholesterol",
    "Triglycerides",
    "HbA1c",
    "LDL",
    "HDL",
    "Insulin",
    "BMI",
    "Hemoglobin",
    "Platelets",
    "WBC",
    "RBC",
    "Hematocrit",
    "MCV",
    "MCH",
    "MCHC",
    "Heart Rate",
    "Systolic",
    "Diastolic",
    "Troponin",
    "CRP",
    "C-reactive Protein",
    "ALT",
    "AST",
    "Creatinine",
    "TSH",
    "T3",
    "T4",
    "Sodium",
    "Potassium",
    "Calcium",
}

_CONDITION_KEYWORDS: dict[str, str] = {
    "diabetes": "diabetes",
    "diabetic": "diabetes",
    "hyperglycemia": "diabetes",
    "insulin resistance": "diabetes",
    "anemia": "anemia",
    "anaemia": "anemia",
    "iron deficiency": "anemia",
    "thalassemia": "thalassemia",
    "thalassaemia": "thalassemia",
    "thrombocytopenia": "thrombocytopenia",
    "heart disease": "heart_disease",
    "cardiovascular": "heart_disease",
    "coronary": "heart_disease",
    "hypertension": "heart_disease",
    "atherosclerosis": "heart_disease",
    "hyperlipidemia": "heart_disease",
}

_SECTION_RE = re.compile(
    r"^(?:#+\s*)?("
    r"abstract|introduction|background|methods?|methodology|materials?"
    r"|results?|findings|discussion|conclusion|summary"
    r"|guidelines?|recommendations?|references?|bibliography"
    r"|clinical\s*presentation|pathophysiology|diagnosis|treatment|prognosis"
    r")\b",
    re.IGNORECASE | re.MULTILINE,
)


@dataclass
class MedicalChunk:
    """A single chunk with medical metadata."""

    text: str
    chunk_index: int
    document_id: str = ""
    title: str = ""
    source_file: str = ""
    page_number: int | None = None
    section_title: str = ""
    biomarkers_mentioned: list[str] = field(default_factory=list)
    condition_tags: list[str] = field(default_factory=list)
    word_count: int = 0

    def to_dict(self) -> dict:
        return {
            "chunk_text": self.text,
            "chunk_index": self.chunk_index,
            "document_id": self.document_id,
            "title": self.title,
            "source_file": self.source_file,
            "page_number": self.page_number,
            "section_title": self.section_title,
            "biomarkers_mentioned": self.biomarkers_mentioned,
            "condition_tags": self.condition_tags,
        }


class MedicalTextChunker:
    """Section-aware text chunker optimised for medical documents."""

    def __init__(
        self,
        target_words: int = 600,
        overlap_words: int = 100,
        min_words: int = 50,
    ):
        self.target_words = target_words
        self.overlap_words = overlap_words
        self.min_words = min_words

    def chunk_text(
        self,
        text: str,
        *,
        document_id: str = "",
        title: str = "",
        source_file: str = "",
    ) -> list[MedicalChunk]:
        """Split text into enriched medical chunks."""
        sections = self._split_sections(text)
        chunks: list[MedicalChunk] = []
        idx = 0
        for section_title, section_text in sections:
            words = section_text.split()
            if not words:
                continue
            start = 0
            while start < len(words):
                end = min(start + self.target_words, len(words))
                chunk_words = words[start:end]
                if len(chunk_words) < self.min_words and chunks:
                    # merge tiny tail into previous chunk
                    chunks[-1].text += " " + " ".join(chunk_words)
                    chunks[-1].word_count = len(chunks[-1].text.split())
                    break

                chunk_text = " ".join(chunk_words)
                biomarkers = self._detect_biomarkers(chunk_text)
                conditions = self._detect_conditions(chunk_text)

                chunks.append(
                    MedicalChunk(
                        text=chunk_text,
                        chunk_index=idx,
                        document_id=document_id,
                        title=title,
                        source_file=source_file,
                        section_title=section_title,
                        biomarkers_mentioned=biomarkers,
                        condition_tags=conditions,
                        word_count=len(chunk_words),
                    )
                )
                idx += 1
                start = end - self.overlap_words if end < len(words) else len(words)
        return chunks

    # ── internal helpers ─────────────────────────────────────────────────

    @staticmethod
    def _split_sections(text: str) -> list[tuple[str, str]]:
        """Split text by detected section headers."""
        matches = list(_SECTION_RE.finditer(text))
        if not matches:
            return [("", text)]
        sections: list[tuple[str, str]] = []
        # text before first section header
        if matches[0].start() > 0:
            preamble = text[: matches[0].start()].strip()
            if preamble:
                sections.append(("", preamble))
        for i, match in enumerate(matches):
            header = match.group(1).strip().title()
            start = match.end()
            end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
            body = text[start:end].strip()
            # Skip reference/bibliography sections
            if header.lower() in ("references", "bibliography"):
                continue
            if body:
                sections.append((header, body))
        return sections or [("", text)]

    @staticmethod
    def _detect_biomarkers(text: str) -> list[str]:
        text_lower = text.lower()
        return sorted({name for name in _BIOMARKER_NAMES if name.lower() in text_lower})

    @staticmethod
    def _detect_conditions(text: str) -> list[str]:
        text_lower = text.lower()
        return sorted({tag for kw, tag in _CONDITION_KEYWORDS.items() if kw in text_lower})