Spaces:

T0X1N
/

Agentic-RagBot

Running

App Files Files Community

Agentic-RagBot / src /services /indexing /text_chunker.py

T0X1N

chore: codebase audit and fixes (ruff, mypy, pytest)

9659593 about 17 hours ago

raw

history blame contribute delete

6.28 kB

	"""
	MediGuard AI — Medical-Aware Text Chunker

	Section-aware chunking with biomarker / condition metadata extraction.
	"""

	from __future__ import annotations

	import re
	from dataclasses import dataclass, field

	# Biomarker names to detect in chunk text
	_BIOMARKER_NAMES: set[str] = {
	"Glucose",
	"Cholesterol",
	"Triglycerides",
	"HbA1c",
	"LDL",
	"HDL",
	"Insulin",
	"BMI",
	"Hemoglobin",
	"Platelets",
	"WBC",
	"RBC",
	"Hematocrit",
	"MCV",
	"MCH",
	"MCHC",
	"Heart Rate",
	"Systolic",
	"Diastolic",
	"Troponin",
	"CRP",
	"C-reactive Protein",
	"ALT",
	"AST",
	"Creatinine",
	"TSH",
	"T3",
	"T4",
	"Sodium",
	"Potassium",
	"Calcium",
	}

	_CONDITION_KEYWORDS: dict[str, str] = {
	"diabetes": "diabetes",
	"diabetic": "diabetes",
	"hyperglycemia": "diabetes",
	"insulin resistance": "diabetes",
	"anemia": "anemia",
	"anaemia": "anemia",
	"iron deficiency": "anemia",
	"thalassemia": "thalassemia",
	"thalassaemia": "thalassemia",
	"thrombocytopenia": "thrombocytopenia",
	"heart disease": "heart_disease",
	"cardiovascular": "heart_disease",
	"coronary": "heart_disease",
	"hypertension": "heart_disease",
	"atherosclerosis": "heart_disease",
	"hyperlipidemia": "heart_disease",
	}

	_SECTION_RE = re.compile(
	r"^(?:#+\s*)?("
	r"abstract\|introduction\|background\|methods?\|methodology\|materials?"
	r"\|results?\|findings\|discussion\|conclusion\|summary"
	r"\|guidelines?\|recommendations?\|references?\|bibliography"
	r"\|clinical\s*presentation\|pathophysiology\|diagnosis\|treatment\|prognosis"
	r")\b",
	re.IGNORECASE \| re.MULTILINE,
	)


	@dataclass
	class MedicalChunk:
	"""A single chunk with medical metadata."""

	text: str
	chunk_index: int
	document_id: str = ""
	title: str = ""
	source_file: str = ""
	page_number: int \| None = None
	section_title: str = ""
	biomarkers_mentioned: list[str] = field(default_factory=list)
	condition_tags: list[str] = field(default_factory=list)
	word_count: int = 0

	def to_dict(self) -> dict:
	return {
	"chunk_text": self.text,
	"chunk_index": self.chunk_index,
	"document_id": self.document_id,
	"title": self.title,
	"source_file": self.source_file,
	"page_number": self.page_number,
	"section_title": self.section_title,
	"biomarkers_mentioned": self.biomarkers_mentioned,
	"condition_tags": self.condition_tags,
	}


	class MedicalTextChunker:
	"""Section-aware text chunker optimised for medical documents."""

	def __init__(
	self,
	target_words: int = 600,
	overlap_words: int = 100,
	min_words: int = 50,
	):
	self.target_words = target_words
	self.overlap_words = overlap_words
	self.min_words = min_words

	def chunk_text(
	self,
	text: str,
	*,
	document_id: str = "",
	title: str = "",
	source_file: str = "",
	) -> list[MedicalChunk]:
	"""Split text into enriched medical chunks."""
	sections = self._split_sections(text)
	chunks: list[MedicalChunk] = []
	idx = 0
	for section_title, section_text in sections:
	words = section_text.split()
	if not words:
	continue
	start = 0
	while start < len(words):
	end = min(start + self.target_words, len(words))
	chunk_words = words[start:end]
	if len(chunk_words) < self.min_words and chunks:
	# merge tiny tail into previous chunk
	chunks[-1].text += " " + " ".join(chunk_words)
	chunks[-1].word_count = len(chunks[-1].text.split())
	break

	chunk_text = " ".join(chunk_words)
	biomarkers = self._detect_biomarkers(chunk_text)
	conditions = self._detect_conditions(chunk_text)

	chunks.append(
	MedicalChunk(
	text=chunk_text,
	chunk_index=idx,
	document_id=document_id,
	title=title,
	source_file=source_file,
	section_title=section_title,
	biomarkers_mentioned=biomarkers,
	condition_tags=conditions,
	word_count=len(chunk_words),
	)
	)
	idx += 1
	start = end - self.overlap_words if end < len(words) else len(words)
	return chunks

	# ── internal helpers ─────────────────────────────────────────────────

	@staticmethod
	def _split_sections(text: str) -> list[tuple[str, str]]:
	"""Split text by detected section headers."""
	matches = list(_SECTION_RE.finditer(text))
	if not matches:
	return [("", text)]
	sections: list[tuple[str, str]] = []
	# text before first section header
	if matches[0].start() > 0:
	preamble = text[: matches[0].start()].strip()
	if preamble:
	sections.append(("", preamble))
	for i, match in enumerate(matches):
	header = match.group(1).strip().title()
	start = match.end()
	end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
	body = text[start:end].strip()
	# Skip reference/bibliography sections
	if header.lower() in ("references", "bibliography"):
	continue
	if body:
	sections.append((header, body))
	return sections or [("", text)]

	@staticmethod
	def _detect_biomarkers(text: str) -> list[str]:
	text_lower = text.lower()
	return sorted({name for name in _BIOMARKER_NAMES if name.lower() in text_lower})

	@staticmethod
	def _detect_conditions(text: str) -> list[str]:
	text_lower = text.lower()
	return sorted({tag for kw, tag in _CONDITION_KEYWORDS.items() if kw in text_lower})