RAG_Book_QA_System / pipeline /chunking /semantic_chunker.py
samithcs's picture
Pipeline added
63105da verified
import re
from typing import List, Dict
from .splitter_base import SplitterBase
HEADING_PATTERNS = [
r"^(CHAPTER|Chapter|Section)\s+\d+",
r"^[A-Z][A-Z ]{5,}$",
r"^(\d+\.){1,3}\s+\w+",
]
PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f")
FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE)
def find_headings(lines):
headings = []
for i, line in enumerate(lines):
for pat in HEADING_PATTERNS:
if re.match(pat, line.strip()):
headings.append((i, line.strip()))
break
return headings
def split_by_size(text, chunk_size, overlap):
subsections = []
i = 0
while i < len(text):
end_i = min(i + chunk_size, len(text))
chunk = text[i:end_i]
if chunk.strip():
subsections.append((i, end_i, chunk))
if end_i == len(text):
break
i += chunk_size - overlap
return subsections
class SemanticChunker(SplitterBase):
def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]:
lines = text.splitlines()
cur_section = None
cur_page = 1
chunks = []
line_pages = {}
for i, line in enumerate(lines):
m = PAGE_PATTERN.search(line)
if m and m.group(1):
cur_page = int(m.group(1))
line_pages[i] = cur_page
i = 0
while i < len(lines):
line = lines[i]
if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS):
cur_section = line.strip()
i += 1
continue
if FIGURE_PATTERN.match(line):
chunks.append({
"text": line.strip(),
"start": i,
"end": i + 1,
"meta": {
"section": cur_section or "NO_SECTION",
"page": line_pages.get(i, 1),
"type": "figure"
}
})
i += 1
continue
if PAGE_PATTERN.search(line):
i += 1
continue
para_lines = []
para_start = i
while (i < len(lines) and lines[i].strip() and
not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and
not FIGURE_PATTERN.match(lines[i]) and
not PAGE_PATTERN.search(lines[i])):
para_lines.append(lines[i])
i += 1
para_text = "\n".join(para_lines).strip()
if para_text:
subchunks = split_by_size(para_text, chunk_size, overlap)
for substart, subend, chunk_str in subchunks:
chunks.append({
"text": chunk_str,
"start": para_start,
"end": i,
"meta": {
"section": cur_section or "NO_SECTION",
"page": line_pages.get(para_start, 1),
"source": "semantic"
}
})
while i < len(lines) and not lines[i].strip():
i += 1
return chunks