Spaces:
Sleeping
Sleeping
| import re | |
| from typing import List, Dict | |
| from .splitter_base import SplitterBase | |
| HEADING_PATTERNS = [ | |
| r"^(CHAPTER|Chapter|Section)\s+\d+", | |
| r"^[A-Z][A-Z ]{5,}$", | |
| r"^(\d+\.){1,3}\s+\w+", | |
| ] | |
| PAGE_PATTERN = re.compile(r"\b[Pp]age\s+(\d+)\b|\f") | |
| FIGURE_PATTERN = re.compile(r"^(Figure|Table|Image)[ .:]+\d+[ .:]+", re.IGNORECASE) | |
| def find_headings(lines): | |
| headings = [] | |
| for i, line in enumerate(lines): | |
| for pat in HEADING_PATTERNS: | |
| if re.match(pat, line.strip()): | |
| headings.append((i, line.strip())) | |
| break | |
| return headings | |
| def split_by_size(text, chunk_size, overlap): | |
| subsections = [] | |
| i = 0 | |
| while i < len(text): | |
| end_i = min(i + chunk_size, len(text)) | |
| chunk = text[i:end_i] | |
| if chunk.strip(): | |
| subsections.append((i, end_i, chunk)) | |
| if end_i == len(text): | |
| break | |
| i += chunk_size - overlap | |
| return subsections | |
| class SemanticChunker(SplitterBase): | |
| def chunk(self, text: str, chunk_size: int, overlap: int) -> List[Dict]: | |
| lines = text.splitlines() | |
| cur_section = None | |
| cur_page = 1 | |
| chunks = [] | |
| line_pages = {} | |
| for i, line in enumerate(lines): | |
| m = PAGE_PATTERN.search(line) | |
| if m and m.group(1): | |
| cur_page = int(m.group(1)) | |
| line_pages[i] = cur_page | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| if any(re.match(pat, line.strip()) for pat in HEADING_PATTERNS): | |
| cur_section = line.strip() | |
| i += 1 | |
| continue | |
| if FIGURE_PATTERN.match(line): | |
| chunks.append({ | |
| "text": line.strip(), | |
| "start": i, | |
| "end": i + 1, | |
| "meta": { | |
| "section": cur_section or "NO_SECTION", | |
| "page": line_pages.get(i, 1), | |
| "type": "figure" | |
| } | |
| }) | |
| i += 1 | |
| continue | |
| if PAGE_PATTERN.search(line): | |
| i += 1 | |
| continue | |
| para_lines = [] | |
| para_start = i | |
| while (i < len(lines) and lines[i].strip() and | |
| not any(re.match(pat, lines[i].strip()) for pat in HEADING_PATTERNS) and | |
| not FIGURE_PATTERN.match(lines[i]) and | |
| not PAGE_PATTERN.search(lines[i])): | |
| para_lines.append(lines[i]) | |
| i += 1 | |
| para_text = "\n".join(para_lines).strip() | |
| if para_text: | |
| subchunks = split_by_size(para_text, chunk_size, overlap) | |
| for substart, subend, chunk_str in subchunks: | |
| chunks.append({ | |
| "text": chunk_str, | |
| "start": para_start, | |
| "end": i, | |
| "meta": { | |
| "section": cur_section or "NO_SECTION", | |
| "page": line_pages.get(para_start, 1), | |
| "source": "semantic" | |
| } | |
| }) | |
| while i < len(lines) and not lines[i].strip(): | |
| i += 1 | |
| return chunks | |