Spaces:
Running
Running
Minor update for chunking improvement
Browse files- config.py +7 -2
- vectordb/document_processor.py +451 -150
config.py
CHANGED
|
@@ -38,8 +38,13 @@ WHISPER_MODEL = "tiny" # Options: tiny, base, small, medium, large (tiny=75MB f
|
|
| 38 |
# - large: ~3GB, best accuracy
|
| 39 |
|
| 40 |
# Chunking settings
|
| 41 |
-
CHUNK_SIZE
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
MAX_CHUNKS_PER_DOC = 1000
|
| 44 |
|
| 45 |
# Retrieval settings
|
|
|
|
| 38 |
# - large: ~3GB, best accuracy
|
| 39 |
|
| 40 |
# Chunking settings
|
| 41 |
+
# CHUNK_SIZE: target characters per chunk (~800 chars β 2-4 paragraphs of lecture notes).
|
| 42 |
+
# Old value was 512 which was too small and split concepts mid-sentence.
|
| 43 |
+
CHUNK_SIZE = 800
|
| 44 |
+
# CHUNK_OVERLAP: characters of text from the previous chunk included at the start
|
| 45 |
+
# of the next one, so the embedding always sees a coherent context boundary.
|
| 46 |
+
# Old value was 50 (word count, not chars) β now consistently chars.
|
| 47 |
+
CHUNK_OVERLAP = 150
|
| 48 |
MAX_CHUNKS_PER_DOC = 1000
|
| 49 |
|
| 50 |
# Retrieval settings
|
vectordb/document_processor.py
CHANGED
|
@@ -1,172 +1,473 @@
|
|
| 1 |
"""
|
| 2 |
-
Document processing and chunking
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
"""
|
| 4 |
-
|
|
|
|
| 5 |
from pathlib import Path
|
| 6 |
-
from typing import List, Dict
|
| 7 |
-
|
| 8 |
-
import pdfplumber
|
| 9 |
-
from docx import Document
|
| 10 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
class DocumentChunk:
|
| 13 |
-
def __init__(
|
| 14 |
-
self,
|
| 15 |
-
text: str,
|
| 16 |
-
metadata: Dict,
|
| 17 |
-
chunk_id: int
|
| 18 |
-
):
|
| 19 |
self.text = text
|
| 20 |
self.metadata = metadata
|
| 21 |
self.chunk_id = chunk_id
|
| 22 |
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
for sentence in sentences:
|
| 102 |
-
sentence = sentence.strip()
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
else
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
current_chunk = sentence[chunk_size:]
|
| 116 |
else:
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
def process_document(
|
| 126 |
self,
|
| 127 |
file_path: str,
|
| 128 |
-
metadata: Dict = None
|
| 129 |
) -> List[DocumentChunk]:
|
| 130 |
"""
|
| 131 |
-
Process document into
|
| 132 |
-
|
| 133 |
-
Args:
|
| 134 |
-
file_path: Path to document
|
| 135 |
-
metadata: Additional metadata
|
| 136 |
-
|
| 137 |
-
Returns:
|
| 138 |
-
List of DocumentChunk objects
|
| 139 |
"""
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
}
|
| 150 |
-
|
| 151 |
if metadata:
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
#
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
#
|
|
|
|
|
|
|
|
|
|
| 158 |
doc_chunks = []
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
return doc_chunks
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
"""
|
| 2 |
+
Document processing and chunking β semantic, structure-aware pipeline.
|
| 3 |
+
|
| 4 |
+
Improvements over the old version
|
| 5 |
+
ββββββββββββββββββββββββββββββββββ
|
| 6 |
+
β’ PDF extracted page-by-page via PyMuPDF (fitz) β pdfplumber fallback
|
| 7 |
+
β Tracks page numbers per chunk for precise citations
|
| 8 |
+
β Auto-detects & removes repeated headers/footers (noise lines appearing
|
| 9 |
+
on β₯40 % of pages)
|
| 10 |
+
β Fixes hyphenated line-breaks (word-\nbreak β wordbreak)
|
| 11 |
+
β’ Three-level chunking hierarchy:
|
| 12 |
+
1. Detect section headings β each section stays together where possible
|
| 13 |
+
2. Split into paragraphs (double-newline / blank line)
|
| 14 |
+
3. Split paragraphs into sentences (abbreviation-aware regex)
|
| 15 |
+
Sentences are then accumulated into target-size chunks so a chunk never
|
| 16 |
+
cuts in the middle of a sentence.
|
| 17 |
+
β’ Chunk overlap carried as actual character text (not word count) so the
|
| 18 |
+
embedding always sees a coherent intro from the previous chunk.
|
| 19 |
+
β’ Minimum chunk size filter (100 chars) β avoids storing page numbers,
|
| 20 |
+
lone headers, or empty fragments.
|
| 21 |
+
β’ Metadata per chunk now includes: page_start, page_end, section_title,
|
| 22 |
+
char_count, chunk_index, total_chunks, source, file_type, institution_id,
|
| 23 |
+
course_id (passed in by caller).
|
| 24 |
"""
|
| 25 |
+
|
| 26 |
+
import re
|
| 27 |
from pathlib import Path
|
| 28 |
+
from typing import List, Dict, Tuple, Optional
|
| 29 |
+
|
|
|
|
|
|
|
| 30 |
from config import CHUNK_SIZE, CHUNK_OVERLAP
|
| 31 |
|
| 32 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 33 |
+
# Data class
|
| 34 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 35 |
+
|
| 36 |
class DocumentChunk:
|
| 37 |
+
def __init__(self, text: str, metadata: Dict, chunk_id: int):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
self.text = text
|
| 39 |
self.metadata = metadata
|
| 40 |
self.chunk_id = chunk_id
|
| 41 |
|
| 42 |
+
|
| 43 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββοΏ½οΏ½οΏ½βββββββββββββββ
|
| 44 |
+
# Low-level text utilities
|
| 45 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 46 |
+
|
| 47 |
+
# Common abbreviations that end with a period but are NOT sentence endings.
|
| 48 |
+
_ABBREV_PAT = (
|
| 49 |
+
r"Dr|Mr|Mrs|Ms|Prof|Sr|Jr|Rev|Gen|Sgt|Cpl|Pvt|Lt|Capt|Cmdr|Adm"
|
| 50 |
+
r"|etc|Fig|fig|vs|i\.e|e\.g|Eq|eq|No|ref|approx|cf|et\sal|vol|ed"
|
| 51 |
+
r"|pp|ch|sec|dept|univ|est|govt|corp|inc|ltd|co|eng|tech|lab|exp"
|
| 52 |
+
r"|max|min|avg|std|def|Def|Prop|Thm|Cor|Lem|Ex|Eg|Jan|Feb|Mar|Apr"
|
| 53 |
+
r"|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Mon|Tue|Wed|Thu|Fri|Sat|Sun"
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
# Sentence boundary: (. or ! or ?) followed by whitespace + uppercase/digit,
|
| 57 |
+
# but NOT preceded by a known abbreviation.
|
| 58 |
+
_SENT_BOUNDARY = re.compile(
|
| 59 |
+
r"(?<!(?:" + _ABBREV_PAT + r"))(?<=[.!?])\s{1,3}(?=[A-Z0-9\"])"
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Unicode ligatures that PDFs sometimes embed
|
| 63 |
+
_LIGATURES = str.maketrans({
|
| 64 |
+
"\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
|
| 65 |
+
"\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
|
| 66 |
+
})
|
| 67 |
+
|
| 68 |
+
# Heading detection: line is a heading if it matches any of these
|
| 69 |
+
_HEADING_RE = re.compile(
|
| 70 |
+
r"^\s*("
|
| 71 |
+
r"\d+(\.\d+)*\.?\s+[A-Z]" # 1. Introduction / 1.2 Overview
|
| 72 |
+
r"|[A-Z][A-Z\s]{4,}[A-Z]" # ALL CAPS (min 6 chars)
|
| 73 |
+
r"|Chapter\s+\d+" # Chapter N
|
| 74 |
+
r"|Section\s+\d+" # Section N
|
| 75 |
+
r"|[IVXLCDM]+\.\s+[A-Z]" # Roman numeral heading
|
| 76 |
+
r")\s*$",
|
| 77 |
+
re.MULTILINE,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
def _fix_text(raw: str) -> str:
|
| 82 |
+
"""Light cleaning that preserves paragraph structure."""
|
| 83 |
+
text = raw.translate(_LIGATURES)
|
| 84 |
+
# Fix soft-hyphen / hard-hyphen line-breaks: "some-\nword" β "someword"
|
| 85 |
+
text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
|
| 86 |
+
# Replace single lone newlines inside a paragraph with a space
|
| 87 |
+
# but preserve real paragraph breaks (2+ newlines stay)
|
| 88 |
+
text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
|
| 89 |
+
# Collapse runs of spaces (but not newlines)
|
| 90 |
+
text = re.sub(r"[ \t]{2,}", " ", text)
|
| 91 |
+
# Collapse 3+ blank lines to 2
|
| 92 |
+
text = re.sub(r"\n{3,}", "\n\n", text)
|
| 93 |
+
return text.strip()
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def _split_sentences(paragraph: str) -> List[str]:
|
| 97 |
+
"""Split a paragraph into sentences using abbreviation-aware regex."""
|
| 98 |
+
parts = _SENT_BOUNDARY.split(paragraph.strip())
|
| 99 |
+
sentences = []
|
| 100 |
+
for part in parts:
|
| 101 |
+
part = part.strip()
|
| 102 |
+
if part:
|
| 103 |
+
sentences.append(part)
|
| 104 |
+
return sentences if sentences else [paragraph.strip()]
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
def _split_paragraphs(text: str) -> List[str]:
|
| 108 |
+
"""Split cleaned text into paragraphs (blank-line or indent separated)."""
|
| 109 |
+
# Split on double newlines (blank lines)
|
| 110 |
+
raw_paras = re.split(r"\n{2,}", text)
|
| 111 |
+
paras = []
|
| 112 |
+
for p in raw_paras:
|
| 113 |
+
p = p.strip()
|
| 114 |
+
if p:
|
| 115 |
+
paras.append(p)
|
| 116 |
+
return paras
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
def _detect_heading(line: str) -> bool:
|
| 120 |
+
"""Return True if the line looks like a section heading."""
|
| 121 |
+
return bool(_HEADING_RE.match(line.strip()))
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 125 |
+
# PDF extraction helpers
|
| 126 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 127 |
+
|
| 128 |
+
def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
|
| 129 |
+
"""
|
| 130 |
+
Extract text per page using PyMuPDF (fitz).
|
| 131 |
+
Returns [(page_number_1based, text), ...].
|
| 132 |
+
"""
|
| 133 |
+
import fitz # PyMuPDF
|
| 134 |
+
pages = []
|
| 135 |
+
with fitz.open(file_path) as doc:
|
| 136 |
+
for i, page in enumerate(doc, start=1):
|
| 137 |
+
text = page.get_text("text") # plain text, respects reading order
|
| 138 |
+
if text.strip():
|
| 139 |
+
pages.append((i, text))
|
| 140 |
+
return pages
|
| 141 |
+
|
| 142 |
+
|
| 143 |
+
def _extract_pdf_pages_pdfplumber(file_path: str) -> List[Tuple[int, str]]:
|
| 144 |
+
"""Fallback: extract per-page text via pdfplumber."""
|
| 145 |
+
import pdfplumber
|
| 146 |
+
pages = []
|
| 147 |
+
with pdfplumber.open(file_path) as pdf:
|
| 148 |
+
for i, page in enumerate(pdf.pages, start=1):
|
| 149 |
+
text = page.extract_text() or ""
|
| 150 |
+
if text.strip():
|
| 151 |
+
pages.append((i, text))
|
| 152 |
+
return pages
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def _extract_pdf_pages_pypdf2(file_path: str) -> List[Tuple[int, str]]:
|
| 156 |
+
"""Last resort: PyPDF2 per page."""
|
| 157 |
+
import PyPDF2
|
| 158 |
+
pages = []
|
| 159 |
+
with open(file_path, "rb") as f:
|
| 160 |
+
reader = PyPDF2.PdfReader(f)
|
| 161 |
+
for i, page in enumerate(reader.pages, start=1):
|
| 162 |
+
text = page.extract_text() or ""
|
| 163 |
+
if text.strip():
|
| 164 |
+
pages.append((i, text))
|
| 165 |
+
return pages
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def _remove_headers_footers(
|
| 169 |
+
pages: List[Tuple[int, str]],
|
| 170 |
+
threshold: float = 0.40,
|
| 171 |
+
) -> List[Tuple[int, str]]:
|
| 172 |
+
"""
|
| 173 |
+
Remove lines that appear almost identically on β₯ threshold fraction of pages
|
| 174 |
+
β these are headers/footers (e.g. "Confidential", "Page N", course title).
|
| 175 |
+
"""
|
| 176 |
+
if len(pages) < 3:
|
| 177 |
+
return pages # too few pages to detect reliably
|
| 178 |
+
|
| 179 |
+
# Collect first-line and last-line of each page (most common header/footer positions)
|
| 180 |
+
first_lines: Dict[str, int] = {}
|
| 181 |
+
last_lines: Dict[str, int] = {}
|
| 182 |
+
|
| 183 |
+
for _, text in pages:
|
| 184 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 185 |
+
if not lines:
|
| 186 |
+
continue
|
| 187 |
+
# Normalise: strip numbers from the lines to catch "Page 1", "Page 2", etc.
|
| 188 |
+
first = re.sub(r"\b\d+\b", "N", lines[0])
|
| 189 |
+
last = re.sub(r"\b\d+\b", "N", lines[-1])
|
| 190 |
+
first_lines[first] = first_lines.get(first, 0) + 1
|
| 191 |
+
last_lines[last] = last_lines.get(last, 0) + 1
|
| 192 |
+
|
| 193 |
+
total = len(pages)
|
| 194 |
+
noisy_first = {k for k, v in first_lines.items() if v / total >= threshold}
|
| 195 |
+
noisy_last = {k for k, v in last_lines.items() if v / total >= threshold}
|
| 196 |
+
|
| 197 |
+
cleaned = []
|
| 198 |
+
for page_num, text in pages:
|
| 199 |
+
lines = text.splitlines()
|
| 200 |
+
filtered = []
|
| 201 |
+
for idx, line in enumerate(lines):
|
| 202 |
+
normalised = re.sub(r"\b\d+\b", "N", line.strip())
|
| 203 |
+
if idx == 0 and normalised in noisy_first:
|
| 204 |
+
continue
|
| 205 |
+
if idx == len(lines) - 1 and normalised in noisy_last:
|
| 206 |
+
continue
|
| 207 |
+
# Also skip lone page-number lines anywhere in the page
|
| 208 |
+
if re.fullmatch(r"[\s\-ββ]*\d{1,4}[\s\-ββ]*", line):
|
| 209 |
+
continue
|
| 210 |
+
filtered.append(line)
|
| 211 |
+
cleaned.append((page_num, "\n".join(filtered)))
|
| 212 |
+
return cleaned
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 216 |
+
# Core chunker
|
| 217 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 218 |
+
|
| 219 |
+
def _build_chunks(
|
| 220 |
+
passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
|
| 221 |
+
target_size: int = CHUNK_SIZE,
|
| 222 |
+
overlap_chars: int = CHUNK_OVERLAP,
|
| 223 |
+
min_chunk_size: int = 100,
|
| 224 |
+
) -> List[Dict]:
|
| 225 |
+
"""
|
| 226 |
+
Accumulate sentence-split text into target-sized chunks with char overlap.
|
| 227 |
+
|
| 228 |
+
Each passage is split into sentences. Sentences are packed into the current
|
| 229 |
+
chunk until the target_size would be exceeded, then the chunk is flushed
|
| 230 |
+
and a new one starts, seeded with the last `overlap_chars` characters of
|
| 231 |
+
the previous chunk (so context bleeds across chunk boundaries).
|
| 232 |
+
|
| 233 |
+
Returns a list of dicts: {text, page_start, page_end, section_title}.
|
| 234 |
+
"""
|
| 235 |
+
chunks: List[Dict] = []
|
| 236 |
+
current_text = ""
|
| 237 |
+
current_page_start: Optional[int] = None
|
| 238 |
+
current_page_end: Optional[int] = None
|
| 239 |
+
current_section: Optional[str] = None
|
| 240 |
+
overlap_seed = "" # tail of the last chunk
|
| 241 |
+
|
| 242 |
+
def flush():
|
| 243 |
+
nonlocal current_text, current_page_start, current_page_end, current_section, overlap_seed
|
| 244 |
+
text = current_text.strip()
|
| 245 |
+
if len(text) >= min_chunk_size:
|
| 246 |
+
chunks.append({
|
| 247 |
+
"text": text,
|
| 248 |
+
"page_start": current_page_start,
|
| 249 |
+
"page_end": current_page_end,
|
| 250 |
+
"section_title": current_section,
|
| 251 |
+
})
|
| 252 |
+
# Seed next chunk with the last overlap_chars of this chunk
|
| 253 |
+
overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
|
| 254 |
+
current_text = ""
|
| 255 |
+
current_page_start = None
|
| 256 |
+
current_page_end = None
|
| 257 |
+
|
| 258 |
+
for passage_text, page_num, section_title in passages:
|
| 259 |
+
# Update section tracking
|
| 260 |
+
if section_title:
|
| 261 |
+
current_section = section_title
|
| 262 |
+
|
| 263 |
+
sentences = _split_sentences(passage_text)
|
| 264 |
+
|
| 265 |
for sentence in sentences:
|
| 266 |
+
sentence = sentence.strip()
|
| 267 |
+
if not sentence:
|
| 268 |
+
continue
|
| 269 |
+
|
| 270 |
+
# Would adding this sentence overflow the target?
|
| 271 |
+
projected = len(current_text) + (1 if current_text else 0) + len(sentence)
|
| 272 |
+
|
| 273 |
+
if projected > target_size and current_text:
|
| 274 |
+
flush()
|
| 275 |
+
# Start new chunk from overlap seed
|
| 276 |
+
current_text = overlap_seed + (" " if overlap_seed else "") + sentence
|
| 277 |
+
current_page_start = page_num
|
| 278 |
+
current_page_end = page_num
|
|
|
|
| 279 |
else:
|
| 280 |
+
if not current_text:
|
| 281 |
+
# Fresh chunk β include overlap seed first
|
| 282 |
+
current_text = (overlap_seed + " " + sentence).strip() if overlap_seed else sentence
|
| 283 |
+
current_page_start = page_num
|
| 284 |
+
else:
|
| 285 |
+
current_text += " " + sentence
|
| 286 |
+
|
| 287 |
+
if current_page_end is None:
|
| 288 |
+
current_page_end = page_num
|
| 289 |
+
else:
|
| 290 |
+
current_page_end = max(current_page_end, page_num)
|
| 291 |
+
|
| 292 |
+
# Flush the last partial chunk
|
| 293 |
+
if current_text.strip():
|
| 294 |
+
flush()
|
| 295 |
+
|
| 296 |
+
return chunks
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 300 |
+
# Main processor class (public API unchanged)
|
| 301 |
+
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 302 |
+
|
| 303 |
+
class DocumentProcessor:
|
| 304 |
+
def __init__(self):
|
| 305 |
+
self.supported_formats = [".pdf", ".txt", ".docx"]
|
| 306 |
+
|
| 307 |
+
# ββ Public entry point ββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 308 |
+
|
| 309 |
def process_document(
|
| 310 |
self,
|
| 311 |
file_path: str,
|
| 312 |
+
metadata: Dict = None,
|
| 313 |
) -> List[DocumentChunk]:
|
| 314 |
"""
|
| 315 |
+
Process a document file into semantically coherent chunks.
|
| 316 |
+
Returns a list of DocumentChunk objects; interface is unchanged.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 317 |
"""
|
| 318 |
+
path = Path(file_path)
|
| 319 |
+
ext = path.suffix.lower()
|
| 320 |
+
|
| 321 |
+
if ext == ".pdf":
|
| 322 |
+
pages = self._load_pdf_pages(file_path)
|
| 323 |
+
elif ext == ".txt":
|
| 324 |
+
raw = self._load_txt(file_path)
|
| 325 |
+
pages = [(1, raw)]
|
| 326 |
+
elif ext == ".docx":
|
| 327 |
+
raw = self._load_docx(file_path)
|
| 328 |
+
pages = [(1, raw)]
|
| 329 |
+
else:
|
| 330 |
+
raise ValueError(f"Unsupported file format: {ext}")
|
| 331 |
+
|
| 332 |
+
# Build base metadata
|
| 333 |
+
file_meta = {
|
| 334 |
+
"source": path.name,
|
| 335 |
+
"file_path": str(file_path),
|
| 336 |
+
"file_type": ext,
|
| 337 |
}
|
|
|
|
| 338 |
if metadata:
|
| 339 |
+
file_meta.update(metadata)
|
| 340 |
+
|
| 341 |
+
# Convert pages to passage list with section tracking
|
| 342 |
+
passages = self._pages_to_passages(pages)
|
| 343 |
+
|
| 344 |
+
# Build variable-length chunks
|
| 345 |
+
raw_chunks = _build_chunks(passages, target_size=CHUNK_SIZE, overlap_chars=CHUNK_OVERLAP)
|
| 346 |
+
|
| 347 |
+
# Wrap into DocumentChunk objects
|
| 348 |
doc_chunks = []
|
| 349 |
+
total = len(raw_chunks)
|
| 350 |
+
for i, rc in enumerate(raw_chunks):
|
| 351 |
+
chunk_meta = file_meta.copy()
|
| 352 |
+
chunk_meta["chunk_index"] = i
|
| 353 |
+
chunk_meta["total_chunks"] = total
|
| 354 |
+
chunk_meta["char_count"] = len(rc["text"])
|
| 355 |
+
chunk_meta["page_start"] = rc.get("page_start")
|
| 356 |
+
chunk_meta["page_end"] = rc.get("page_end")
|
| 357 |
+
if rc.get("section_title"):
|
| 358 |
+
chunk_meta["section_title"] = rc["section_title"]
|
| 359 |
+
|
| 360 |
+
doc_chunks.append(DocumentChunk(
|
| 361 |
+
text=rc["text"],
|
| 362 |
+
metadata=chunk_meta,
|
| 363 |
+
chunk_id=i,
|
| 364 |
+
))
|
| 365 |
+
|
| 366 |
+
print(f"β
Chunked '{path.name}' β {total} chunks "
|
| 367 |
+
f"(avg {sum(len(c.text) for c in doc_chunks)//max(total,1)} chars each)")
|
| 368 |
return doc_chunks
|
| 369 |
+
|
| 370 |
+
# ββ Legacy interface (still works; used by some older code paths) βββββββββ
|
| 371 |
+
|
| 372 |
+
def load_document(self, file_path: str) -> str:
|
| 373 |
+
"""Return the full cleaned text of a document as a single string."""
|
| 374 |
+
ext = Path(file_path).suffix.lower()
|
| 375 |
+
if ext == ".pdf":
|
| 376 |
+
pages = self._load_pdf_pages(file_path)
|
| 377 |
+
return "\n\n".join(text for _, text in pages)
|
| 378 |
+
elif ext == ".txt":
|
| 379 |
+
return self._load_txt(file_path)
|
| 380 |
+
elif ext == ".docx":
|
| 381 |
+
return self._load_docx(file_path)
|
| 382 |
+
raise ValueError(f"Unsupported format: {ext}")
|
| 383 |
+
|
| 384 |
+
def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE,
|
| 385 |
+
overlap: int = CHUNK_OVERLAP) -> List[str]:
|
| 386 |
+
"""Legacy helper β returns list of chunk strings from a raw text blob."""
|
| 387 |
+
passages = [(text, 1, None)]
|
| 388 |
+
raw_chunks = _build_chunks(passages, target_size=chunk_size, overlap_chars=overlap)
|
| 389 |
+
return [rc["text"] for rc in raw_chunks]
|
| 390 |
+
|
| 391 |
+
# ββ PDF loading βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 392 |
+
|
| 393 |
+
def _load_pdf_pages(self, file_path: str) -> List[Tuple[int, str]]:
|
| 394 |
+
"""Extract per-page text from a PDF with fallback chain."""
|
| 395 |
+
pages = None
|
| 396 |
+
|
| 397 |
+
# 1. PyMuPDF (best quality, respects reading order)
|
| 398 |
+
try:
|
| 399 |
+
pages = _extract_pdf_pages_fitz(file_path)
|
| 400 |
+
except Exception as e:
|
| 401 |
+
print(f" fitz failed ({e}), trying pdfplumberβ¦")
|
| 402 |
+
|
| 403 |
+
# 2. pdfplumber
|
| 404 |
+
if not pages:
|
| 405 |
+
try:
|
| 406 |
+
pages = _extract_pdf_pages_pdfplumber(file_path)
|
| 407 |
+
except Exception as e:
|
| 408 |
+
print(f" pdfplumber failed ({e}), trying PyPDF2β¦")
|
| 409 |
+
|
| 410 |
+
# 3. PyPDF2 last resort
|
| 411 |
+
if not pages:
|
| 412 |
+
pages = _extract_pdf_pages_pypdf2(file_path)
|
| 413 |
+
|
| 414 |
+
if not pages:
|
| 415 |
+
raise RuntimeError(f"Could not extract any text from: {file_path}")
|
| 416 |
+
|
| 417 |
+
# Remove noise headers/footers, then clean each page
|
| 418 |
+
pages = _remove_headers_footers(pages)
|
| 419 |
+
return [(pn, _fix_text(text)) for pn, text in pages if _fix_text(text)]
|
| 420 |
+
|
| 421 |
+
# ββ Plain text / DOCX loading βββββββββββββββββββββββββββββββββββββββββββββ
|
| 422 |
+
|
| 423 |
+
def _load_txt(self, file_path: str) -> str:
|
| 424 |
+
with open(file_path, "r", encoding="utf-8", errors="replace") as f:
|
| 425 |
+
return _fix_text(f.read())
|
| 426 |
+
|
| 427 |
+
def _load_docx(self, file_path: str) -> str:
|
| 428 |
+
from docx import Document as DocxDoc
|
| 429 |
+
doc = DocxDoc(file_path)
|
| 430 |
+
paragraphs = []
|
| 431 |
+
for para in doc.paragraphs:
|
| 432 |
+
text = para.text.strip()
|
| 433 |
+
if text:
|
| 434 |
+
paragraphs.append(text)
|
| 435 |
+
return _fix_text("\n\n".join(paragraphs))
|
| 436 |
+
|
| 437 |
+
# ββ Section/passage extraction ββββββββββββββββββββββββββββββββββββββββββββ
|
| 438 |
+
|
| 439 |
+
def _pages_to_passages(
|
| 440 |
+
self,
|
| 441 |
+
pages: List[Tuple[int, str]],
|
| 442 |
+
) -> List[Tuple[str, int, Optional[str]]]:
|
| 443 |
+
"""
|
| 444 |
+
Convert (page_num, text) pairs into a flat list of
|
| 445 |
+
(passage_text, page_num, section_title) tuples.
|
| 446 |
+
|
| 447 |
+
Detects section headings and tags each passage with the most recent
|
| 448 |
+
heading seen. Paragraphs within a page are exploded into separate
|
| 449 |
+
passages so that the chunker can work at fine granularity.
|
| 450 |
+
"""
|
| 451 |
+
passages: List[Tuple[str, int, Optional[str]]] = []
|
| 452 |
+
current_section: Optional[str] = None
|
| 453 |
+
|
| 454 |
+
for page_num, page_text in pages:
|
| 455 |
+
# Split the page into paragraphs
|
| 456 |
+
paragraphs = _split_paragraphs(page_text)
|
| 457 |
+
|
| 458 |
+
for para in paragraphs:
|
| 459 |
+
if not para.strip():
|
| 460 |
+
continue
|
| 461 |
+
|
| 462 |
+
# Is this paragraph a standalone heading?
|
| 463 |
+
first_line = para.splitlines()[0].strip()
|
| 464 |
+
if _detect_heading(first_line) and len(para.strip()) < 120:
|
| 465 |
+
current_section = para.strip()
|
| 466 |
+
# Don't create a chunk for a bare heading β it'll be absorbed
|
| 467 |
+
# into the next passage as its section_title context
|
| 468 |
+
continue
|
| 469 |
+
|
| 470 |
+
passages.append((para, page_num, current_section))
|
| 471 |
+
|
| 472 |
+
return passages
|
| 473 |
+
|