Jay-10020 commited on
Commit
d0d84d2
Β·
1 Parent(s): 4e4501d

Minor update for chunking improvement

Browse files
Files changed (2) hide show
  1. config.py +7 -2
  2. vectordb/document_processor.py +451 -150
config.py CHANGED
@@ -38,8 +38,13 @@ WHISPER_MODEL = "tiny" # Options: tiny, base, small, medium, large (tiny=75MB f
38
  # - large: ~3GB, best accuracy
39
 
40
  # Chunking settings
41
- CHUNK_SIZE = 512
42
- CHUNK_OVERLAP = 50
 
 
 
 
 
43
  MAX_CHUNKS_PER_DOC = 1000
44
 
45
  # Retrieval settings
 
38
  # - large: ~3GB, best accuracy
39
 
40
  # Chunking settings
41
+ # CHUNK_SIZE: target characters per chunk (~800 chars β‰ˆ 2-4 paragraphs of lecture notes).
42
+ # Old value was 512 which was too small and split concepts mid-sentence.
43
+ CHUNK_SIZE = 800
44
+ # CHUNK_OVERLAP: characters of text from the previous chunk included at the start
45
+ # of the next one, so the embedding always sees a coherent context boundary.
46
+ # Old value was 50 (word count, not chars) β€” now consistently chars.
47
+ CHUNK_OVERLAP = 150
48
  MAX_CHUNKS_PER_DOC = 1000
49
 
50
  # Retrieval settings
vectordb/document_processor.py CHANGED
@@ -1,172 +1,473 @@
1
  """
2
- Document processing and chunking
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  """
4
- import os
 
5
  from pathlib import Path
6
- from typing import List, Dict
7
- import PyPDF2
8
- import pdfplumber
9
- from docx import Document
10
  from config import CHUNK_SIZE, CHUNK_OVERLAP
11
 
 
 
 
 
12
  class DocumentChunk:
13
- def __init__(
14
- self,
15
- text: str,
16
- metadata: Dict,
17
- chunk_id: int
18
- ):
19
  self.text = text
20
  self.metadata = metadata
21
  self.chunk_id = chunk_id
22
 
23
- class DocumentProcessor:
24
- def __init__(self):
25
- self.supported_formats = ['.pdf', '.txt', '.docx']
26
-
27
- def load_document(self, file_path: str) -> str:
28
- """Load document content based on file type"""
29
- path = Path(file_path)
30
-
31
- if not path.exists():
32
- raise FileNotFoundError(f"File not found: {file_path}")
33
-
34
- ext = path.suffix.lower()
35
-
36
- if ext == '.pdf':
37
- return self._load_pdf(file_path)
38
- elif ext == '.txt':
39
- return self._load_txt(file_path)
40
- elif ext == '.docx':
41
- return self._load_docx(file_path)
42
- else:
43
- raise ValueError(f"Unsupported file format: {ext}")
44
-
45
- def _load_pdf(self, file_path: str) -> str:
46
- """Extract text from PDF"""
47
- text = ""
48
- try:
49
- # Try pdfplumber first (better for tables)
50
- with pdfplumber.open(file_path) as pdf:
51
- for page in pdf.pages:
52
- page_text = page.extract_text()
53
- if page_text:
54
- text += page_text + "\n"
55
- except:
56
- # Fallback to PyPDF2
57
- with open(file_path, 'rb') as file:
58
- pdf_reader = PyPDF2.PdfReader(file)
59
- for page in pdf_reader.pages:
60
- text += page.extract_text() + "\n"
61
-
62
- return text.strip()
63
-
64
- def _load_txt(self, file_path: str) -> str:
65
- """Load text file"""
66
- with open(file_path, 'r', encoding='utf-8') as file:
67
- return file.read()
68
-
69
- def _load_docx(self, file_path: str) -> str:
70
- """Extract text from DOCX"""
71
- doc = Document(file_path)
72
- text = "\n".join([para.text for para in doc.paragraphs])
73
- return text
74
-
75
- def chunk_text(
76
- self,
77
- text: str,
78
- chunk_size: int = CHUNK_SIZE,
79
- overlap: int = CHUNK_OVERLAP
80
- ) -> List[str]:
81
- """
82
- Split text into overlapping chunks
83
-
84
- Args:
85
- text: Input text
86
- chunk_size: Maximum chunk size in characters
87
- overlap: Overlap between chunks
88
-
89
- Returns:
90
- List of text chunks
91
- """
92
- if not text:
93
- return []
94
-
95
- # Split by sentences first (simple approach)
96
- sentences = text.replace('\n', ' ').split('. ')
97
-
98
- chunks = []
99
- current_chunk = ""
100
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  for sentence in sentences:
102
- sentence = sentence.strip() + ". "
103
-
104
- # If adding this sentence exceeds chunk size
105
- if len(current_chunk) + len(sentence) > chunk_size:
106
- if current_chunk:
107
- chunks.append(current_chunk.strip())
108
- # Start new chunk with overlap
109
- words = current_chunk.split()
110
- overlap_words = words[-overlap:] if len(words) > overlap else words
111
- current_chunk = " ".join(overlap_words) + " " + sentence
112
- else:
113
- # Sentence itself is longer than chunk_size
114
- chunks.append(sentence[:chunk_size])
115
- current_chunk = sentence[chunk_size:]
116
  else:
117
- current_chunk += sentence
118
-
119
- # Add last chunk
120
- if current_chunk:
121
- chunks.append(current_chunk.strip())
122
-
123
- return chunks
124
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
  def process_document(
126
  self,
127
  file_path: str,
128
- metadata: Dict = None
129
  ) -> List[DocumentChunk]:
130
  """
131
- Process document into chunks with metadata
132
-
133
- Args:
134
- file_path: Path to document
135
- metadata: Additional metadata
136
-
137
- Returns:
138
- List of DocumentChunk objects
139
  """
140
- # Load document
141
- text = self.load_document(file_path)
142
-
143
- # Create metadata
144
- file_metadata = {
145
- 'source': str(Path(file_path).name),
146
- 'file_path': str(file_path),
147
- 'file_type': Path(file_path).suffix,
148
- 'total_chars': len(text)
 
 
 
 
 
 
 
 
 
 
149
  }
150
-
151
  if metadata:
152
- file_metadata.update(metadata)
153
-
154
- # Chunk text
155
- chunks = self.chunk_text(text)
156
-
157
- # Create DocumentChunk objects
 
 
 
158
  doc_chunks = []
159
- for i, chunk in enumerate(chunks):
160
- chunk_metadata = file_metadata.copy()
161
- chunk_metadata['chunk_index'] = i
162
- chunk_metadata['total_chunks'] = len(chunks)
163
-
164
- doc_chunks.append(
165
- DocumentChunk(
166
- text=chunk,
167
- metadata=chunk_metadata,
168
- chunk_id=i
169
- )
170
- )
171
-
 
 
 
 
 
 
172
  return doc_chunks
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  """
2
+ Document processing and chunking β€” semantic, structure-aware pipeline.
3
+
4
+ Improvements over the old version
5
+ ──────────────────────────────────
6
+ β€’ PDF extracted page-by-page via PyMuPDF (fitz) β†’ pdfplumber fallback
7
+ – Tracks page numbers per chunk for precise citations
8
+ – Auto-detects & removes repeated headers/footers (noise lines appearing
9
+ on β‰₯40 % of pages)
10
+ – Fixes hyphenated line-breaks (word-\nbreak β†’ wordbreak)
11
+ β€’ Three-level chunking hierarchy:
12
+ 1. Detect section headings β†’ each section stays together where possible
13
+ 2. Split into paragraphs (double-newline / blank line)
14
+ 3. Split paragraphs into sentences (abbreviation-aware regex)
15
+ Sentences are then accumulated into target-size chunks so a chunk never
16
+ cuts in the middle of a sentence.
17
+ β€’ Chunk overlap carried as actual character text (not word count) so the
18
+ embedding always sees a coherent intro from the previous chunk.
19
+ β€’ Minimum chunk size filter (100 chars) β€” avoids storing page numbers,
20
+ lone headers, or empty fragments.
21
+ β€’ Metadata per chunk now includes: page_start, page_end, section_title,
22
+ char_count, chunk_index, total_chunks, source, file_type, institution_id,
23
+ course_id (passed in by caller).
24
  """
25
+
26
+ import re
27
  from pathlib import Path
28
+ from typing import List, Dict, Tuple, Optional
29
+
 
 
30
  from config import CHUNK_SIZE, CHUNK_OVERLAP
31
 
32
+ # ──────────────────────────────────────────────────────────────────────────────
33
+ # Data class
34
+ # ──────────────────────────────────────────────────────────────────────────────
35
+
36
  class DocumentChunk:
37
+ def __init__(self, text: str, metadata: Dict, chunk_id: int):
 
 
 
 
 
38
  self.text = text
39
  self.metadata = metadata
40
  self.chunk_id = chunk_id
41
 
42
+
43
+ # ──────────────────────────────────────────────────────────────���───────────────
44
+ # Low-level text utilities
45
+ # ──────────────────────────────────────────────────────────────────────────────
46
+
47
+ # Common abbreviations that end with a period but are NOT sentence endings.
48
+ _ABBREV_PAT = (
49
+ r"Dr|Mr|Mrs|Ms|Prof|Sr|Jr|Rev|Gen|Sgt|Cpl|Pvt|Lt|Capt|Cmdr|Adm"
50
+ r"|etc|Fig|fig|vs|i\.e|e\.g|Eq|eq|No|ref|approx|cf|et\sal|vol|ed"
51
+ r"|pp|ch|sec|dept|univ|est|govt|corp|inc|ltd|co|eng|tech|lab|exp"
52
+ r"|max|min|avg|std|def|Def|Prop|Thm|Cor|Lem|Ex|Eg|Jan|Feb|Mar|Apr"
53
+ r"|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Mon|Tue|Wed|Thu|Fri|Sat|Sun"
54
+ )
55
+
56
+ # Sentence boundary: (. or ! or ?) followed by whitespace + uppercase/digit,
57
+ # but NOT preceded by a known abbreviation.
58
+ _SENT_BOUNDARY = re.compile(
59
+ r"(?<!(?:" + _ABBREV_PAT + r"))(?<=[.!?])\s{1,3}(?=[A-Z0-9\"])"
60
+ )
61
+
62
+ # Unicode ligatures that PDFs sometimes embed
63
+ _LIGATURES = str.maketrans({
64
+ "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
65
+ "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
66
+ })
67
+
68
+ # Heading detection: line is a heading if it matches any of these
69
+ _HEADING_RE = re.compile(
70
+ r"^\s*("
71
+ r"\d+(\.\d+)*\.?\s+[A-Z]" # 1. Introduction / 1.2 Overview
72
+ r"|[A-Z][A-Z\s]{4,}[A-Z]" # ALL CAPS (min 6 chars)
73
+ r"|Chapter\s+\d+" # Chapter N
74
+ r"|Section\s+\d+" # Section N
75
+ r"|[IVXLCDM]+\.\s+[A-Z]" # Roman numeral heading
76
+ r")\s*$",
77
+ re.MULTILINE,
78
+ )
79
+
80
+
81
+ def _fix_text(raw: str) -> str:
82
+ """Light cleaning that preserves paragraph structure."""
83
+ text = raw.translate(_LIGATURES)
84
+ # Fix soft-hyphen / hard-hyphen line-breaks: "some-\nword" β†’ "someword"
85
+ text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
86
+ # Replace single lone newlines inside a paragraph with a space
87
+ # but preserve real paragraph breaks (2+ newlines stay)
88
+ text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
89
+ # Collapse runs of spaces (but not newlines)
90
+ text = re.sub(r"[ \t]{2,}", " ", text)
91
+ # Collapse 3+ blank lines to 2
92
+ text = re.sub(r"\n{3,}", "\n\n", text)
93
+ return text.strip()
94
+
95
+
96
+ def _split_sentences(paragraph: str) -> List[str]:
97
+ """Split a paragraph into sentences using abbreviation-aware regex."""
98
+ parts = _SENT_BOUNDARY.split(paragraph.strip())
99
+ sentences = []
100
+ for part in parts:
101
+ part = part.strip()
102
+ if part:
103
+ sentences.append(part)
104
+ return sentences if sentences else [paragraph.strip()]
105
+
106
+
107
+ def _split_paragraphs(text: str) -> List[str]:
108
+ """Split cleaned text into paragraphs (blank-line or indent separated)."""
109
+ # Split on double newlines (blank lines)
110
+ raw_paras = re.split(r"\n{2,}", text)
111
+ paras = []
112
+ for p in raw_paras:
113
+ p = p.strip()
114
+ if p:
115
+ paras.append(p)
116
+ return paras
117
+
118
+
119
+ def _detect_heading(line: str) -> bool:
120
+ """Return True if the line looks like a section heading."""
121
+ return bool(_HEADING_RE.match(line.strip()))
122
+
123
+
124
+ # ──────────────────────────────────────────────────────────────────────────────
125
+ # PDF extraction helpers
126
+ # ──────────────────────────────────────────────────────────────────────────────
127
+
128
+ def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
129
+ """
130
+ Extract text per page using PyMuPDF (fitz).
131
+ Returns [(page_number_1based, text), ...].
132
+ """
133
+ import fitz # PyMuPDF
134
+ pages = []
135
+ with fitz.open(file_path) as doc:
136
+ for i, page in enumerate(doc, start=1):
137
+ text = page.get_text("text") # plain text, respects reading order
138
+ if text.strip():
139
+ pages.append((i, text))
140
+ return pages
141
+
142
+
143
+ def _extract_pdf_pages_pdfplumber(file_path: str) -> List[Tuple[int, str]]:
144
+ """Fallback: extract per-page text via pdfplumber."""
145
+ import pdfplumber
146
+ pages = []
147
+ with pdfplumber.open(file_path) as pdf:
148
+ for i, page in enumerate(pdf.pages, start=1):
149
+ text = page.extract_text() or ""
150
+ if text.strip():
151
+ pages.append((i, text))
152
+ return pages
153
+
154
+
155
+ def _extract_pdf_pages_pypdf2(file_path: str) -> List[Tuple[int, str]]:
156
+ """Last resort: PyPDF2 per page."""
157
+ import PyPDF2
158
+ pages = []
159
+ with open(file_path, "rb") as f:
160
+ reader = PyPDF2.PdfReader(f)
161
+ for i, page in enumerate(reader.pages, start=1):
162
+ text = page.extract_text() or ""
163
+ if text.strip():
164
+ pages.append((i, text))
165
+ return pages
166
+
167
+
168
+ def _remove_headers_footers(
169
+ pages: List[Tuple[int, str]],
170
+ threshold: float = 0.40,
171
+ ) -> List[Tuple[int, str]]:
172
+ """
173
+ Remove lines that appear almost identically on β‰₯ threshold fraction of pages
174
+ β€” these are headers/footers (e.g. "Confidential", "Page N", course title).
175
+ """
176
+ if len(pages) < 3:
177
+ return pages # too few pages to detect reliably
178
+
179
+ # Collect first-line and last-line of each page (most common header/footer positions)
180
+ first_lines: Dict[str, int] = {}
181
+ last_lines: Dict[str, int] = {}
182
+
183
+ for _, text in pages:
184
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
185
+ if not lines:
186
+ continue
187
+ # Normalise: strip numbers from the lines to catch "Page 1", "Page 2", etc.
188
+ first = re.sub(r"\b\d+\b", "N", lines[0])
189
+ last = re.sub(r"\b\d+\b", "N", lines[-1])
190
+ first_lines[first] = first_lines.get(first, 0) + 1
191
+ last_lines[last] = last_lines.get(last, 0) + 1
192
+
193
+ total = len(pages)
194
+ noisy_first = {k for k, v in first_lines.items() if v / total >= threshold}
195
+ noisy_last = {k for k, v in last_lines.items() if v / total >= threshold}
196
+
197
+ cleaned = []
198
+ for page_num, text in pages:
199
+ lines = text.splitlines()
200
+ filtered = []
201
+ for idx, line in enumerate(lines):
202
+ normalised = re.sub(r"\b\d+\b", "N", line.strip())
203
+ if idx == 0 and normalised in noisy_first:
204
+ continue
205
+ if idx == len(lines) - 1 and normalised in noisy_last:
206
+ continue
207
+ # Also skip lone page-number lines anywhere in the page
208
+ if re.fullmatch(r"[\s\-–—]*\d{1,4}[\s\-–—]*", line):
209
+ continue
210
+ filtered.append(line)
211
+ cleaned.append((page_num, "\n".join(filtered)))
212
+ return cleaned
213
+
214
+
215
+ # ──────────────────────────────────────────────────────────────────────────────
216
+ # Core chunker
217
+ # ──────────────────────────────────────────────────────────────────────────────
218
+
219
+ def _build_chunks(
220
+ passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
221
+ target_size: int = CHUNK_SIZE,
222
+ overlap_chars: int = CHUNK_OVERLAP,
223
+ min_chunk_size: int = 100,
224
+ ) -> List[Dict]:
225
+ """
226
+ Accumulate sentence-split text into target-sized chunks with char overlap.
227
+
228
+ Each passage is split into sentences. Sentences are packed into the current
229
+ chunk until the target_size would be exceeded, then the chunk is flushed
230
+ and a new one starts, seeded with the last `overlap_chars` characters of
231
+ the previous chunk (so context bleeds across chunk boundaries).
232
+
233
+ Returns a list of dicts: {text, page_start, page_end, section_title}.
234
+ """
235
+ chunks: List[Dict] = []
236
+ current_text = ""
237
+ current_page_start: Optional[int] = None
238
+ current_page_end: Optional[int] = None
239
+ current_section: Optional[str] = None
240
+ overlap_seed = "" # tail of the last chunk
241
+
242
+ def flush():
243
+ nonlocal current_text, current_page_start, current_page_end, current_section, overlap_seed
244
+ text = current_text.strip()
245
+ if len(text) >= min_chunk_size:
246
+ chunks.append({
247
+ "text": text,
248
+ "page_start": current_page_start,
249
+ "page_end": current_page_end,
250
+ "section_title": current_section,
251
+ })
252
+ # Seed next chunk with the last overlap_chars of this chunk
253
+ overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
254
+ current_text = ""
255
+ current_page_start = None
256
+ current_page_end = None
257
+
258
+ for passage_text, page_num, section_title in passages:
259
+ # Update section tracking
260
+ if section_title:
261
+ current_section = section_title
262
+
263
+ sentences = _split_sentences(passage_text)
264
+
265
  for sentence in sentences:
266
+ sentence = sentence.strip()
267
+ if not sentence:
268
+ continue
269
+
270
+ # Would adding this sentence overflow the target?
271
+ projected = len(current_text) + (1 if current_text else 0) + len(sentence)
272
+
273
+ if projected > target_size and current_text:
274
+ flush()
275
+ # Start new chunk from overlap seed
276
+ current_text = overlap_seed + (" " if overlap_seed else "") + sentence
277
+ current_page_start = page_num
278
+ current_page_end = page_num
 
279
  else:
280
+ if not current_text:
281
+ # Fresh chunk β€” include overlap seed first
282
+ current_text = (overlap_seed + " " + sentence).strip() if overlap_seed else sentence
283
+ current_page_start = page_num
284
+ else:
285
+ current_text += " " + sentence
286
+
287
+ if current_page_end is None:
288
+ current_page_end = page_num
289
+ else:
290
+ current_page_end = max(current_page_end, page_num)
291
+
292
+ # Flush the last partial chunk
293
+ if current_text.strip():
294
+ flush()
295
+
296
+ return chunks
297
+
298
+
299
+ # ──────────────────────────────────────────────────────────────────────────────
300
+ # Main processor class (public API unchanged)
301
+ # ──────────────────────────────────────────────────────────────────────────────
302
+
303
+ class DocumentProcessor:
304
+ def __init__(self):
305
+ self.supported_formats = [".pdf", ".txt", ".docx"]
306
+
307
+ # ── Public entry point ────────────────────────────────────────────────────
308
+
309
  def process_document(
310
  self,
311
  file_path: str,
312
+ metadata: Dict = None,
313
  ) -> List[DocumentChunk]:
314
  """
315
+ Process a document file into semantically coherent chunks.
316
+ Returns a list of DocumentChunk objects; interface is unchanged.
 
 
 
 
 
 
317
  """
318
+ path = Path(file_path)
319
+ ext = path.suffix.lower()
320
+
321
+ if ext == ".pdf":
322
+ pages = self._load_pdf_pages(file_path)
323
+ elif ext == ".txt":
324
+ raw = self._load_txt(file_path)
325
+ pages = [(1, raw)]
326
+ elif ext == ".docx":
327
+ raw = self._load_docx(file_path)
328
+ pages = [(1, raw)]
329
+ else:
330
+ raise ValueError(f"Unsupported file format: {ext}")
331
+
332
+ # Build base metadata
333
+ file_meta = {
334
+ "source": path.name,
335
+ "file_path": str(file_path),
336
+ "file_type": ext,
337
  }
 
338
  if metadata:
339
+ file_meta.update(metadata)
340
+
341
+ # Convert pages to passage list with section tracking
342
+ passages = self._pages_to_passages(pages)
343
+
344
+ # Build variable-length chunks
345
+ raw_chunks = _build_chunks(passages, target_size=CHUNK_SIZE, overlap_chars=CHUNK_OVERLAP)
346
+
347
+ # Wrap into DocumentChunk objects
348
  doc_chunks = []
349
+ total = len(raw_chunks)
350
+ for i, rc in enumerate(raw_chunks):
351
+ chunk_meta = file_meta.copy()
352
+ chunk_meta["chunk_index"] = i
353
+ chunk_meta["total_chunks"] = total
354
+ chunk_meta["char_count"] = len(rc["text"])
355
+ chunk_meta["page_start"] = rc.get("page_start")
356
+ chunk_meta["page_end"] = rc.get("page_end")
357
+ if rc.get("section_title"):
358
+ chunk_meta["section_title"] = rc["section_title"]
359
+
360
+ doc_chunks.append(DocumentChunk(
361
+ text=rc["text"],
362
+ metadata=chunk_meta,
363
+ chunk_id=i,
364
+ ))
365
+
366
+ print(f"βœ… Chunked '{path.name}' β†’ {total} chunks "
367
+ f"(avg {sum(len(c.text) for c in doc_chunks)//max(total,1)} chars each)")
368
  return doc_chunks
369
+
370
+ # ── Legacy interface (still works; used by some older code paths) ─────────
371
+
372
+ def load_document(self, file_path: str) -> str:
373
+ """Return the full cleaned text of a document as a single string."""
374
+ ext = Path(file_path).suffix.lower()
375
+ if ext == ".pdf":
376
+ pages = self._load_pdf_pages(file_path)
377
+ return "\n\n".join(text for _, text in pages)
378
+ elif ext == ".txt":
379
+ return self._load_txt(file_path)
380
+ elif ext == ".docx":
381
+ return self._load_docx(file_path)
382
+ raise ValueError(f"Unsupported format: {ext}")
383
+
384
+ def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE,
385
+ overlap: int = CHUNK_OVERLAP) -> List[str]:
386
+ """Legacy helper β€” returns list of chunk strings from a raw text blob."""
387
+ passages = [(text, 1, None)]
388
+ raw_chunks = _build_chunks(passages, target_size=chunk_size, overlap_chars=overlap)
389
+ return [rc["text"] for rc in raw_chunks]
390
+
391
+ # ── PDF loading ───────────────────────────────────────────────────────────
392
+
393
+ def _load_pdf_pages(self, file_path: str) -> List[Tuple[int, str]]:
394
+ """Extract per-page text from a PDF with fallback chain."""
395
+ pages = None
396
+
397
+ # 1. PyMuPDF (best quality, respects reading order)
398
+ try:
399
+ pages = _extract_pdf_pages_fitz(file_path)
400
+ except Exception as e:
401
+ print(f" fitz failed ({e}), trying pdfplumber…")
402
+
403
+ # 2. pdfplumber
404
+ if not pages:
405
+ try:
406
+ pages = _extract_pdf_pages_pdfplumber(file_path)
407
+ except Exception as e:
408
+ print(f" pdfplumber failed ({e}), trying PyPDF2…")
409
+
410
+ # 3. PyPDF2 last resort
411
+ if not pages:
412
+ pages = _extract_pdf_pages_pypdf2(file_path)
413
+
414
+ if not pages:
415
+ raise RuntimeError(f"Could not extract any text from: {file_path}")
416
+
417
+ # Remove noise headers/footers, then clean each page
418
+ pages = _remove_headers_footers(pages)
419
+ return [(pn, _fix_text(text)) for pn, text in pages if _fix_text(text)]
420
+
421
+ # ── Plain text / DOCX loading ─────────────────────────────────────────────
422
+
423
+ def _load_txt(self, file_path: str) -> str:
424
+ with open(file_path, "r", encoding="utf-8", errors="replace") as f:
425
+ return _fix_text(f.read())
426
+
427
+ def _load_docx(self, file_path: str) -> str:
428
+ from docx import Document as DocxDoc
429
+ doc = DocxDoc(file_path)
430
+ paragraphs = []
431
+ for para in doc.paragraphs:
432
+ text = para.text.strip()
433
+ if text:
434
+ paragraphs.append(text)
435
+ return _fix_text("\n\n".join(paragraphs))
436
+
437
+ # ── Section/passage extraction ────────────────────────────────────────────
438
+
439
+ def _pages_to_passages(
440
+ self,
441
+ pages: List[Tuple[int, str]],
442
+ ) -> List[Tuple[str, int, Optional[str]]]:
443
+ """
444
+ Convert (page_num, text) pairs into a flat list of
445
+ (passage_text, page_num, section_title) tuples.
446
+
447
+ Detects section headings and tags each passage with the most recent
448
+ heading seen. Paragraphs within a page are exploded into separate
449
+ passages so that the chunker can work at fine granularity.
450
+ """
451
+ passages: List[Tuple[str, int, Optional[str]]] = []
452
+ current_section: Optional[str] = None
453
+
454
+ for page_num, page_text in pages:
455
+ # Split the page into paragraphs
456
+ paragraphs = _split_paragraphs(page_text)
457
+
458
+ for para in paragraphs:
459
+ if not para.strip():
460
+ continue
461
+
462
+ # Is this paragraph a standalone heading?
463
+ first_line = para.splitlines()[0].strip()
464
+ if _detect_heading(first_line) and len(para.strip()) < 120:
465
+ current_section = para.strip()
466
+ # Don't create a chunk for a bare heading β€” it'll be absorbed
467
+ # into the next passage as its section_title context
468
+ continue
469
+
470
+ passages.append((para, page_num, current_section))
471
+
472
+ return passages
473
+