Jay-10020 commited on
Commit
76320c7
Β·
1 Parent(s): e1d3f9d

RAG test3

Browse files
api/main.py CHANGED
@@ -264,7 +264,11 @@ async def upload_document(
264
  'institution_id': institution_id,
265
  'course_id': course_id
266
  }
267
-
 
 
 
 
268
  chunks = doc_processor.process_document(str(file_path), metadata)
269
 
270
  texts = [chunk.text for chunk in chunks]
 
264
  'institution_id': institution_id,
265
  'course_id': course_id
266
  }
267
+
268
+ # Remove any previously-stored chunks for this file so that
269
+ # re-uploads do not accumulate duplicate vectors.
270
+ vector_store.remove_document_chunks(file.filename)
271
+
272
  chunks = doc_processor.process_document(str(file_path), metadata)
273
 
274
  texts = [chunk.text for chunk in chunks]
vectordb/document_processor.py CHANGED
@@ -65,8 +65,22 @@ _ABBREVS = frozenset({
65
  _LIGATURES = str.maketrans({
66
  "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
67
  "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
 
 
68
  })
69
 
 
 
 
 
 
 
 
 
 
 
 
 
70
  # Heading detection: line is a heading if it matches any of these
71
  _HEADING_RE = re.compile(
72
  r"^\s*("
@@ -79,61 +93,128 @@ _HEADING_RE = re.compile(
79
  re.MULTILINE,
80
  )
81
 
 
 
 
 
 
 
 
82
 
83
  def _fix_text(raw: str) -> str:
84
- """Light cleaning that preserves paragraph structure."""
 
 
 
 
 
 
 
 
 
 
 
85
  text = raw.translate(_LIGATURES)
86
- # Fix soft-hyphen / hard-hyphen line-breaks: "some-\nword" β†’ "someword"
87
- text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
88
- # Replace single lone newlines inside a paragraph with a space
89
- # but preserve real paragraph breaks (2+ newlines stay)
90
- text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
91
- # Collapse runs of spaces (but not newlines)
92
- text = re.sub(r"[ \t]{2,}", " ", text)
93
- # Collapse 3+ blank lines to 2
94
- text = re.sub(r"\n{3,}", "\n\n", text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
95
  return text.strip()
96
 
97
 
98
  def _split_sentences(paragraph: str) -> List[str]:
99
- """Split a paragraph into sentences (abbreviation-aware, two-pass).
 
100
 
101
- First splits on any sentence-ending punctuation before an uppercase
102
- letter/digit, then rejoins splits where the preceding word is a known
103
- abbreviation. This avoids Python re's variable-width lookbehind
104
- restriction which would cause re.error at import time.
 
105
  """
106
- parts = _SENT_SPLIT_RE.split(paragraph.strip())
 
 
 
 
107
  if len(parts) <= 1:
108
- return [paragraph.strip()] if paragraph.strip() else []
109
 
110
- result: List[str] = []
 
111
  current = parts[0]
112
  for part in parts[1:]:
113
- # Check if current segment ends with a known abbreviation word.
114
  m = re.search(r'\b(\w+)\.\s*$', current)
115
  if m and m.group(1).lower() in _ABBREVS:
116
- # Abbreviation β€” rejoin with the next sentence fragment.
117
  current = current.rstrip() + ' ' + part
118
  else:
119
  stripped = current.strip()
120
  if stripped:
121
- result.append(stripped)
122
  current = part
123
  stripped = current.strip()
124
  if stripped:
125
- result.append(stripped)
126
- return result if result else [paragraph.strip()]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129
  def _split_paragraphs(text: str) -> List[str]:
130
- """Split cleaned text into paragraphs (blank-line or indent separated)."""
131
- # Split on double newlines (blank lines)
132
- raw_paras = re.split(r"\n{2,}", text)
133
- paras = []
 
134
  for p in raw_paras:
135
  p = p.strip()
136
- if p:
 
 
 
 
 
 
 
 
 
137
  paras.append(p)
138
  return paras
139
 
@@ -150,15 +231,30 @@ def _detect_heading(line: str) -> bool:
150
  def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
151
  """
152
  Extract text per page using PyMuPDF (fitz).
 
 
 
 
 
 
153
  Returns [(page_number_1based, text), ...].
154
  """
155
  import fitz # PyMuPDF
156
  pages = []
157
  with fitz.open(file_path) as doc:
158
- for i, page in enumerate(doc, start=1):
159
- text = page.get_text("text") # plain text, respects reading order
160
- if text.strip():
161
- pages.append((i, text))
 
 
 
 
 
 
 
 
 
162
  return pages
163
 
164
 
@@ -238,31 +334,52 @@ def _remove_headers_footers(
238
  # Core chunker
239
  # ──────────────────────────────────────────────────────────────────────────────
240
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  def _build_chunks(
242
  passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
243
  target_size: int = CHUNK_SIZE,
244
  overlap_chars: int = CHUNK_OVERLAP,
245
- min_chunk_size: int = 100,
246
  ) -> List[Dict]:
247
  """
248
- Accumulate sentence-split text into target-sized chunks with char overlap.
249
-
250
- Each passage is split into sentences. Sentences are packed into the current
251
- chunk until the target_size would be exceeded, then the chunk is flushed
252
- and a new one starts, seeded with the last `overlap_chars` characters of
253
- the previous chunk (so context bleeds across chunk boundaries).
254
-
255
- Returns a list of dicts: {text, page_start, page_end, section_title}.
 
 
 
 
 
256
  """
257
  chunks: List[Dict] = []
258
  current_text = ""
259
  current_page_start: Optional[int] = None
260
  current_page_end: Optional[int] = None
261
  current_section: Optional[str] = None
262
- overlap_seed = "" # tail of the last chunk
263
 
264
- def flush():
265
- nonlocal current_text, current_page_start, current_page_end, current_section, overlap_seed
266
  text = current_text.strip()
267
  if len(text) >= min_chunk_size:
268
  chunks.append({
@@ -271,47 +388,121 @@ def _build_chunks(
271
  "page_end": current_page_end,
272
  "section_title": current_section,
273
  })
274
- # Seed next chunk with the last overlap_chars of this chunk
275
- overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
276
  current_text = ""
277
  current_page_start = None
278
  current_page_end = None
279
 
280
- for passage_text, page_num, section_title in passages:
281
- # Update section tracking
282
- if section_title:
283
- current_section = section_title
284
-
285
- sentences = _split_sentences(passage_text)
 
 
 
 
286
 
287
- for sentence in sentences:
288
- sentence = sentence.strip()
289
- if not sentence:
 
 
 
 
 
 
290
  continue
291
 
292
- # Would adding this sentence overflow the target?
293
- projected = len(current_text) + (1 if current_text else 0) + len(sentence)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
294
 
295
  if projected > target_size and current_text:
296
  flush()
297
- # Start new chunk from overlap seed
298
- current_text = overlap_seed + (" " if overlap_seed else "") + sentence
299
- current_page_start = page_num
300
- current_page_end = page_num
301
  else:
302
  if not current_text:
303
- # Fresh chunk β€” include overlap seed first
304
- current_text = (overlap_seed + " " + sentence).strip() if overlap_seed else sentence
305
- current_page_start = page_num
306
  else:
307
- current_text += " " + sentence
 
 
 
 
 
 
 
 
 
 
308
 
309
- if current_page_end is None:
310
- current_page_end = page_num
 
 
 
 
 
 
 
 
 
 
 
311
  else:
312
- current_page_end = max(current_page_end, page_num)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- # Flush the last partial chunk
315
  if current_text.strip():
316
  flush()
317
 
 
65
  _LIGATURES = str.maketrans({
66
  "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
67
  "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
68
+ "\u2019": "'", "\u2018": "'", "\u201C": '"', "\u201D": '"',
69
+ "\u2013": "-", "\u2014": " - ", "\u2022": "*", "\u00A0": " ",
70
  })
71
 
72
+ # Detect a question sentence (ends with ? or starts with question words)
73
+ _QUESTION_RE = re.compile(
74
+ r'\?$|^(what|which|who|whom|whose|when|where|why|how|is|are|was|were|'
75
+ r'do|does|did|can|could|will|would|shall|should|may|might|must|has|have|had)\b',
76
+ re.IGNORECASE,
77
+ )
78
+
79
+ # List item starters: bullet, dash, numbered, letter+period
80
+ _LIST_ITEM_RE = re.compile(
81
+ r'^(\s*[\*\-\β€’\–\β€”]\s+|\s*\d{1,3}[.)]\s+|\s*[a-zA-Z][.)]\s+)'
82
+ )
83
+
84
  # Heading detection: line is a heading if it matches any of these
85
  _HEADING_RE = re.compile(
86
  r"^\s*("
 
93
  re.MULTILINE,
94
  )
95
 
96
+ # Detect lines that are just page numbers / artifacts (no real content)
97
+ _NOISE_LINE_RE = re.compile(
98
+ r'^[\s\d\.\-\–\β€”\|]{0,6}$' # whitespace/digits/punctuation only
99
+ r'|^\s*(page|pg\.?)\s*\d+\s*$', # "Page 5" etc.
100
+ re.IGNORECASE,
101
+ )
102
+
103
 
104
  def _fix_text(raw: str) -> str:
105
+ """
106
+ Comprehensive PDF text cleaning that preserves paragraph structure.
107
+
108
+ Handles:
109
+ - Unicode ligatures and smart quotes
110
+ - Hyphenated line-breaks (word-\n)
111
+ - Isolated single-character lines from columnar PDFs
112
+ - Repeated spaces from PDF spacing
113
+ - Runs of blank lines
114
+ - Non-breaking spaces, zero-width chars
115
+ """
116
+ # Translate known ligatures and typographic chars
117
  text = raw.translate(_LIGATURES)
118
+
119
+ # Remove zero-width / control chars except newline and tab
120
+ text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
121
+
122
+ # Tabs β†’ spaces
123
+ text = text.replace('\t', ' ')
124
+
125
+ # Fix hard/soft hyphen line-breaks: "some-\nword" β†’ "someword"
126
+ text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
127
+
128
+ # Remove lone single-character lines (column-merging artifact)
129
+ # but only if surrounded by blank lines
130
+ text = re.sub(r'\n([A-Za-z])\n', r' \1 ', text)
131
+
132
+ # A single newline that is NOT a paragraph break: join as a space
133
+ # (paragraph breaks = 2+ newlines, keep those)
134
+ text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
135
+
136
+ # Collapse runs of spaces (not newlines)
137
+ text = re.sub(r'[ ]{2,}', ' ', text)
138
+
139
+ # Collapse 3+ blank lines to exactly 2
140
+ text = re.sub(r'\n{3,}', '\n\n', text)
141
+
142
+ # Remove lines that are pure noise (page numbers, lone dashes, etc.)
143
+ lines = text.splitlines()
144
+ lines = [ln for ln in lines if not _NOISE_LINE_RE.fullmatch(ln)]
145
+ text = '\n'.join(lines)
146
+
147
  return text.strip()
148
 
149
 
150
  def _split_sentences(paragraph: str) -> List[str]:
151
+ """
152
+ Split a paragraph into clean, complete sentences.
153
 
154
+ Rules:
155
+ 1. Split on [.!?] followed by whitespace + uppercase/digit (fixed lookbehind)
156
+ 2. Rejoin where the word before the period is a known abbreviation
157
+ 3. Questions (ending with ?) are preserved as whole atomic units
158
+ 4. Short fragments (< 15 chars) are merged with the next sentence
159
  """
160
+ text = paragraph.strip()
161
+ if not text:
162
+ return []
163
+
164
+ parts = _SENT_SPLIT_RE.split(text)
165
  if len(parts) <= 1:
166
+ return [text]
167
 
168
+ # Abbreviation-aware rejoin pass
169
+ merged: List[str] = []
170
  current = parts[0]
171
  for part in parts[1:]:
 
172
  m = re.search(r'\b(\w+)\.\s*$', current)
173
  if m and m.group(1).lower() in _ABBREVS:
 
174
  current = current.rstrip() + ' ' + part
175
  else:
176
  stripped = current.strip()
177
  if stripped:
178
+ merged.append(stripped)
179
  current = part
180
  stripped = current.strip()
181
  if stripped:
182
+ merged.append(stripped)
183
+
184
+ if not merged:
185
+ return [text]
186
+
187
+ # Merge tiny fragments (< 15 chars that aren't standalone questions)
188
+ result: List[str] = []
189
+ for sent in merged:
190
+ if (result and len(sent) < 15
191
+ and not _QUESTION_RE.search(sent)
192
+ and not sent.endswith('?')):
193
+ result[-1] = result[-1].rstrip() + ' ' + sent
194
+ else:
195
+ result.append(sent)
196
+
197
+ return result
198
 
199
 
200
  def _split_paragraphs(text: str) -> List[str]:
201
+ """Split cleaned text into paragraphs (blank-line separated).
202
+ Also treats each list item as its own paragraph.
203
+ """
204
+ raw_paras = re.split(r'\n{2,}', text)
205
+ paras: List[str] = []
206
  for p in raw_paras:
207
  p = p.strip()
208
+ if not p:
209
+ continue
210
+ # If the paragraph contains multiple list items, split them individually
211
+ lines = p.splitlines()
212
+ if len(lines) > 1 and all(_LIST_ITEM_RE.match(ln) for ln in lines if ln.strip()):
213
+ for ln in lines:
214
+ ln = ln.strip()
215
+ if ln:
216
+ paras.append(ln)
217
+ else:
218
  paras.append(p)
219
  return paras
220
 
 
231
  def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
232
  """
233
  Extract text per page using PyMuPDF (fitz).
234
+
235
+ Uses the 'blocks' extraction mode which preserves reading order and
236
+ provides paragraph-level grouping: each block becomes its own logical
237
+ paragraph, which drastically reduces mid-word / mid-sentence breaks
238
+ compared to raw character-stream extraction.
239
+
240
  Returns [(page_number_1based, text), ...].
241
  """
242
  import fitz # PyMuPDF
243
  pages = []
244
  with fitz.open(file_path) as doc:
245
+ for page_num, page in enumerate(doc, start=1):
246
+ blocks = page.get_text("blocks", sort=True) # sort=True β†’ reading order
247
+ paragraphs = []
248
+ for blk in blocks:
249
+ # blocks entry: (x0, y0, x1, y1, "text", block_no, block_type)
250
+ if blk[6] != 0: # 0 = text, 1 = image β€” skip images
251
+ continue
252
+ blk_text = blk[4].strip()
253
+ if blk_text:
254
+ paragraphs.append(blk_text)
255
+ if paragraphs:
256
+ # Join blocks with double newlines so _split_paragraphs can use them
257
+ pages.append((page_num, "\n\n".join(paragraphs)))
258
  return pages
259
 
260
 
 
334
  # Core chunker
335
  # ──────────────────────────────────────────────────────────────────────────────
336
 
337
+ # Hard upper limit β€” a single chunk is never allowed to exceed this
338
+ _MAX_CHUNK_SIZE = 1400
339
+
340
+ def _overlap_seed(text: str, overlap_chars: int) -> str:
341
+ """
342
+ Return the last `overlap_chars` characters of `text`, but trim to the
343
+ start of the last complete word so we never cut mid-word.
344
+ """
345
+ if len(text) <= overlap_chars:
346
+ return text
347
+ tail = text[-overlap_chars:]
348
+ # Walk forward until we hit a word boundary (space)
349
+ idx = tail.find(' ')
350
+ return tail[idx + 1:] if idx != -1 else tail
351
+
352
+
353
  def _build_chunks(
354
  passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
355
  target_size: int = CHUNK_SIZE,
356
  overlap_chars: int = CHUNK_OVERLAP,
357
+ min_chunk_size: int = 80,
358
  ) -> List[Dict]:
359
  """
360
+ Paragraph-first chunking with sentence-level overflow handling.
361
+
362
+ Strategy (in priority order):
363
+ 1. **Keep paragraphs whole** β€” if a paragraph fits in [min_chunk_size, MAX],
364
+ accumulate paragraphs into the current chunk until target is reached.
365
+ 2. **Question boundary preference** β€” when flushing, prefer to end on a
366
+ sentence that ends with '?' so questions are never split.
367
+ 3. **Sentence-level split** β€” if a single paragraph exceeds MAX_CHUNK_SIZE,
368
+ split it at sentence boundaries (never mid-word, never mid-sentence).
369
+ 4. **Overlap** β€” the last 1–2 sentences of the previous chunk are prepended
370
+ to the next so the LLM has context across boundaries.
371
+ 5. **Min filter** β€” discard chunks shorter than min_chunk_size (stray
372
+ headings, lone numbers, etc.).
373
  """
374
  chunks: List[Dict] = []
375
  current_text = ""
376
  current_page_start: Optional[int] = None
377
  current_page_end: Optional[int] = None
378
  current_section: Optional[str] = None
379
+ seed = "" # overlap carried into the next chunk
380
 
381
+ def flush(force_seed: str = ""):
382
+ nonlocal current_text, current_page_start, current_page_end, seed
383
  text = current_text.strip()
384
  if len(text) >= min_chunk_size:
385
  chunks.append({
 
388
  "page_end": current_page_end,
389
  "section_title": current_section,
390
  })
391
+ seed = force_seed if force_seed else _overlap_seed(text, overlap_chars)
 
392
  current_text = ""
393
  current_page_start = None
394
  current_page_end = None
395
 
396
+ def _append_to_current(text_piece: str, page_num: int):
397
+ nonlocal current_text, current_page_start, current_page_end
398
+ sep = " " if current_text and not current_text.endswith('\n') else ""
399
+ current_text += sep + text_piece
400
+ if current_page_start is None:
401
+ current_page_start = page_num
402
+ if current_page_end is None:
403
+ current_page_end = page_num
404
+ else:
405
+ current_page_end = max(current_page_end, page_num)
406
 
407
+ def add_sentence_chunks(sentences: List[str], page_num: int):
408
+ """
409
+ Split a list of sentences into chunks, respecting target/max sizes.
410
+ Questions are always kept in their own chunk if long enough.
411
+ """
412
+ nonlocal seed
413
+ for sent in sentences:
414
+ sent = sent.strip()
415
+ if not sent:
416
  continue
417
 
418
+ is_question = bool(_QUESTION_RE.search(sent)) or sent.endswith('?')
419
+
420
+ # If the sentence itself exceeds MAX, split at the last word boundary
421
+ while len(sent) > _MAX_CHUNK_SIZE:
422
+ cut = sent.rfind(' ', 0, _MAX_CHUNK_SIZE)
423
+ if cut == -1:
424
+ cut = _MAX_CHUNK_SIZE
425
+ piece = sent[:cut].strip()
426
+ sent = sent[cut:].strip()
427
+ if current_text:
428
+ flush()
429
+ current_text = (seed + " " + piece).strip() if seed else piece
430
+ current_page_start = current_page_end = page_num
431
+ else:
432
+ current_text = (seed + " " + piece).strip() if seed else piece
433
+ current_page_start = current_page_end = page_num
434
+ flush()
435
+
436
+ projected = len(current_text) + (1 if current_text else 0) + len(sent)
437
+
438
+ # Questions start a fresh chunk when they're substantial enough
439
+ if is_question and len(sent) >= 30 and current_text:
440
+ flush()
441
+ current_text = (seed + " " + sent).strip() if seed else sent
442
+ current_page_start = current_page_end = page_num
443
+ # If this question fits alone as a good chunk, flush it immediately
444
+ if len(sent) >= min_chunk_size:
445
+ flush()
446
+ return
447
 
448
  if projected > target_size and current_text:
449
  flush()
450
+ current_text = (seed + " " + sent).strip() if seed else sent
451
+ current_page_start = current_page_end = page_num
 
 
452
  else:
453
  if not current_text:
454
+ current_text = (seed + " " + sent).strip() if seed else sent
455
+ current_page_start = current_page_end = page_num
 
456
  else:
457
+ _append_to_current(sent, page_num)
458
+
459
+ for passage_text, page_num, section_title in passages:
460
+ # Section tracking
461
+ if section_title:
462
+ current_section = section_title
463
+
464
+ # ── Paragraph-first ──────────────────────────────────────────────────
465
+ para = passage_text.strip()
466
+ if not para:
467
+ continue
468
 
469
+ para_len = len(para)
470
+
471
+ # Small paragraph: accumulate as-is (don't sentence-split yet)
472
+ if para_len <= target_size:
473
+ projected = len(current_text) + (2 if current_text else 0) + para_len
474
+ if projected > _MAX_CHUNK_SIZE and current_text:
475
+ flush()
476
+ current_text = (seed + " " + para).strip() if seed else para
477
+ current_page_start = current_page_end = page_num
478
+ else:
479
+ if not current_text:
480
+ current_text = (seed + " " + para).strip() if seed else para
481
+ current_page_start = current_page_end = page_num
482
  else:
483
+ # Add a clear paragraph separator within the chunk
484
+ current_text += "\n\n" + para
485
+ current_page_end = max(current_page_end or page_num, page_num)
486
+
487
+ # Large paragraph: sentence-split and accumulate sentence by sentence
488
+ else:
489
+ sentences = _split_sentences(para)
490
+ if len(sentences) <= 1:
491
+ # Can't split β€” force as its own chunk, truncating only at MAX
492
+ if current_text:
493
+ flush()
494
+ truncated = para[:_MAX_CHUNK_SIZE]
495
+ # Don't cut mid-word
496
+ last_space = truncated.rfind(' ')
497
+ if last_space > min_chunk_size:
498
+ truncated = truncated[:last_space]
499
+ current_text = (seed + " " + truncated).strip() if seed else truncated
500
+ current_page_start = current_page_end = page_num
501
+ flush()
502
+ else:
503
+ add_sentence_chunks(sentences, page_num)
504
 
505
+ # Flush the final partial chunk
506
  if current_text.strip():
507
  flush()
508
 
vectordb/json_store.py CHANGED
@@ -171,6 +171,31 @@ class JSONStore:
171
  }
172
  self._save_data()
173
  print("βœ“ Deleted all documents")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  def get_stats(self) -> Dict:
176
  """Get store statistics"""
 
171
  }
172
  self._save_data()
173
  print("βœ“ Deleted all documents")
174
+
175
+ def remove_document_chunks(self, source_filename: str) -> int:
176
+ """
177
+ Remove all stored chunks that belong to the given file.
178
+
179
+ Matches on two criteria (either is sufficient):
180
+ 1. doc['id'].startswith(source_filename + '_') – ID convention used by /upload
181
+ 2. doc['metadata'].get('source') == source_filename
182
+
183
+ Returns the number of chunks removed.
184
+ """
185
+ prefix = source_filename + '_'
186
+ before = len(self.data['documents'])
187
+ self.data['documents'] = [
188
+ doc for doc in self.data['documents']
189
+ if not (
190
+ doc.get('id', '').startswith(prefix) or
191
+ doc.get('metadata', {}).get('source') == source_filename
192
+ )
193
+ ]
194
+ removed = before - len(self.data['documents'])
195
+ if removed:
196
+ self._save_data()
197
+ print(f"βœ“ Removed {removed} existing chunks for '{source_filename}'")
198
+ return removed
199
 
200
  def get_stats(self) -> Dict:
201
  """Get store statistics"""