Spaces:
Running
Running
RAG test3
Browse files- api/main.py +5 -1
- vectordb/document_processor.py +260 -69
- vectordb/json_store.py +25 -0
api/main.py
CHANGED
|
@@ -264,7 +264,11 @@ async def upload_document(
|
|
| 264 |
'institution_id': institution_id,
|
| 265 |
'course_id': course_id
|
| 266 |
}
|
| 267 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
chunks = doc_processor.process_document(str(file_path), metadata)
|
| 269 |
|
| 270 |
texts = [chunk.text for chunk in chunks]
|
|
|
|
| 264 |
'institution_id': institution_id,
|
| 265 |
'course_id': course_id
|
| 266 |
}
|
| 267 |
+
|
| 268 |
+
# Remove any previously-stored chunks for this file so that
|
| 269 |
+
# re-uploads do not accumulate duplicate vectors.
|
| 270 |
+
vector_store.remove_document_chunks(file.filename)
|
| 271 |
+
|
| 272 |
chunks = doc_processor.process_document(str(file_path), metadata)
|
| 273 |
|
| 274 |
texts = [chunk.text for chunk in chunks]
|
vectordb/document_processor.py
CHANGED
|
@@ -65,8 +65,22 @@ _ABBREVS = frozenset({
|
|
| 65 |
_LIGATURES = str.maketrans({
|
| 66 |
"\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
|
| 67 |
"\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
|
|
|
|
|
|
|
| 68 |
})
|
| 69 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
# Heading detection: line is a heading if it matches any of these
|
| 71 |
_HEADING_RE = re.compile(
|
| 72 |
r"^\s*("
|
|
@@ -79,61 +93,128 @@ _HEADING_RE = re.compile(
|
|
| 79 |
re.MULTILINE,
|
| 80 |
)
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
def _fix_text(raw: str) -> str:
|
| 84 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
text = raw.translate(_LIGATURES)
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
#
|
| 94 |
-
text = re.sub(r
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
return text.strip()
|
| 96 |
|
| 97 |
|
| 98 |
def _split_sentences(paragraph: str) -> List[str]:
|
| 99 |
-
"""
|
|
|
|
| 100 |
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
| 105 |
"""
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
if len(parts) <= 1:
|
| 108 |
-
return [
|
| 109 |
|
| 110 |
-
|
|
|
|
| 111 |
current = parts[0]
|
| 112 |
for part in parts[1:]:
|
| 113 |
-
# Check if current segment ends with a known abbreviation word.
|
| 114 |
m = re.search(r'\b(\w+)\.\s*$', current)
|
| 115 |
if m and m.group(1).lower() in _ABBREVS:
|
| 116 |
-
# Abbreviation β rejoin with the next sentence fragment.
|
| 117 |
current = current.rstrip() + ' ' + part
|
| 118 |
else:
|
| 119 |
stripped = current.strip()
|
| 120 |
if stripped:
|
| 121 |
-
|
| 122 |
current = part
|
| 123 |
stripped = current.strip()
|
| 124 |
if stripped:
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
|
| 128 |
|
| 129 |
def _split_paragraphs(text: str) -> List[str]:
|
| 130 |
-
"""Split cleaned text into paragraphs (blank-line
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
|
|
|
| 134 |
for p in raw_paras:
|
| 135 |
p = p.strip()
|
| 136 |
-
if p:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
paras.append(p)
|
| 138 |
return paras
|
| 139 |
|
|
@@ -150,15 +231,30 @@ def _detect_heading(line: str) -> bool:
|
|
| 150 |
def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
|
| 151 |
"""
|
| 152 |
Extract text per page using PyMuPDF (fitz).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 153 |
Returns [(page_number_1based, text), ...].
|
| 154 |
"""
|
| 155 |
import fitz # PyMuPDF
|
| 156 |
pages = []
|
| 157 |
with fitz.open(file_path) as doc:
|
| 158 |
-
for
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
return pages
|
| 163 |
|
| 164 |
|
|
@@ -238,31 +334,52 @@ def _remove_headers_footers(
|
|
| 238 |
# Core chunker
|
| 239 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 240 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
def _build_chunks(
|
| 242 |
passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
|
| 243 |
target_size: int = CHUNK_SIZE,
|
| 244 |
overlap_chars: int = CHUNK_OVERLAP,
|
| 245 |
-
min_chunk_size: int =
|
| 246 |
) -> List[Dict]:
|
| 247 |
"""
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
"""
|
| 257 |
chunks: List[Dict] = []
|
| 258 |
current_text = ""
|
| 259 |
current_page_start: Optional[int] = None
|
| 260 |
current_page_end: Optional[int] = None
|
| 261 |
current_section: Optional[str] = None
|
| 262 |
-
|
| 263 |
|
| 264 |
-
def flush():
|
| 265 |
-
nonlocal current_text, current_page_start, current_page_end,
|
| 266 |
text = current_text.strip()
|
| 267 |
if len(text) >= min_chunk_size:
|
| 268 |
chunks.append({
|
|
@@ -271,47 +388,121 @@ def _build_chunks(
|
|
| 271 |
"page_end": current_page_end,
|
| 272 |
"section_title": current_section,
|
| 273 |
})
|
| 274 |
-
|
| 275 |
-
overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
|
| 276 |
current_text = ""
|
| 277 |
current_page_start = None
|
| 278 |
current_page_end = None
|
| 279 |
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
if
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 290 |
continue
|
| 291 |
|
| 292 |
-
|
| 293 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
if projected > target_size and current_text:
|
| 296 |
flush()
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
current_page_start = page_num
|
| 300 |
-
current_page_end = page_num
|
| 301 |
else:
|
| 302 |
if not current_text:
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
current_page_start = page_num
|
| 306 |
else:
|
| 307 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
|
| 309 |
-
|
| 310 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 311 |
else:
|
| 312 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 313 |
|
| 314 |
-
# Flush the
|
| 315 |
if current_text.strip():
|
| 316 |
flush()
|
| 317 |
|
|
|
|
| 65 |
_LIGATURES = str.maketrans({
|
| 66 |
"\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
|
| 67 |
"\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
|
| 68 |
+
"\u2019": "'", "\u2018": "'", "\u201C": '"', "\u201D": '"',
|
| 69 |
+
"\u2013": "-", "\u2014": " - ", "\u2022": "*", "\u00A0": " ",
|
| 70 |
})
|
| 71 |
|
| 72 |
+
# Detect a question sentence (ends with ? or starts with question words)
|
| 73 |
+
_QUESTION_RE = re.compile(
|
| 74 |
+
r'\?$|^(what|which|who|whom|whose|when|where|why|how|is|are|was|were|'
|
| 75 |
+
r'do|does|did|can|could|will|would|shall|should|may|might|must|has|have|had)\b',
|
| 76 |
+
re.IGNORECASE,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
# List item starters: bullet, dash, numbered, letter+period
|
| 80 |
+
_LIST_ITEM_RE = re.compile(
|
| 81 |
+
r'^(\s*[\*\-\β’\β\β]\s+|\s*\d{1,3}[.)]\s+|\s*[a-zA-Z][.)]\s+)'
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
# Heading detection: line is a heading if it matches any of these
|
| 85 |
_HEADING_RE = re.compile(
|
| 86 |
r"^\s*("
|
|
|
|
| 93 |
re.MULTILINE,
|
| 94 |
)
|
| 95 |
|
| 96 |
+
# Detect lines that are just page numbers / artifacts (no real content)
|
| 97 |
+
_NOISE_LINE_RE = re.compile(
|
| 98 |
+
r'^[\s\d\.\-\β\β\|]{0,6}$' # whitespace/digits/punctuation only
|
| 99 |
+
r'|^\s*(page|pg\.?)\s*\d+\s*$', # "Page 5" etc.
|
| 100 |
+
re.IGNORECASE,
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
|
| 104 |
def _fix_text(raw: str) -> str:
|
| 105 |
+
"""
|
| 106 |
+
Comprehensive PDF text cleaning that preserves paragraph structure.
|
| 107 |
+
|
| 108 |
+
Handles:
|
| 109 |
+
- Unicode ligatures and smart quotes
|
| 110 |
+
- Hyphenated line-breaks (word-\n)
|
| 111 |
+
- Isolated single-character lines from columnar PDFs
|
| 112 |
+
- Repeated spaces from PDF spacing
|
| 113 |
+
- Runs of blank lines
|
| 114 |
+
- Non-breaking spaces, zero-width chars
|
| 115 |
+
"""
|
| 116 |
+
# Translate known ligatures and typographic chars
|
| 117 |
text = raw.translate(_LIGATURES)
|
| 118 |
+
|
| 119 |
+
# Remove zero-width / control chars except newline and tab
|
| 120 |
+
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
|
| 121 |
+
|
| 122 |
+
# Tabs β spaces
|
| 123 |
+
text = text.replace('\t', ' ')
|
| 124 |
+
|
| 125 |
+
# Fix hard/soft hyphen line-breaks: "some-\nword" β "someword"
|
| 126 |
+
text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
|
| 127 |
+
|
| 128 |
+
# Remove lone single-character lines (column-merging artifact)
|
| 129 |
+
# but only if surrounded by blank lines
|
| 130 |
+
text = re.sub(r'\n([A-Za-z])\n', r' \1 ', text)
|
| 131 |
+
|
| 132 |
+
# A single newline that is NOT a paragraph break: join as a space
|
| 133 |
+
# (paragraph breaks = 2+ newlines, keep those)
|
| 134 |
+
text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
|
| 135 |
+
|
| 136 |
+
# Collapse runs of spaces (not newlines)
|
| 137 |
+
text = re.sub(r'[ ]{2,}', ' ', text)
|
| 138 |
+
|
| 139 |
+
# Collapse 3+ blank lines to exactly 2
|
| 140 |
+
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 141 |
+
|
| 142 |
+
# Remove lines that are pure noise (page numbers, lone dashes, etc.)
|
| 143 |
+
lines = text.splitlines()
|
| 144 |
+
lines = [ln for ln in lines if not _NOISE_LINE_RE.fullmatch(ln)]
|
| 145 |
+
text = '\n'.join(lines)
|
| 146 |
+
|
| 147 |
return text.strip()
|
| 148 |
|
| 149 |
|
| 150 |
def _split_sentences(paragraph: str) -> List[str]:
|
| 151 |
+
"""
|
| 152 |
+
Split a paragraph into clean, complete sentences.
|
| 153 |
|
| 154 |
+
Rules:
|
| 155 |
+
1. Split on [.!?] followed by whitespace + uppercase/digit (fixed lookbehind)
|
| 156 |
+
2. Rejoin where the word before the period is a known abbreviation
|
| 157 |
+
3. Questions (ending with ?) are preserved as whole atomic units
|
| 158 |
+
4. Short fragments (< 15 chars) are merged with the next sentence
|
| 159 |
"""
|
| 160 |
+
text = paragraph.strip()
|
| 161 |
+
if not text:
|
| 162 |
+
return []
|
| 163 |
+
|
| 164 |
+
parts = _SENT_SPLIT_RE.split(text)
|
| 165 |
if len(parts) <= 1:
|
| 166 |
+
return [text]
|
| 167 |
|
| 168 |
+
# Abbreviation-aware rejoin pass
|
| 169 |
+
merged: List[str] = []
|
| 170 |
current = parts[0]
|
| 171 |
for part in parts[1:]:
|
|
|
|
| 172 |
m = re.search(r'\b(\w+)\.\s*$', current)
|
| 173 |
if m and m.group(1).lower() in _ABBREVS:
|
|
|
|
| 174 |
current = current.rstrip() + ' ' + part
|
| 175 |
else:
|
| 176 |
stripped = current.strip()
|
| 177 |
if stripped:
|
| 178 |
+
merged.append(stripped)
|
| 179 |
current = part
|
| 180 |
stripped = current.strip()
|
| 181 |
if stripped:
|
| 182 |
+
merged.append(stripped)
|
| 183 |
+
|
| 184 |
+
if not merged:
|
| 185 |
+
return [text]
|
| 186 |
+
|
| 187 |
+
# Merge tiny fragments (< 15 chars that aren't standalone questions)
|
| 188 |
+
result: List[str] = []
|
| 189 |
+
for sent in merged:
|
| 190 |
+
if (result and len(sent) < 15
|
| 191 |
+
and not _QUESTION_RE.search(sent)
|
| 192 |
+
and not sent.endswith('?')):
|
| 193 |
+
result[-1] = result[-1].rstrip() + ' ' + sent
|
| 194 |
+
else:
|
| 195 |
+
result.append(sent)
|
| 196 |
+
|
| 197 |
+
return result
|
| 198 |
|
| 199 |
|
| 200 |
def _split_paragraphs(text: str) -> List[str]:
|
| 201 |
+
"""Split cleaned text into paragraphs (blank-line separated).
|
| 202 |
+
Also treats each list item as its own paragraph.
|
| 203 |
+
"""
|
| 204 |
+
raw_paras = re.split(r'\n{2,}', text)
|
| 205 |
+
paras: List[str] = []
|
| 206 |
for p in raw_paras:
|
| 207 |
p = p.strip()
|
| 208 |
+
if not p:
|
| 209 |
+
continue
|
| 210 |
+
# If the paragraph contains multiple list items, split them individually
|
| 211 |
+
lines = p.splitlines()
|
| 212 |
+
if len(lines) > 1 and all(_LIST_ITEM_RE.match(ln) for ln in lines if ln.strip()):
|
| 213 |
+
for ln in lines:
|
| 214 |
+
ln = ln.strip()
|
| 215 |
+
if ln:
|
| 216 |
+
paras.append(ln)
|
| 217 |
+
else:
|
| 218 |
paras.append(p)
|
| 219 |
return paras
|
| 220 |
|
|
|
|
| 231 |
def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
|
| 232 |
"""
|
| 233 |
Extract text per page using PyMuPDF (fitz).
|
| 234 |
+
|
| 235 |
+
Uses the 'blocks' extraction mode which preserves reading order and
|
| 236 |
+
provides paragraph-level grouping: each block becomes its own logical
|
| 237 |
+
paragraph, which drastically reduces mid-word / mid-sentence breaks
|
| 238 |
+
compared to raw character-stream extraction.
|
| 239 |
+
|
| 240 |
Returns [(page_number_1based, text), ...].
|
| 241 |
"""
|
| 242 |
import fitz # PyMuPDF
|
| 243 |
pages = []
|
| 244 |
with fitz.open(file_path) as doc:
|
| 245 |
+
for page_num, page in enumerate(doc, start=1):
|
| 246 |
+
blocks = page.get_text("blocks", sort=True) # sort=True β reading order
|
| 247 |
+
paragraphs = []
|
| 248 |
+
for blk in blocks:
|
| 249 |
+
# blocks entry: (x0, y0, x1, y1, "text", block_no, block_type)
|
| 250 |
+
if blk[6] != 0: # 0 = text, 1 = image β skip images
|
| 251 |
+
continue
|
| 252 |
+
blk_text = blk[4].strip()
|
| 253 |
+
if blk_text:
|
| 254 |
+
paragraphs.append(blk_text)
|
| 255 |
+
if paragraphs:
|
| 256 |
+
# Join blocks with double newlines so _split_paragraphs can use them
|
| 257 |
+
pages.append((page_num, "\n\n".join(paragraphs)))
|
| 258 |
return pages
|
| 259 |
|
| 260 |
|
|
|
|
| 334 |
# Core chunker
|
| 335 |
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 336 |
|
| 337 |
+
# Hard upper limit β a single chunk is never allowed to exceed this
|
| 338 |
+
_MAX_CHUNK_SIZE = 1400
|
| 339 |
+
|
| 340 |
+
def _overlap_seed(text: str, overlap_chars: int) -> str:
|
| 341 |
+
"""
|
| 342 |
+
Return the last `overlap_chars` characters of `text`, but trim to the
|
| 343 |
+
start of the last complete word so we never cut mid-word.
|
| 344 |
+
"""
|
| 345 |
+
if len(text) <= overlap_chars:
|
| 346 |
+
return text
|
| 347 |
+
tail = text[-overlap_chars:]
|
| 348 |
+
# Walk forward until we hit a word boundary (space)
|
| 349 |
+
idx = tail.find(' ')
|
| 350 |
+
return tail[idx + 1:] if idx != -1 else tail
|
| 351 |
+
|
| 352 |
+
|
| 353 |
def _build_chunks(
|
| 354 |
passages: List[Tuple[str, int, Optional[str]]], # (text, page_num, section_title)
|
| 355 |
target_size: int = CHUNK_SIZE,
|
| 356 |
overlap_chars: int = CHUNK_OVERLAP,
|
| 357 |
+
min_chunk_size: int = 80,
|
| 358 |
) -> List[Dict]:
|
| 359 |
"""
|
| 360 |
+
Paragraph-first chunking with sentence-level overflow handling.
|
| 361 |
+
|
| 362 |
+
Strategy (in priority order):
|
| 363 |
+
1. **Keep paragraphs whole** β if a paragraph fits in [min_chunk_size, MAX],
|
| 364 |
+
accumulate paragraphs into the current chunk until target is reached.
|
| 365 |
+
2. **Question boundary preference** β when flushing, prefer to end on a
|
| 366 |
+
sentence that ends with '?' so questions are never split.
|
| 367 |
+
3. **Sentence-level split** β if a single paragraph exceeds MAX_CHUNK_SIZE,
|
| 368 |
+
split it at sentence boundaries (never mid-word, never mid-sentence).
|
| 369 |
+
4. **Overlap** β the last 1β2 sentences of the previous chunk are prepended
|
| 370 |
+
to the next so the LLM has context across boundaries.
|
| 371 |
+
5. **Min filter** β discard chunks shorter than min_chunk_size (stray
|
| 372 |
+
headings, lone numbers, etc.).
|
| 373 |
"""
|
| 374 |
chunks: List[Dict] = []
|
| 375 |
current_text = ""
|
| 376 |
current_page_start: Optional[int] = None
|
| 377 |
current_page_end: Optional[int] = None
|
| 378 |
current_section: Optional[str] = None
|
| 379 |
+
seed = "" # overlap carried into the next chunk
|
| 380 |
|
| 381 |
+
def flush(force_seed: str = ""):
|
| 382 |
+
nonlocal current_text, current_page_start, current_page_end, seed
|
| 383 |
text = current_text.strip()
|
| 384 |
if len(text) >= min_chunk_size:
|
| 385 |
chunks.append({
|
|
|
|
| 388 |
"page_end": current_page_end,
|
| 389 |
"section_title": current_section,
|
| 390 |
})
|
| 391 |
+
seed = force_seed if force_seed else _overlap_seed(text, overlap_chars)
|
|
|
|
| 392 |
current_text = ""
|
| 393 |
current_page_start = None
|
| 394 |
current_page_end = None
|
| 395 |
|
| 396 |
+
def _append_to_current(text_piece: str, page_num: int):
|
| 397 |
+
nonlocal current_text, current_page_start, current_page_end
|
| 398 |
+
sep = " " if current_text and not current_text.endswith('\n') else ""
|
| 399 |
+
current_text += sep + text_piece
|
| 400 |
+
if current_page_start is None:
|
| 401 |
+
current_page_start = page_num
|
| 402 |
+
if current_page_end is None:
|
| 403 |
+
current_page_end = page_num
|
| 404 |
+
else:
|
| 405 |
+
current_page_end = max(current_page_end, page_num)
|
| 406 |
|
| 407 |
+
def add_sentence_chunks(sentences: List[str], page_num: int):
|
| 408 |
+
"""
|
| 409 |
+
Split a list of sentences into chunks, respecting target/max sizes.
|
| 410 |
+
Questions are always kept in their own chunk if long enough.
|
| 411 |
+
"""
|
| 412 |
+
nonlocal seed
|
| 413 |
+
for sent in sentences:
|
| 414 |
+
sent = sent.strip()
|
| 415 |
+
if not sent:
|
| 416 |
continue
|
| 417 |
|
| 418 |
+
is_question = bool(_QUESTION_RE.search(sent)) or sent.endswith('?')
|
| 419 |
+
|
| 420 |
+
# If the sentence itself exceeds MAX, split at the last word boundary
|
| 421 |
+
while len(sent) > _MAX_CHUNK_SIZE:
|
| 422 |
+
cut = sent.rfind(' ', 0, _MAX_CHUNK_SIZE)
|
| 423 |
+
if cut == -1:
|
| 424 |
+
cut = _MAX_CHUNK_SIZE
|
| 425 |
+
piece = sent[:cut].strip()
|
| 426 |
+
sent = sent[cut:].strip()
|
| 427 |
+
if current_text:
|
| 428 |
+
flush()
|
| 429 |
+
current_text = (seed + " " + piece).strip() if seed else piece
|
| 430 |
+
current_page_start = current_page_end = page_num
|
| 431 |
+
else:
|
| 432 |
+
current_text = (seed + " " + piece).strip() if seed else piece
|
| 433 |
+
current_page_start = current_page_end = page_num
|
| 434 |
+
flush()
|
| 435 |
+
|
| 436 |
+
projected = len(current_text) + (1 if current_text else 0) + len(sent)
|
| 437 |
+
|
| 438 |
+
# Questions start a fresh chunk when they're substantial enough
|
| 439 |
+
if is_question and len(sent) >= 30 and current_text:
|
| 440 |
+
flush()
|
| 441 |
+
current_text = (seed + " " + sent).strip() if seed else sent
|
| 442 |
+
current_page_start = current_page_end = page_num
|
| 443 |
+
# If this question fits alone as a good chunk, flush it immediately
|
| 444 |
+
if len(sent) >= min_chunk_size:
|
| 445 |
+
flush()
|
| 446 |
+
return
|
| 447 |
|
| 448 |
if projected > target_size and current_text:
|
| 449 |
flush()
|
| 450 |
+
current_text = (seed + " " + sent).strip() if seed else sent
|
| 451 |
+
current_page_start = current_page_end = page_num
|
|
|
|
|
|
|
| 452 |
else:
|
| 453 |
if not current_text:
|
| 454 |
+
current_text = (seed + " " + sent).strip() if seed else sent
|
| 455 |
+
current_page_start = current_page_end = page_num
|
|
|
|
| 456 |
else:
|
| 457 |
+
_append_to_current(sent, page_num)
|
| 458 |
+
|
| 459 |
+
for passage_text, page_num, section_title in passages:
|
| 460 |
+
# Section tracking
|
| 461 |
+
if section_title:
|
| 462 |
+
current_section = section_title
|
| 463 |
+
|
| 464 |
+
# ββ Paragraph-first ββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 465 |
+
para = passage_text.strip()
|
| 466 |
+
if not para:
|
| 467 |
+
continue
|
| 468 |
|
| 469 |
+
para_len = len(para)
|
| 470 |
+
|
| 471 |
+
# Small paragraph: accumulate as-is (don't sentence-split yet)
|
| 472 |
+
if para_len <= target_size:
|
| 473 |
+
projected = len(current_text) + (2 if current_text else 0) + para_len
|
| 474 |
+
if projected > _MAX_CHUNK_SIZE and current_text:
|
| 475 |
+
flush()
|
| 476 |
+
current_text = (seed + " " + para).strip() if seed else para
|
| 477 |
+
current_page_start = current_page_end = page_num
|
| 478 |
+
else:
|
| 479 |
+
if not current_text:
|
| 480 |
+
current_text = (seed + " " + para).strip() if seed else para
|
| 481 |
+
current_page_start = current_page_end = page_num
|
| 482 |
else:
|
| 483 |
+
# Add a clear paragraph separator within the chunk
|
| 484 |
+
current_text += "\n\n" + para
|
| 485 |
+
current_page_end = max(current_page_end or page_num, page_num)
|
| 486 |
+
|
| 487 |
+
# Large paragraph: sentence-split and accumulate sentence by sentence
|
| 488 |
+
else:
|
| 489 |
+
sentences = _split_sentences(para)
|
| 490 |
+
if len(sentences) <= 1:
|
| 491 |
+
# Can't split β force as its own chunk, truncating only at MAX
|
| 492 |
+
if current_text:
|
| 493 |
+
flush()
|
| 494 |
+
truncated = para[:_MAX_CHUNK_SIZE]
|
| 495 |
+
# Don't cut mid-word
|
| 496 |
+
last_space = truncated.rfind(' ')
|
| 497 |
+
if last_space > min_chunk_size:
|
| 498 |
+
truncated = truncated[:last_space]
|
| 499 |
+
current_text = (seed + " " + truncated).strip() if seed else truncated
|
| 500 |
+
current_page_start = current_page_end = page_num
|
| 501 |
+
flush()
|
| 502 |
+
else:
|
| 503 |
+
add_sentence_chunks(sentences, page_num)
|
| 504 |
|
| 505 |
+
# Flush the final partial chunk
|
| 506 |
if current_text.strip():
|
| 507 |
flush()
|
| 508 |
|
vectordb/json_store.py
CHANGED
|
@@ -171,6 +171,31 @@ class JSONStore:
|
|
| 171 |
}
|
| 172 |
self._save_data()
|
| 173 |
print("β Deleted all documents")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
|
| 175 |
def get_stats(self) -> Dict:
|
| 176 |
"""Get store statistics"""
|
|
|
|
| 171 |
}
|
| 172 |
self._save_data()
|
| 173 |
print("β Deleted all documents")
|
| 174 |
+
|
| 175 |
+
def remove_document_chunks(self, source_filename: str) -> int:
|
| 176 |
+
"""
|
| 177 |
+
Remove all stored chunks that belong to the given file.
|
| 178 |
+
|
| 179 |
+
Matches on two criteria (either is sufficient):
|
| 180 |
+
1. doc['id'].startswith(source_filename + '_') β ID convention used by /upload
|
| 181 |
+
2. doc['metadata'].get('source') == source_filename
|
| 182 |
+
|
| 183 |
+
Returns the number of chunks removed.
|
| 184 |
+
"""
|
| 185 |
+
prefix = source_filename + '_'
|
| 186 |
+
before = len(self.data['documents'])
|
| 187 |
+
self.data['documents'] = [
|
| 188 |
+
doc for doc in self.data['documents']
|
| 189 |
+
if not (
|
| 190 |
+
doc.get('id', '').startswith(prefix) or
|
| 191 |
+
doc.get('metadata', {}).get('source') == source_filename
|
| 192 |
+
)
|
| 193 |
+
]
|
| 194 |
+
removed = before - len(self.data['documents'])
|
| 195 |
+
if removed:
|
| 196 |
+
self._save_data()
|
| 197 |
+
print(f"β Removed {removed} existing chunks for '{source_filename}'")
|
| 198 |
+
return removed
|
| 199 |
|
| 200 |
def get_stats(self) -> Dict:
|
| 201 |
"""Get store statistics"""
|