Spaces:

Jay-10020
/

cortexa-ai

Running

App Files Files Community

Jay-10020 commited on Mar 12

Commit

76320c7

1 Parent(s): e1d3f9d

RAG test3

Browse files

Files changed (3) hide show

api/main.py +5 -1
vectordb/document_processor.py +260 -69
vectordb/json_store.py +25 -0

api/main.py CHANGED Viewed

@@ -264,7 +264,11 @@ async def upload_document(
             'institution_id': institution_id,
             'course_id': course_id
         }
         chunks = doc_processor.process_document(str(file_path), metadata)
         texts = [chunk.text for chunk in chunks]

             'institution_id': institution_id,
             'course_id': course_id
         }
+        # Remove any previously-stored chunks for this file so that
+        # re-uploads do not accumulate duplicate vectors.
+        vector_store.remove_document_chunks(file.filename)
         chunks = doc_processor.process_document(str(file_path), metadata)
         texts = [chunk.text for chunk in chunks]

vectordb/document_processor.py CHANGED Viewed

@@ -65,8 +65,22 @@ _ABBREVS = frozenset({
 _LIGATURES = str.maketrans({
     "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
     "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
 })
 # Heading detection: line is a heading if it matches any of these
 _HEADING_RE = re.compile(
     r"^\s*("
@@ -79,61 +93,128 @@ _HEADING_RE = re.compile(
     re.MULTILINE,
 )
 def _fix_text(raw: str) -> str:
-    """Light cleaning that preserves paragraph structure."""
     text = raw.translate(_LIGATURES)
-    # Fix soft-hyphen / hard-hyphen line-breaks: "some-\nword" → "someword"
-    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
-    # Replace single lone newlines inside a paragraph with a space
-    # but preserve real paragraph breaks (2+ newlines stay)
-    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
-    # Collapse runs of spaces (but not newlines)
-    text = re.sub(r"[ \t]{2,}", " ", text)
-    # Collapse 3+ blank lines to 2
-    text = re.sub(r"\n{3,}", "\n\n", text)
     return text.strip()
 def _split_sentences(paragraph: str) -> List[str]:
-    """Split a paragraph into sentences (abbreviation-aware, two-pass).
-    First splits on any sentence-ending punctuation before an uppercase
-    letter/digit, then rejoins splits where the preceding word is a known
-    abbreviation.  This avoids Python re's variable-width lookbehind
-    restriction which would cause re.error at import time.
     """
-    parts = _SENT_SPLIT_RE.split(paragraph.strip())
     if len(parts) <= 1:
-        return [paragraph.strip()] if paragraph.strip() else []
-    result: List[str] = []
     current = parts[0]
     for part in parts[1:]:
-        # Check if current segment ends with a known abbreviation word.
         m = re.search(r'\b(\w+)\.\s*$', current)
         if m and m.group(1).lower() in _ABBREVS:
-            # Abbreviation — rejoin with the next sentence fragment.
             current = current.rstrip() + ' ' + part
         else:
             stripped = current.strip()
             if stripped:
-                result.append(stripped)
             current = part
     stripped = current.strip()
     if stripped:
-        result.append(stripped)
-    return result if result else [paragraph.strip()]
 def _split_paragraphs(text: str) -> List[str]:
-    """Split cleaned text into paragraphs (blank-line or indent separated)."""
-    # Split on double newlines (blank lines)
-    raw_paras = re.split(r"\n{2,}", text)
-    paras = []
     for p in raw_paras:
         p = p.strip()
-        if p:
             paras.append(p)
     return paras
@@ -150,15 +231,30 @@ def _detect_heading(line: str) -> bool:
 def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
     """
     Extract text per page using PyMuPDF (fitz).
     Returns [(page_number_1based, text), ...].
     """
     import fitz  # PyMuPDF
     pages = []
     with fitz.open(file_path) as doc:
-        for i, page in enumerate(doc, start=1):
-            text = page.get_text("text")  # plain text, respects reading order
-            if text.strip():
-                pages.append((i, text))
     return pages
@@ -238,31 +334,52 @@ def _remove_headers_footers(
 #  Core chunker
 # ──────────────────────────────────────────────────────────────────────────────
 def _build_chunks(
     passages: List[Tuple[str, int, Optional[str]]],  # (text, page_num, section_title)
     target_size: int = CHUNK_SIZE,
     overlap_chars: int = CHUNK_OVERLAP,
-    min_chunk_size: int = 100,
 ) -> List[Dict]:
     """
-    Accumulate sentence-split text into target-sized chunks with char overlap.
-    Each passage is split into sentences. Sentences are packed into the current
-    chunk until the target_size would be exceeded, then the chunk is flushed
-    and a new one starts, seeded with the last `overlap_chars` characters of
-    the previous chunk (so context bleeds across chunk boundaries).
-    Returns a list of dicts: {text, page_start, page_end, section_title}.
     """
     chunks: List[Dict] = []
     current_text = ""
     current_page_start: Optional[int] = None
     current_page_end: Optional[int] = None
     current_section: Optional[str] = None
-    overlap_seed = ""   # tail of the last chunk
-    def flush():
-        nonlocal current_text, current_page_start, current_page_end, current_section, overlap_seed
         text = current_text.strip()
         if len(text) >= min_chunk_size:
             chunks.append({
@@ -271,47 +388,121 @@ def _build_chunks(
                 "page_end": current_page_end,
                 "section_title": current_section,
             })
-            # Seed next chunk with the last overlap_chars of this chunk
-            overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
         current_text = ""
         current_page_start = None
         current_page_end = None
-    for passage_text, page_num, section_title in passages:
-        # Update section tracking
-        if section_title:
-            current_section = section_title
-        sentences = _split_sentences(passage_text)
-        for sentence in sentences:
-            sentence = sentence.strip()
-            if not sentence:
                 continue
-            # Would adding this sentence overflow the target?
-            projected = len(current_text) + (1 if current_text else 0) + len(sentence)
             if projected > target_size and current_text:
                 flush()
-                # Start new chunk from overlap seed
-                current_text = overlap_seed + (" " if overlap_seed else "") + sentence
-                current_page_start = page_num
-                current_page_end = page_num
             else:
                 if not current_text:
-                    # Fresh chunk — include overlap seed first
-                    current_text = (overlap_seed + " " + sentence).strip() if overlap_seed else sentence
-                    current_page_start = page_num
                 else:
-                    current_text += " " + sentence
-                if current_page_end is None:
-                    current_page_end = page_num
                 else:
-                    current_page_end = max(current_page_end, page_num)
-    # Flush the last partial chunk
     if current_text.strip():
         flush()

 _LIGATURES = str.maketrans({
     "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
     "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
+    "\u2019": "'",  "\u2018": "'",  "\u201C": '"',  "\u201D": '"',
+    "\u2013": "-",  "\u2014": " - ", "\u2022": "*",  "\u00A0": " ",
 })
+# Detect a question sentence (ends with ? or starts with question words)
+_QUESTION_RE = re.compile(
+    r'\?$|^(what|which|who|whom|whose|when|where|why|how|is|are|was|were|'
+    r'do|does|did|can|could|will|would|shall|should|may|might|must|has|have|had)\b',
+    re.IGNORECASE,
+)
+# List item starters: bullet, dash, numbered, letter+period
+_LIST_ITEM_RE = re.compile(
+    r'^(\s*[\*\-\•\–\—]\s+|\s*\d{1,3}[.)]\s+|\s*[a-zA-Z][.)]\s+)'
+)
 # Heading detection: line is a heading if it matches any of these
 _HEADING_RE = re.compile(
     r"^\s*("
     re.MULTILINE,
 )
+# Detect lines that are just page numbers / artifacts (no real content)
+_NOISE_LINE_RE = re.compile(
+    r'^[\s\d\.\-\–\—\|]{0,6}$'         # whitespace/digits/punctuation only
+    r'|^\s*(page|pg\.?)\s*\d+\s*$',    # "Page 5" etc.
+    re.IGNORECASE,
+)
 def _fix_text(raw: str) -> str:
+    """
+    Comprehensive PDF text cleaning that preserves paragraph structure.
+    Handles:
+    - Unicode ligatures and smart quotes
+    - Hyphenated line-breaks (word-\n)
+    - Isolated single-character lines from columnar PDFs
+    - Repeated spaces from PDF spacing
+    - Runs of blank lines
+    - Non-breaking spaces, zero-width chars
+    """
+    # Translate known ligatures and typographic chars
     text = raw.translate(_LIGATURES)
+    # Remove zero-width / control chars except newline and tab
+    text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', text)
+    # Tabs → spaces
+    text = text.replace('\t', ' ')
+    # Fix hard/soft hyphen line-breaks: "some-\nword" → "someword"
+    text = re.sub(r'(\w)-\n(\w)', r'\1\2', text)
+    # Remove lone single-character lines (column-merging artifact)
+    # but only if surrounded by blank lines
+    text = re.sub(r'\n([A-Za-z])\n', r' \1 ', text)
+    # A single newline that is NOT a paragraph break: join as a space
+    # (paragraph breaks = 2+ newlines, keep those)
+    text = re.sub(r'(?<!\n)\n(?!\n)', ' ', text)
+    # Collapse runs of spaces (not newlines)
+    text = re.sub(r'[ ]{2,}', ' ', text)
+    # Collapse 3+ blank lines to exactly 2
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    # Remove lines that are pure noise (page numbers, lone dashes, etc.)
+    lines = text.splitlines()
+    lines = [ln for ln in lines if not _NOISE_LINE_RE.fullmatch(ln)]
+    text = '\n'.join(lines)
     return text.strip()
 def _split_sentences(paragraph: str) -> List[str]:
+    """
+    Split a paragraph into clean, complete sentences.
+    Rules:
+    1. Split on [.!?] followed by whitespace + uppercase/digit  (fixed lookbehind)
+    2. Rejoin where the word before the period is a known abbreviation
+    3. Questions (ending with ?) are preserved as whole atomic units
+    4. Short fragments (< 15 chars) are merged with the next sentence
     """
+    text = paragraph.strip()
+    if not text:
+        return []
+    parts = _SENT_SPLIT_RE.split(text)
     if len(parts) <= 1:
+        return [text]
+    # Abbreviation-aware rejoin pass
+    merged: List[str] = []
     current = parts[0]
     for part in parts[1:]:
         m = re.search(r'\b(\w+)\.\s*$', current)
         if m and m.group(1).lower() in _ABBREVS:
             current = current.rstrip() + ' ' + part
         else:
             stripped = current.strip()
             if stripped:
+                merged.append(stripped)
             current = part
     stripped = current.strip()
     if stripped:
+        merged.append(stripped)
+    if not merged:
+        return [text]
+    # Merge tiny fragments (< 15 chars that aren't standalone questions)
+    result: List[str] = []
+    for sent in merged:
+        if (result and len(sent) < 15
+                and not _QUESTION_RE.search(sent)
+                and not sent.endswith('?')):
+            result[-1] = result[-1].rstrip() + ' ' + sent
+        else:
+            result.append(sent)
+    return result
 def _split_paragraphs(text: str) -> List[str]:
+    """Split cleaned text into paragraphs (blank-line separated).
+    Also treats each list item as its own paragraph.
+    """
+    raw_paras = re.split(r'\n{2,}', text)
+    paras: List[str] = []
     for p in raw_paras:
         p = p.strip()
+        if not p:
+            continue
+        # If the paragraph contains multiple list items, split them individually
+        lines = p.splitlines()
+        if len(lines) > 1 and all(_LIST_ITEM_RE.match(ln) for ln in lines if ln.strip()):
+            for ln in lines:
+                ln = ln.strip()
+                if ln:
+                    paras.append(ln)
+        else:
             paras.append(p)
     return paras
 def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
     """
     Extract text per page using PyMuPDF (fitz).
+    Uses the 'blocks' extraction mode which preserves reading order and
+    provides paragraph-level grouping: each block becomes its own logical
+    paragraph, which drastically reduces mid-word / mid-sentence breaks
+    compared to raw character-stream extraction.
     Returns [(page_number_1based, text), ...].
     """
     import fitz  # PyMuPDF
     pages = []
     with fitz.open(file_path) as doc:
+        for page_num, page in enumerate(doc, start=1):
+            blocks = page.get_text("blocks", sort=True)  # sort=True → reading order
+            paragraphs = []
+            for blk in blocks:
+                # blocks entry: (x0, y0, x1, y1, "text", block_no, block_type)
+                if blk[6] != 0:  # 0 = text, 1 = image — skip images
+                    continue
+                blk_text = blk[4].strip()
+                if blk_text:
+                    paragraphs.append(blk_text)
+            if paragraphs:
+                # Join blocks with double newlines so _split_paragraphs can use them
+                pages.append((page_num, "\n\n".join(paragraphs)))
     return pages
 #  Core chunker
 # ──────────────────────────────────────────────────────────────────────────────
+# Hard upper limit — a single chunk is never allowed to exceed this
+_MAX_CHUNK_SIZE = 1400
+def _overlap_seed(text: str, overlap_chars: int) -> str:
+    """
+    Return the last `overlap_chars` characters of `text`, but trim to the
+    start of the last complete word so we never cut mid-word.
+    """
+    if len(text) <= overlap_chars:
+        return text
+    tail = text[-overlap_chars:]
+    # Walk forward until we hit a word boundary (space)
+    idx = tail.find(' ')
+    return tail[idx + 1:] if idx != -1 else tail
 def _build_chunks(
     passages: List[Tuple[str, int, Optional[str]]],  # (text, page_num, section_title)
     target_size: int = CHUNK_SIZE,
     overlap_chars: int = CHUNK_OVERLAP,
+    min_chunk_size: int = 80,
 ) -> List[Dict]:
     """
+    Paragraph-first chunking with sentence-level overflow handling.
+    Strategy (in priority order):
+    1. **Keep paragraphs whole** — if a paragraph fits in [min_chunk_size, MAX],
+       accumulate paragraphs into the current chunk until target is reached.
+    2. **Question boundary preference** — when flushing, prefer to end on a
+       sentence that ends with '?' so questions are never split.
+    3. **Sentence-level split** — if a single paragraph exceeds MAX_CHUNK_SIZE,
+       split it at sentence boundaries (never mid-word, never mid-sentence).
+    4. **Overlap** — the last 1–2 sentences of the previous chunk are prepended
+       to the next so the LLM has context across boundaries.
+    5. **Min filter** — discard chunks shorter than min_chunk_size (stray
+       headings, lone numbers, etc.).
     """
     chunks: List[Dict] = []
     current_text = ""
     current_page_start: Optional[int] = None
     current_page_end: Optional[int] = None
     current_section: Optional[str] = None
+    seed = ""   # overlap carried into the next chunk
+    def flush(force_seed: str = ""):
+        nonlocal current_text, current_page_start, current_page_end, seed
         text = current_text.strip()
         if len(text) >= min_chunk_size:
             chunks.append({
                 "page_end": current_page_end,
                 "section_title": current_section,
             })
+            seed = force_seed if force_seed else _overlap_seed(text, overlap_chars)
         current_text = ""
         current_page_start = None
         current_page_end = None
+    def _append_to_current(text_piece: str, page_num: int):
+        nonlocal current_text, current_page_start, current_page_end
+        sep = " " if current_text and not current_text.endswith('\n') else ""
+        current_text += sep + text_piece
+        if current_page_start is None:
+            current_page_start = page_num
+        if current_page_end is None:
+            current_page_end = page_num
+        else:
+            current_page_end = max(current_page_end, page_num)
+    def add_sentence_chunks(sentences: List[str], page_num: int):
+        """
+        Split a list of sentences into chunks, respecting target/max sizes.
+        Questions are always kept in their own chunk if long enough.
+        """
+        nonlocal seed
+        for sent in sentences:
+            sent = sent.strip()
+            if not sent:
                 continue
+            is_question = bool(_QUESTION_RE.search(sent)) or sent.endswith('?')
+            # If the sentence itself exceeds MAX, split at the last word boundary
+            while len(sent) > _MAX_CHUNK_SIZE:
+                cut = sent.rfind(' ', 0, _MAX_CHUNK_SIZE)
+                if cut == -1:
+                    cut = _MAX_CHUNK_SIZE
+                piece = sent[:cut].strip()
+                sent  = sent[cut:].strip()
+                if current_text:
+                    flush()
+                    current_text = (seed + " " + piece).strip() if seed else piece
+                    current_page_start = current_page_end = page_num
+                else:
+                    current_text = (seed + " " + piece).strip() if seed else piece
+                    current_page_start = current_page_end = page_num
+                flush()
+            projected = len(current_text) + (1 if current_text else 0) + len(sent)
+            # Questions start a fresh chunk when they're substantial enough
+            if is_question and len(sent) >= 30 and current_text:
+                flush()
+                current_text = (seed + " " + sent).strip() if seed else sent
+                current_page_start = current_page_end = page_num
+                # If this question fits alone as a good chunk, flush it immediately
+                if len(sent) >= min_chunk_size:
+                    flush()
+                return
             if projected > target_size and current_text:
                 flush()
+                current_text = (seed + " " + sent).strip() if seed else sent
+                current_page_start = current_page_end = page_num
             else:
                 if not current_text:
+                    current_text = (seed + " " + sent).strip() if seed else sent
+                    current_page_start = current_page_end = page_num
                 else:
+                    _append_to_current(sent, page_num)
+    for passage_text, page_num, section_title in passages:
+        # Section tracking
+        if section_title:
+            current_section = section_title
+        # ── Paragraph-first ──────────────────────────────────────────────────
+        para = passage_text.strip()
+        if not para:
+            continue
+        para_len = len(para)
+        # Small paragraph: accumulate as-is (don't sentence-split yet)
+        if para_len <= target_size:
+            projected = len(current_text) + (2 if current_text else 0) + para_len
+            if projected > _MAX_CHUNK_SIZE and current_text:
+                flush()
+                current_text = (seed + " " + para).strip() if seed else para
+                current_page_start = current_page_end = page_num
+            else:
+                if not current_text:
+                    current_text = (seed + " " + para).strip() if seed else para
+                    current_page_start = current_page_end = page_num
                 else:
+                    # Add a clear paragraph separator within the chunk
+                    current_text += "\n\n" + para
+                    current_page_end = max(current_page_end or page_num, page_num)
+        # Large paragraph: sentence-split and accumulate sentence by sentence
+        else:
+            sentences = _split_sentences(para)
+            if len(sentences) <= 1:
+                # Can't split — force as its own chunk, truncating only at MAX
+                if current_text:
+                    flush()
+                truncated = para[:_MAX_CHUNK_SIZE]
+                # Don't cut mid-word
+                last_space = truncated.rfind(' ')
+                if last_space > min_chunk_size:
+                    truncated = truncated[:last_space]
+                current_text = (seed + " " + truncated).strip() if seed else truncated
+                current_page_start = current_page_end = page_num
+                flush()
+            else:
+                add_sentence_chunks(sentences, page_num)
+    # Flush the final partial chunk
     if current_text.strip():
         flush()

vectordb/json_store.py CHANGED Viewed

@@ -171,6 +171,31 @@ class JSONStore:
         }
         self._save_data()
         print("✓ Deleted all documents")
     def get_stats(self) -> Dict:
         """Get store statistics"""

         }
         self._save_data()
         print("✓ Deleted all documents")
+    def remove_document_chunks(self, source_filename: str) -> int:
+        """
+        Remove all stored chunks that belong to the given file.
+        Matches on two criteria (either is sufficient):
+          1. doc['id'].startswith(source_filename + '_')   – ID convention used by /upload
+          2. doc['metadata'].get('source') == source_filename
+        Returns the number of chunks removed.
+        """
+        prefix = source_filename + '_'
+        before = len(self.data['documents'])
+        self.data['documents'] = [
+            doc for doc in self.data['documents']
+            if not (
+                doc.get('id', '').startswith(prefix) or
+                doc.get('metadata', {}).get('source') == source_filename
+            )
+        ]
+        removed = before - len(self.data['documents'])
+        if removed:
+            self._save_data()
+            print(f"✓ Removed {removed} existing chunks for '{source_filename}'")
+        return removed
     def get_stats(self) -> Dict:
         """Get store statistics"""