Spaces:

Jay-10020
/

cortexa-ai

Running

App Files Files Community

Jay-10020 commited on about 4 hours ago

Commit

d0d84d2

1 Parent(s): 4e4501d

Minor update for chunking improvement

Browse files

Files changed (2) hide show

config.py +7 -2
vectordb/document_processor.py +451 -150

config.py CHANGED Viewed

@@ -38,8 +38,13 @@ WHISPER_MODEL = "tiny"  # Options: tiny, base, small, medium, large (tiny=75MB f
 # - large: ~3GB, best accuracy
 # Chunking settings
-CHUNK_SIZE = 512
-CHUNK_OVERLAP = 50
 MAX_CHUNKS_PER_DOC = 1000
 # Retrieval settings

 # - large: ~3GB, best accuracy
 # Chunking settings
+# CHUNK_SIZE: target characters per chunk (~800 chars ≈ 2-4 paragraphs of lecture notes).
+#   Old value was 512 which was too small and split concepts mid-sentence.
+CHUNK_SIZE = 800
+# CHUNK_OVERLAP: characters of text from the previous chunk included at the start
+#   of the next one, so the embedding always sees a coherent context boundary.
+#   Old value was 50 (word count, not chars) — now consistently chars.
+CHUNK_OVERLAP = 150
 MAX_CHUNKS_PER_DOC = 1000
 # Retrieval settings

vectordb/document_processor.py CHANGED Viewed

@@ -1,172 +1,473 @@
 """
-Document processing and chunking
 """
-import os
 from pathlib import Path
-from typing import List, Dict
-import PyPDF2
-import pdfplumber
-from docx import Document
 from config import CHUNK_SIZE, CHUNK_OVERLAP
 class DocumentChunk:
-    def __init__(
-        self,
-        text: str,
-        metadata: Dict,
-        chunk_id: int
-    ):
         self.text = text
         self.metadata = metadata
         self.chunk_id = chunk_id
-class DocumentProcessor:
-    def __init__(self):
-        self.supported_formats = ['.pdf', '.txt', '.docx']
-    def load_document(self, file_path: str) -> str:
-        """Load document content based on file type"""
-        path = Path(file_path)
-        if not path.exists():
-            raise FileNotFoundError(f"File not found: {file_path}")
-        ext = path.suffix.lower()
-        if ext == '.pdf':
-            return self._load_pdf(file_path)
-        elif ext == '.txt':
-            return self._load_txt(file_path)
-        elif ext == '.docx':
-            return self._load_docx(file_path)
-        else:
-            raise ValueError(f"Unsupported file format: {ext}")
-    def _load_pdf(self, file_path: str) -> str:
-        """Extract text from PDF"""
-        text = ""
-        try:
-            # Try pdfplumber first (better for tables)
-            with pdfplumber.open(file_path) as pdf:
-                for page in pdf.pages:
-                    page_text = page.extract_text()
-                    if page_text:
-                        text += page_text + "\n"
-        except:
-            # Fallback to PyPDF2
-            with open(file_path, 'rb') as file:
-                pdf_reader = PyPDF2.PdfReader(file)
-                for page in pdf_reader.pages:
-                    text += page.extract_text() + "\n"
-        return text.strip()
-    def _load_txt(self, file_path: str) -> str:
-        """Load text file"""
-        with open(file_path, 'r', encoding='utf-8') as file:
-            return file.read()
-    def _load_docx(self, file_path: str) -> str:
-        """Extract text from DOCX"""
-        doc = Document(file_path)
-        text = "\n".join([para.text for para in doc.paragraphs])
-        return text
-    def chunk_text(
-        self,
-        text: str,
-        chunk_size: int = CHUNK_SIZE,
-        overlap: int = CHUNK_OVERLAP
-    ) -> List[str]:
-        """
-        Split text into overlapping chunks
-        Args:
-            text: Input text
-            chunk_size: Maximum chunk size in characters
-            overlap: Overlap between chunks
-        Returns:
-            List of text chunks
-        """
-        if not text:
-            return []
-        # Split by sentences first (simple approach)
-        sentences = text.replace('\n', ' ').split('. ')
-        chunks = []
-        current_chunk = ""
         for sentence in sentences:
-            sentence = sentence.strip() + ". "
-            # If adding this sentence exceeds chunk size
-            if len(current_chunk) + len(sentence) > chunk_size:
-                if current_chunk:
-                    chunks.append(current_chunk.strip())
-                    # Start new chunk with overlap
-                    words = current_chunk.split()
-                    overlap_words = words[-overlap:] if len(words) > overlap else words
-                    current_chunk = " ".join(overlap_words) + " " + sentence
-                else:
-                    # Sentence itself is longer than chunk_size
-                    chunks.append(sentence[:chunk_size])
-                    current_chunk = sentence[chunk_size:]
             else:
-                current_chunk += sentence
-        # Add last chunk
-        if current_chunk:
-            chunks.append(current_chunk.strip())
-        return chunks
     def process_document(
         self,
         file_path: str,
-        metadata: Dict = None
     ) -> List[DocumentChunk]:
         """
-        Process document into chunks with metadata
-        Args:
-            file_path: Path to document
-            metadata: Additional metadata
-        Returns:
-            List of DocumentChunk objects
         """
-        # Load document
-        text = self.load_document(file_path)
-        # Create metadata
-        file_metadata = {
-            'source': str(Path(file_path).name),
-            'file_path': str(file_path),
-            'file_type': Path(file_path).suffix,
-            'total_chars': len(text)
         }
         if metadata:
-            file_metadata.update(metadata)
-        # Chunk text
-        chunks = self.chunk_text(text)
-        # Create DocumentChunk objects
         doc_chunks = []
-        for i, chunk in enumerate(chunks):
-            chunk_metadata = file_metadata.copy()
-            chunk_metadata['chunk_index'] = i
-            chunk_metadata['total_chunks'] = len(chunks)
-            doc_chunks.append(
-                DocumentChunk(
-                    text=chunk,
-                    metadata=chunk_metadata,
-                    chunk_id=i
-                )
-            )
         return doc_chunks

 """
+Document processing and chunking — semantic, structure-aware pipeline.
+Improvements over the old version
+──────────────────────────────────
+• PDF extracted page-by-page via PyMuPDF (fitz) → pdfplumber fallback
+  – Tracks page numbers per chunk for precise citations
+  – Auto-detects & removes repeated headers/footers (noise lines appearing
+    on ≥40 % of pages)
+  – Fixes hyphenated line-breaks (word-\nbreak → wordbreak)
+• Three-level chunking hierarchy:
+    1. Detect section headings → each section stays together where possible
+    2. Split into paragraphs (double-newline / blank line)
+    3. Split paragraphs into sentences (abbreviation-aware regex)
+  Sentences are then accumulated into target-size chunks so a chunk never
+  cuts in the middle of a sentence.
+• Chunk overlap carried as actual character text (not word count) so the
+  embedding always sees a coherent intro from the previous chunk.
+• Minimum chunk size filter (100 chars) — avoids storing page numbers,
+  lone headers, or empty fragments.
+• Metadata per chunk now includes:  page_start, page_end, section_title,
+  char_count, chunk_index, total_chunks, source, file_type, institution_id,
+  course_id (passed in by caller).
 """
+import re
 from pathlib import Path
+from typing import List, Dict, Tuple, Optional
 from config import CHUNK_SIZE, CHUNK_OVERLAP
+# ──────────────────────────────────────────────────────────────────────────────
+#  Data class
+# ──────────────────────────────────────────────────────────────────────────────
 class DocumentChunk:
+    def __init__(self, text: str, metadata: Dict, chunk_id: int):
         self.text = text
         self.metadata = metadata
         self.chunk_id = chunk_id
+# ──────────────────────────────────────────────────────────────���───────────────
+#  Low-level text utilities
+# ──────────────────────────────────────────────────────────────────────────────
+# Common abbreviations that end with a period but are NOT sentence endings.
+_ABBREV_PAT = (
+    r"Dr|Mr|Mrs|Ms|Prof|Sr|Jr|Rev|Gen|Sgt|Cpl|Pvt|Lt|Capt|Cmdr|Adm"
+    r"|etc|Fig|fig|vs|i\.e|e\.g|Eq|eq|No|ref|approx|cf|et\sal|vol|ed"
+    r"|pp|ch|sec|dept|univ|est|govt|corp|inc|ltd|co|eng|tech|lab|exp"
+    r"|max|min|avg|std|def|Def|Prop|Thm|Cor|Lem|Ex|Eg|Jan|Feb|Mar|Apr"
+    r"|Jun|Jul|Aug|Sep|Oct|Nov|Dec|Mon|Tue|Wed|Thu|Fri|Sat|Sun"
+)
+# Sentence boundary: (. or ! or ?) followed by whitespace + uppercase/digit,
+# but NOT preceded by a known abbreviation.
+_SENT_BOUNDARY = re.compile(
+    r"(?<!(?:" + _ABBREV_PAT + r"))(?<=[.!?])\s{1,3}(?=[A-Z0-9\"])"
+)
+# Unicode ligatures that PDFs sometimes embed
+_LIGATURES = str.maketrans({
+    "\uFB00": "ff", "\uFB01": "fi", "\uFB02": "fl",
+    "\uFB03": "ffi", "\uFB04": "ffl", "\uFB05": "st", "\uFB06": "st",
+})
+# Heading detection: line is a heading if it matches any of these
+_HEADING_RE = re.compile(
+    r"^\s*("
+    r"\d+(\.\d+)*\.?\s+[A-Z]"          # 1. Introduction / 1.2 Overview
+    r"|[A-Z][A-Z\s]{4,}[A-Z]"          # ALL CAPS (min 6 chars)
+    r"|Chapter\s+\d+"                   # Chapter N
+    r"|Section\s+\d+"                   # Section N
+    r"|[IVXLCDM]+\.\s+[A-Z]"           # Roman numeral heading
+    r")\s*$",
+    re.MULTILINE,
+)
+def _fix_text(raw: str) -> str:
+    """Light cleaning that preserves paragraph structure."""
+    text = raw.translate(_LIGATURES)
+    # Fix soft-hyphen / hard-hyphen line-breaks: "some-\nword" → "someword"
+    text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
+    # Replace single lone newlines inside a paragraph with a space
+    # but preserve real paragraph breaks (2+ newlines stay)
+    text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
+    # Collapse runs of spaces (but not newlines)
+    text = re.sub(r"[ \t]{2,}", " ", text)
+    # Collapse 3+ blank lines to 2
+    text = re.sub(r"\n{3,}", "\n\n", text)
+    return text.strip()
+def _split_sentences(paragraph: str) -> List[str]:
+    """Split a paragraph into sentences using abbreviation-aware regex."""
+    parts = _SENT_BOUNDARY.split(paragraph.strip())
+    sentences = []
+    for part in parts:
+        part = part.strip()
+        if part:
+            sentences.append(part)
+    return sentences if sentences else [paragraph.strip()]
+def _split_paragraphs(text: str) -> List[str]:
+    """Split cleaned text into paragraphs (blank-line or indent separated)."""
+    # Split on double newlines (blank lines)
+    raw_paras = re.split(r"\n{2,}", text)
+    paras = []
+    for p in raw_paras:
+        p = p.strip()
+        if p:
+            paras.append(p)
+    return paras
+def _detect_heading(line: str) -> bool:
+    """Return True if the line looks like a section heading."""
+    return bool(_HEADING_RE.match(line.strip()))
+# ──────────────────────────────────────────────────────────────────────────────
+#  PDF extraction helpers
+# ──────────────────────────────────────────────────────────────────────────────
+def _extract_pdf_pages_fitz(file_path: str) -> List[Tuple[int, str]]:
+    """
+    Extract text per page using PyMuPDF (fitz).
+    Returns [(page_number_1based, text), ...].
+    """
+    import fitz  # PyMuPDF
+    pages = []
+    with fitz.open(file_path) as doc:
+        for i, page in enumerate(doc, start=1):
+            text = page.get_text("text")  # plain text, respects reading order
+            if text.strip():
+                pages.append((i, text))
+    return pages
+def _extract_pdf_pages_pdfplumber(file_path: str) -> List[Tuple[int, str]]:
+    """Fallback: extract per-page text via pdfplumber."""
+    import pdfplumber
+    pages = []
+    with pdfplumber.open(file_path) as pdf:
+        for i, page in enumerate(pdf.pages, start=1):
+            text = page.extract_text() or ""
+            if text.strip():
+                pages.append((i, text))
+    return pages
+def _extract_pdf_pages_pypdf2(file_path: str) -> List[Tuple[int, str]]:
+    """Last resort: PyPDF2 per page."""
+    import PyPDF2
+    pages = []
+    with open(file_path, "rb") as f:
+        reader = PyPDF2.PdfReader(f)
+        for i, page in enumerate(reader.pages, start=1):
+            text = page.extract_text() or ""
+            if text.strip():
+                pages.append((i, text))
+    return pages
+def _remove_headers_footers(
+    pages: List[Tuple[int, str]],
+    threshold: float = 0.40,
+) -> List[Tuple[int, str]]:
+    """
+    Remove lines that appear almost identically on ≥ threshold fraction of pages
+    — these are headers/footers (e.g. "Confidential", "Page N", course title).
+    """
+    if len(pages) < 3:
+        return pages   # too few pages to detect reliably
+    # Collect first-line and last-line of each page (most common header/footer positions)
+    first_lines: Dict[str, int] = {}
+    last_lines: Dict[str, int] = {}
+    for _, text in pages:
+        lines = [l.strip() for l in text.splitlines() if l.strip()]
+        if not lines:
+            continue
+        # Normalise: strip numbers from the lines to catch "Page 1", "Page 2", etc.
+        first = re.sub(r"\b\d+\b", "N", lines[0])
+        last  = re.sub(r"\b\d+\b", "N", lines[-1])
+        first_lines[first] = first_lines.get(first, 0) + 1
+        last_lines[last]   = last_lines.get(last, 0) + 1
+    total = len(pages)
+    noisy_first = {k for k, v in first_lines.items() if v / total >= threshold}
+    noisy_last  = {k for k, v in last_lines.items()  if v / total >= threshold}
+    cleaned = []
+    for page_num, text in pages:
+        lines = text.splitlines()
+        filtered = []
+        for idx, line in enumerate(lines):
+            normalised = re.sub(r"\b\d+\b", "N", line.strip())
+            if idx == 0 and normalised in noisy_first:
+                continue
+            if idx == len(lines) - 1 and normalised in noisy_last:
+                continue
+            # Also skip lone page-number lines anywhere in the page
+            if re.fullmatch(r"[\s\-–—]*\d{1,4}[\s\-–—]*", line):
+                continue
+            filtered.append(line)
+        cleaned.append((page_num, "\n".join(filtered)))
+    return cleaned
+# ──────────────────────────────────────────────────────────────────────────────
+#  Core chunker
+# ──────────────────────────────────────────────────────────────────────────────
+def _build_chunks(
+    passages: List[Tuple[str, int, Optional[str]]],  # (text, page_num, section_title)
+    target_size: int = CHUNK_SIZE,
+    overlap_chars: int = CHUNK_OVERLAP,
+    min_chunk_size: int = 100,
+) -> List[Dict]:
+    """
+    Accumulate sentence-split text into target-sized chunks with char overlap.
+    Each passage is split into sentences. Sentences are packed into the current
+    chunk until the target_size would be exceeded, then the chunk is flushed
+    and a new one starts, seeded with the last `overlap_chars` characters of
+    the previous chunk (so context bleeds across chunk boundaries).
+    Returns a list of dicts: {text, page_start, page_end, section_title}.
+    """
+    chunks: List[Dict] = []
+    current_text = ""
+    current_page_start: Optional[int] = None
+    current_page_end: Optional[int] = None
+    current_section: Optional[str] = None
+    overlap_seed = ""   # tail of the last chunk
+    def flush():
+        nonlocal current_text, current_page_start, current_page_end, current_section, overlap_seed
+        text = current_text.strip()
+        if len(text) >= min_chunk_size:
+            chunks.append({
+                "text": text,
+                "page_start": current_page_start,
+                "page_end": current_page_end,
+                "section_title": current_section,
+            })
+            # Seed next chunk with the last overlap_chars of this chunk
+            overlap_seed = text[-overlap_chars:] if len(text) > overlap_chars else text
+        current_text = ""
+        current_page_start = None
+        current_page_end = None
+    for passage_text, page_num, section_title in passages:
+        # Update section tracking
+        if section_title:
+            current_section = section_title
+        sentences = _split_sentences(passage_text)
         for sentence in sentences:
+            sentence = sentence.strip()
+            if not sentence:
+                continue
+            # Would adding this sentence overflow the target?
+            projected = len(current_text) + (1 if current_text else 0) + len(sentence)
+            if projected > target_size and current_text:
+                flush()
+                # Start new chunk from overlap seed
+                current_text = overlap_seed + (" " if overlap_seed else "") + sentence
+                current_page_start = page_num
+                current_page_end = page_num
             else:
+                if not current_text:
+                    # Fresh chunk — include overlap seed first
+                    current_text = (overlap_seed + " " + sentence).strip() if overlap_seed else sentence
+                    current_page_start = page_num
+                else:
+                    current_text += " " + sentence
+                if current_page_end is None:
+                    current_page_end = page_num
+                else:
+                    current_page_end = max(current_page_end, page_num)
+    # Flush the last partial chunk
+    if current_text.strip():
+        flush()
+    return chunks
+# ──────────────────────────────────────────────────────────────────────────────
+#  Main processor class (public API unchanged)
+# ──────────────────────────────────────────────────────────────────────────────
+class DocumentProcessor:
+    def __init__(self):
+        self.supported_formats = [".pdf", ".txt", ".docx"]
+    # ── Public entry point ────────────────────────────────────────────────────
     def process_document(
         self,
         file_path: str,
+        metadata: Dict = None,
     ) -> List[DocumentChunk]:
         """
+        Process a document file into semantically coherent chunks.
+        Returns a list of DocumentChunk objects; interface is unchanged.
         """
+        path = Path(file_path)
+        ext = path.suffix.lower()
+        if ext == ".pdf":
+            pages = self._load_pdf_pages(file_path)
+        elif ext == ".txt":
+            raw = self._load_txt(file_path)
+            pages = [(1, raw)]
+        elif ext == ".docx":
+            raw = self._load_docx(file_path)
+            pages = [(1, raw)]
+        else:
+            raise ValueError(f"Unsupported file format: {ext}")
+        # Build base metadata
+        file_meta = {
+            "source": path.name,
+            "file_path": str(file_path),
+            "file_type": ext,
         }
         if metadata:
+            file_meta.update(metadata)
+        # Convert pages to passage list with section tracking
+        passages = self._pages_to_passages(pages)
+        # Build variable-length chunks
+        raw_chunks = _build_chunks(passages, target_size=CHUNK_SIZE, overlap_chars=CHUNK_OVERLAP)
+        # Wrap into DocumentChunk objects
         doc_chunks = []
+        total = len(raw_chunks)
+        for i, rc in enumerate(raw_chunks):
+            chunk_meta = file_meta.copy()
+            chunk_meta["chunk_index"] = i
+            chunk_meta["total_chunks"] = total
+            chunk_meta["char_count"] = len(rc["text"])
+            chunk_meta["page_start"] = rc.get("page_start")
+            chunk_meta["page_end"] = rc.get("page_end")
+            if rc.get("section_title"):
+                chunk_meta["section_title"] = rc["section_title"]
+            doc_chunks.append(DocumentChunk(
+                text=rc["text"],
+                metadata=chunk_meta,
+                chunk_id=i,
+            ))
+        print(f"✅ Chunked '{path.name}' → {total} chunks "
+              f"(avg {sum(len(c.text) for c in doc_chunks)//max(total,1)} chars each)")
         return doc_chunks
+    # ── Legacy interface (still works; used by some older code paths) ─────────
+    def load_document(self, file_path: str) -> str:
+        """Return the full cleaned text of a document as a single string."""
+        ext = Path(file_path).suffix.lower()
+        if ext == ".pdf":
+            pages = self._load_pdf_pages(file_path)
+            return "\n\n".join(text for _, text in pages)
+        elif ext == ".txt":
+            return self._load_txt(file_path)
+        elif ext == ".docx":
+            return self._load_docx(file_path)
+        raise ValueError(f"Unsupported format: {ext}")
+    def chunk_text(self, text: str, chunk_size: int = CHUNK_SIZE,
+                   overlap: int = CHUNK_OVERLAP) -> List[str]:
+        """Legacy helper — returns list of chunk strings from a raw text blob."""
+        passages = [(text, 1, None)]
+        raw_chunks = _build_chunks(passages, target_size=chunk_size, overlap_chars=overlap)
+        return [rc["text"] for rc in raw_chunks]
+    # ── PDF loading ───────────────────────────────────────────────────────────
+    def _load_pdf_pages(self, file_path: str) -> List[Tuple[int, str]]:
+        """Extract per-page text from a PDF with fallback chain."""
+        pages = None
+        # 1. PyMuPDF (best quality, respects reading order)
+        try:
+            pages = _extract_pdf_pages_fitz(file_path)
+        except Exception as e:
+            print(f"  fitz failed ({e}), trying pdfplumber…")
+        # 2. pdfplumber
+        if not pages:
+            try:
+                pages = _extract_pdf_pages_pdfplumber(file_path)
+            except Exception as e:
+                print(f"  pdfplumber failed ({e}), trying PyPDF2…")
+        # 3. PyPDF2 last resort
+        if not pages:
+            pages = _extract_pdf_pages_pypdf2(file_path)
+        if not pages:
+            raise RuntimeError(f"Could not extract any text from: {file_path}")
+        # Remove noise headers/footers, then clean each page
+        pages = _remove_headers_footers(pages)
+        return [(pn, _fix_text(text)) for pn, text in pages if _fix_text(text)]
+    # ── Plain text / DOCX loading ─────────────────────────────────────────────
+    def _load_txt(self, file_path: str) -> str:
+        with open(file_path, "r", encoding="utf-8", errors="replace") as f:
+            return _fix_text(f.read())
+    def _load_docx(self, file_path: str) -> str:
+        from docx import Document as DocxDoc
+        doc = DocxDoc(file_path)
+        paragraphs = []
+        for para in doc.paragraphs:
+            text = para.text.strip()
+            if text:
+                paragraphs.append(text)
+        return _fix_text("\n\n".join(paragraphs))
+    # ── Section/passage extraction ────────────────────────────────────────────
+    def _pages_to_passages(
+        self,
+        pages: List[Tuple[int, str]],
+    ) -> List[Tuple[str, int, Optional[str]]]:
+        """
+        Convert (page_num, text) pairs into a flat list of
+        (passage_text, page_num, section_title) tuples.
+        Detects section headings and tags each passage with the most recent
+        heading seen.  Paragraphs within a page are exploded into separate
+        passages so that the chunker can work at fine granularity.
+        """
+        passages: List[Tuple[str, int, Optional[str]]] = []
+        current_section: Optional[str] = None
+        for page_num, page_text in pages:
+            # Split the page into paragraphs
+            paragraphs = _split_paragraphs(page_text)
+            for para in paragraphs:
+                if not para.strip():
+                    continue
+                # Is this paragraph a standalone heading?
+                first_line = para.splitlines()[0].strip()
+                if _detect_heading(first_line) and len(para.strip()) < 120:
+                    current_section = para.strip()
+                    # Don't create a chunk for a bare heading — it'll be absorbed
+                    # into the next passage as its section_title context
+                    continue
+                passages.append((para, page_num, current_section))
+        return passages