Spaces:

Param20h
/

PDF-Assit_RAG

Running

App Files Files Community

Srushti-Kamble commited on 5 days ago

Commit

f72065c

1 Parent(s): a4fe8f2

feat: add table-aware PDF chunking with pdfplumber

Browse files

Files changed (5) hide show

backend/app/rag/chunker.py +120 -1
backend/app/rag/vectorstore.py +5 -0
backend/requirements.txt +1 -0
backend/tests/test_chunker.py +49 -0
requirements.txt +2 -1

backend/app/rag/chunker.py CHANGED Viewed

@@ -2,6 +2,7 @@
 Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
 Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
 """
 import fitz  # PyMuPDF
 import docx
 from typing import List, Dict, Any
@@ -11,8 +12,72 @@ from app.config import get_settings
 settings = get_settings()
 def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
-    """Extract text from PDF with page numbers."""
     doc = fitz.open(filepath)
     pages = []
@@ -22,12 +87,52 @@ def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
             pages.append({
                 "text": text,
                 "page": page_num + 1,
             })
     doc.close()
     return pages
 def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
     """Extract images from a PDF and return list of dicts with image bytes and page number.
@@ -109,6 +214,19 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
     for page_data in pages:
         text = page_data["text"]
         page_num = page_data["page"]
         # Split this page's text
         splits = splitter.split_text(text)
@@ -119,6 +237,7 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
                     "text": split_text.strip(),
                     "page": page_num,
                     "chunk_index": chunk_index,
                 })
                 chunk_index += 1

 Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
 Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
 """
+import json
 import fitz  # PyMuPDF
 import docx
 from typing import List, Dict, Any
 settings = get_settings()
+def _is_word_inside_bbox(word: Dict[str, Any], bbox: tuple) -> bool:
+    """Return True when the word center falls inside a pdfplumber bbox."""
+    x0, top, x1, bottom = bbox
+    word_x = (float(word["x0"]) + float(word["x1"])) / 2
+    word_y = (float(word["top"]) + float(word["bottom"])) / 2
+    return x0 <= word_x <= x1 and top <= word_y <= bottom
+def _words_to_text(words: List[Dict[str, Any]], line_tolerance: float = 3.0) -> str:
+    """Rebuild readable text from positioned pdfplumber words."""
+    if not words:
+        return ""
+    sorted_words = sorted(words, key=lambda item: (round(float(item["top"]) / line_tolerance), item["x0"]))
+    lines: List[List[Dict[str, Any]]] = []
+    for word in sorted_words:
+        if not lines:
+            lines.append([word])
+            continue
+        current_top = sum(float(item["top"]) for item in lines[-1]) / len(lines[-1])
+        if abs(float(word["top"]) - current_top) <= line_tolerance:
+            lines[-1].append(word)
+        else:
+            lines.append([word])
+    text_lines = [
+        " ".join(item["text"] for item in sorted(line, key=lambda item: item["x0"]))
+        for line in lines
+    ]
+    return "\n".join(line for line in text_lines if line.strip())
+def _table_to_markdown(rows: List[List[Any]]) -> str:
+    """Serialize extracted table rows into Markdown for retrieval."""
+    cleaned_rows = [
+        ["" if cell is None else str(cell).replace("\n", " ").strip() for cell in row]
+        for row in rows
+        if row and any(cell is not None and str(cell).strip() for cell in row)
+    ]
+    if not cleaned_rows:
+        return ""
+    width = max(len(row) for row in cleaned_rows)
+    normalized = [row + [""] * (width - len(row)) for row in cleaned_rows]
+    def fmt(row: List[str]) -> str:
+        return "| " + " | ".join(cell.replace("|", "\\|") for cell in row) + " |"
+    header = normalized[0]
+    separator = ["---"] * width
+    body = normalized[1:]
+    return "\n".join([fmt(header), fmt(separator), *[fmt(row) for row in body]])
 def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
+    """Extract PDF text while preserving tables as separate bbox-aware chunks."""
+    try:
+        return extract_pdf_with_tables(filepath)
+    except ImportError:
+        return extract_pdf_with_pymupdf(filepath)
+def extract_pdf_with_pymupdf(filepath: str) -> List[Dict[str, Any]]:
+    """Fallback PDF extraction with page numbers using PyMuPDF."""
     doc = fitz.open(filepath)
     pages = []
             pages.append({
                 "text": text,
                 "page": page_num + 1,
+                "chunk_type": "text",
             })
     doc.close()
     return pages
+def extract_pdf_with_tables(filepath: str) -> List[Dict[str, Any]]:
+    """Detect tables with pdfplumber, remove table text from paragraphs, and keep table bboxes."""
+    import pdfplumber
+    pages: List[Dict[str, Any]] = []
+    with pdfplumber.open(filepath) as pdf:
+        for page_num, page in enumerate(pdf.pages, start=1):
+            tables = page.find_tables()
+            table_bboxes = [table.bbox for table in tables]
+            words = page.extract_words() or []
+            paragraph_words = [
+                word for word in words
+                if not any(_is_word_inside_bbox(word, bbox) for bbox in table_bboxes)
+            ]
+            paragraph_text = _words_to_text(paragraph_words)
+            if paragraph_text.strip():
+                pages.append({
+                    "text": paragraph_text,
+                    "page": page_num,
+                    "chunk_type": "text",
+                })
+            for table_index, table in enumerate(tables):
+                table_text = _table_to_markdown(table.extract() or [])
+                if table_text.strip():
+                    pages.append({
+                        "text": table_text,
+                        "page": page_num,
+                        "chunk_type": "table",
+                        "bbox": json.dumps([round(float(value), 2) for value in table.bbox]),
+                        "table_index": table_index,
+                    })
+    return pages
 def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
     """Extract images from a PDF and return list of dicts with image bytes and page number.
     for page_data in pages:
         text = page_data["text"]
         page_num = page_data["page"]
+        chunk_type = page_data.get("chunk_type", "text")
+        if chunk_type == "table":
+            all_chunks.append({
+                "text": text.strip(),
+                "page": page_num,
+                "chunk_index": chunk_index,
+                "chunk_type": "table",
+                "bbox": page_data.get("bbox", ""),
+                "table_index": page_data.get("table_index", 0),
+            })
+            chunk_index += 1
+            continue
         # Split this page's text
         splits = splitter.split_text(text)
                     "text": split_text.strip(),
                     "page": page_num,
                     "chunk_index": chunk_index,
+                    "chunk_type": chunk_type,
                 })
                 chunk_index += 1

backend/app/rag/vectorstore.py CHANGED Viewed

@@ -84,6 +84,9 @@ def store_chunks(
             "document_id": document_id,
             "page": chunk["page"],
             "chunk_index": chunk["chunk_index"],
             # Indicate whether this chunk was originally an image and include a short caption
             **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
                if chunk.get("is_image") else {}),
@@ -162,6 +165,8 @@ def query_chunks(
                 "filename": metadata.get("filename", ""),
                 "document_id": metadata.get("document_id", ""),
                 "page": metadata.get("page", 1),
                 "score": round(similarity, 4),
             })

             "document_id": document_id,
             "page": chunk["page"],
             "chunk_index": chunk["chunk_index"],
+            "chunk_type": chunk.get("chunk_type", "text"),
+            **({"bbox": chunk.get("bbox", "")} if chunk.get("bbox") else {}),
+            **({"table_index": chunk.get("table_index", 0)} if chunk.get("chunk_type") == "table" else {}),
             # Indicate whether this chunk was originally an image and include a short caption
             **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
                if chunk.get("is_image") else {}),
                 "filename": metadata.get("filename", ""),
                 "document_id": metadata.get("document_id", ""),
                 "page": metadata.get("page", 1),
+                "chunk_type": metadata.get("chunk_type", "text"),
+                "bbox": metadata.get("bbox", ""),
                 "score": round(similarity, 4),
             })

backend/requirements.txt CHANGED Viewed

@@ -25,6 +25,7 @@ httpx
 # Document Processing
 PyMuPDF
 python-docx
 # LangChain & RAG

 # Document Processing
 PyMuPDF
+pdfplumber
 python-docx
 # LangChain & RAG

backend/tests/test_chunker.py CHANGED Viewed

@@ -1,7 +1,10 @@
 from pathlib import Path
 import pytest
 from app.rag.chunker import chunk_document, get_page_count
@@ -36,3 +39,49 @@ def test_get_page_count_for_txt_returns_one(tmp_path):
     file_path.write_text("hello", encoding="utf-8")
     assert get_page_count(str(file_path)) == 1

 from pathlib import Path
+import sys
+import types
 import pytest
+from app.rag import chunker
 from app.rag.chunker import chunk_document, get_page_count
     file_path.write_text("hello", encoding="utf-8")
     assert get_page_count(str(file_path)) == 1
+def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
+    class FakeTable:
+        bbox = (40, 90, 300, 160)
+        def extract(self):
+            return [["Name", "Amount"], ["Alpha", "$10"]]
+    class FakePage:
+        def find_tables(self):
+            return [FakeTable()]
+        def extract_words(self):
+            return [
+                {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
+                {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
+                {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
+                {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
+                {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
+                {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
+            ]
+    class FakePdf:
+        pages = [FakePage()]
+        def __enter__(self):
+            return self
+        def __exit__(self, exc_type, exc, traceback):
+            return False
+    fake_pdfplumber = types.SimpleNamespace(open=lambda _filepath: FakePdf())
+    monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
+    monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
+    chunks = chunk_document("report.pdf")
+    assert len(chunks) == 2
+    assert chunks[0]["chunk_type"] == "text"
+    assert chunks[0]["text"] == "Intro paragraph"
+    assert "Name" not in chunks[0]["text"]
+    assert chunks[1]["chunk_type"] == "table"
+    assert chunks[1]["bbox"] == "[40.0, 90.0, 300.0, 160.0]"
+    assert "| Name | Amount |" in chunks[1]["text"]
+    assert "| Alpha | $10 |" in chunks[1]["text"]

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
 flask
 python-dotenv
 pymupdf
 flask-login
 pymongo
 werkzeug
@@ -12,4 +13,4 @@ requests-oauthlib
 google-genai
 cryptography
 gunicorn
-pinecone

 flask
 python-dotenv
 pymupdf
+pdfplumber
 flask-login
 pymongo
 werkzeug
 google-genai
 cryptography
 gunicorn
+pinecone