Spaces:
Running
Running
Srushti-Kamble commited on
Commit ·
f72065c
1
Parent(s): a4fe8f2
feat: add table-aware PDF chunking with pdfplumber
Browse files- backend/app/rag/chunker.py +120 -1
- backend/app/rag/vectorstore.py +5 -0
- backend/requirements.txt +1 -0
- backend/tests/test_chunker.py +49 -0
- requirements.txt +2 -1
backend/app/rag/chunker.py
CHANGED
|
@@ -2,6 +2,7 @@
|
|
| 2 |
Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
|
| 3 |
Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
|
| 4 |
"""
|
|
|
|
| 5 |
import fitz # PyMuPDF
|
| 6 |
import docx
|
| 7 |
from typing import List, Dict, Any
|
|
@@ -11,8 +12,72 @@ from app.config import get_settings
|
|
| 11 |
settings = get_settings()
|
| 12 |
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
|
| 15 |
-
"""Extract text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
doc = fitz.open(filepath)
|
| 17 |
pages = []
|
| 18 |
|
|
@@ -22,12 +87,52 @@ def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
|
|
| 22 |
pages.append({
|
| 23 |
"text": text,
|
| 24 |
"page": page_num + 1,
|
|
|
|
| 25 |
})
|
| 26 |
|
| 27 |
doc.close()
|
| 28 |
return pages
|
| 29 |
|
| 30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
|
| 32 |
"""Extract images from a PDF and return list of dicts with image bytes and page number.
|
| 33 |
|
|
@@ -109,6 +214,19 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
|
|
| 109 |
for page_data in pages:
|
| 110 |
text = page_data["text"]
|
| 111 |
page_num = page_data["page"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
# Split this page's text
|
| 114 |
splits = splitter.split_text(text)
|
|
@@ -119,6 +237,7 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
|
|
| 119 |
"text": split_text.strip(),
|
| 120 |
"page": page_num,
|
| 121 |
"chunk_index": chunk_index,
|
|
|
|
| 122 |
})
|
| 123 |
chunk_index += 1
|
| 124 |
|
|
|
|
| 2 |
Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
|
| 3 |
Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
|
| 4 |
"""
|
| 5 |
+
import json
|
| 6 |
import fitz # PyMuPDF
|
| 7 |
import docx
|
| 8 |
from typing import List, Dict, Any
|
|
|
|
| 12 |
settings = get_settings()
|
| 13 |
|
| 14 |
|
| 15 |
+
def _is_word_inside_bbox(word: Dict[str, Any], bbox: tuple) -> bool:
|
| 16 |
+
"""Return True when the word center falls inside a pdfplumber bbox."""
|
| 17 |
+
x0, top, x1, bottom = bbox
|
| 18 |
+
word_x = (float(word["x0"]) + float(word["x1"])) / 2
|
| 19 |
+
word_y = (float(word["top"]) + float(word["bottom"])) / 2
|
| 20 |
+
return x0 <= word_x <= x1 and top <= word_y <= bottom
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _words_to_text(words: List[Dict[str, Any]], line_tolerance: float = 3.0) -> str:
|
| 24 |
+
"""Rebuild readable text from positioned pdfplumber words."""
|
| 25 |
+
if not words:
|
| 26 |
+
return ""
|
| 27 |
+
|
| 28 |
+
sorted_words = sorted(words, key=lambda item: (round(float(item["top"]) / line_tolerance), item["x0"]))
|
| 29 |
+
lines: List[List[Dict[str, Any]]] = []
|
| 30 |
+
|
| 31 |
+
for word in sorted_words:
|
| 32 |
+
if not lines:
|
| 33 |
+
lines.append([word])
|
| 34 |
+
continue
|
| 35 |
+
|
| 36 |
+
current_top = sum(float(item["top"]) for item in lines[-1]) / len(lines[-1])
|
| 37 |
+
if abs(float(word["top"]) - current_top) <= line_tolerance:
|
| 38 |
+
lines[-1].append(word)
|
| 39 |
+
else:
|
| 40 |
+
lines.append([word])
|
| 41 |
+
|
| 42 |
+
text_lines = [
|
| 43 |
+
" ".join(item["text"] for item in sorted(line, key=lambda item: item["x0"]))
|
| 44 |
+
for line in lines
|
| 45 |
+
]
|
| 46 |
+
return "\n".join(line for line in text_lines if line.strip())
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def _table_to_markdown(rows: List[List[Any]]) -> str:
|
| 50 |
+
"""Serialize extracted table rows into Markdown for retrieval."""
|
| 51 |
+
cleaned_rows = [
|
| 52 |
+
["" if cell is None else str(cell).replace("\n", " ").strip() for cell in row]
|
| 53 |
+
for row in rows
|
| 54 |
+
if row and any(cell is not None and str(cell).strip() for cell in row)
|
| 55 |
+
]
|
| 56 |
+
if not cleaned_rows:
|
| 57 |
+
return ""
|
| 58 |
+
|
| 59 |
+
width = max(len(row) for row in cleaned_rows)
|
| 60 |
+
normalized = [row + [""] * (width - len(row)) for row in cleaned_rows]
|
| 61 |
+
|
| 62 |
+
def fmt(row: List[str]) -> str:
|
| 63 |
+
return "| " + " | ".join(cell.replace("|", "\\|") for cell in row) + " |"
|
| 64 |
+
|
| 65 |
+
header = normalized[0]
|
| 66 |
+
separator = ["---"] * width
|
| 67 |
+
body = normalized[1:]
|
| 68 |
+
return "\n".join([fmt(header), fmt(separator), *[fmt(row) for row in body]])
|
| 69 |
+
|
| 70 |
+
|
| 71 |
def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
|
| 72 |
+
"""Extract PDF text while preserving tables as separate bbox-aware chunks."""
|
| 73 |
+
try:
|
| 74 |
+
return extract_pdf_with_tables(filepath)
|
| 75 |
+
except ImportError:
|
| 76 |
+
return extract_pdf_with_pymupdf(filepath)
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def extract_pdf_with_pymupdf(filepath: str) -> List[Dict[str, Any]]:
|
| 80 |
+
"""Fallback PDF extraction with page numbers using PyMuPDF."""
|
| 81 |
doc = fitz.open(filepath)
|
| 82 |
pages = []
|
| 83 |
|
|
|
|
| 87 |
pages.append({
|
| 88 |
"text": text,
|
| 89 |
"page": page_num + 1,
|
| 90 |
+
"chunk_type": "text",
|
| 91 |
})
|
| 92 |
|
| 93 |
doc.close()
|
| 94 |
return pages
|
| 95 |
|
| 96 |
|
| 97 |
+
def extract_pdf_with_tables(filepath: str) -> List[Dict[str, Any]]:
|
| 98 |
+
"""Detect tables with pdfplumber, remove table text from paragraphs, and keep table bboxes."""
|
| 99 |
+
import pdfplumber
|
| 100 |
+
|
| 101 |
+
pages: List[Dict[str, Any]] = []
|
| 102 |
+
|
| 103 |
+
with pdfplumber.open(filepath) as pdf:
|
| 104 |
+
for page_num, page in enumerate(pdf.pages, start=1):
|
| 105 |
+
tables = page.find_tables()
|
| 106 |
+
table_bboxes = [table.bbox for table in tables]
|
| 107 |
+
|
| 108 |
+
words = page.extract_words() or []
|
| 109 |
+
paragraph_words = [
|
| 110 |
+
word for word in words
|
| 111 |
+
if not any(_is_word_inside_bbox(word, bbox) for bbox in table_bboxes)
|
| 112 |
+
]
|
| 113 |
+
paragraph_text = _words_to_text(paragraph_words)
|
| 114 |
+
|
| 115 |
+
if paragraph_text.strip():
|
| 116 |
+
pages.append({
|
| 117 |
+
"text": paragraph_text,
|
| 118 |
+
"page": page_num,
|
| 119 |
+
"chunk_type": "text",
|
| 120 |
+
})
|
| 121 |
+
|
| 122 |
+
for table_index, table in enumerate(tables):
|
| 123 |
+
table_text = _table_to_markdown(table.extract() or [])
|
| 124 |
+
if table_text.strip():
|
| 125 |
+
pages.append({
|
| 126 |
+
"text": table_text,
|
| 127 |
+
"page": page_num,
|
| 128 |
+
"chunk_type": "table",
|
| 129 |
+
"bbox": json.dumps([round(float(value), 2) for value in table.bbox]),
|
| 130 |
+
"table_index": table_index,
|
| 131 |
+
})
|
| 132 |
+
|
| 133 |
+
return pages
|
| 134 |
+
|
| 135 |
+
|
| 136 |
def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
|
| 137 |
"""Extract images from a PDF and return list of dicts with image bytes and page number.
|
| 138 |
|
|
|
|
| 214 |
for page_data in pages:
|
| 215 |
text = page_data["text"]
|
| 216 |
page_num = page_data["page"]
|
| 217 |
+
chunk_type = page_data.get("chunk_type", "text")
|
| 218 |
+
|
| 219 |
+
if chunk_type == "table":
|
| 220 |
+
all_chunks.append({
|
| 221 |
+
"text": text.strip(),
|
| 222 |
+
"page": page_num,
|
| 223 |
+
"chunk_index": chunk_index,
|
| 224 |
+
"chunk_type": "table",
|
| 225 |
+
"bbox": page_data.get("bbox", ""),
|
| 226 |
+
"table_index": page_data.get("table_index", 0),
|
| 227 |
+
})
|
| 228 |
+
chunk_index += 1
|
| 229 |
+
continue
|
| 230 |
|
| 231 |
# Split this page's text
|
| 232 |
splits = splitter.split_text(text)
|
|
|
|
| 237 |
"text": split_text.strip(),
|
| 238 |
"page": page_num,
|
| 239 |
"chunk_index": chunk_index,
|
| 240 |
+
"chunk_type": chunk_type,
|
| 241 |
})
|
| 242 |
chunk_index += 1
|
| 243 |
|
backend/app/rag/vectorstore.py
CHANGED
|
@@ -84,6 +84,9 @@ def store_chunks(
|
|
| 84 |
"document_id": document_id,
|
| 85 |
"page": chunk["page"],
|
| 86 |
"chunk_index": chunk["chunk_index"],
|
|
|
|
|
|
|
|
|
|
| 87 |
# Indicate whether this chunk was originally an image and include a short caption
|
| 88 |
**({"is_image": True, "image_caption": chunk.get("image_caption", "")}
|
| 89 |
if chunk.get("is_image") else {}),
|
|
@@ -162,6 +165,8 @@ def query_chunks(
|
|
| 162 |
"filename": metadata.get("filename", ""),
|
| 163 |
"document_id": metadata.get("document_id", ""),
|
| 164 |
"page": metadata.get("page", 1),
|
|
|
|
|
|
|
| 165 |
"score": round(similarity, 4),
|
| 166 |
})
|
| 167 |
|
|
|
|
| 84 |
"document_id": document_id,
|
| 85 |
"page": chunk["page"],
|
| 86 |
"chunk_index": chunk["chunk_index"],
|
| 87 |
+
"chunk_type": chunk.get("chunk_type", "text"),
|
| 88 |
+
**({"bbox": chunk.get("bbox", "")} if chunk.get("bbox") else {}),
|
| 89 |
+
**({"table_index": chunk.get("table_index", 0)} if chunk.get("chunk_type") == "table" else {}),
|
| 90 |
# Indicate whether this chunk was originally an image and include a short caption
|
| 91 |
**({"is_image": True, "image_caption": chunk.get("image_caption", "")}
|
| 92 |
if chunk.get("is_image") else {}),
|
|
|
|
| 165 |
"filename": metadata.get("filename", ""),
|
| 166 |
"document_id": metadata.get("document_id", ""),
|
| 167 |
"page": metadata.get("page", 1),
|
| 168 |
+
"chunk_type": metadata.get("chunk_type", "text"),
|
| 169 |
+
"bbox": metadata.get("bbox", ""),
|
| 170 |
"score": round(similarity, 4),
|
| 171 |
})
|
| 172 |
|
backend/requirements.txt
CHANGED
|
@@ -25,6 +25,7 @@ httpx
|
|
| 25 |
|
| 26 |
# Document Processing
|
| 27 |
PyMuPDF
|
|
|
|
| 28 |
python-docx
|
| 29 |
|
| 30 |
# LangChain & RAG
|
|
|
|
| 25 |
|
| 26 |
# Document Processing
|
| 27 |
PyMuPDF
|
| 28 |
+
pdfplumber
|
| 29 |
python-docx
|
| 30 |
|
| 31 |
# LangChain & RAG
|
backend/tests/test_chunker.py
CHANGED
|
@@ -1,7 +1,10 @@
|
|
| 1 |
from pathlib import Path
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import pytest
|
| 4 |
|
|
|
|
| 5 |
from app.rag.chunker import chunk_document, get_page_count
|
| 6 |
|
| 7 |
|
|
@@ -36,3 +39,49 @@ def test_get_page_count_for_txt_returns_one(tmp_path):
|
|
| 36 |
file_path.write_text("hello", encoding="utf-8")
|
| 37 |
|
| 38 |
assert get_page_count(str(file_path)) == 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
import sys
|
| 3 |
+
import types
|
| 4 |
|
| 5 |
import pytest
|
| 6 |
|
| 7 |
+
from app.rag import chunker
|
| 8 |
from app.rag.chunker import chunk_document, get_page_count
|
| 9 |
|
| 10 |
|
|
|
|
| 39 |
file_path.write_text("hello", encoding="utf-8")
|
| 40 |
|
| 41 |
assert get_page_count(str(file_path)) == 1
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
|
| 45 |
+
class FakeTable:
|
| 46 |
+
bbox = (40, 90, 300, 160)
|
| 47 |
+
|
| 48 |
+
def extract(self):
|
| 49 |
+
return [["Name", "Amount"], ["Alpha", "$10"]]
|
| 50 |
+
|
| 51 |
+
class FakePage:
|
| 52 |
+
def find_tables(self):
|
| 53 |
+
return [FakeTable()]
|
| 54 |
+
|
| 55 |
+
def extract_words(self):
|
| 56 |
+
return [
|
| 57 |
+
{"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
|
| 58 |
+
{"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
|
| 59 |
+
{"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
|
| 60 |
+
{"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
|
| 61 |
+
{"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
|
| 62 |
+
{"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
|
| 63 |
+
]
|
| 64 |
+
|
| 65 |
+
class FakePdf:
|
| 66 |
+
pages = [FakePage()]
|
| 67 |
+
|
| 68 |
+
def __enter__(self):
|
| 69 |
+
return self
|
| 70 |
+
|
| 71 |
+
def __exit__(self, exc_type, exc, traceback):
|
| 72 |
+
return False
|
| 73 |
+
|
| 74 |
+
fake_pdfplumber = types.SimpleNamespace(open=lambda _filepath: FakePdf())
|
| 75 |
+
monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
|
| 76 |
+
monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
|
| 77 |
+
|
| 78 |
+
chunks = chunk_document("report.pdf")
|
| 79 |
+
|
| 80 |
+
assert len(chunks) == 2
|
| 81 |
+
assert chunks[0]["chunk_type"] == "text"
|
| 82 |
+
assert chunks[0]["text"] == "Intro paragraph"
|
| 83 |
+
assert "Name" not in chunks[0]["text"]
|
| 84 |
+
assert chunks[1]["chunk_type"] == "table"
|
| 85 |
+
assert chunks[1]["bbox"] == "[40.0, 90.0, 300.0, 160.0]"
|
| 86 |
+
assert "| Name | Amount |" in chunks[1]["text"]
|
| 87 |
+
assert "| Alpha | $10 |" in chunks[1]["text"]
|
requirements.txt
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
flask
|
| 2 |
python-dotenv
|
| 3 |
pymupdf
|
|
|
|
| 4 |
flask-login
|
| 5 |
pymongo
|
| 6 |
werkzeug
|
|
@@ -12,4 +13,4 @@ requests-oauthlib
|
|
| 12 |
google-genai
|
| 13 |
cryptography
|
| 14 |
gunicorn
|
| 15 |
-
pinecone
|
|
|
|
| 1 |
flask
|
| 2 |
python-dotenv
|
| 3 |
pymupdf
|
| 4 |
+
pdfplumber
|
| 5 |
flask-login
|
| 6 |
pymongo
|
| 7 |
werkzeug
|
|
|
|
| 13 |
google-genai
|
| 14 |
cryptography
|
| 15 |
gunicorn
|
| 16 |
+
pinecone
|