Srushti-Kamble commited on
Commit
f72065c
·
1 Parent(s): a4fe8f2

feat: add table-aware PDF chunking with pdfplumber

Browse files
backend/app/rag/chunker.py CHANGED
@@ -2,6 +2,7 @@
2
  Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
3
  Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
4
  """
 
5
  import fitz # PyMuPDF
6
  import docx
7
  from typing import List, Dict, Any
@@ -11,8 +12,72 @@ from app.config import get_settings
11
  settings = get_settings()
12
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
15
- """Extract text from PDF with page numbers."""
 
 
 
 
 
 
 
 
16
  doc = fitz.open(filepath)
17
  pages = []
18
 
@@ -22,12 +87,52 @@ def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
22
  pages.append({
23
  "text": text,
24
  "page": page_num + 1,
 
25
  })
26
 
27
  doc.close()
28
  return pages
29
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
32
  """Extract images from a PDF and return list of dicts with image bytes and page number.
33
 
@@ -109,6 +214,19 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
109
  for page_data in pages:
110
  text = page_data["text"]
111
  page_num = page_data["page"]
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
  # Split this page's text
114
  splits = splitter.split_text(text)
@@ -119,6 +237,7 @@ def chunk_document(filepath: str) -> List[Dict[str, Any]]:
119
  "text": split_text.strip(),
120
  "page": page_num,
121
  "chunk_index": chunk_index,
 
122
  })
123
  chunk_index += 1
124
 
 
2
  Smart document chunking using LangChain's RecursiveCharacterTextSplitter.
3
  Supports PDF, DOCX, TXT, and Markdown files with page-level metadata.
4
  """
5
+ import json
6
  import fitz # PyMuPDF
7
  import docx
8
  from typing import List, Dict, Any
 
12
  settings = get_settings()
13
 
14
 
15
+ def _is_word_inside_bbox(word: Dict[str, Any], bbox: tuple) -> bool:
16
+ """Return True when the word center falls inside a pdfplumber bbox."""
17
+ x0, top, x1, bottom = bbox
18
+ word_x = (float(word["x0"]) + float(word["x1"])) / 2
19
+ word_y = (float(word["top"]) + float(word["bottom"])) / 2
20
+ return x0 <= word_x <= x1 and top <= word_y <= bottom
21
+
22
+
23
+ def _words_to_text(words: List[Dict[str, Any]], line_tolerance: float = 3.0) -> str:
24
+ """Rebuild readable text from positioned pdfplumber words."""
25
+ if not words:
26
+ return ""
27
+
28
+ sorted_words = sorted(words, key=lambda item: (round(float(item["top"]) / line_tolerance), item["x0"]))
29
+ lines: List[List[Dict[str, Any]]] = []
30
+
31
+ for word in sorted_words:
32
+ if not lines:
33
+ lines.append([word])
34
+ continue
35
+
36
+ current_top = sum(float(item["top"]) for item in lines[-1]) / len(lines[-1])
37
+ if abs(float(word["top"]) - current_top) <= line_tolerance:
38
+ lines[-1].append(word)
39
+ else:
40
+ lines.append([word])
41
+
42
+ text_lines = [
43
+ " ".join(item["text"] for item in sorted(line, key=lambda item: item["x0"]))
44
+ for line in lines
45
+ ]
46
+ return "\n".join(line for line in text_lines if line.strip())
47
+
48
+
49
+ def _table_to_markdown(rows: List[List[Any]]) -> str:
50
+ """Serialize extracted table rows into Markdown for retrieval."""
51
+ cleaned_rows = [
52
+ ["" if cell is None else str(cell).replace("\n", " ").strip() for cell in row]
53
+ for row in rows
54
+ if row and any(cell is not None and str(cell).strip() for cell in row)
55
+ ]
56
+ if not cleaned_rows:
57
+ return ""
58
+
59
+ width = max(len(row) for row in cleaned_rows)
60
+ normalized = [row + [""] * (width - len(row)) for row in cleaned_rows]
61
+
62
+ def fmt(row: List[str]) -> str:
63
+ return "| " + " | ".join(cell.replace("|", "\\|") for cell in row) + " |"
64
+
65
+ header = normalized[0]
66
+ separator = ["---"] * width
67
+ body = normalized[1:]
68
+ return "\n".join([fmt(header), fmt(separator), *[fmt(row) for row in body]])
69
+
70
+
71
  def extract_pdf(filepath: str) -> List[Dict[str, Any]]:
72
+ """Extract PDF text while preserving tables as separate bbox-aware chunks."""
73
+ try:
74
+ return extract_pdf_with_tables(filepath)
75
+ except ImportError:
76
+ return extract_pdf_with_pymupdf(filepath)
77
+
78
+
79
+ def extract_pdf_with_pymupdf(filepath: str) -> List[Dict[str, Any]]:
80
+ """Fallback PDF extraction with page numbers using PyMuPDF."""
81
  doc = fitz.open(filepath)
82
  pages = []
83
 
 
87
  pages.append({
88
  "text": text,
89
  "page": page_num + 1,
90
+ "chunk_type": "text",
91
  })
92
 
93
  doc.close()
94
  return pages
95
 
96
 
97
+ def extract_pdf_with_tables(filepath: str) -> List[Dict[str, Any]]:
98
+ """Detect tables with pdfplumber, remove table text from paragraphs, and keep table bboxes."""
99
+ import pdfplumber
100
+
101
+ pages: List[Dict[str, Any]] = []
102
+
103
+ with pdfplumber.open(filepath) as pdf:
104
+ for page_num, page in enumerate(pdf.pages, start=1):
105
+ tables = page.find_tables()
106
+ table_bboxes = [table.bbox for table in tables]
107
+
108
+ words = page.extract_words() or []
109
+ paragraph_words = [
110
+ word for word in words
111
+ if not any(_is_word_inside_bbox(word, bbox) for bbox in table_bboxes)
112
+ ]
113
+ paragraph_text = _words_to_text(paragraph_words)
114
+
115
+ if paragraph_text.strip():
116
+ pages.append({
117
+ "text": paragraph_text,
118
+ "page": page_num,
119
+ "chunk_type": "text",
120
+ })
121
+
122
+ for table_index, table in enumerate(tables):
123
+ table_text = _table_to_markdown(table.extract() or [])
124
+ if table_text.strip():
125
+ pages.append({
126
+ "text": table_text,
127
+ "page": page_num,
128
+ "chunk_type": "table",
129
+ "bbox": json.dumps([round(float(value), 2) for value in table.bbox]),
130
+ "table_index": table_index,
131
+ })
132
+
133
+ return pages
134
+
135
+
136
  def extract_pdf_images(filepath: str) -> List[Dict[str, Any]]:
137
  """Extract images from a PDF and return list of dicts with image bytes and page number.
138
 
 
214
  for page_data in pages:
215
  text = page_data["text"]
216
  page_num = page_data["page"]
217
+ chunk_type = page_data.get("chunk_type", "text")
218
+
219
+ if chunk_type == "table":
220
+ all_chunks.append({
221
+ "text": text.strip(),
222
+ "page": page_num,
223
+ "chunk_index": chunk_index,
224
+ "chunk_type": "table",
225
+ "bbox": page_data.get("bbox", ""),
226
+ "table_index": page_data.get("table_index", 0),
227
+ })
228
+ chunk_index += 1
229
+ continue
230
 
231
  # Split this page's text
232
  splits = splitter.split_text(text)
 
237
  "text": split_text.strip(),
238
  "page": page_num,
239
  "chunk_index": chunk_index,
240
+ "chunk_type": chunk_type,
241
  })
242
  chunk_index += 1
243
 
backend/app/rag/vectorstore.py CHANGED
@@ -84,6 +84,9 @@ def store_chunks(
84
  "document_id": document_id,
85
  "page": chunk["page"],
86
  "chunk_index": chunk["chunk_index"],
 
 
 
87
  # Indicate whether this chunk was originally an image and include a short caption
88
  **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
89
  if chunk.get("is_image") else {}),
@@ -162,6 +165,8 @@ def query_chunks(
162
  "filename": metadata.get("filename", ""),
163
  "document_id": metadata.get("document_id", ""),
164
  "page": metadata.get("page", 1),
 
 
165
  "score": round(similarity, 4),
166
  })
167
 
 
84
  "document_id": document_id,
85
  "page": chunk["page"],
86
  "chunk_index": chunk["chunk_index"],
87
+ "chunk_type": chunk.get("chunk_type", "text"),
88
+ **({"bbox": chunk.get("bbox", "")} if chunk.get("bbox") else {}),
89
+ **({"table_index": chunk.get("table_index", 0)} if chunk.get("chunk_type") == "table" else {}),
90
  # Indicate whether this chunk was originally an image and include a short caption
91
  **({"is_image": True, "image_caption": chunk.get("image_caption", "")}
92
  if chunk.get("is_image") else {}),
 
165
  "filename": metadata.get("filename", ""),
166
  "document_id": metadata.get("document_id", ""),
167
  "page": metadata.get("page", 1),
168
+ "chunk_type": metadata.get("chunk_type", "text"),
169
+ "bbox": metadata.get("bbox", ""),
170
  "score": round(similarity, 4),
171
  })
172
 
backend/requirements.txt CHANGED
@@ -25,6 +25,7 @@ httpx
25
 
26
  # Document Processing
27
  PyMuPDF
 
28
  python-docx
29
 
30
  # LangChain & RAG
 
25
 
26
  # Document Processing
27
  PyMuPDF
28
+ pdfplumber
29
  python-docx
30
 
31
  # LangChain & RAG
backend/tests/test_chunker.py CHANGED
@@ -1,7 +1,10 @@
1
  from pathlib import Path
 
 
2
 
3
  import pytest
4
 
 
5
  from app.rag.chunker import chunk_document, get_page_count
6
 
7
 
@@ -36,3 +39,49 @@ def test_get_page_count_for_txt_returns_one(tmp_path):
36
  file_path.write_text("hello", encoding="utf-8")
37
 
38
  assert get_page_count(str(file_path)) == 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
2
+ import sys
3
+ import types
4
 
5
  import pytest
6
 
7
+ from app.rag import chunker
8
  from app.rag.chunker import chunk_document, get_page_count
9
 
10
 
 
39
  file_path.write_text("hello", encoding="utf-8")
40
 
41
  assert get_page_count(str(file_path)) == 1
42
+
43
+
44
+ def test_pdf_table_detection_separates_table_from_paragraph(monkeypatch):
45
+ class FakeTable:
46
+ bbox = (40, 90, 300, 160)
47
+
48
+ def extract(self):
49
+ return [["Name", "Amount"], ["Alpha", "$10"]]
50
+
51
+ class FakePage:
52
+ def find_tables(self):
53
+ return [FakeTable()]
54
+
55
+ def extract_words(self):
56
+ return [
57
+ {"text": "Intro", "x0": 40, "x1": 70, "top": 20, "bottom": 30},
58
+ {"text": "paragraph", "x0": 75, "x1": 140, "top": 20, "bottom": 30},
59
+ {"text": "Name", "x0": 45, "x1": 80, "top": 100, "bottom": 110},
60
+ {"text": "Amount", "x0": 160, "x1": 220, "top": 100, "bottom": 110},
61
+ {"text": "Alpha", "x0": 45, "x1": 85, "top": 125, "bottom": 135},
62
+ {"text": "$10", "x0": 160, "x1": 185, "top": 125, "bottom": 135},
63
+ ]
64
+
65
+ class FakePdf:
66
+ pages = [FakePage()]
67
+
68
+ def __enter__(self):
69
+ return self
70
+
71
+ def __exit__(self, exc_type, exc, traceback):
72
+ return False
73
+
74
+ fake_pdfplumber = types.SimpleNamespace(open=lambda _filepath: FakePdf())
75
+ monkeypatch.setitem(sys.modules, "pdfplumber", fake_pdfplumber)
76
+ monkeypatch.setattr(chunker, "extract_pdf_images", lambda _filepath: [])
77
+
78
+ chunks = chunk_document("report.pdf")
79
+
80
+ assert len(chunks) == 2
81
+ assert chunks[0]["chunk_type"] == "text"
82
+ assert chunks[0]["text"] == "Intro paragraph"
83
+ assert "Name" not in chunks[0]["text"]
84
+ assert chunks[1]["chunk_type"] == "table"
85
+ assert chunks[1]["bbox"] == "[40.0, 90.0, 300.0, 160.0]"
86
+ assert "| Name | Amount |" in chunks[1]["text"]
87
+ assert "| Alpha | $10 |" in chunks[1]["text"]
requirements.txt CHANGED
@@ -1,6 +1,7 @@
1
  flask
2
  python-dotenv
3
  pymupdf
 
4
  flask-login
5
  pymongo
6
  werkzeug
@@ -12,4 +13,4 @@ requests-oauthlib
12
  google-genai
13
  cryptography
14
  gunicorn
15
- pinecone
 
1
  flask
2
  python-dotenv
3
  pymupdf
4
+ pdfplumber
5
  flask-login
6
  pymongo
7
  werkzeug
 
13
  google-genai
14
  cryptography
15
  gunicorn
16
+ pinecone