LunarTech / src /pdf_processor.py
vishalkatheriya's picture
Upload 14 files
24773d4 verified
"""PDF text extraction and chunking for RAG."""
import re
from pathlib import Path
from typing import List
import pdfplumber
from pypdf import PdfReader
from config import CHUNK_OVERLAP, CHUNK_SIZE
def extract_text_from_pdf(pdf_path: str | Path) -> str:
"""Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback."""
path = Path(pdf_path)
if not path.exists():
raise FileNotFoundError(f"PDF not found: {path}")
text_parts: List[str] = []
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text_parts.append(t)
except Exception:
# Fallback to pypdf
reader = PdfReader(path)
for page in reader.pages:
t = page.extract_text()
if t:
text_parts.append(t)
raw = "\n\n".join(text_parts)
# Normalize whitespace
return re.sub(r"\s+", " ", raw).strip()
def chunk_text(
text: str,
chunk_size: int = CHUNK_SIZE,
overlap: int = CHUNK_OVERLAP,
) -> List[dict]:
"""
Split text into overlapping chunks for embedding.
Returns list of dicts with 'text' and 'metadata' (source, chunk_index).
"""
if not text or not text.strip():
return []
chunks: List[dict] = []
start = 0
index = 0
text = text.strip()
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence or word boundary
if end < len(text):
last_period = chunk.rfind(". ")
last_newline = chunk.rfind("\n")
break_at = max(last_period, last_newline)
if break_at > chunk_size // 2:
chunk = chunk[: break_at + 1]
end = start + break_at + 1
chunk = chunk.strip()
if chunk:
chunks.append({
"text": chunk,
"metadata": {"chunk_index": index},
})
index += 1
start = end - overlap if end < len(text) else len(text)
return chunks
def process_pdf(pdf_path: str | Path, source_name: str | None = None) -> List[dict]:
"""
Extract text from PDF and return chunks with source metadata.
source_name: optional label (e.g. filename) for metadata.
"""
path = Path(pdf_path)
source_name = source_name or path.name
text = extract_text_from_pdf(path)
chunks = chunk_text(text)
for c in chunks:
c["metadata"]["source"] = source_name
return chunks