"""PDF text extraction and chunking for RAG.""" import re from pathlib import Path from typing import List import pdfplumber from pypdf import PdfReader from config import CHUNK_OVERLAP, CHUNK_SIZE def extract_text_from_pdf(pdf_path: str | Path) -> str: """Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback.""" path = Path(pdf_path) if not path.exists(): raise FileNotFoundError(f"PDF not found: {path}") text_parts: List[str] = [] try: with pdfplumber.open(path) as pdf: for page in pdf.pages: t = page.extract_text() if t: text_parts.append(t) except Exception: # Fallback to pypdf reader = PdfReader(path) for page in reader.pages: t = page.extract_text() if t: text_parts.append(t) raw = "\n\n".join(text_parts) # Normalize whitespace return re.sub(r"\s+", " ", raw).strip() def chunk_text( text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP, ) -> List[dict]: """ Split text into overlapping chunks for embedding. Returns list of dicts with 'text' and 'metadata' (source, chunk_index). """ if not text or not text.strip(): return [] chunks: List[dict] = [] start = 0 index = 0 text = text.strip() while start < len(text): end = start + chunk_size chunk = text[start:end] # Try to break at sentence or word boundary if end < len(text): last_period = chunk.rfind(". ") last_newline = chunk.rfind("\n") break_at = max(last_period, last_newline) if break_at > chunk_size // 2: chunk = chunk[: break_at + 1] end = start + break_at + 1 chunk = chunk.strip() if chunk: chunks.append({ "text": chunk, "metadata": {"chunk_index": index}, }) index += 1 start = end - overlap if end < len(text) else len(text) return chunks def process_pdf(pdf_path: str | Path, source_name: str | None = None) -> List[dict]: """ Extract text from PDF and return chunks with source metadata. source_name: optional label (e.g. filename) for metadata. """ path = Path(pdf_path) source_name = source_name or path.name text = extract_text_from_pdf(path) chunks = chunk_text(text) for c in chunks: c["metadata"]["source"] = source_name return chunks