Spaces:
Runtime error
Runtime error
File size: 2,583 Bytes
24773d4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 | """PDF text extraction and chunking for RAG."""
import re
from pathlib import Path
from typing import List
import pdfplumber
from pypdf import PdfReader
from config import CHUNK_OVERLAP, CHUNK_SIZE
def extract_text_from_pdf(pdf_path: str | Path) -> str:
"""Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback."""
path = Path(pdf_path)
if not path.exists():
raise FileNotFoundError(f"PDF not found: {path}")
text_parts: List[str] = []
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
t = page.extract_text()
if t:
text_parts.append(t)
except Exception:
# Fallback to pypdf
reader = PdfReader(path)
for page in reader.pages:
t = page.extract_text()
if t:
text_parts.append(t)
raw = "\n\n".join(text_parts)
# Normalize whitespace
return re.sub(r"\s+", " ", raw).strip()
def chunk_text(
text: str,
chunk_size: int = CHUNK_SIZE,
overlap: int = CHUNK_OVERLAP,
) -> List[dict]:
"""
Split text into overlapping chunks for embedding.
Returns list of dicts with 'text' and 'metadata' (source, chunk_index).
"""
if not text or not text.strip():
return []
chunks: List[dict] = []
start = 0
index = 0
text = text.strip()
while start < len(text):
end = start + chunk_size
chunk = text[start:end]
# Try to break at sentence or word boundary
if end < len(text):
last_period = chunk.rfind(". ")
last_newline = chunk.rfind("\n")
break_at = max(last_period, last_newline)
if break_at > chunk_size // 2:
chunk = chunk[: break_at + 1]
end = start + break_at + 1
chunk = chunk.strip()
if chunk:
chunks.append({
"text": chunk,
"metadata": {"chunk_index": index},
})
index += 1
start = end - overlap if end < len(text) else len(text)
return chunks
def process_pdf(pdf_path: str | Path, source_name: str | None = None) -> List[dict]:
"""
Extract text from PDF and return chunks with source metadata.
source_name: optional label (e.g. filename) for metadata.
"""
path = Path(pdf_path)
source_name = source_name or path.name
text = extract_text_from_pdf(path)
chunks = chunk_text(text)
for c in chunks:
c["metadata"]["source"] = source_name
return chunks
|