Spaces:

vishalkatheriya
/

LunarTech

Runtime error

App Files Files Community

LunarTech / src /pdf_processor.py

vishalkatheriya

Upload 14 files

24773d4 verified about 1 month ago

raw

history blame contribute delete

2.58 kB

	"""PDF text extraction and chunking for RAG."""
	import re
	from pathlib import Path
	from typing import List

	import pdfplumber
	from pypdf import PdfReader

	from config import CHUNK_OVERLAP, CHUNK_SIZE


	def extract_text_from_pdf(pdf_path: str \| Path) -> str:
	"""Extract text from a PDF using pdfplumber (better for tables) with pypdf fallback."""
	path = Path(pdf_path)
	if not path.exists():
	raise FileNotFoundError(f"PDF not found: {path}")

	text_parts: List[str] = []
	try:
	with pdfplumber.open(path) as pdf:
	for page in pdf.pages:
	t = page.extract_text()
	if t:
	text_parts.append(t)
	except Exception:
	# Fallback to pypdf
	reader = PdfReader(path)
	for page in reader.pages:
	t = page.extract_text()
	if t:
	text_parts.append(t)

	raw = "\n\n".join(text_parts)
	# Normalize whitespace
	return re.sub(r"\s+", " ", raw).strip()


	def chunk_text(
	text: str,
	chunk_size: int = CHUNK_SIZE,
	overlap: int = CHUNK_OVERLAP,
	) -> List[dict]:
	"""
	Split text into overlapping chunks for embedding.
	Returns list of dicts with 'text' and 'metadata' (source, chunk_index).
	"""
	if not text or not text.strip():
	return []

	chunks: List[dict] = []
	start = 0
	index = 0
	text = text.strip()

	while start < len(text):
	end = start + chunk_size
	chunk = text[start:end]

	# Try to break at sentence or word boundary
	if end < len(text):
	last_period = chunk.rfind(". ")
	last_newline = chunk.rfind("\n")
	break_at = max(last_period, last_newline)
	if break_at > chunk_size // 2:
	chunk = chunk[: break_at + 1]
	end = start + break_at + 1

	chunk = chunk.strip()
	if chunk:
	chunks.append({
	"text": chunk,
	"metadata": {"chunk_index": index},
	})
	index += 1

	start = end - overlap if end < len(text) else len(text)

	return chunks


	def process_pdf(pdf_path: str \| Path, source_name: str \| None = None) -> List[dict]:
	"""
	Extract text from PDF and return chunks with source metadata.
	source_name: optional label (e.g. filename) for metadata.
	"""
	path = Path(pdf_path)
	source_name = source_name or path.name
	text = extract_text_from_pdf(path)
	chunks = chunk_text(text)
	for c in chunks:
	c["metadata"]["source"] = source_name
	return chunks