Spaces:

Seyfelislem
/

pdf_chat_assistant

Sleeping

pdf_chat_assistant / src /services /pdf_processor.py

Seif-aber

implemented pdf chat assistant with gemini and RAG

edac567 4 months ago

2.87 kB

	import sys
	import os
	from typing import List, Dict, Optional

	sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../')))

	try:
	from PyPDF2 import PdfReader
	except ImportError:
	try:
	from pypdf import PdfReader
	except ImportError:
	print("Error: PDF reading library not found. Please install PyPDF2 or pypdf.")
	PdfReader = None

	from src.utils.chunking import chunk_pdf_text, clean_text
	from config.settings import Config

	class PDFProcessor:
	"""Process PDFs into cleaned text chunks."""

	def __init__(self, chunk_size: Optional[int] = None, overlap: Optional[int] = None) -> None:
	"""
	Initialize processor with chunk parameters.

	Args:
	chunk_size: Characters per chunk (defaults to config).
	overlap: Overlap between chunks (defaults to config).
	"""
	self.chunk_size = chunk_size or Config.CHUNK_SIZE
	self.overlap = overlap or Config.CHUNK_OVERLAP

	def process_pdf(self, file_path: str) -> List[str]:
	"""
	Read PDF, extract text, clean, and chunk.

	Args:
	file_path: Path to PDF.

	Returns:
	List of chunk strings.
	"""
	raw = self._extract_text(file_path)
	if not raw.strip():
	return []
	cleaned = clean_text(raw)
	chunks = chunk_pdf_text(cleaned, self.chunk_size, self.overlap)
	return [c for c in chunks if len(c.strip()) > 50]

	def get_pdf_info(self, file_path: str) -> Dict:
	"""
	Retrieve simple info (pages, metadata, encryption).

	Args:
	file_path: Path to PDF.

	Returns:
	Dict of info.
	"""
	try:
	reader = PdfReader(file_path)
	return {
	"num_pages": len(reader.pages),
	"metadata": reader.metadata,
	"encrypted": reader.is_encrypted,
	}
	except Exception as e:
	print(f"[PDFProcessor] Info error: {e}")
	return {}

	def _extract_text(self, file_path: str) -> str:
	"""
	Extract text from all pages.

	Args:
	file_path: Path to PDF.

	Returns:
	Concatenated text with page separators.
	"""
	try:
	reader = PdfReader(file_path)
	out: List[str] = []
	for idx, page in enumerate(reader.pages):
	try:
	text = page.extract_text() or ""
	if text.strip():
	out.append(f"\n--- Page {idx+1} ---\n{text}")
	except Exception as pe:
	print(f"[PDFProcessor] Page {idx+1} extraction failed: {pe}")
	return "".join(out)
	except Exception as e:
	print(f"[PDFProcessor] Read error: {e}")
	return ""