Spaces:

murkasad
/

DeepLearningRAGchatbot

Sleeping

Upload 10 files

22cff0b verified about 1 month ago

920 Bytes

	#extracting from the pdf book
	import pdfplumber
	import logging

	logger = logging.getLogger(__name__)

	def load_pdf_text(pdf_path: str) -> str:
	try:
	corpus = ""

	with pdfplumber.open(pdf_path) as pdf:
	for page_num, page in enumerate(pdf.pages, start=1): #books usually start at later pages, page1 is only the book title
	text = page.extract_text()

	if text:
	corpus += text + " "
	else:
	logger.warning(f"No text found on page {page_num}")

	if not corpus.strip():
	raise ValueError("Empty PDF content")

	logger.info("PDF loaded successfully")
	return corpus

	except FileNotFoundError:
	logger.error("PDF file not found")
	raise

	except Exception as e:
	logger.exception(f"Error loading PDF: {e}")
	raise