Spaces:
Sleeping
Sleeping
| #extracting from the pdf book | |
| import pdfplumber | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def load_pdf_text(pdf_path: str) -> str: | |
| try: | |
| corpus = "" | |
| with pdfplumber.open(pdf_path) as pdf: | |
| for page_num, page in enumerate(pdf.pages, start=1): #books usually start at later pages, page1 is only the book title | |
| text = page.extract_text() | |
| if text: | |
| corpus += text + " " | |
| else: | |
| logger.warning(f"No text found on page {page_num}") | |
| if not corpus.strip(): | |
| raise ValueError("Empty PDF content") | |
| logger.info("PDF loaded successfully") | |
| return corpus | |
| except FileNotFoundError: | |
| logger.error("PDF file not found") | |
| raise | |
| except Exception as e: | |
| logger.exception(f"Error loading PDF: {e}") | |
| raise |