""" PDF Processing utilities for extracting and chunking text from PDF files """ import os from typing import List, Dict import PyPDF2 import pdfplumber try: from langchain_text_splitters import RecursiveCharacterTextSplitter except ImportError: from langchain.text_splitter import RecursiveCharacterTextSplitter from config.model_config import config class PDFProcessor: """Handle PDF text extraction and processing""" def __init__(self): self.text_splitter = RecursiveCharacterTextSplitter( chunk_size=config.CHUNK_SIZE, chunk_overlap=config.CHUNK_OVERLAP, length_function=len, separators=["\n\n", "\n", " ", ""] ) def extract_text_from_pdf(self, pdf_path: str, method: str = "pdfplumber") -> str: """ Extract text from PDF file Args: pdf_path: Path to PDF file method: Extraction method ('pypdf2' or 'pdfplumber') Returns: Extracted text as string """ text = "" try: if method == "pdfplumber": text = self._extract_with_pdfplumber(pdf_path) else: text = self._extract_with_pypdf2(pdf_path) except Exception as e: print(f"Error extracting text from {pdf_path}: {e}") # Fallback to alternative method if method == "pdfplumber": text = self._extract_with_pypdf2(pdf_path) else: text = self._extract_with_pdfplumber(pdf_path) return text def _extract_with_pypdf2(self, pdf_path: str) -> str: """Extract text using PyPDF2""" text = "" with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text def _extract_with_pdfplumber(self, pdf_path: str) -> str: """Extract text using pdfplumber (better for complex PDFs)""" text = "" with pdfplumber.open(pdf_path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" return text def chunk_text(self, text: str) -> List[str]: """ Split text into chunks Args: text: Input text to chunk Returns: List of text chunks """ chunks = self.text_splitter.split_text(text) return chunks def extract_with_structure(self, pdf_path: str) -> Dict: """ Extract text with page and paragraph structure Args: pdf_path: Path to PDF file Returns: Dictionary with structured content including pages and paragraphs """ structured_content = { "pages": [], "paragraphs": [], "full_text": "" } try: with pdfplumber.open(pdf_path) as pdf: paragraph_id = 0 for page_num, page in enumerate(pdf.pages, start=1): page_text = page.extract_text() if not page_text: continue # Split into paragraphs (double newline or significant whitespace) raw_paragraphs = page_text.split('\n\n') page_paragraphs = [] for para_text in raw_paragraphs: para_text = para_text.strip() if len(para_text) > 20: # Ignore very short fragments paragraph_id += 1 paragraph_data = { "id": f"para_{paragraph_id}", "page": page_num, "text": para_text, "char_start": len(structured_content["full_text"]), "char_end": len(structured_content["full_text"]) + len(para_text) } page_paragraphs.append(paragraph_data) structured_content["paragraphs"].append(paragraph_data) structured_content["full_text"] += para_text + "\n\n" structured_content["pages"].append({ "page_num": page_num, "text": page_text, "paragraphs": page_paragraphs }) except Exception as e: print(f"Error extracting structured content: {e}") # Fallback to simple extraction text = self.extract_text_from_pdf(pdf_path) structured_content["full_text"] = text structured_content["paragraphs"] = [{ "id": "para_1", "page": 1, "text": text, "char_start": 0, "char_end": len(text) }] return structured_content def generate_html_preview(self, structured_content: Dict, filename: str) -> str: """ Generate HTML representation of PDF for viewer Args: structured_content: Structured content from extract_with_structure filename: Name of the PDF file Returns: HTML string """ html = f"""
{para['text']}
""" html += "