""" PDF Processing Module Handles PDF file upload, text extraction, and preprocessing. """ import PyPDF2 import io import re from typing import Optional, List import streamlit as st class PDFProcessor: """Class to handle PDF file processing and text extraction""" def __init__(self): self.max_file_size = 10 * 1024 * 1024 # 10MB limit def validate_pdf(self, uploaded_file) -> bool: """ Validate uploaded PDF file Args: uploaded_file: Streamlit uploaded file object Returns: bool: True if valid, False otherwise """ # Check file size if uploaded_file.size > self.max_file_size: st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)") return False # Check file type if uploaded_file.type != "application/pdf": st.error("Please upload a valid PDF file") return False return True def extract_text_from_pdf(self, uploaded_file) -> Optional[str]: """ Extract text content from uploaded PDF file Args: uploaded_file: Streamlit uploaded file object Returns: str: Extracted text content or None if extraction fails """ try: # Reset file pointer uploaded_file.seek(0) # Create a PDF reader object pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read())) # Check if PDF is encrypted if pdf_reader.is_encrypted: st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.") return None # Check number of pages num_pages = len(pdf_reader.pages) if num_pages == 0: st.error("❌ PDF file appears to be empty or corrupted.") return None if num_pages > 100: st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.") # Extract text from all pages text_content = "" failed_pages = [] for page_num, page in enumerate(pdf_reader.pages): try: page_text = page.extract_text() if page_text.strip(): # Only add non-empty pages text_content += page_text + "\n" except Exception as e: failed_pages.append(page_num + 1) continue # Report failed pages if failed_pages: if len(failed_pages) < 5: st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}") else: st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages") if not text_content.strip(): st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.") return None # Check if extracted text is too short if len(text_content.strip()) < 100: st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.") return text_content except PyPDF2.errors.PdfReadError as e: st.error(f"❌ Invalid or corrupted PDF file: {str(e)}") return None except MemoryError: st.error("❌ PDF file is too large to process. Please try a smaller file.") return None except Exception as e: st.error(f"❌ Unexpected error processing PDF file: {str(e)}") return None def preprocess_text(self, text: str) -> str: """ Clean and preprocess extracted text Args: text: Raw extracted text Returns: str: Cleaned and preprocessed text """ if not text: return "" # Remove excessive whitespace and newlines text = re.sub(r'\n+', '\n', text) text = re.sub(r'\s+', ' ', text) # Remove special characters that might interfere with processing text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text) # Remove extra spaces text = ' '.join(text.split()) return text.strip() def process_pdf(self, uploaded_file) -> Optional[str]: """ Complete PDF processing pipeline Args: uploaded_file: Streamlit uploaded file object Returns: str: Processed text content or None if processing fails """ if not self.validate_pdf(uploaded_file): return None # Extract text raw_text = self.extract_text_from_pdf(uploaded_file) if raw_text is None: return None # Preprocess text processed_text = self.preprocess_text(raw_text) if len(processed_text) < 50: st.warning("The extracted text is very short. Please check if the PDF contains readable text.") return processed_text