import pypdf import pdfplumber from src.preprocess.cleaner import postprocess_extracted_text def parse_pdf(path: str) -> str: """ Extract text from a PDF file. Tries pdfplumber first, falls back to pypdf. Returns postprocessed text. """ text = "" # --- pdfplumber extraction --- try: with pdfplumber.open(path) as pdf: for page in pdf.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if text.strip(): return postprocess_extracted_text(text) except Exception as e: print(f"⚠️ pdfplumber failed for {path}: {e}") # --- fallback to pypdf --- try: with open(path, "rb") as f: reader = pypdf.PdfReader(f) for page in reader.pages: page_text = page.extract_text() if page_text: text += page_text + "\n" if text.strip(): return postprocess_extracted_text(text) except Exception as e: print(f"❌ pypdf also failed for {path}: {e}") raise ValueError("We couldn't extract any readable text from this PDF. If you generated this using 'Microsoft Print to PDF', please try using Chrome/Edge's native 'Save as PDF' option instead, as Microsoft Print to PDF often converts text into unreadable images.")