iris_backend / backend /src /ingestion /pdf_reader.py
sameer2026's picture
fix: ats resume builder UI and backend extraction handling
9d6cc86
import pypdf
import pdfplumber
from src.preprocess.cleaner import postprocess_extracted_text
def parse_pdf(path: str) -> str:
"""
Extract text from a PDF file.
Tries pdfplumber first, falls back to pypdf.
Returns postprocessed text.
"""
text = ""
# --- pdfplumber extraction ---
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return postprocess_extracted_text(text)
except Exception as e:
print(f"⚠️ pdfplumber failed for {path}: {e}")
# --- fallback to pypdf ---
try:
with open(path, "rb") as f:
reader = pypdf.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return postprocess_extracted_text(text)
except Exception as e:
print(f"❌ pypdf also failed for {path}: {e}")
raise ValueError("We couldn't extract any readable text from this PDF. If you generated this using 'Microsoft Print to PDF', please try using Chrome/Edge's native 'Save as PDF' option instead, as Microsoft Print to PDF often converts text into unreadable images.")