resume-optimizer-api / app /utils /pdf_parser.py
JermaineAI's picture
Update backend logic
90e6570
import fitz # PyMuPDF
import pdfplumber
import io
from app.utils.common import clean_text
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Hybrid extraction: Tries pdfplumber first (better for layout/columns),
falls back to PyMuPDF if that fails.
"""
try:
# METHOD A: pdfplumber (Best for LaTeX / Columns)
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
text_content = []
for page in pdf.pages:
# layout=True tries to mimic the visual layout physically
extracted = page.extract_text(layout=True)
if extracted:
text_content.append(extracted)
full_text = "\n".join(text_content)
# If pdfplumber found meaningful text, return it
if len(full_text) > 50:
return clean_text(full_text)
except Exception as e:
print(f"pdfplumber failed: {e}, falling back to fitz")
# METHOD B: PyMuPDF (Fallback - Faster, robust against corrupted files)
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text_content = []
for page in doc:
text_content.append(page.get_text("text", sort=True))
full_text = "\n".join(text_content)
return clean_text(full_text)
except Exception as e:
return ""