import fitz # PyMuPDF import pdfplumber import io from app.utils.common import clean_text def extract_text_from_pdf(file_bytes: bytes) -> str: """ Hybrid extraction: Tries pdfplumber first (better for layout/columns), falls back to PyMuPDF if that fails. """ try: # METHOD A: pdfplumber (Best for LaTeX / Columns) with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: text_content = [] for page in pdf.pages: # layout=True tries to mimic the visual layout physically extracted = page.extract_text(layout=True) if extracted: text_content.append(extracted) full_text = "\n".join(text_content) # If pdfplumber found meaningful text, return it if len(full_text) > 50: return clean_text(full_text) except Exception as e: print(f"pdfplumber failed: {e}, falling back to fitz") # METHOD B: PyMuPDF (Fallback - Faster, robust against corrupted files) try: doc = fitz.open(stream=file_bytes, filetype="pdf") text_content = [] for page in doc: text_content.append(page.get_text("text", sort=True)) full_text = "\n".join(text_content) return clean_text(full_text) except Exception as e: return ""