Spaces:
Running
Running
File size: 1,403 Bytes
0c6fb97 90e6570 0c6fb97 90e6570 0c6fb97 90e6570 0c6fb97 90e6570 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 | import fitz # PyMuPDF
import pdfplumber
import io
from app.utils.common import clean_text
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Hybrid extraction: Tries pdfplumber first (better for layout/columns),
falls back to PyMuPDF if that fails.
"""
try:
# METHOD A: pdfplumber (Best for LaTeX / Columns)
with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
text_content = []
for page in pdf.pages:
# layout=True tries to mimic the visual layout physically
extracted = page.extract_text(layout=True)
if extracted:
text_content.append(extracted)
full_text = "\n".join(text_content)
# If pdfplumber found meaningful text, return it
if len(full_text) > 50:
return clean_text(full_text)
except Exception as e:
print(f"pdfplumber failed: {e}, falling back to fitz")
# METHOD B: PyMuPDF (Fallback - Faster, robust against corrupted files)
try:
doc = fitz.open(stream=file_bytes, filetype="pdf")
text_content = []
for page in doc:
text_content.append(page.get_text("text", sort=True))
full_text = "\n".join(text_content)
return clean_text(full_text)
except Exception as e:
return "" |