Spaces:
Running
Running
| import fitz # PyMuPDF | |
| import pdfplumber | |
| import io | |
| from app.utils.common import clean_text | |
| def extract_text_from_pdf(file_bytes: bytes) -> str: | |
| """ | |
| Hybrid extraction: Tries pdfplumber first (better for layout/columns), | |
| falls back to PyMuPDF if that fails. | |
| """ | |
| try: | |
| # METHOD A: pdfplumber (Best for LaTeX / Columns) | |
| with pdfplumber.open(io.BytesIO(file_bytes)) as pdf: | |
| text_content = [] | |
| for page in pdf.pages: | |
| # layout=True tries to mimic the visual layout physically | |
| extracted = page.extract_text(layout=True) | |
| if extracted: | |
| text_content.append(extracted) | |
| full_text = "\n".join(text_content) | |
| # If pdfplumber found meaningful text, return it | |
| if len(full_text) > 50: | |
| return clean_text(full_text) | |
| except Exception as e: | |
| print(f"pdfplumber failed: {e}, falling back to fitz") | |
| # METHOD B: PyMuPDF (Fallback - Faster, robust against corrupted files) | |
| try: | |
| doc = fitz.open(stream=file_bytes, filetype="pdf") | |
| text_content = [] | |
| for page in doc: | |
| text_content.append(page.get_text("text", sort=True)) | |
| full_text = "\n".join(text_content) | |
| return clean_text(full_text) | |
| except Exception as e: | |
| return "" |