File size: 1,403 Bytes
0c6fb97
90e6570
 
0c6fb97
 
 
 
90e6570
 
0c6fb97
90e6570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0c6fb97
90e6570
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import fitz  # PyMuPDF
import pdfplumber
import io
from app.utils.common import clean_text

def extract_text_from_pdf(file_bytes: bytes) -> str:
    """
    Hybrid extraction: Tries pdfplumber first (better for layout/columns),
    falls back to PyMuPDF if that fails.
    """
    try:
        # METHOD A: pdfplumber (Best for LaTeX / Columns)
        with pdfplumber.open(io.BytesIO(file_bytes)) as pdf:
            text_content = []
            for page in pdf.pages:
                # layout=True tries to mimic the visual layout physically
                extracted = page.extract_text(layout=True)
                if extracted:
                    text_content.append(extracted)
            
            full_text = "\n".join(text_content)
            
            # If pdfplumber found meaningful text, return it
            if len(full_text) > 50:
                return clean_text(full_text)
                
    except Exception as e:
        print(f"pdfplumber failed: {e}, falling back to fitz")

    # METHOD B: PyMuPDF (Fallback - Faster, robust against corrupted files)
    try:
        doc = fitz.open(stream=file_bytes, filetype="pdf")
        text_content = []
        for page in doc:
            text_content.append(page.get_text("text", sort=True))
        
        full_text = "\n".join(text_content)
        return clean_text(full_text)
    except Exception as e:
        return ""