File size: 1,391 Bytes
ea9ca44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d6cc86
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import pypdf
import pdfplumber
from src.preprocess.cleaner import postprocess_extracted_text

def parse_pdf(path: str) -> str:
    """
    Extract text from a PDF file.
    Tries pdfplumber first, falls back to pypdf.
    Returns postprocessed text.
    """
    text = ""

    # --- pdfplumber extraction ---
    try:
        with pdfplumber.open(path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        if text.strip():
            return postprocess_extracted_text(text)
    except Exception as e:
        print(f"⚠️ pdfplumber failed for {path}: {e}")

    # --- fallback to pypdf ---
    try:
        with open(path, "rb") as f:
            reader = pypdf.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"
        if text.strip():
            return postprocess_extracted_text(text)
    except Exception as e:
        print(f"❌ pypdf also failed for {path}: {e}")

    raise ValueError("We couldn't extract any readable text from this PDF. If you generated this using 'Microsoft Print to PDF', please try using Chrome/Edge's native 'Save as PDF' option instead, as Microsoft Print to PDF often converts text into unreadable images.")