Spaces:
Sleeping
Sleeping
File size: 1,391 Bytes
ea9ca44 9d6cc86 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | import pypdf
import pdfplumber
from src.preprocess.cleaner import postprocess_extracted_text
def parse_pdf(path: str) -> str:
"""
Extract text from a PDF file.
Tries pdfplumber first, falls back to pypdf.
Returns postprocessed text.
"""
text = ""
# --- pdfplumber extraction ---
try:
with pdfplumber.open(path) as pdf:
for page in pdf.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return postprocess_extracted_text(text)
except Exception as e:
print(f"⚠️ pdfplumber failed for {path}: {e}")
# --- fallback to pypdf ---
try:
with open(path, "rb") as f:
reader = pypdf.PdfReader(f)
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text += page_text + "\n"
if text.strip():
return postprocess_extracted_text(text)
except Exception as e:
print(f"❌ pypdf also failed for {path}: {e}")
raise ValueError("We couldn't extract any readable text from this PDF. If you generated this using 'Microsoft Print to PDF', please try using Chrome/Edge's native 'Save as PDF' option instead, as Microsoft Print to PDF often converts text into unreadable images.")
|