File size: 700 Bytes
d2b7a80 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 | import fitz # PyMuPDF
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Extract plain text from a PDF given its raw bytes.
Joins all pages into a single string.
Raises ValueError if the PDF yields no text (e.g. scanned image-only PDF).
"""
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages_text = []
for page in doc:
pages_text.append(page.get_text("text")) # "text" = plain text mode
full_text = "\n".join(pages_text).strip()
if not full_text:
raise ValueError(
"No text could be extracted from the PDF. "
"It may be a scanned image. Please upload a text-based PDF."
)
return full_text |