import fitz # PyMuPDF def extract_text_from_pdf(file_bytes: bytes) -> str: """ Extract plain text from a PDF given its raw bytes. Joins all pages into a single string. Raises ValueError if the PDF yields no text (e.g. scanned image-only PDF). """ doc = fitz.open(stream=file_bytes, filetype="pdf") pages_text = [] for page in doc: pages_text.append(page.get_text("text")) # "text" = plain text mode full_text = "\n".join(pages_text).strip() if not full_text: raise ValueError( "No text could be extracted from the PDF. " "It may be a scanned image. Please upload a text-based PDF." ) return full_text