Siggmoid's picture
Deploy ATS Intelligence Engine to Hugging Face Space
d2b7a80
raw
history blame contribute delete
700 Bytes
import fitz # PyMuPDF
def extract_text_from_pdf(file_bytes: bytes) -> str:
"""
Extract plain text from a PDF given its raw bytes.
Joins all pages into a single string.
Raises ValueError if the PDF yields no text (e.g. scanned image-only PDF).
"""
doc = fitz.open(stream=file_bytes, filetype="pdf")
pages_text = []
for page in doc:
pages_text.append(page.get_text("text")) # "text" = plain text mode
full_text = "\n".join(pages_text).strip()
if not full_text:
raise ValueError(
"No text could be extracted from the PDF. "
"It may be a scanned image. Please upload a text-based PDF."
)
return full_text