ishaq101's picture
clean init
478dec6
raw
history blame contribute delete
933 Bytes
# import fitz # PyMuPDF
# from io import BytesIO
# async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
# """
# Extract text from PDF file in bytes
# Return plain text
# """
# text_chunks = []
# with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
# for page in doc:
# page_text = page.get_text()
# if page_text:
# text_chunks.append(page_text)
# return "\n".join(text_chunks).strip()
from io import BytesIO
from pypdf import PdfReader
async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str:
if not isinstance(pdf_bytes, (bytes, bytearray)):
raise TypeError("pdf_bytes must be bytes")
reader = PdfReader(BytesIO(pdf_bytes))
text_parts = []
for page in reader.pages:
page_text = page.extract_text()
if page_text:
text_parts.append(page_text)
return "\n".join(text_parts)