# import fitz # PyMuPDF # from io import BytesIO # async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: # """ # Extract text from PDF file in bytes # Return plain text # """ # text_chunks = [] # with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: # for page in doc: # page_text = page.get_text() # if page_text: # text_chunks.append(page_text) # return "\n".join(text_chunks).strip() from io import BytesIO from pypdf import PdfReader async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: if not isinstance(pdf_bytes, (bytes, bytearray)): raise TypeError("pdf_bytes must be bytes") reader = PdfReader(BytesIO(pdf_bytes)) text_parts = [] for page in reader.pages: page_text = page.extract_text() if page_text: text_parts.append(page_text) return "\n".join(text_parts)