Spaces:
Sleeping
Sleeping
| # import fitz # PyMuPDF | |
| # from io import BytesIO | |
| # async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: | |
| # """ | |
| # Extract text from PDF file in bytes | |
| # Return plain text | |
| # """ | |
| # text_chunks = [] | |
| # with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: | |
| # for page in doc: | |
| # page_text = page.get_text() | |
| # if page_text: | |
| # text_chunks.append(page_text) | |
| # return "\n".join(text_chunks).strip() | |
| from io import BytesIO | |
| from pypdf import PdfReader | |
| async def extract_text_from_pdf_bytes(pdf_bytes: bytes) -> str: | |
| if not isinstance(pdf_bytes, (bytes, bytearray)): | |
| raise TypeError("pdf_bytes must be bytes") | |
| reader = PdfReader(BytesIO(pdf_bytes)) | |
| text_parts = [] | |
| for page in reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text_parts.append(page_text) | |
| return "\n".join(text_parts) |