Spaces:
Sleeping
Sleeping
| """PDF text extraction utilities.""" | |
| import io | |
| from typing import Optional | |
| from PyPDF2 import PdfReader | |
| def extract_text_from_pdf(pdf_bytes: bytes) -> Optional[str]: | |
| """ | |
| Extract text content from a PDF file. | |
| Args: | |
| pdf_bytes: PDF file content as bytes | |
| Returns: | |
| Extracted text as string, or None if extraction fails | |
| """ | |
| try: | |
| pdf_file = io.BytesIO(pdf_bytes) | |
| reader = PdfReader(pdf_file) | |
| text_parts = [] | |
| for page in reader.pages: | |
| text = page.extract_text() | |
| if text: | |
| text_parts.append(text) | |
| full_text = "\n\n".join(text_parts) | |
| return full_text if full_text.strip() else None | |
| except Exception as e: | |
| print(f"Error extracting text from PDF: {e}") | |
| return None | |