DocClassify / backend /app /pdf_processor.py
Seth
Update
f6e574f
raw
history blame contribute delete
834 Bytes
"""PDF text extraction utilities."""
import io
from typing import Optional
from PyPDF2 import PdfReader
def extract_text_from_pdf(pdf_bytes: bytes) -> Optional[str]:
"""
Extract text content from a PDF file.
Args:
pdf_bytes: PDF file content as bytes
Returns:
Extracted text as string, or None if extraction fails
"""
try:
pdf_file = io.BytesIO(pdf_bytes)
reader = PdfReader(pdf_file)
text_parts = []
for page in reader.pages:
text = page.extract_text()
if text:
text_parts.append(text)
full_text = "\n\n".join(text_parts)
return full_text if full_text.strip() else None
except Exception as e:
print(f"Error extracting text from PDF: {e}")
return None