Spaces:

Seth0330
/

DocClassify

Sleeping

DocClassify / backend /app /pdf_processor.py

Seth

Update

f6e574f 26 days ago

834 Bytes

	"""PDF text extraction utilities."""
	import io
	from typing import Optional
	from PyPDF2 import PdfReader


	def extract_text_from_pdf(pdf_bytes: bytes) -> Optional[str]:
	"""
	Extract text content from a PDF file.

	Args:
	pdf_bytes: PDF file content as bytes

	Returns:
	Extracted text as string, or None if extraction fails
	"""
	try:
	pdf_file = io.BytesIO(pdf_bytes)
	reader = PdfReader(pdf_file)

	text_parts = []
	for page in reader.pages:
	text = page.extract_text()
	if text:
	text_parts.append(text)

	full_text = "\n\n".join(text_parts)
	return full_text if full_text.strip() else None
	except Exception as e:
	print(f"Error extracting text from PDF: {e}")
	return None