Solomon7890's picture
Upload folder using huggingface_hub
d28e16a verified
"""
PDF Processing Utilities
"""
import io
class PDFProcessor:
"""Handles PDF text extraction."""
def __init__(self):
try:
import PyPDF2
self.PyPDF2 = PyPDF2
except ImportError:
self.PyPDF2 = None
print("Warning: PyPDF2 not available")
def extract_text(self, pdf_file):
"""Extract text from PDF file object."""
if not self.PyPDF2:
return "PDF processing not available. Please install PyPDF2."
try:
reader = self.PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
return f"Error extracting PDF text: {str(e)}"
def extract_text_from_bytes(self, pdf_bytes):
"""Extract text from PDF bytes."""
if not self.PyPDF2:
return "PDF processing not available. Please install PyPDF2."
try:
pdf_file = io.BytesIO(pdf_bytes)
reader = self.PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
return f"Error extracting PDF text: {str(e)}"