Spaces:
Runtime error
Runtime error
File size: 6,896 Bytes
1d95600 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 |
import PyPDF2
import pdfplumber
from typing import Dict, List, Optional, Union, Any
import re
import logging
import io
logger = logging.getLogger(__name__)
class PDFProcessor:
"""Handles PDF file processing and text extraction"""
def __init__(self):
self.supported_formats = ['.pdf']
def extract_text_from_pdf(self, pdf_file) -> Dict[str, Any]:
"""
Extract text content from PDF file
Returns structured data with text, metadata, and page information
"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
pdf_file = io.BytesIO(pdf_file)
# Try pdfplumber first (better for complex layouts)
with pdfplumber.open(pdf_file) as pdf:
text_content = []
metadata = {
'total_pages': len(pdf.pages),
'title': '',
'author': '',
'subject': ''
}
# Extract metadata if available
if pdf.metadata:
metadata.update({
'title': pdf.metadata.get('Title', ''),
'author': pdf.metadata.get('Author', ''),
'subject': pdf.metadata.get('Subject', '')
})
# Extract text from each page
for page_num, page in enumerate(pdf.pages, 1):
page_text = page.extract_text()
if page_text:
text_content.append({
'page_number': page_num,
'text': self._clean_text(page_text)
})
combined_text = '\n\n'.join([page['text'] for page in text_content])
return {
'success': True,
'text': combined_text,
'pages': text_content,
'metadata': metadata,
'word_count': len(combined_text.split()),
'character_count': len(combined_text)
}
except Exception as e:
logger.error(f"pdfplumber extraction failed: {str(e)}")
# Fallback to PyPDF2
return self._extract_with_pypdf2(pdf_file)
def _extract_with_pypdf2(self, pdf_file) -> Dict[str, Any]:
"""Fallback method using PyPDF2"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
pdf_file = io.BytesIO(pdf_file)
else:
pdf_file.seek(0) # Reset file pointer
reader = PyPDF2.PdfReader(pdf_file)
text_content = []
metadata = {
'total_pages': len(reader.pages),
'title': '',
'author': '',
'subject': ''
}
# Extract metadata
if reader.metadata:
metadata.update({
'title': reader.metadata.get('/Title', ''),
'author': reader.metadata.get('/Author', ''),
'subject': reader.metadata.get('/Subject', '')
})
# Extract text from each page
for page_num, page in enumerate(reader.pages, 1):
page_text = page.extract_text()
if page_text:
text_content.append({
'page_number': page_num,
'text': self._clean_text(page_text)
})
combined_text = '\n\n'.join([page['text'] for page in text_content])
return {
'success': True,
'text': combined_text,
'pages': text_content,
'metadata': metadata,
'word_count': len(combined_text.split()),
'character_count': len(combined_text)
}
except Exception as e:
logger.error(f"PyPDF2 extraction failed: {str(e)}")
return {
'success': False,
'error': f"Failed to extract text from PDF: {str(e)}",
'text': '',
'pages': [],
'metadata': {},
'word_count': 0,
'character_count': 0
}
def _clean_text(self, text: str) -> str:
"""Clean and normalize extracted text"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove page numbers and headers/footers (common patterns)
text = re.sub(r'\n\d+\n', '\n', text)
# Fix common PDF extraction issues
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Split concatenated words
text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words across lines
# Remove excessive line breaks
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def validate_pdf(self, pdf_file) -> Dict[str, Any]:
"""Validate PDF file before processing"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
file_size = len(pdf_file)
pdf_file = io.BytesIO(pdf_file)
else:
# Check file size (limit to 50MB)
pdf_file.seek(0, 2) # Seek to end
file_size = pdf_file.tell()
pdf_file.seek(0) # Reset to beginning
if file_size > 50 * 1024 * 1024: # 50MB limit
return {
'valid': False,
'error': 'File size exceeds 50MB limit'
}
# Try to open the PDF to validate format
try:
reader = PyPDF2.PdfReader(pdf_file)
if len(reader.pages) == 0:
return {
'valid': False,
'error': 'PDF contains no pages'
}
pdf_file.seek(0) # Reset file pointer
return {
'valid': True,
'pages': len(reader.pages),
'size_mb': round(file_size / (1024 * 1024), 2)
}
except Exception as e:
return {
'valid': False,
'error': f'Invalid PDF format: {str(e)}'
}
except Exception as e:
return {
'valid': False,
'error': f'Error validating PDF: {str(e)}'
}
|