AI_tutor / app /pdf_processor.py
vishalshelke's picture
Upload 15 files
1d95600 verified
import PyPDF2
import pdfplumber
from typing import Dict, List, Optional, Union, Any
import re
import logging
import io
logger = logging.getLogger(__name__)
class PDFProcessor:
"""Handles PDF file processing and text extraction"""
def __init__(self):
self.supported_formats = ['.pdf']
def extract_text_from_pdf(self, pdf_file) -> Dict[str, Any]:
"""
Extract text content from PDF file
Returns structured data with text, metadata, and page information
"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
pdf_file = io.BytesIO(pdf_file)
# Try pdfplumber first (better for complex layouts)
with pdfplumber.open(pdf_file) as pdf:
text_content = []
metadata = {
'total_pages': len(pdf.pages),
'title': '',
'author': '',
'subject': ''
}
# Extract metadata if available
if pdf.metadata:
metadata.update({
'title': pdf.metadata.get('Title', ''),
'author': pdf.metadata.get('Author', ''),
'subject': pdf.metadata.get('Subject', '')
})
# Extract text from each page
for page_num, page in enumerate(pdf.pages, 1):
page_text = page.extract_text()
if page_text:
text_content.append({
'page_number': page_num,
'text': self._clean_text(page_text)
})
combined_text = '\n\n'.join([page['text'] for page in text_content])
return {
'success': True,
'text': combined_text,
'pages': text_content,
'metadata': metadata,
'word_count': len(combined_text.split()),
'character_count': len(combined_text)
}
except Exception as e:
logger.error(f"pdfplumber extraction failed: {str(e)}")
# Fallback to PyPDF2
return self._extract_with_pypdf2(pdf_file)
def _extract_with_pypdf2(self, pdf_file) -> Dict[str, Any]:
"""Fallback method using PyPDF2"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
pdf_file = io.BytesIO(pdf_file)
else:
pdf_file.seek(0) # Reset file pointer
reader = PyPDF2.PdfReader(pdf_file)
text_content = []
metadata = {
'total_pages': len(reader.pages),
'title': '',
'author': '',
'subject': ''
}
# Extract metadata
if reader.metadata:
metadata.update({
'title': reader.metadata.get('/Title', ''),
'author': reader.metadata.get('/Author', ''),
'subject': reader.metadata.get('/Subject', '')
})
# Extract text from each page
for page_num, page in enumerate(reader.pages, 1):
page_text = page.extract_text()
if page_text:
text_content.append({
'page_number': page_num,
'text': self._clean_text(page_text)
})
combined_text = '\n\n'.join([page['text'] for page in text_content])
return {
'success': True,
'text': combined_text,
'pages': text_content,
'metadata': metadata,
'word_count': len(combined_text.split()),
'character_count': len(combined_text)
}
except Exception as e:
logger.error(f"PyPDF2 extraction failed: {str(e)}")
return {
'success': False,
'error': f"Failed to extract text from PDF: {str(e)}",
'text': '',
'pages': [],
'metadata': {},
'word_count': 0,
'character_count': 0
}
def _clean_text(self, text: str) -> str:
"""Clean and normalize extracted text"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove page numbers and headers/footers (common patterns)
text = re.sub(r'\n\d+\n', '\n', text)
# Fix common PDF extraction issues
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Split concatenated words
text = re.sub(r'(\w)-\n(\w)', r'\1\2', text) # Fix hyphenated words across lines
# Remove excessive line breaks
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()
def validate_pdf(self, pdf_file) -> Dict[str, Any]:
"""Validate PDF file before processing"""
try:
# Handle bytes input from Gradio
if isinstance(pdf_file, bytes):
file_size = len(pdf_file)
pdf_file = io.BytesIO(pdf_file)
else:
# Check file size (limit to 50MB)
pdf_file.seek(0, 2) # Seek to end
file_size = pdf_file.tell()
pdf_file.seek(0) # Reset to beginning
if file_size > 50 * 1024 * 1024: # 50MB limit
return {
'valid': False,
'error': 'File size exceeds 50MB limit'
}
# Try to open the PDF to validate format
try:
reader = PyPDF2.PdfReader(pdf_file)
if len(reader.pages) == 0:
return {
'valid': False,
'error': 'PDF contains no pages'
}
pdf_file.seek(0) # Reset file pointer
return {
'valid': True,
'pages': len(reader.pages),
'size_mb': round(file_size / (1024 * 1024), 2)
}
except Exception as e:
return {
'valid': False,
'error': f'Invalid PDF format: {str(e)}'
}
except Exception as e:
return {
'valid': False,
'error': f'Error validating PDF: {str(e)}'
}