Spaces:
Sleeping
Sleeping
File size: 5,330 Bytes
1ae86a7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | """
PDF Processing Module
Handles PDF file upload, text extraction, and preprocessing.
"""
import PyPDF2
import io
import re
from typing import Optional, List
import streamlit as st
class PDFProcessor:
"""Class to handle PDF file processing and text extraction"""
def __init__(self):
self.max_file_size = 10 * 1024 * 1024 # 10MB limit
def validate_pdf(self, uploaded_file) -> bool:
"""
Validate uploaded PDF file
Args:
uploaded_file: Streamlit uploaded file object
Returns:
bool: True if valid, False otherwise
"""
# Check file size
if uploaded_file.size > self.max_file_size:
st.error(f"File size ({uploaded_file.size / 1024 / 1024:.1f}MB) exceeds limit (10MB)")
return False
# Check file type
if uploaded_file.type != "application/pdf":
st.error("Please upload a valid PDF file")
return False
return True
def extract_text_from_pdf(self, uploaded_file) -> Optional[str]:
"""
Extract text content from uploaded PDF file
Args:
uploaded_file: Streamlit uploaded file object
Returns:
str: Extracted text content or None if extraction fails
"""
try:
# Reset file pointer
uploaded_file.seek(0)
# Create a PDF reader object
pdf_reader = PyPDF2.PdfReader(io.BytesIO(uploaded_file.read()))
# Check if PDF is encrypted
if pdf_reader.is_encrypted:
st.error("❌ Cannot process encrypted PDF files. Please upload an unencrypted PDF.")
return None
# Check number of pages
num_pages = len(pdf_reader.pages)
if num_pages == 0:
st.error("❌ PDF file appears to be empty or corrupted.")
return None
if num_pages > 100:
st.warning(f"⚠️ Large PDF detected ({num_pages} pages). Processing may take longer.")
# Extract text from all pages
text_content = ""
failed_pages = []
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text.strip(): # Only add non-empty pages
text_content += page_text + "\n"
except Exception as e:
failed_pages.append(page_num + 1)
continue
# Report failed pages
if failed_pages:
if len(failed_pages) < 5:
st.warning(f"⚠️ Could not extract text from pages: {', '.join(map(str, failed_pages))}")
else:
st.warning(f"⚠️ Could not extract text from {len(failed_pages)} pages")
if not text_content.strip():
st.error("❌ No readable text content found in the PDF file. The PDF might contain only images or scanned content.")
return None
# Check if extracted text is too short
if len(text_content.strip()) < 100:
st.warning("⚠️ Very little text was extracted. The PDF might contain mostly images or have formatting issues.")
return text_content
except PyPDF2.errors.PdfReadError as e:
st.error(f"❌ Invalid or corrupted PDF file: {str(e)}")
return None
except MemoryError:
st.error("❌ PDF file is too large to process. Please try a smaller file.")
return None
except Exception as e:
st.error(f"❌ Unexpected error processing PDF file: {str(e)}")
return None
def preprocess_text(self, text: str) -> str:
"""
Clean and preprocess extracted text
Args:
text: Raw extracted text
Returns:
str: Cleaned and preprocessed text
"""
if not text:
return ""
# Remove excessive whitespace and newlines
text = re.sub(r'\n+', '\n', text)
text = re.sub(r'\s+', ' ', text)
# Remove special characters that might interfere with processing
text = re.sub(r'[^\w\s\.\,\!\?\;\:\-\(\)]', ' ', text)
# Remove extra spaces
text = ' '.join(text.split())
return text.strip()
def process_pdf(self, uploaded_file) -> Optional[str]:
"""
Complete PDF processing pipeline
Args:
uploaded_file: Streamlit uploaded file object
Returns:
str: Processed text content or None if processing fails
"""
if not self.validate_pdf(uploaded_file):
return None
# Extract text
raw_text = self.extract_text_from_pdf(uploaded_file)
if raw_text is None:
return None
# Preprocess text
processed_text = self.preprocess_text(raw_text)
if len(processed_text) < 50:
st.warning("The extracted text is very short. Please check if the PDF contains readable text.")
return processed_text
|