File size: 5,925 Bytes
01728c5 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 | import PyPDF2
import re
from typing import List, Dict
import io
class DocumentProcessor:
"""
Handles PDF document processing and intelligent text chunking
"""
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def extract_text_from_pdf(self, pdf_file) -> Dict[str, any]:
"""
Extract text from PDF file and preserve metadata
Args:
pdf_file: Uploaded PDF file object
Returns:
Dict containing extracted text, metadata, and page information
"""
try:
# Read PDF using PyPDF2
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Extract metadata
metadata = {
'filename': pdf_file.name,
'num_pages': len(pdf_reader.pages),
'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '',
'author': pdf_reader.metadata.get('/Author', '') if pdf_reader.metadata else '',
'subject': pdf_reader.metadata.get('/Subject', '') if pdf_reader.metadata else ''
}
# Extract text from each page
pages_text = []
full_text = ""
for page_num, page in enumerate(pdf_reader.pages):
try:
page_text = page.extract_text()
if page_text.strip(): # Only add non-empty pages
pages_text.append({
'page_number': page_num + 1,
'text': page_text.strip()
})
full_text += f"\n\n[Page {page_num + 1}]\n{page_text.strip()}"
except Exception as e:
print(f"Error extracting text from page {page_num + 1}: {str(e)}")
continue
return {
'full_text': full_text.strip(),
'pages': pages_text,
'metadata': metadata
}
except Exception as e:
raise Exception(f"Error processing PDF: {str(e)}")
def clean_text(self, text: str) -> str:
"""
Clean and normalize extracted text
Args:
text: Raw extracted text
Returns:
Cleaned text
"""
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
# Remove special characters but keep punctuation
text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\"\'\/]', '', text)
# Fix common PDF extraction issues
text = text.replace('�', '') # Remove replacement characters
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) # Add space between words
return text.strip()
def chunk_text(self, text: str, metadata: Dict) -> List[Dict]:
"""
Split text into overlapping chunks for better retrieval
Args:
text: Full document text
metadata: Document metadata
Returns:
List of text chunks with metadata
"""
# Clean the text first
cleaned_text = self.clean_text(text)
# Split into sentences for better chunking
sentences = re.split(r'(?<=[.!?])\s+', cleaned_text)
chunks = []
current_chunk = ""
current_length = 0
chunk_id = 0
for sentence in sentences:
sentence_length = len(sentence)
# If adding this sentence would exceed chunk size, save current chunk
if current_length + sentence_length > self.chunk_size and current_chunk:
chunks.append({
'chunk_id': chunk_id,
'text': current_chunk.strip(),
'metadata': {
**metadata,
'chunk_size': len(current_chunk),
'chunk_index': chunk_id
}
})
chunk_id += 1
# Start new chunk with overlap
if self.chunk_overlap > 0:
# Take last few sentences for overlap
overlap_sentences = current_chunk.split('. ')[-2:]
current_chunk = '. '.join(overlap_sentences) + '. ' + sentence
current_length = len(current_chunk)
else:
current_chunk = sentence
current_length = sentence_length
else:
# Add sentence to current chunk
if current_chunk:
current_chunk += " " + sentence
else:
current_chunk = sentence
current_length += sentence_length
# Add the last chunk if it exists
if current_chunk.strip():
chunks.append({
'chunk_id': chunk_id,
'text': current_chunk.strip(),
'metadata': {
**metadata,
'chunk_size': len(current_chunk),
'chunk_index': chunk_id
}
})
return chunks
def process_document(self, pdf_file) -> List[Dict]:
"""
Complete document processing pipeline
Args:
pdf_file: Uploaded PDF file
Returns:
List of processed text chunks with metadata
"""
# Extract text and metadata
doc_data = self.extract_text_from_pdf(pdf_file)
# Create chunks
chunks = self.chunk_text(doc_data['full_text'], doc_data['metadata'])
return chunks |