| | import PyPDF2 |
| | import re |
| | from typing import List, Dict |
| | import io |
| |
|
| | class DocumentProcessor: |
| | """ |
| | Handles PDF document processing and intelligent text chunking |
| | """ |
| | |
| | def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200): |
| | self.chunk_size = chunk_size |
| | self.chunk_overlap = chunk_overlap |
| | |
| | def extract_text_from_pdf(self, pdf_file) -> Dict[str, any]: |
| | """ |
| | Extract text from PDF file and preserve metadata |
| | |
| | Args: |
| | pdf_file: Uploaded PDF file object |
| | |
| | Returns: |
| | Dict containing extracted text, metadata, and page information |
| | """ |
| | try: |
| | |
| | pdf_reader = PyPDF2.PdfReader(pdf_file) |
| | |
| | |
| | metadata = { |
| | 'filename': pdf_file.name, |
| | 'num_pages': len(pdf_reader.pages), |
| | 'title': pdf_reader.metadata.get('/Title', '') if pdf_reader.metadata else '', |
| | 'author': pdf_reader.metadata.get('/Author', '') if pdf_reader.metadata else '', |
| | 'subject': pdf_reader.metadata.get('/Subject', '') if pdf_reader.metadata else '' |
| | } |
| | |
| | |
| | pages_text = [] |
| | full_text = "" |
| | |
| | for page_num, page in enumerate(pdf_reader.pages): |
| | try: |
| | page_text = page.extract_text() |
| | if page_text.strip(): |
| | pages_text.append({ |
| | 'page_number': page_num + 1, |
| | 'text': page_text.strip() |
| | }) |
| | full_text += f"\n\n[Page {page_num + 1}]\n{page_text.strip()}" |
| | except Exception as e: |
| | print(f"Error extracting text from page {page_num + 1}: {str(e)}") |
| | continue |
| | |
| | return { |
| | 'full_text': full_text.strip(), |
| | 'pages': pages_text, |
| | 'metadata': metadata |
| | } |
| | |
| | except Exception as e: |
| | raise Exception(f"Error processing PDF: {str(e)}") |
| | |
| | def clean_text(self, text: str) -> str: |
| | """ |
| | Clean and normalize extracted text |
| | |
| | Args: |
| | text: Raw extracted text |
| | |
| | Returns: |
| | Cleaned text |
| | """ |
| | |
| | text = re.sub(r'\s+', ' ', text) |
| | |
| | |
| | text = re.sub(r'[^\w\s\.\,\;\:\!\?\-\(\)\[\]\"\'\/]', '', text) |
| | |
| | |
| | text = text.replace('�', '') |
| | text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text) |
| | |
| | return text.strip() |
| | |
| | def chunk_text(self, text: str, metadata: Dict) -> List[Dict]: |
| | """ |
| | Split text into overlapping chunks for better retrieval |
| | |
| | Args: |
| | text: Full document text |
| | metadata: Document metadata |
| | |
| | Returns: |
| | List of text chunks with metadata |
| | """ |
| | |
| | cleaned_text = self.clean_text(text) |
| | |
| | |
| | sentences = re.split(r'(?<=[.!?])\s+', cleaned_text) |
| | |
| | chunks = [] |
| | current_chunk = "" |
| | current_length = 0 |
| | chunk_id = 0 |
| | |
| | for sentence in sentences: |
| | sentence_length = len(sentence) |
| | |
| | |
| | if current_length + sentence_length > self.chunk_size and current_chunk: |
| | chunks.append({ |
| | 'chunk_id': chunk_id, |
| | 'text': current_chunk.strip(), |
| | 'metadata': { |
| | **metadata, |
| | 'chunk_size': len(current_chunk), |
| | 'chunk_index': chunk_id |
| | } |
| | }) |
| | chunk_id += 1 |
| | |
| | |
| | if self.chunk_overlap > 0: |
| | |
| | overlap_sentences = current_chunk.split('. ')[-2:] |
| | current_chunk = '. '.join(overlap_sentences) + '. ' + sentence |
| | current_length = len(current_chunk) |
| | else: |
| | current_chunk = sentence |
| | current_length = sentence_length |
| | else: |
| | |
| | if current_chunk: |
| | current_chunk += " " + sentence |
| | else: |
| | current_chunk = sentence |
| | current_length += sentence_length |
| | |
| | |
| | if current_chunk.strip(): |
| | chunks.append({ |
| | 'chunk_id': chunk_id, |
| | 'text': current_chunk.strip(), |
| | 'metadata': { |
| | **metadata, |
| | 'chunk_size': len(current_chunk), |
| | 'chunk_index': chunk_id |
| | } |
| | }) |
| | |
| | return chunks |
| | |
| | def process_document(self, pdf_file) -> List[Dict]: |
| | """ |
| | Complete document processing pipeline |
| | |
| | Args: |
| | pdf_file: Uploaded PDF file |
| | |
| | Returns: |
| | List of processed text chunks with metadata |
| | """ |
| | |
| | doc_data = self.extract_text_from_pdf(pdf_file) |
| | |
| | |
| | chunks = self.chunk_text(doc_data['full_text'], doc_data['metadata']) |
| | |
| | return chunks |