| | """ |
| | Enhanced Multimodal PDF Parser for PDFs with Text + Image URLs |
| | Extracts text, detects image URLs, and links them together |
| | """ |
| |
|
| | import pypdfium2 as pdfium |
| | from typing import List, Dict, Optional, Tuple |
| | import re |
| | from dataclasses import dataclass, field |
| |
|
| |
|
| | @dataclass |
| | class MultimodalChunk: |
| | """Represents a chunk with text and associated images""" |
| | text: str |
| | page_number: int |
| | chunk_index: int |
| | image_urls: List[str] = field(default_factory=list) |
| | metadata: Dict = field(default_factory=dict) |
| |
|
| |
|
| | class MultimodalPDFParser: |
| | """ |
| | Enhanced PDF Parser that extracts text and image URLs |
| | Perfect for user guides with screenshots and visual instructions |
| | """ |
| |
|
| | def __init__( |
| | self, |
| | chunk_size: int = 500, |
| | chunk_overlap: int = 50, |
| | min_chunk_size: int = 50, |
| | extract_images: bool = True |
| | ): |
| | self.chunk_size = chunk_size |
| | self.chunk_overlap = chunk_overlap |
| | self.min_chunk_size = min_chunk_size |
| | self.extract_images = extract_images |
| |
|
| | |
| | self.url_patterns = [ |
| | |
| | r'https?://[^\s<>"{}|\\^`\[\]]+', |
| | |
| | r'!\[.*?\]\((https?://[^\s)]+)\)', |
| | |
| | r'<img[^>]+src=["\']([^"\']+)["\']', |
| | |
| | r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:jpg|jpeg|png|gif|bmp|svg|webp)', |
| | ] |
| |
|
| | def extract_image_urls(self, text: str) -> List[str]: |
| | """ |
| | Extract all image URLs from text |
| | |
| | Args: |
| | text: Text content |
| | |
| | Returns: |
| | List of image URLs found |
| | """ |
| | urls = [] |
| |
|
| | for pattern in self.url_patterns: |
| | matches = re.findall(pattern, text, re.IGNORECASE) |
| | urls.extend(matches) |
| |
|
| | |
| | seen = set() |
| | unique_urls = [] |
| | for url in urls: |
| | if url not in seen: |
| | seen.add(url) |
| | unique_urls.append(url) |
| |
|
| | return unique_urls |
| |
|
| | def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, Tuple[str, List[str]]]: |
| | """ |
| | Extract text and image URLs from PDF |
| | |
| | Args: |
| | pdf_path: Path to PDF file |
| | |
| | Returns: |
| | Dictionary mapping page number to (text, image_urls) tuple |
| | """ |
| | pdf_pages = {} |
| |
|
| | try: |
| | pdf = pdfium.PdfDocument(pdf_path) |
| |
|
| | for page_num in range(len(pdf)): |
| | page = pdf[page_num] |
| | textpage = page.get_textpage() |
| | text = textpage.get_text_range() |
| |
|
| | |
| | text = self._clean_text(text) |
| |
|
| | |
| | image_urls = [] |
| | if self.extract_images: |
| | image_urls = self.extract_image_urls(text) |
| |
|
| | pdf_pages[page_num + 1] = (text, image_urls) |
| |
|
| | return pdf_pages |
| |
|
| | except Exception as e: |
| | raise Exception(f"Error reading PDF: {str(e)}") |
| |
|
| | def _clean_text(self, text: str) -> str: |
| | """Clean extracted text""" |
| | |
| | text = re.sub(r'\s+', ' ', text) |
| | |
| | text = text.replace('\x00', '') |
| | return text.strip() |
| |
|
| | def chunk_text_with_images( |
| | self, |
| | text: str, |
| | image_urls: List[str], |
| | page_number: int |
| | ) -> List[MultimodalChunk]: |
| | """ |
| | Split text into chunks and associate images with relevant chunks |
| | |
| | Args: |
| | text: Text to chunk |
| | image_urls: Image URLs from the page |
| | page_number: Page number |
| | |
| | Returns: |
| | List of MultimodalChunk objects |
| | """ |
| | |
| | words = text.split() |
| |
|
| | if len(words) < self.min_chunk_size: |
| | if len(words) > 0: |
| | return [MultimodalChunk( |
| | text=text, |
| | page_number=page_number, |
| | chunk_index=0, |
| | image_urls=image_urls, |
| | metadata={'page': page_number, 'chunk': 0} |
| | )] |
| | return [] |
| |
|
| | chunks = [] |
| | chunk_index = 0 |
| | start = 0 |
| |
|
| | |
| | images_per_chunk = len(image_urls) // max(1, len(words) // self.chunk_size) if image_urls else 0 |
| | image_index = 0 |
| |
|
| | while start < len(words): |
| | end = min(start + self.chunk_size, len(words)) |
| | chunk_words = words[start:end] |
| | chunk_text = ' '.join(chunk_words) |
| |
|
| | |
| | chunk_images = [] |
| | if image_urls: |
| | |
| | |
| | for url in image_urls: |
| | if url in chunk_text: |
| | chunk_images.append(url) |
| |
|
| | |
| | if not chunk_images and image_index < len(image_urls): |
| | |
| | num_imgs = min(images_per_chunk + 1, len(image_urls) - image_index) |
| | chunk_images = image_urls[image_index:image_index + num_imgs] |
| | image_index += num_imgs |
| |
|
| | chunks.append(MultimodalChunk( |
| | text=chunk_text, |
| | page_number=page_number, |
| | chunk_index=chunk_index, |
| | image_urls=chunk_images, |
| | metadata={ |
| | 'page': page_number, |
| | 'chunk': chunk_index, |
| | 'start_word': start, |
| | 'end_word': end, |
| | 'has_images': len(chunk_images) > 0, |
| | 'num_images': len(chunk_images) |
| | } |
| | )) |
| |
|
| | chunk_index += 1 |
| | start = end - self.chunk_overlap |
| |
|
| | if start >= len(words) - self.min_chunk_size: |
| | break |
| |
|
| | return chunks |
| |
|
| | def parse_pdf( |
| | self, |
| | pdf_path: str, |
| | document_metadata: Optional[Dict] = None |
| | ) -> List[MultimodalChunk]: |
| | """ |
| | Parse PDF into multimodal chunks |
| | |
| | Args: |
| | pdf_path: Path to PDF file |
| | document_metadata: Additional metadata |
| | |
| | Returns: |
| | List of MultimodalChunk objects |
| | """ |
| | pages_data = self.extract_text_from_pdf(pdf_path) |
| |
|
| | all_chunks = [] |
| | for page_num, (text, image_urls) in pages_data.items(): |
| | chunks = self.chunk_text_with_images(text, image_urls, page_num) |
| |
|
| | |
| | if document_metadata: |
| | for chunk in chunks: |
| | chunk.metadata.update(document_metadata) |
| |
|
| | all_chunks.extend(chunks) |
| |
|
| | return all_chunks |
| |
|
| | def parse_pdf_bytes( |
| | self, |
| | pdf_bytes: bytes, |
| | document_metadata: Optional[Dict] = None |
| | ) -> List[MultimodalChunk]: |
| | """Parse PDF from bytes""" |
| | import tempfile |
| | import os |
| |
|
| | with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: |
| | tmp.write(pdf_bytes) |
| | tmp_path = tmp.name |
| |
|
| | try: |
| | chunks = self.parse_pdf(tmp_path, document_metadata) |
| | return chunks |
| | finally: |
| | if os.path.exists(tmp_path): |
| | os.unlink(tmp_path) |
| |
|
| |
|
| | class MultimodalPDFIndexer: |
| | """Index multimodal PDF chunks into RAG system""" |
| |
|
| | def __init__(self, embedding_service, qdrant_service, documents_collection): |
| | self.embedding_service = embedding_service |
| | self.qdrant_service = qdrant_service |
| | self.documents_collection = documents_collection |
| | self.parser = MultimodalPDFParser() |
| |
|
| | def index_pdf( |
| | self, |
| | pdf_path: str, |
| | document_id: str, |
| | document_metadata: Optional[Dict] = None |
| | ) -> Dict: |
| | """Index PDF with image URLs""" |
| | chunks = self.parser.parse_pdf(pdf_path, document_metadata) |
| |
|
| | indexed_count = 0 |
| | chunk_ids = [] |
| | total_images = 0 |
| |
|
| | for chunk in chunks: |
| | chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" |
| |
|
| | |
| | embedding = self.embedding_service.encode_text(chunk.text) |
| |
|
| | |
| | metadata = { |
| | 'text': chunk.text, |
| | 'document_id': document_id, |
| | 'page': chunk.page_number, |
| | 'chunk_index': chunk.chunk_index, |
| | 'source': 'pdf', |
| | 'has_images': len(chunk.image_urls) > 0, |
| | 'image_urls': chunk.image_urls, |
| | 'num_images': len(chunk.image_urls), |
| | **chunk.metadata |
| | } |
| |
|
| | |
| | self.qdrant_service.index_data( |
| | doc_id=chunk_id, |
| | embedding=embedding, |
| | metadata=metadata |
| | ) |
| |
|
| | chunk_ids.append(chunk_id) |
| | indexed_count += 1 |
| | total_images += len(chunk.image_urls) |
| |
|
| | |
| | doc_info = { |
| | 'document_id': document_id, |
| | 'type': 'multimodal_pdf', |
| | 'file_path': pdf_path, |
| | 'num_chunks': indexed_count, |
| | 'total_images': total_images, |
| | 'chunk_ids': chunk_ids, |
| | 'metadata': document_metadata or {} |
| | } |
| | self.documents_collection.insert_one(doc_info) |
| |
|
| | return { |
| | 'success': True, |
| | 'document_id': document_id, |
| | 'chunks_indexed': indexed_count, |
| | 'images_found': total_images, |
| | 'chunk_ids': chunk_ids[:5] |
| | } |
| |
|
| | def index_pdf_bytes( |
| | self, |
| | pdf_bytes: bytes, |
| | document_id: str, |
| | filename: str, |
| | document_metadata: Optional[Dict] = None |
| | ) -> Dict: |
| | """Index PDF from bytes""" |
| | metadata = document_metadata or {} |
| | metadata['filename'] = filename |
| |
|
| | chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata) |
| |
|
| | indexed_count = 0 |
| | chunk_ids = [] |
| | total_images = 0 |
| |
|
| | for chunk in chunks: |
| | chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" |
| |
|
| | embedding = self.embedding_service.encode_text(chunk.text) |
| |
|
| | metadata = { |
| | 'text': chunk.text, |
| | 'document_id': document_id, |
| | 'page': chunk.page_number, |
| | 'chunk_index': chunk.chunk_index, |
| | 'source': 'multimodal_pdf', |
| | 'filename': filename, |
| | 'has_images': len(chunk.image_urls) > 0, |
| | 'image_urls': chunk.image_urls, |
| | 'num_images': len(chunk.image_urls), |
| | **chunk.metadata |
| | } |
| |
|
| | self.qdrant_service.index_data( |
| | doc_id=chunk_id, |
| | embedding=embedding, |
| | metadata=metadata |
| | ) |
| |
|
| | chunk_ids.append(chunk_id) |
| | indexed_count += 1 |
| | total_images += len(chunk.image_urls) |
| |
|
| | doc_info = { |
| | 'document_id': document_id, |
| | 'type': 'multimodal_pdf', |
| | 'filename': filename, |
| | 'num_chunks': indexed_count, |
| | 'total_images': total_images, |
| | 'chunk_ids': chunk_ids, |
| | 'metadata': metadata |
| | } |
| | self.documents_collection.insert_one(doc_info) |
| |
|
| | return { |
| | 'success': True, |
| | 'document_id': document_id, |
| | 'filename': filename, |
| | 'chunks_indexed': indexed_count, |
| | 'images_found': total_images, |
| | 'chunk_ids': chunk_ids[:5] |
| | } |
| |
|