Spaces:
Sleeping
Sleeping
| from docling.document_converter import DocumentConverter, PdfFormatOption | |
| from docling.datamodel.base_models import InputFormat | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions | |
| from docling_core.transforms.chunker.hybrid_chunker import HybridChunker | |
| from docling_core.types.doc.document import TableItem | |
| from docling_core.types.doc.labels import DocItemLabel | |
| from langchain_core.documents import Document | |
| from PIL import Image | |
| import base64 | |
| import io | |
| import itertools | |
| import os | |
| def process_pdf(file_path, embeddings_tokenizer, vision_model): | |
| """ | |
| Process a PDF file and extract text, tables, and images with descriptions. | |
| Args: | |
| file_path (str): Path to the PDF file | |
| embeddings_tokenizer: Tokenizer for chunking text | |
| vision_model: Model for processing images | |
| Returns: | |
| tuple: (text_chunks, table_chunks, image_descriptions) | |
| """ | |
| # Step 1: Define PDF processing options | |
| pdf_pipeline_options = PdfPipelineOptions( | |
| do_ocr=True, | |
| generate_picture_images=True | |
| ) | |
| # Step 2: Link input format to pipeline options | |
| format_options = { | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options), | |
| } | |
| # Step 3: Initialize the converter with format options | |
| converter = DocumentConverter(format_options=format_options) | |
| # Step 4: List of sources (can be file paths or URLs) | |
| sources = [file_path] | |
| # Step 5: Convert PDFs to structured documents | |
| conversions = { | |
| source: converter.convert(source=source).document for source in sources | |
| } | |
| # Process text chunks | |
| doc_id = 0 | |
| texts = [] | |
| for source, docling_document in conversions.items(): | |
| chunker = HybridChunker(tokenizer=embeddings_tokenizer) | |
| for chunk in chunker.chunk(docling_document): | |
| items = chunk.meta.doc_items | |
| # Skip if chunk is just a table | |
| if len(items) == 1 and isinstance(items[0], TableItem): | |
| continue | |
| # Collect references from items | |
| refs = "".join(item.get_ref().cref for item in items) | |
| text = chunk.text | |
| # Store as LangChain document | |
| document = Document( | |
| page_content=text, | |
| metadata={ | |
| "doc_id": (doc_id := doc_id + 1), | |
| "source": source, | |
| "ref": refs, | |
| } | |
| ) | |
| texts.append(document) | |
| # Process tables | |
| doc_id = len(texts) | |
| tables = [] | |
| for source, docling_document in conversions.items(): | |
| for table in docling_document.tables: | |
| if table.label == DocItemLabel.TABLE: | |
| ref = table.get_ref().cref | |
| text = table.export_to_markdown() | |
| document = Document( | |
| page_content=text, | |
| metadata={ | |
| "doc_id": (doc_id := doc_id + 1), | |
| "source": source, | |
| "ref": ref, | |
| } | |
| ) | |
| tables.append(document) | |
| # Process images | |
| doc_id = len(texts) + len(tables) | |
| pictures = [] | |
| for source, docling_document in conversions.items(): | |
| for picture in docling_document.pictures: | |
| ref = picture.get_ref().cref | |
| image = picture.get_image(docling_document) | |
| if image: | |
| try: | |
| # Process with Gemini | |
| response = vision_model.generate_content([ | |
| "Extract all text and describe key visual elements in this image. " | |
| "Include any numbers, labels, or important details.", | |
| image | |
| ]) | |
| # Create a document with the vision model's description | |
| document = Document( | |
| page_content=response.text, | |
| metadata={ | |
| "doc_id": doc_id, | |
| "source": source, | |
| "ref": ref, | |
| } | |
| ) | |
| pictures.append(document) | |
| doc_id += 1 | |
| except Exception as e: | |
| print(f"Error processing image {ref}: {str(e)}") | |
| return texts, tables, pictures |