maintenance_chatbot

Sleeping

App Files Files Community

Prathamesh1420 commited on May 10

Commit

d237c98

verified ·

1 Parent(s): 61e6b08

Create utils/document_processing.py

Browse files

Files changed (1) hide show

utils/document_processing.py +129 -0

utils/document_processing.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from docling.document_converter import DocumentConverter, PdfFormatOption
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import PdfPipelineOptions
+from docling_core.transforms.chunker.hybrid_chunker import HybridChunker
+from docling_core.types.doc.document import TableItem
+from docling_core.types.doc.labels import DocItemLabel
+from langchain_core.documents import Document
+from PIL import Image
+import base64
+import io
+import itertools
+import os
+def process_pdf(file_path, embeddings_tokenizer, vision_model):
+    """
+    Process a PDF file and extract text, tables, and images with descriptions.
+    Args:
+        file_path (str): Path to the PDF file
+        embeddings_tokenizer: Tokenizer for chunking text
+        vision_model: Model for processing images
+    Returns:
+        tuple: (text_chunks, table_chunks, image_descriptions)
+    """
+    # Step 1: Define PDF processing options
+    pdf_pipeline_options = PdfPipelineOptions(
+        do_ocr=True,
+        generate_picture_images=True
+    )
+    # Step 2: Link input format to pipeline options
+    format_options = {
+        InputFormat.PDF: PdfFormatOption(pipeline_options=pdf_pipeline_options),
+    }
+    # Step 3: Initialize the converter with format options
+    converter = DocumentConverter(format_options=format_options)
+    # Step 4: List of sources (can be file paths or URLs)
+    sources = [file_path]
+    # Step 5: Convert PDFs to structured documents
+    conversions = {
+        source: converter.convert(source=source).document for source in sources
+    }
+    # Process text chunks
+    doc_id = 0
+    texts = []
+    for source, docling_document in conversions.items():
+        chunker = HybridChunker(tokenizer=embeddings_tokenizer)
+        for chunk in chunker.chunk(docling_document):
+            items = chunk.meta.doc_items
+            # Skip if chunk is just a table
+            if len(items) == 1 and isinstance(items[0], TableItem):
+                continue
+            # Collect references from items
+            refs = "".join(item.get_ref().cref for item in items)
+            text = chunk.text
+            # Store as LangChain document
+            document = Document(
+                page_content=text,
+                metadata={
+                    "doc_id": (doc_id := doc_id + 1),
+                    "source": source,
+                    "ref": refs,
+                }
+            )
+            texts.append(document)
+    # Process tables
+    doc_id = len(texts)
+    tables = []
+    for source, docling_document in conversions.items():
+        for table in docling_document.tables:
+            if table.label == DocItemLabel.TABLE:
+                ref = table.get_ref().cref
+                text = table.export_to_markdown()
+                document = Document(
+                    page_content=text,
+                    metadata={
+                        "doc_id": (doc_id := doc_id + 1),
+                        "source": source,
+                        "ref": ref,
+                    }
+                )
+                tables.append(document)
+    # Process images
+    doc_id = len(texts) + len(tables)
+    pictures = []
+    for source, docling_document in conversions.items():
+        for picture in docling_document.pictures:
+            ref = picture.get_ref().cref
+            image = picture.get_image(docling_document)
+            if image:
+                try:
+                    # Process with Gemini
+                    response = vision_model.generate_content([
+                        "Extract all text and describe key visual elements in this image. "
+                        "Include any numbers, labels, or important details.",
+                        image
+                    ])
+                    # Create a document with the vision model's description
+                    document = Document(
+                        page_content=response.text,
+                        metadata={
+                            "doc_id": doc_id,
+                            "source": source,
+                            "ref": ref,
+                        }
+                    )
+                    pictures.append(document)
+                    doc_id += 1
+                except Exception as e:
+                    print(f"Error processing image {ref}: {str(e)}")
+    return texts, tables, pictures