my-smoldocling-demo

Sleeping

App Files Files Community

bharatcoder commited on Oct 25

Commit

1f42ce9

verified ·

1 Parent(s): 2c602f3

Update app.py

Browse files

Files changed (1) hide show

app.py +57 -22

app.py CHANGED Viewed

@@ -1,27 +1,60 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image
-import PIL.Image
 # Load model & processor once at startup
 processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
-def smoldocling_readimage(image: PIL.Image.Image, prompt_text: str) -> str:
     """
     Extract text and structured content from document images using SmolDocling model.
-    This function processes document images (PDFs, scanned documents, screenshots, etc.)
-    and converts them to structured text format based on the provided prompt. It uses
-    the SmolDocling-256M-preview model for image-to-text conversion with chat-based
-    prompting.
     Args:
-        image (PIL.Image.Image): The input document image to process. Should be a PIL
-            Image object containing a document, text, or any visual content that needs
-            to be converted to text.
-        prompt_text (str): The instruction or prompt text that guides the model's
-            output format. Supported prompts include:
             Content Conversion:
             - "Convert this page to docling." - Full conversion to DocTags representation
@@ -31,28 +64,30 @@ def smoldocling_readimage(image: PIL.Image.Image, prompt_text: str) -> str:
             - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
             OCR and Location-based Actions:
-            - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
-            - "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
-            - "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
             - "Detect footer elements on the page." - Identify footer content
     Returns:
-        str: The extracted and formatted text content from the image, cleaned of
-            special tokens and whitespace. The format depends on the prompt_text
-            provided.
     Example:
-        >>> from PIL import Image
-        >>> img = Image.open("document.pdf")
-        >>> result = smoldocling_readimage(img, "Convert to docling")
         >>> print(result)  # Returns structured document content
     Note:
-        - The function is optimized for document images but can handle any image
-          containing text
         - Processing time depends on image size and complexity
         - Maximum output length is limited to 1024 new tokens
     """
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]

 import gradio as gr
 from transformers import AutoProcessor, AutoModelForImageTextToText
 from PIL import Image
+import base64
+from io import BytesIO
+import os
 # Load model & processor once at startup
 processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
+def convert_to_pil(image_input: str) -> Image.Image:
+    """
+    Convert base64 or file path string to PIL.Image.
+    Args:
+        image_input: Base64 encoded string or file path
+    Returns:
+        PIL.Image.Image object
+    """
+    # Check if it's a base64 string
+    if image_input.startswith('data:image'):
+        # Remove data:image/jpeg;base64, prefix
+        base64_str = image_input.split(',', 1)[1]
+        image_data = base64.b64decode(base64_str)
+        return Image.open(BytesIO(image_data))
+    elif ',' in image_input and len(image_input) > 100:
+        # Might be base64 without prefix
+        try:
+            image_data = base64.b64decode(image_input)
+            return Image.open(BytesIO(image_data))
+        except:
+            pass
+    # Assume it's a file path
+    if os.path.exists(image_input):
+        return Image.open(image_input)
+    raise ValueError(f"Could not convert image input to PIL.Image: {type(image_input)}")
+def smoldocling_readimage(image: str, prompt_text: str) -> str:
     """
     Extract text and structured content from document images using SmolDocling model.
+    This function processes document images (PDFs, scanned documents, screenshots, etc.)
+    and converts them to structured text format based on the provided prompt. It uses
+    the SmolDocling-256M-preview model for image-to-text conversion with chat-based prompting.
     Args:
+        image (str): The input document image as base64 encoded string or file path.
+            MCP clients will send this as base64.
+        prompt_text (str): The instruction or prompt text that guides the model's output format.
+            Supported prompts include:
             Content Conversion:
             - "Convert this page to docling." - Full conversion to DocTags representation
             - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
             OCR and Location-based Actions:
+            - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>"
+              - Extract text from specific coordinates
+            - "Identify element at: <loc_247><loc_482><loc_252><loc_486>"
+              - Identify element type at coordinates
+            - "Find all 'text' elements on the page, retrieve all section headers."
+              - Extract section headers
             - "Detect footer elements on the page." - Identify footer content
     Returns:
+        str: The extracted and formatted text content from the image, cleaned of special
+            tokens and whitespace. The format depends on the prompt_text provided.
     Example:
+        >>> result = smoldocling_readimage("data:image/jpeg;base64,/9j/4AAQ...", "Convert to docling")
         >>> print(result)  # Returns structured document content
     Note:
+        - The function is optimized for document images but can handle any image containing text
         - Processing time depends on image size and complexity
         - Maximum output length is limited to 1024 new tokens
     """
+    # Convert string input (base64 or path) to PIL.Image
+    pil_image = convert_to_pil(image)
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]