my-smoldocling-demo

Sleeping

App Files Files Community

bharatcoder commited on Oct 25

Commit

0a09255

verified ·

1 Parent(s): 178bba5

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -72

app.py CHANGED Viewed

@@ -5,109 +5,99 @@ import base64
 from io import BytesIO
 import os
-# Load model & processor once at startup
 processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
-def convert_to_pil(image_input: str) -> Image.Image:
     """
-    Convert base64 or file path string to PIL.Image.
-    Args:
-        image_input: Base64 encoded string or file path
-    Returns:
-        PIL.Image.Image object
     """
-    # Check if it's a base64 string
-    if image_input.startswith('data:image'):
-        # Remove data:image/jpeg;base64, prefix
-        base64_str = image_input.split(',', 1)[1]
         image_data = base64.b64decode(base64_str)
         return Image.open(BytesIO(image_data))
-    elif ',' in image_input and len(image_input) > 100:
-        # Might be base64 without prefix
         try:
             image_data = base64.b64decode(image_input)
             return Image.open(BytesIO(image_data))
-        except:
             pass
-    # Assume it's a file path
-    if os.path.exists(image_input):
         return Image.open(image_input)
-    raise ValueError(f"Could not convert image input to PIL.Image: {type(image_input)}")
 def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
     """
-    Extract text and structured content from document images using SmolDocling model.
-    This function processes document images (PDFs, scanned documents, screenshots, etc.)
-    and converts them to structured text format based on the provided prompt. It uses
-    the SmolDocling-256M-preview model for image-to-text conversion with chat-based prompting.
-    Args:
-        image (Image.Image): The input document image
-        prompt_text (str): The instruction or prompt text that guides the model's output format.
-            Supported prompts include:
-            Content Conversion:
-            - "Convert this page to docling." - Full conversion to DocTags representation
-            - "Convert chart to table." - Convert charts to table format
-            - "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
-            - "Convert code to text." - Convert code blocks to readable text
-            - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
-            OCR and Location-based Actions:
-            - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>"
-              - Extract text from specific coordinates
-            - "Identify element at: <loc_247><loc_482><loc_252><loc_486>"
-              - Identify element type at coordinates
-            - "Find all 'text' elements on the page, retrieve all section headers."
-              - Extract section headers
-            - "Detect footer elements on the page." - Identify footer content
-    Returns:
-        str: The extracted and formatted text content from the image, cleaned of special
-            tokens and whitespace. The format depends on the prompt_text provided.
-    Example:
-        >>> result = smoldocling_readimage("data:image/jpeg;base64,/9j/4AAQ...", "Convert to docling")
-        >>> print(result)  # Returns structured document content
-    Note:
-        - The function is optimized for document images but can handle any image containing text
-        - Processing time depends on image size and complexity
-        - Maximum output length is limited to 1024 new tokens
     """
-    # Convert string input (base64 or path) to PIL.Image
-    # pil_image = convert_to_pil(image)
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=[image], return_tensors="pt")
     outputs = model.generate(**inputs, max_new_tokens=1024)
     prompt_length = inputs.input_ids.shape[1]
     generated = outputs[:, prompt_length:]
     result = processor.batch_decode(generated, skip_special_tokens=False)[0]
     return result.replace("<end_of_utterance>", "").strip()
-# Gradio UI
 with gr.Blocks() as demo:
     gr.Markdown(
         """
-        This is a MCP only tool for conversion using smoldocling
-        This tool is MCP-only, so it does not have a UI.
         """
     )
-    gr.api(
-        smoldocling_readimage
-    )
-_, url, _ = demo.launch(mcp_server=True)

 from io import BytesIO
 import os
+# -----------------------------
+#  Load model and processor once
+# -----------------------------
 processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
+# -----------------------------
+#  Image conversion helper
+# -----------------------------
+def convert_to_pil(image_input):
     """
+    Convert base64, dict, or file path to PIL.Image.
+    Handles:
+      - "data:image/png;base64,...."
+      - plain base64
+      - {"type": "image", "data": "..."}
+      - file path
     """
+    # Case 1: dict input (Perplexity/Claude format)
+    if isinstance(image_input, dict) and "data" in image_input:
+        image_input = image_input["data"]
+    # Case 2: base64 string with prefix
+    if isinstance(image_input, str) and image_input.startswith("data:image"):
+        base64_str = image_input.split(",", 1)[1]
         image_data = base64.b64decode(base64_str)
         return Image.open(BytesIO(image_data))
+    # Case 3: plain base64 string (no prefix)
+    if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
         try:
             image_data = base64.b64decode(image_input)
             return Image.open(BytesIO(image_data))
+        except Exception:
             pass
+    # Case 4: local file path
+    if isinstance(image_input, str) and os.path.exists(image_input):
         return Image.open(image_input)
+    raise ValueError("Could not convert image input to PIL.Image")
+# -----------------------------
+#  Core function
+# -----------------------------
 def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
     """
+    Run SmolDocling image-to-text conversion.
     """
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
     prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
     inputs = processor(text=prompt, images=[image], return_tensors="pt")
     outputs = model.generate(**inputs, max_new_tokens=1024)
     prompt_length = inputs.input_ids.shape[1]
     generated = outputs[:, prompt_length:]
     result = processor.batch_decode(generated, skip_special_tokens=False)[0]
     return result.replace("<end_of_utterance>", "").strip()
+# -----------------------------
+#  Wrapper for MCP schema compatibility
+# -----------------------------
+def smoldocling_entry(image, prompt_text: str) -> str:
+    """
+    Entry point for MCP tool.
+    Accepts any of:
+      - base64 string
+      - dict {"type": "image", "data": "data:image/png;base64,..."}
+      - file path
+    """
+    pil_image = convert_to_pil(image)
+    return smoldocling_readimage(pil_image, prompt_text)
+# -----------------------------
+#  Gradio MCP App (Headless)
+# -----------------------------
 with gr.Blocks() as demo:
     gr.Markdown(
         """
+        ### 📄 SmolDocling MCP Tool
+        This is a **headless MCP tool** for document image conversion.
+        It supports input as:
+        - Base64-encoded images
+        - Perplexity/Claude `{"type": "image", "data": "..."}` objects
+        - Local file paths (for testing)
         """
     )
+    # Expose MCP tool
+    gr.api(smoldocling_entry)
+# Launch MCP server mode
+_, url, _ = demo.launch(mcp_server=True)
+print(f"✅ MCP Server running at: {url}")