Spaces:
Sleeping
Sleeping
File size: 4,314 Bytes
9bcdecb b6538da 1f42ce9 0a09255 9bcdecb b6538da 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 1f42ce9 0a09255 178bba5 fcf0972 0a09255 fcf0972 b6538da 9bcdecb b6538da 9bcdecb 0a09255 b6538da 9bcdecb 0a09255 989a4f7 0a09255 989a4f7 0a09255 989a4f7 05291b5 989a4f7 0a09255 a3da674 0a09255 a3da674 9bcdecb 0a09255 4d88a04 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 |
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import base64
from io import BytesIO
import os
# -----------------------------
# Load model and processor once
# -----------------------------
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# -----------------------------
# Image conversion helper
# -----------------------------
def convert_to_pil(image_input):
"""
Convert base64, dict, or file path to PIL.Image.
Handles:
- "data:image/png;base64,...."
- plain base64
- {"type": "image", "data": "..."}
- file path
"""
# Case 1: dict input (Perplexity/Claude format)
if isinstance(image_input, dict) and "data" in image_input:
image_input = image_input["data"]
# Case 2: base64 string with prefix
if isinstance(image_input, str) and image_input.startswith("data:image"):
base64_str = image_input.split(",", 1)[1]
image_data = base64.b64decode(base64_str)
return Image.open(BytesIO(image_data))
# Case 3: plain base64 string (no prefix)
if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
try:
image_data = base64.b64decode(image_input)
return Image.open(BytesIO(image_data))
except Exception:
pass
# Case 4: local file path
if isinstance(image_input, str) and os.path.exists(image_input):
return Image.open(image_input)
raise ValueError("Could not convert image input to PIL.Image")
# -----------------------------
# Core function
# -----------------------------
def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
"""
Run SmolDocling image-to-text conversion.
"""
messages = [
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=1024)
prompt_length = inputs.input_ids.shape[1]
generated = outputs[:, prompt_length:]
result = processor.batch_decode(generated, skip_special_tokens=False)[0]
return result.replace("<end_of_utterance>", "").strip()
# -----------------------------
# Wrapper for MCP schema compatibility
# -----------------------------
def smoldocling_entry(image: str, prompt_text: str) -> str:
"""
Entry point for the SmolDocling MCP tool.
Expected input formats:
- **Base64 string**: "data:image/png;base64,...."
- **Object** (Perplexity/Claude style): {"type": "image", "data": "data:image/png;base64,..."}
- **Local file path** (for internal testing)
Parameters
----------
image : str
A base64-encoded image string (with or without data: prefix) OR
a JSON-encoded object containing image data.
prompt_text : str
Instruction text for how to process the document (e.g., "Convert this page to docling.")
Returns
-------
str
Structured or textual content extracted from the image.
"""
# Handle Perplexity-style dicts encoded as JSON strings
print(f"Received entry: {image} prompt: {prompt_text}")
try:
import json
maybe_json = json.loads(image)
if isinstance(maybe_json, dict) and "data" in maybe_json:
image = maybe_json
except Exception:
pass
pil_image = convert_to_pil(image)
return smoldocling_readimage(pil_image, prompt_text)
# -----------------------------
# Gradio MCP App (Headless)
# -----------------------------
with gr.Blocks() as demo:
gr.Markdown(
"""
### 📄 SmolDocling MCP Tool
This is a **headless MCP tool** for document image conversion.
It supports input as:
- Base64-encoded images
- Perplexity/Claude `{"type": "image", "data": "..."}` objects
- Local file paths (for testing)
"""
)
# Expose MCP tool
gr.api(smoldocling_entry)
# Launch MCP server mode
_, url, _ = demo.launch(mcp_server=True)
print(f"MCP Server running at: {url}") |