File size: 4,314 Bytes
9bcdecb
b6538da
 
1f42ce9
 
 
 
0a09255
 
 
9bcdecb
 
b6538da
0a09255
 
 
 
1f42ce9
0a09255
 
 
 
 
 
1f42ce9
0a09255
 
 
 
 
 
 
1f42ce9
 
0a09255
 
 
1f42ce9
 
 
0a09255
1f42ce9
0a09255
 
 
1f42ce9
 
0a09255
1f42ce9
0a09255
 
 
178bba5
fcf0972
0a09255
fcf0972
b6538da
9bcdecb
b6538da
 
 
9bcdecb
0a09255
b6538da
9bcdecb
 
 
 
0a09255
 
 
989a4f7
0a09255
989a4f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0a09255
989a4f7
05291b5
989a4f7
 
 
 
 
 
 
 
0a09255
 
 
 
 
 
a3da674
 
 
0a09255
 
 
 
 
 
a3da674
 
9bcdecb
0a09255
 
 
 
 
4d88a04
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import base64
from io import BytesIO
import os

# -----------------------------
#  Load model and processor once
# -----------------------------
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

# -----------------------------
#  Image conversion helper
# -----------------------------
def convert_to_pil(image_input):
    """
    Convert base64, dict, or file path to PIL.Image.
    Handles:
      - "data:image/png;base64,...."
      - plain base64
      - {"type": "image", "data": "..."}
      - file path
    """
    # Case 1: dict input (Perplexity/Claude format)
    if isinstance(image_input, dict) and "data" in image_input:
        image_input = image_input["data"]

    # Case 2: base64 string with prefix
    if isinstance(image_input, str) and image_input.startswith("data:image"):
        base64_str = image_input.split(",", 1)[1]
        image_data = base64.b64decode(base64_str)
        return Image.open(BytesIO(image_data))

    # Case 3: plain base64 string (no prefix)
    if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
        try:
            image_data = base64.b64decode(image_input)
            return Image.open(BytesIO(image_data))
        except Exception:
            pass

    # Case 4: local file path
    if isinstance(image_input, str) and os.path.exists(image_input):
        return Image.open(image_input)

    raise ValueError("Could not convert image input to PIL.Image")

# -----------------------------
#  Core function
# -----------------------------
def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
    """
    Run SmolDocling image-to-text conversion.
    """
    messages = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=1024)

    prompt_length = inputs.input_ids.shape[1]
    generated = outputs[:, prompt_length:]
    result = processor.batch_decode(generated, skip_special_tokens=False)[0]
    return result.replace("<end_of_utterance>", "").strip()

# -----------------------------
#  Wrapper for MCP schema compatibility
# -----------------------------
def smoldocling_entry(image: str, prompt_text: str) -> str:
    """
    Entry point for the SmolDocling MCP tool.

    Expected input formats:
    - **Base64 string**: "data:image/png;base64,...."
    - **Object** (Perplexity/Claude style): {"type": "image", "data": "data:image/png;base64,..."}
    - **Local file path** (for internal testing)

    Parameters
    ----------
    image : str
        A base64-encoded image string (with or without data: prefix) OR
        a JSON-encoded object containing image data.
    prompt_text : str
        Instruction text for how to process the document (e.g., "Convert this page to docling.")

    Returns
    -------
    str
        Structured or textual content extracted from the image.
    """
    # Handle Perplexity-style dicts encoded as JSON strings
    print(f"Received entry: {image} prompt: {prompt_text}")
    try:
        import json
        maybe_json = json.loads(image)
        if isinstance(maybe_json, dict) and "data" in maybe_json:
            image = maybe_json
    except Exception:
        pass

    pil_image = convert_to_pil(image)
    return smoldocling_readimage(pil_image, prompt_text)

# -----------------------------
#  Gradio MCP App (Headless)
# -----------------------------
with gr.Blocks() as demo:
    gr.Markdown(
        """
        ### 📄 SmolDocling MCP Tool
        This is a **headless MCP tool** for document image conversion.
        It supports input as:
        - Base64-encoded images
        - Perplexity/Claude `{"type": "image", "data": "..."}` objects
        - Local file paths (for testing)
        """
    )

    # Expose MCP tool
    gr.api(smoldocling_entry)

# Launch MCP server mode
_, url, _ = demo.launch(mcp_server=True)
print(f"MCP Server running at: {url}")