File size: 4,554 Bytes
9bcdecb
b6538da
 
1f42ce9
 
 
 
b6538da
9bcdecb
 
 
b6538da
1f42ce9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178bba5
fcf0972
 
 
1f42ce9
 
 
fcf0972
 
178bba5
1f42ce9
 
fcf0972
 
 
 
 
 
 
 
 
1f42ce9
 
 
 
 
 
fcf0972
 
 
1f42ce9
 
fcf0972
 
1f42ce9
fcf0972
 
 
1f42ce9
fcf0972
 
 
1f42ce9
178bba5
1f42ce9
b6538da
9bcdecb
b6538da
 
 
9bcdecb
b6538da
9bcdecb
 
 
 
 
a3da674
 
 
e2db174
a3da674
 
 
 
 
 
9bcdecb
a3da674
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import gradio as gr
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import base64
from io import BytesIO
import os


# Load model & processor once at startup
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")


def convert_to_pil(image_input: str) -> Image.Image:
    """
    Convert base64 or file path string to PIL.Image.
    
    Args:
        image_input: Base64 encoded string or file path
    
    Returns:
        PIL.Image.Image object
    """
    # Check if it's a base64 string
    if image_input.startswith('data:image'):
        # Remove data:image/jpeg;base64, prefix
        base64_str = image_input.split(',', 1)[1]
        image_data = base64.b64decode(base64_str)
        return Image.open(BytesIO(image_data))
    elif ',' in image_input and len(image_input) > 100:
        # Might be base64 without prefix
        try:
            image_data = base64.b64decode(image_input)
            return Image.open(BytesIO(image_data))
        except:
            pass
    
    # Assume it's a file path
    if os.path.exists(image_input):
        return Image.open(image_input)
    
    raise ValueError(f"Could not convert image input to PIL.Image: {type(image_input)}")


def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
    """
    Extract text and structured content from document images using SmolDocling model.
    
    This function processes document images (PDFs, scanned documents, screenshots, etc.)
    and converts them to structured text format based on the provided prompt. It uses
    the SmolDocling-256M-preview model for image-to-text conversion with chat-based prompting.
    
    Args:
        image (Image.Image): The input document image
        prompt_text (str): The instruction or prompt text that guides the model's output format.
            Supported prompts include:
            
            Content Conversion:
            - "Convert this page to docling." - Full conversion to DocTags representation
            - "Convert chart to table." - Convert charts to table format
            - "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
            - "Convert code to text." - Convert code blocks to readable text
            - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
            
            OCR and Location-based Actions:
            - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>"
              - Extract text from specific coordinates
            - "Identify element at: <loc_247><loc_482><loc_252><loc_486>"
              - Identify element type at coordinates
            - "Find all 'text' elements on the page, retrieve all section headers."
              - Extract section headers
            - "Detect footer elements on the page." - Identify footer content
    
    Returns:
        str: The extracted and formatted text content from the image, cleaned of special
            tokens and whitespace. The format depends on the prompt_text provided.
    
    Example:
        >>> result = smoldocling_readimage("data:image/jpeg;base64,/9j/4AAQ...", "Convert to docling")
        >>> print(result)  # Returns structured document content
    
    Note:
        - The function is optimized for document images but can handle any image containing text
        - Processing time depends on image size and complexity
        - Maximum output length is limited to 1024 new tokens
    """
    # Convert string input (base64 or path) to PIL.Image
    # pil_image = convert_to_pil(image)
    
    messages = [
        {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
    ]
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    outputs = model.generate(**inputs, max_new_tokens=1024)
    prompt_length = inputs.input_ids.shape[1]
    generated = outputs[:, prompt_length:]
    result = processor.batch_decode(generated, skip_special_tokens=False)[0]
    return result.replace("<end_of_utterance>", "").strip()

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown(
        """
        This is a MCP only tool for conversion using smoldocling 
        This tool is MCP-only, so it does not have a UI.
        """
    )
    gr.api(
        smoldocling_readimage
    )

_, url, _ = demo.launch(mcp_server=True)