bharatcoder commited on
Commit
0a09255
·
verified ·
1 Parent(s): 178bba5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -72
app.py CHANGED
@@ -5,109 +5,99 @@ import base64
5
  from io import BytesIO
6
  import os
7
 
8
-
9
- # Load model & processor once at startup
 
10
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
11
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
12
 
13
-
14
- def convert_to_pil(image_input: str) -> Image.Image:
 
 
15
  """
16
- Convert base64 or file path string to PIL.Image.
17
-
18
- Args:
19
- image_input: Base64 encoded string or file path
20
-
21
- Returns:
22
- PIL.Image.Image object
23
  """
24
- # Check if it's a base64 string
25
- if image_input.startswith('data:image'):
26
- # Remove data:image/jpeg;base64, prefix
27
- base64_str = image_input.split(',', 1)[1]
 
 
 
28
  image_data = base64.b64decode(base64_str)
29
  return Image.open(BytesIO(image_data))
30
- elif ',' in image_input and len(image_input) > 100:
31
- # Might be base64 without prefix
 
32
  try:
33
  image_data = base64.b64decode(image_input)
34
  return Image.open(BytesIO(image_data))
35
- except:
36
  pass
37
-
38
- # Assume it's a file path
39
- if os.path.exists(image_input):
40
  return Image.open(image_input)
41
-
42
- raise ValueError(f"Could not convert image input to PIL.Image: {type(image_input)}")
43
 
 
44
 
 
 
 
45
  def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
46
  """
47
- Extract text and structured content from document images using SmolDocling model.
48
-
49
- This function processes document images (PDFs, scanned documents, screenshots, etc.)
50
- and converts them to structured text format based on the provided prompt. It uses
51
- the SmolDocling-256M-preview model for image-to-text conversion with chat-based prompting.
52
-
53
- Args:
54
- image (Image.Image): The input document image
55
- prompt_text (str): The instruction or prompt text that guides the model's output format.
56
- Supported prompts include:
57
-
58
- Content Conversion:
59
- - "Convert this page to docling." - Full conversion to DocTags representation
60
- - "Convert chart to table." - Convert charts to table format
61
- - "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
62
- - "Convert code to text." - Convert code blocks to readable text
63
- - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
64
-
65
- OCR and Location-based Actions:
66
- - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>"
67
- - Extract text from specific coordinates
68
- - "Identify element at: <loc_247><loc_482><loc_252><loc_486>"
69
- - Identify element type at coordinates
70
- - "Find all 'text' elements on the page, retrieve all section headers."
71
- - Extract section headers
72
- - "Detect footer elements on the page." - Identify footer content
73
-
74
- Returns:
75
- str: The extracted and formatted text content from the image, cleaned of special
76
- tokens and whitespace. The format depends on the prompt_text provided.
77
-
78
- Example:
79
- >>> result = smoldocling_readimage("...", "Convert to docling")
80
- >>> print(result) # Returns structured document content
81
-
82
- Note:
83
- - The function is optimized for document images but can handle any image containing text
84
- - Processing time depends on image size and complexity
85
- - Maximum output length is limited to 1024 new tokens
86
  """
87
- # Convert string input (base64 or path) to PIL.Image
88
- # pil_image = convert_to_pil(image)
89
-
90
  messages = [
91
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
92
  ]
93
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
94
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
95
  outputs = model.generate(**inputs, max_new_tokens=1024)
 
96
  prompt_length = inputs.input_ids.shape[1]
97
  generated = outputs[:, prompt_length:]
98
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
99
  return result.replace("<end_of_utterance>", "").strip()
100
 
101
- # Gradio UI
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
  with gr.Blocks() as demo:
103
  gr.Markdown(
104
  """
105
- This is a MCP only tool for conversion using smoldocling
106
- This tool is MCP-only, so it does not have a UI.
 
 
 
 
107
  """
108
  )
109
- gr.api(
110
- smoldocling_readimage
111
- )
112
 
113
- _, url, _ = demo.launch(mcp_server=True)
 
 
 
 
 
 
5
  from io import BytesIO
6
  import os
7
 
8
+ # -----------------------------
9
+ # Load model and processor once
10
+ # -----------------------------
11
  processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
12
  model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
13
 
14
+ # -----------------------------
15
+ # Image conversion helper
16
+ # -----------------------------
17
+ def convert_to_pil(image_input):
18
  """
19
+ Convert base64, dict, or file path to PIL.Image.
20
+ Handles:
21
+ - "data:image/png;base64,...."
22
+ - plain base64
23
+ - {"type": "image", "data": "..."}
24
+ - file path
 
25
  """
26
+ # Case 1: dict input (Perplexity/Claude format)
27
+ if isinstance(image_input, dict) and "data" in image_input:
28
+ image_input = image_input["data"]
29
+
30
+ # Case 2: base64 string with prefix
31
+ if isinstance(image_input, str) and image_input.startswith("data:image"):
32
+ base64_str = image_input.split(",", 1)[1]
33
  image_data = base64.b64decode(base64_str)
34
  return Image.open(BytesIO(image_data))
35
+
36
+ # Case 3: plain base64 string (no prefix)
37
+ if isinstance(image_input, str) and "," in image_input and len(image_input) > 100:
38
  try:
39
  image_data = base64.b64decode(image_input)
40
  return Image.open(BytesIO(image_data))
41
+ except Exception:
42
  pass
43
+
44
+ # Case 4: local file path
45
+ if isinstance(image_input, str) and os.path.exists(image_input):
46
  return Image.open(image_input)
 
 
47
 
48
+ raise ValueError("Could not convert image input to PIL.Image")
49
 
50
+ # -----------------------------
51
+ # Core function
52
+ # -----------------------------
53
  def smoldocling_readimage(image: Image.Image, prompt_text: str) -> str:
54
  """
55
+ Run SmolDocling image-to-text conversion.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  """
 
 
 
57
  messages = [
58
  {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
59
  ]
60
  prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
61
  inputs = processor(text=prompt, images=[image], return_tensors="pt")
62
  outputs = model.generate(**inputs, max_new_tokens=1024)
63
+
64
  prompt_length = inputs.input_ids.shape[1]
65
  generated = outputs[:, prompt_length:]
66
  result = processor.batch_decode(generated, skip_special_tokens=False)[0]
67
  return result.replace("<end_of_utterance>", "").strip()
68
 
69
+ # -----------------------------
70
+ # Wrapper for MCP schema compatibility
71
+ # -----------------------------
72
+ def smoldocling_entry(image, prompt_text: str) -> str:
73
+ """
74
+ Entry point for MCP tool.
75
+ Accepts any of:
76
+ - base64 string
77
+ - dict {"type": "image", "data": "data:image/png;base64,..."}
78
+ - file path
79
+ """
80
+ pil_image = convert_to_pil(image)
81
+ return smoldocling_readimage(pil_image, prompt_text)
82
+
83
+ # -----------------------------
84
+ # Gradio MCP App (Headless)
85
+ # -----------------------------
86
  with gr.Blocks() as demo:
87
  gr.Markdown(
88
  """
89
+ ### 📄 SmolDocling MCP Tool
90
+ This is a **headless MCP tool** for document image conversion.
91
+ It supports input as:
92
+ - Base64-encoded images
93
+ - Perplexity/Claude `{"type": "image", "data": "..."}` objects
94
+ - Local file paths (for testing)
95
  """
96
  )
 
 
 
97
 
98
+ # Expose MCP tool
99
+ gr.api(smoldocling_entry)
100
+
101
+ # Launch MCP server mode
102
+ _, url, _ = demo.launch(mcp_server=True)
103
+ print(f"✅ MCP Server running at: {url}")