Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,51 @@ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
|
| 7 |
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 8 |
|
| 9 |
def smoldocling_readimage(image, prompt_text):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
messages = [
|
| 11 |
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
|
| 12 |
]
|
|
@@ -30,4 +75,4 @@ demo = gr.Interface(
|
|
| 30 |
description="Upload a document image and convert it to structured docling format."
|
| 31 |
)
|
| 32 |
|
| 33 |
-
demo.launch(mcp_server=True)
|
|
|
|
| 7 |
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
|
| 8 |
|
| 9 |
def smoldocling_readimage(image, prompt_text):
|
| 10 |
+
"""
|
| 11 |
+
Extract text and structured content from document images using SmolDocling model.
|
| 12 |
+
|
| 13 |
+
This function processes document images (PDFs, scanned documents, screenshots, etc.)
|
| 14 |
+
and converts them to structured text format based on the provided prompt. It uses
|
| 15 |
+
the SmolDocling-256M-preview model for image-to-text conversion with chat-based
|
| 16 |
+
prompting.
|
| 17 |
+
|
| 18 |
+
Args:
|
| 19 |
+
image (PIL.Image.Image): The input document image to process. Should be a PIL
|
| 20 |
+
Image object containing a document, text, or any visual content that needs
|
| 21 |
+
to be converted to text.
|
| 22 |
+
prompt_text (str): The instruction or prompt text that guides the model's
|
| 23 |
+
output format. Supported prompts include:
|
| 24 |
+
|
| 25 |
+
Content Conversion:
|
| 26 |
+
- "Convert this page to docling." - Full conversion to DocTags representation
|
| 27 |
+
- "Convert chart to table." - Convert charts to table format
|
| 28 |
+
- "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
|
| 29 |
+
- "Convert code to text." - Convert code blocks to readable text
|
| 30 |
+
- "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
|
| 31 |
+
|
| 32 |
+
OCR and Location-based Actions:
|
| 33 |
+
- "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
|
| 34 |
+
- "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
|
| 35 |
+
- "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
|
| 36 |
+
- "Detect footer elements on the page." - Identify footer content
|
| 37 |
+
|
| 38 |
+
Returns:
|
| 39 |
+
str: The extracted and formatted text content from the image, cleaned of
|
| 40 |
+
special tokens and whitespace. The format depends on the prompt_text
|
| 41 |
+
provided.
|
| 42 |
+
|
| 43 |
+
Example:
|
| 44 |
+
>>> from PIL import Image
|
| 45 |
+
>>> img = Image.open("document.pdf")
|
| 46 |
+
>>> result = smoldocling_readimage(img, "Convert to docling")
|
| 47 |
+
>>> print(result) # Returns structured document content
|
| 48 |
+
|
| 49 |
+
Note:
|
| 50 |
+
- The function is optimized for document images but can handle any image
|
| 51 |
+
containing text
|
| 52 |
+
- Processing time depends on image size and complexity
|
| 53 |
+
- Maximum output length is limited to 1024 new tokens
|
| 54 |
+
"""
|
| 55 |
messages = [
|
| 56 |
{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
|
| 57 |
]
|
|
|
|
| 75 |
description="Upload a document image and convert it to structured docling format."
|
| 76 |
)
|
| 77 |
|
| 78 |
+
demo.launch(mcp_server=True)
|