my-smoldocling-demo

Sleeping

App Files Files Community

bharatcoder commited on Aug 24

Commit

fcf0972

verified ·

1 Parent(s): 5e6cb9d

Update app.py

Browse files

Files changed (1) hide show

app.py +46 -1

app.py CHANGED Viewed

@@ -7,6 +7,51 @@ processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
 def smoldocling_readimage(image, prompt_text):
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
@@ -30,4 +75,4 @@ demo = gr.Interface(
     description="Upload a document image and convert it to structured docling format."
 )
-demo.launch(mcp_server=True)

 model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
 def smoldocling_readimage(image, prompt_text):
+    """
+    Extract text and structured content from document images using SmolDocling model.
+    This function processes document images (PDFs, scanned documents, screenshots, etc.)
+    and converts them to structured text format based on the provided prompt. It uses
+    the SmolDocling-256M-preview model for image-to-text conversion with chat-based
+    prompting.
+    Args:
+        image (PIL.Image.Image): The input document image to process. Should be a PIL
+            Image object containing a document, text, or any visual content that needs
+            to be converted to text.
+        prompt_text (str): The instruction or prompt text that guides the model's
+            output format. Supported prompts include:
+            Content Conversion:
+            - "Convert this page to docling." - Full conversion to DocTags representation
+            - "Convert chart to table." - Convert charts to table format
+            - "Convert formula to LaTeX." - Convert mathematical formulas to LaTeX
+            - "Convert code to text." - Convert code blocks to readable text
+            - "Convert table to OTSL." - Convert tables to OTSL format (Lysak et al., 2023)
+            OCR and Location-based Actions:
+            - "OCR the text in a specific location: <loc_155><loc_233><loc_206><loc_237>" - Extract text from specific coordinates
+            - "Identify element at: <loc_247><loc_482><loc_252><loc_486>" - Identify element type at coordinates
+            - "Find all 'text' elements on the page, retrieve all section headers." - Extract section headers
+            - "Detect footer elements on the page." - Identify footer content
+    Returns:
+        str: The extracted and formatted text content from the image, cleaned of
+            special tokens and whitespace. The format depends on the prompt_text
+            provided.
+    Example:
+        >>> from PIL import Image
+        >>> img = Image.open("document.pdf")
+        >>> result = smoldocling_readimage(img, "Convert to docling")
+        >>> print(result)  # Returns structured document content
+    Note:
+        - The function is optimized for document images but can handle any image
+          containing text
+        - Processing time depends on image size and complexity
+        - Maximum output length is limited to 1024 new tokens
+    """
     messages = [
         {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt_text}]}
     ]
     description="Upload a document image and convert it to structured docling format."
 )
+demo.launch(mcp_server=True)