File size: 1,283 Bytes
b6538da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch



def smoldocling_readimage(image, prompt_text="Convert this page to docling."):
    # Load model and processor
    processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
    model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")

    # Create input messages
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image"},
                {"type": "text", "text": prompt_text}
            ]
        },
    ]


    # Prepare inputs
    prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
    inputs = processor(text=prompt, images=[image], return_tensors="pt")
    # inputs = inputs.to(device)


    # Generate outputs
    generated_ids = model.generate(**inputs, max_new_tokens=1024)  # Reduced for testing
    prompt_length = inputs.input_ids.shape[1]
    trimmed_generated_ids = generated_ids[:, prompt_length:]
    doctags = processor.batch_decode(
        trimmed_generated_ids,
        skip_special_tokens=False,
    )[0].lstrip()

    # Clean the output
    doctags = doctags.replace("<end_of_utterance>", "").strip()
        
    return doctags