Spaces:
Sleeping
Sleeping
File size: 1,283 Bytes
b6538da |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
def smoldocling_readimage(image, prompt_text="Convert this page to docling."):
# Load model and processor
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# Create input messages
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt_text}
]
},
]
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
# inputs = inputs.to(device)
# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=1024) # Reduced for testing
prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
trimmed_generated_ids,
skip_special_tokens=False,
)[0].lstrip()
# Clean the output
doctags = doctags.replace("<end_of_utterance>", "").strip()
return doctags |