my-smoldocling-demo / simple_test.py
YNS-Elaine's picture
Create simple_test.py
b6538da verified
raw
history blame
1.28 kB
from transformers import AutoProcessor, AutoModelForImageTextToText
from PIL import Image
import torch
def smoldocling_readimage(image, prompt_text="Convert this page to docling."):
# Load model and processor
processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview")
model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview")
# Create input messages
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": prompt_text}
]
},
]
# Prepare inputs
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[image], return_tensors="pt")
# inputs = inputs.to(device)
# Generate outputs
generated_ids = model.generate(**inputs, max_new_tokens=1024) # Reduced for testing
prompt_length = inputs.input_ids.shape[1]
trimmed_generated_ids = generated_ids[:, prompt_length:]
doctags = processor.batch_decode(
trimmed_generated_ids,
skip_special_tokens=False,
)[0].lstrip()
# Clean the output
doctags = doctags.replace("<end_of_utterance>", "").strip()
return doctags