from transformers import AutoProcessor, AutoModelForImageTextToText from PIL import Image import torch def smoldocling_readimage(image, prompt_text="Convert this page to docling."): # Load model and processor processor = AutoProcessor.from_pretrained("ds4sd/SmolDocling-256M-preview") model = AutoModelForImageTextToText.from_pretrained("ds4sd/SmolDocling-256M-preview") # Create input messages messages = [ { "role": "user", "content": [ {"type": "image"}, {"type": "text", "text": prompt_text} ] }, ] # Prepare inputs prompt = processor.apply_chat_template(messages, add_generation_prompt=True) inputs = processor(text=prompt, images=[image], return_tensors="pt") # inputs = inputs.to(device) # Generate outputs generated_ids = model.generate(**inputs, max_new_tokens=1024) # Reduced for testing prompt_length = inputs.input_ids.shape[1] trimmed_generated_ids = generated_ids[:, prompt_length:] doctags = processor.batch_decode( trimmed_generated_ids, skip_special_tokens=False, )[0].lstrip() # Clean the output doctags = doctags.replace("", "").strip() return doctags