from vlm_model import VLMConfig, VLM from transformers import AutoProcessor, AutoTokenizer from PIL import Image import torch # Load model and tokenizers config = VLMConfig.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") model = VLM.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") tokenizer = AutoTokenizer.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") # Load image image = Image.open("your_image.jpg").convert("RGB") processor_output = processor(text=None, images=image, return_tensors="pt") pixel_values = processor_output['pixel_values'] # Create input with image placeholder chat = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": f"What's in this image?{('<|image_pad|>' * config.image_pad_num)}"} ] input_text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) input_ids = tokenizer(input_text, return_tensors="pt").input_ids # Generate response with torch.no_grad(): generated_ids = model.generate( input_ids=input_ids, pixel_values=pixel_values, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9, ) # Decode response response = tokenizer.decode(generated_ids[0][input_ids.shape[1]:], skip_special_tokens=True) print(response)