| | from vlm_model import VLMConfig, VLM |
| | from transformers import AutoProcessor, AutoTokenizer |
| | from PIL import Image |
| | import torch |
| |
|
| | |
| | config = VLMConfig.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
| | model = VLM.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
| | tokenizer = AutoTokenizer.from_pretrained("YOUR_USERNAME/vlm-qwen-siglip") |
| | processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-224") |
| |
|
| | |
| | image = Image.open("your_image.jpg").convert("RGB") |
| | processor_output = processor(text=None, images=image, return_tensors="pt") |
| | pixel_values = processor_output['pixel_values'] |
| |
|
| | |
| | chat = [ |
| | {"role": "system", "content": "You are a helpful assistant."}, |
| | {"role": "user", "content": f"What's in this image?{('<|image_pad|>' * config.image_pad_num)}"} |
| | ] |
| | input_text = tokenizer.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) |
| | input_ids = tokenizer(input_text, return_tensors="pt").input_ids |
| |
|
| | |
| | with torch.no_grad(): |
| | generated_ids = model.generate( |
| | input_ids=input_ids, |
| | pixel_values=pixel_values, |
| | max_new_tokens=200, |
| | do_sample=True, |
| | temperature=0.7, |
| | top_p=0.9, |
| | ) |
| |
|
| | |
| | response = tokenizer.decode(generated_ids[0][input_ids.shape[1]:], skip_special_tokens=True) |
| | print(response) |