| # Iris |
|
|
| Custom multimodal Iris checkpoint exported in Hugging Face format. |
|
|
| ## Load |
|
|
| ```python |
| import torch |
| from PIL import Image |
| from transformers import AutoImageProcessor, AutoModelForCausalLM, AutoTokenizer |
| |
| model_id = "iris-hf" |
| model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True, torch_dtype=torch.float16) |
| tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True) |
| image_processor = AutoImageProcessor.from_pretrained(model_id) |
| |
| image = Image.open("example.jpg").convert("RGB") |
| pixel_values = image_processor(images=[image], return_tensors="pt")["pixel_values"].to(model.device) |
| prompt = "Describe this image: " |
| input_ids = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)["input_ids"].to(model.device) |
| attention_mask = torch.ones_like(input_ids) |
| |
| output_ids = model.generate( |
| input_ids=input_ids, |
| attention_mask=attention_mask, |
| pixel_values=pixel_values, |
| prompt_len=input_ids.size(1), |
| max_new_tokens=64, |
| ) |
| print(tokenizer.decode(output_ids[0, input_ids.size(1):], skip_special_tokens=True)) |
| ``` |
|
|