Instructions to use llm-jp/Jagle-VL-2.2B-FineVision with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use llm-jp/Jagle-VL-2.2B-FineVision with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("feature-extraction", model="llm-jp/Jagle-VL-2.2B-FineVision", trust_remote_code=True)# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("llm-jp/Jagle-VL-2.2B-FineVision", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
| """LLM-jp-VL Processor — combines SigLIP image processing + dynamic patching + tokenization.""" | |
| from typing import List, Optional, Union | |
| import torch | |
| from PIL import Image | |
| from transformers import BatchFeature, ProcessorMixin | |
| def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): | |
| best_ratio_diff = float("inf") | |
| best_ratio = (1, 1) | |
| area = width * height | |
| for ratio in target_ratios: | |
| target_aspect_ratio = ratio[0] / ratio[1] | |
| ratio_diff = abs(aspect_ratio - target_aspect_ratio) | |
| if ratio_diff < best_ratio_diff: | |
| best_ratio_diff = ratio_diff | |
| best_ratio = ratio | |
| elif ratio_diff == best_ratio_diff: | |
| if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: | |
| best_ratio = ratio | |
| return best_ratio | |
| def dynamic_preprocess( | |
| image, min_num=1, max_num=12, image_size=512, use_thumbnail=False | |
| ): | |
| orig_width, orig_height = image.size | |
| aspect_ratio = orig_width / orig_height | |
| target_ratios = set( | |
| (i, j) | |
| for n in range(min_num, max_num + 1) | |
| for i in range(1, n + 1) | |
| for j in range(1, n + 1) | |
| if i * j <= max_num and i * j >= min_num | |
| ) | |
| target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) | |
| target_aspect_ratio = find_closest_aspect_ratio( | |
| aspect_ratio, target_ratios, orig_width, orig_height, image_size | |
| ) | |
| target_width = image_size * target_aspect_ratio[0] | |
| target_height = image_size * target_aspect_ratio[1] | |
| blocks = target_aspect_ratio[0] * target_aspect_ratio[1] | |
| resized_img = image.resize((target_width, target_height)) | |
| processed_images = [] | |
| for i in range(blocks): | |
| box = ( | |
| (i % (target_width // image_size)) * image_size, | |
| (i // (target_width // image_size)) * image_size, | |
| ((i % (target_width // image_size)) + 1) * image_size, | |
| ((i // (target_width // image_size)) + 1) * image_size, | |
| ) | |
| processed_images.append(resized_img.crop(box)) | |
| if use_thumbnail and len(processed_images) != 1: | |
| processed_images.append(image.resize((image_size, image_size))) | |
| return processed_images | |
| class LLMjpVLProcessor(ProcessorMixin): | |
| attributes = ["image_processor", "tokenizer"] | |
| image_processor_class = "AutoImageProcessor" | |
| tokenizer_class = "AutoTokenizer" | |
| def __init__( | |
| self, | |
| image_processor, | |
| tokenizer, | |
| image_seq_length=256, | |
| max_dynamic_patch=12, | |
| min_dynamic_patch=1, | |
| use_thumbnail=True, | |
| chat_template=None, | |
| **kwargs, | |
| ): | |
| self.image_seq_length = image_seq_length | |
| self.max_dynamic_patch = max_dynamic_patch | |
| self.min_dynamic_patch = min_dynamic_patch | |
| self.use_thumbnail = use_thumbnail | |
| if chat_template is not None: | |
| tokenizer.chat_template = chat_template | |
| super().__init__(image_processor, tokenizer, **kwargs) | |
| def __call__( | |
| self, | |
| images: Optional[Union[Image.Image, List[Image.Image]]] = None, | |
| text: Optional[Union[str, List[str]]] = None, | |
| return_tensors: Optional[str] = None, | |
| **kwargs, | |
| ) -> BatchFeature: | |
| if text is None and images is None: | |
| raise ValueError("You must provide at least one of `text` or `images`.") | |
| data = {} | |
| num_patches_list = [] | |
| if images is not None: | |
| if isinstance(images, Image.Image): | |
| images = [images] | |
| image_size = self.image_processor.size.get( | |
| "height", self.image_processor.size.get("shortest_edge", 512) | |
| ) | |
| all_pixel_values = [] | |
| num_image = len(images) | |
| # Compute max patches per image from actual text token count. | |
| # Each image uses (max_num + 1) * image_seq_length + 2 tokens (thumbnail added when max_num > 1). | |
| if text is not None: | |
| text_without_images = text if isinstance(text, str) else text[0] | |
| text_without_images = text_without_images.replace("<image>", "") | |
| text_tokens = len(self.tokenizer.encode(text_without_images, add_special_tokens=False)) | |
| else: | |
| text_tokens = 0 | |
| image_budget = self.tokenizer.model_max_length - text_tokens | |
| max_num = (image_budget // num_image - 2) // self.image_seq_length - 1 | |
| max_num = max(1, min(self.max_dynamic_patch, max_num)) | |
| for image in images: | |
| image = image.convert("RGB") | |
| patches = dynamic_preprocess( | |
| image, | |
| min_num=self.min_dynamic_patch, | |
| max_num=max_num, | |
| image_size=image_size, | |
| use_thumbnail=self.use_thumbnail, | |
| ) | |
| num_patches_list.append(len(patches)) | |
| pixel_values = self.image_processor( | |
| images=patches, return_tensors="pt" | |
| ).pixel_values | |
| all_pixel_values.append(pixel_values) | |
| data["pixel_values"] = torch.cat(all_pixel_values, dim=0) | |
| if text is not None: | |
| if isinstance(text, str): | |
| text = [text] | |
| expanded_texts = [] | |
| for t in text: | |
| for num_patches in num_patches_list: | |
| image_tokens = ( | |
| "<|image_start|>" | |
| + "<|image_pad|>" * self.image_seq_length * num_patches | |
| + "<|image_end|>" | |
| ) | |
| t = t.replace("<image>", image_tokens, 1) | |
| expanded_texts.append(t) | |
| tokenized = self.tokenizer( | |
| expanded_texts if len(expanded_texts) > 1 else expanded_texts[0], | |
| return_tensors=return_tensors, | |
| add_special_tokens=False, | |
| **kwargs, | |
| ) | |
| data.update(tokenized) | |
| if num_patches_list: | |
| data["num_patches_list"] = num_patches_list | |
| return BatchFeature(data=data, tensor_type=return_tensors) | |
| def apply_chat_template( | |
| self, | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=False, | |
| return_dict=False, | |
| return_tensors=None, | |
| **kwargs, | |
| ): | |
| """Format messages and optionally process images + tokenize in one call. | |
| Supports structured content messages (Qwen3-VL style):: | |
| messages = [{"role": "user", "content": [ | |
| {"type": "image", "image": "path/to/img.png"}, | |
| {"type": "text", "text": "Describe this image."}, | |
| ]}] | |
| Plain string content is also supported:: | |
| messages = [{"role": "user", "content": "Hello"}] | |
| When ``tokenize=True`` and ``return_dict=True``, returns a | |
| :class:`~transformers.BatchFeature` with ``pixel_values``, | |
| ``input_ids``, and ``attention_mask`` that can be unpacked directly | |
| into ``model.generate(**inputs)``. | |
| """ | |
| # Extract images and flatten structured content to plain text messages | |
| images = [] | |
| flat_messages = [] | |
| for msg in messages: | |
| role = msg["role"] | |
| content = msg["content"] | |
| if isinstance(content, str): | |
| flat_messages.append({"role": role, "content": content}) | |
| elif isinstance(content, list): | |
| text_parts = [] | |
| for item in content: | |
| if item["type"] == "image": | |
| img = item["image"] | |
| if isinstance(img, str): | |
| images.append(Image.open(img).convert("RGB")) | |
| elif isinstance(img, Image.Image): | |
| images.append(img.convert("RGB")) | |
| text_parts.append("<image>") | |
| elif item["type"] == "text": | |
| text_parts.append(item["text"]) | |
| flat_messages.append({"role": role, "content": "".join(text_parts)}) | |
| text = self.tokenizer.apply_chat_template( | |
| flat_messages, | |
| tokenize=False, | |
| add_special_tokens=False, | |
| add_generation_prompt=add_generation_prompt, | |
| ) | |
| text += "<|channel|>final<|message|>" | |
| if not tokenize: | |
| return text | |
| result = self( | |
| images=images if images else None, | |
| text=text, | |
| return_tensors=return_tensors, | |
| **kwargs, | |
| ) | |
| # Remove non-tensor metadata so **result works with model.generate() | |
| result.pop("num_patches_list", None) | |
| if return_dict: | |
| return result | |
| return result["input_ids"] | |
| def decode(self, token_ids, **kwargs): | |
| return self.tokenizer.decode(token_ids, **kwargs) | |
| def batch_decode(self, token_ids, **kwargs): | |
| return self.tokenizer.batch_decode(token_ids, **kwargs) | |
| def model_input_names(self): | |
| tokenizer_names = self.tokenizer.model_input_names | |
| image_processor_names = self.image_processor.model_input_names | |
| return list(dict.fromkeys(tokenizer_names + image_processor_names)) | |