Create custom processor for easier inference

#12
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
- "_name_or_path": "./llava-v1.5-13b",
3
  "architectures": [
4
  "LlavaQwen2ForCausalLM"
5
  ],
6
  "auto_map": {
7
  "AutoConfig": "llava_qwen.LlavaConfig",
8
- "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM"
9
- },
 
10
  "attention_dropout": 0.0,
11
  "bos_token_id": 151643,
12
  "eos_token_id": 151645,
@@ -45,5 +45,24 @@
45
  "use_cache": true,
46
  "use_mm_proj": true,
47
  "use_sliding_window": false,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  "vocab_size": 152064
49
  }
 
1
  {
 
2
  "architectures": [
3
  "LlavaQwen2ForCausalLM"
4
  ],
5
  "auto_map": {
6
  "AutoConfig": "llava_qwen.LlavaConfig",
7
+ "AutoModelForCausalLM": "llava_qwen.LlavaQwen2ForCausalLM",
8
+ "AutoProcessor": "processing_fastvlm.FastVLMProcessor"
9
+ },
10
  "attention_dropout": 0.0,
11
  "bos_token_id": 151643,
12
  "eos_token_id": 151645,
 
45
  "use_cache": true,
46
  "use_mm_proj": true,
47
  "use_sliding_window": false,
48
+ "vision_config": {
49
+ "cls_ratio": 2.0,
50
+ "down_patch_size": 7,
51
+ "down_stride": 2,
52
+ "downsamples": [true, true, true, true, true],
53
+ "embed_dims": [96, 192, 384, 768, 1536],
54
+ "hidden_size": 1024,
55
+ "image_size": 1024,
56
+ "intermediate_size": 3072,
57
+ "layer_scale_init_value": 1e-5,
58
+ "layers": [2, 12, 24, 4, 2],
59
+ "mlp_ratios": [4, 4, 4, 4, 4],
60
+ "num_classes": 1000,
61
+ "patch_size": 64,
62
+ "pos_embs_shapes": [null, null, null, [7, 7], [7, 7]],
63
+ "projection_dim": 768,
64
+ "repmixer_kernel_size": 3,
65
+ "token_mixers": ["repmixer", "repmixer", "repmixer", "attention", "attention"]
66
+ },
67
  "vocab_size": 152064
68
  }
preprocessor_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "processing_fastvlm.FastVLMImageProcessor"
4
+ },
5
+ "image_processor_type": "FastVLMImageProcessor",
6
+ "crop_size": {
7
+ "height": 1024,
8
+ "width": 1024
9
+ },
10
+ "do_center_crop": true,
11
+ "do_convert_rgb": true,
12
+ "do_normalize": true,
13
+ "do_rescale": true,
14
+ "do_resize": true,
15
+ "image_mean": [
16
+ 0.0,
17
+ 0.0,
18
+ 0.0
19
+ ],
20
+ "image_std": [
21
+ 1.0,
22
+ 1.0,
23
+ 1.0
24
+ ],
25
+ "resample": 3,
26
+ "rescale_factor": 0.00392156862745098,
27
+ "size": {
28
+ "shortest_edge": 1024
29
+ }
30
+ }
processing_fastvlm.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import torch
3
+ from transformers import ProcessorMixin, BatchFeature, CLIPImageProcessorFast
4
+ from transformers.image_processing_utils import BaseImageProcessor
5
+ from transformers.image_utils import ImageInput
6
+ from typing import Any, Dict, List, Optional, Union
7
+ from PIL import Image
8
+
9
+ from .llava_qwen import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
10
+
11
+ # Adapted from transformers.models.llava_next.image_processing_llava_next.expand_to_square
12
+ def expand_to_square(image: torch.Tensor, background_color=0) -> torch.Tensor:
13
+ """
14
+ Expands an image to a square by adding a background color.
15
+ """
16
+ c, height, width = image.shape
17
+ if width == height:
18
+ return image
19
+ elif width > height:
20
+ result = torch.ones((c, width, width), dtype=image.dtype) * background_color
21
+ result[:, (width - height) // 2 : (width - height) // 2 + height, :] = image
22
+ return result
23
+ else:
24
+ result = torch.ones((c, height, height), dtype=image.dtype) * background_color
25
+ result[:, :, (height - width) // 2 : (height - width) // 2 + width] = image
26
+ return result
27
+
28
+
29
+ class FastVLMImageProcessor(CLIPImageProcessorFast):
30
+ def _preprocess(self, images, **kwargs):
31
+ image_sizes = [image.shape[-2:][::-1] for image in images]
32
+ images = [expand_to_square(image) for image in images]
33
+ images = super()._preprocess(images, **kwargs)
34
+ pixel_values = torch.stack(images.pixel_values, dim=0)
35
+ return BatchFeature(data={"pixel_values": pixel_values, "image_sizes": image_sizes})
36
+
37
+ class FastVLMProcessor(ProcessorMixin):
38
+ attributes = ["tokenizer", "image_processor"]
39
+ image_processor_class = "AutoImageProcessor"
40
+ tokenizer_class = "AutoTokenizer"
41
+
42
+ def __init__(
43
+ self,
44
+ tokenizer,
45
+ image_processor,
46
+ chat_template=None,
47
+ **kwargs
48
+ ):
49
+ super().__init__(tokenizer, image_processor, chat_template=chat_template, **kwargs)
50
+
51
+ def __call__(
52
+ self,
53
+ images: ImageInput = None,
54
+ text: Optional[Union[str, List[str]]] = None,
55
+ return_tensors: Optional[str] = "pt",
56
+ **kwargs,
57
+ ) -> BatchFeature:
58
+ if isinstance(text, str):
59
+ text = [text]
60
+ elif not isinstance(text, list) and not isinstance(text[0], str):
61
+ raise TypeError("Invalid input text. Please provide a string, or a list of strings")
62
+
63
+ image_inputs = {}
64
+ if images is not None:
65
+ image_inputs = self.image_processor(images=images)
66
+
67
+ image_token = torch.tensor([[IMAGE_TOKEN_INDEX]], dtype=torch.int64)
68
+ input_ids = torch.tensor([], dtype=torch.int64)
69
+ attention_mask = torch.tensor([], dtype=torch.int64)
70
+ for prompt in text:
71
+ image_indexes = [m.start() for m in re.finditer(DEFAULT_IMAGE_TOKEN, prompt)]
72
+ if len(image_indexes) > 1:
73
+ raise ValueError(
74
+ f"Expected up to 1 image tokens per prompt, got {len(image_indexes)} instead."
75
+ )
76
+
77
+ # DEFAULT_IMAGE_TOKEN is -200, not in the vocab (so we can't tokenize the full string)
78
+ pre, _, post = prompt.partition(DEFAULT_IMAGE_TOKEN)
79
+ pre_ids = self.tokenizer(pre, return_tensors="pt", add_special_tokens=False).input_ids
80
+ post_ids = self.tokenizer(post, return_tensors="pt", add_special_tokens=False).input_ids
81
+
82
+ sample_ids = torch.cat([pre_ids, image_token, post_ids], dim=1).to(dtype=torch.int64)
83
+ sample_mask = torch.ones_like(sample_ids)
84
+
85
+ input_ids = torch.cat([input_ids, sample_ids], dim=0)
86
+ attention_mask = torch.cat([attention_mask, sample_mask], dim=0)
87
+
88
+ return BatchFeature(data={"input_ids": input_ids, "attention_mask": attention_mask, **image_inputs}, tensor_type=return_tensors)
processor_config.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "chat_template": "{%- if messages is string -%}\n {{- messages -}}\n{%- else -%}\n {%- for message in messages -%}\n {%- if loop.first and messages[0]['role'] != 'system' -%}\n {{- '<|im_start|>system\\nYou are a helpful assistant.<|im_end|>\\n' -}}\n {%- endif -%}\n {{- '<|im_start|>' + message['role'] + '\\n' -}}\n {%- if message['content'] is string -%}\n {{- message['content'] -}}\n {%- elif message['content'] is iterable -%}\n {%- for item in message['content'] -%}\n {%- if item['type'] == 'image' -%}\n {{- '<image>\\n' -}}\n {%- elif item['type'] == 'text' -%}\n {{- item['text'] -}}\n {%- endif -%}\n {%- endfor -%}\n {%- else -%}\n {{- raise_exception(\"Invalid content type\") -}}\n {%- endif -%}\n {{- '<|im_end|>' + '\\n' -}}\n {%- endfor -%}\n {%- if add_generation_prompt -%}\n {{- '<|im_start|>assistant\\n' -}}\n {%- endif -%}\n{%- endif -%}\n"
3
+ }