Sibgat-Ul commited on
Commit
32d2edb
·
verified ·
1 Parent(s): f262baf

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. config.json +5 -0
  2. data.py +758 -0
  3. encoder.py +1052 -0
  4. model.py +950 -0
  5. model.safetensors.index.json +827 -0
config.json CHANGED
@@ -2,6 +2,11 @@
2
  "architectures": [
3
  "DeepQwenVLForCausalLM"
4
  ],
 
 
 
 
 
5
  "attention_dropout": 0.0,
6
  "bos_token_id": 151643,
7
  "deepseek_vision_hidden_size": 2048,
 
2
  "architectures": [
3
  "DeepQwenVLForCausalLM"
4
  ],
5
+ "auto_map": {
6
+ "AutoConfig": "model.DeepQwenVLConfig",
7
+ "AutoModel": "model.DeepQwenVLForCausalLM",
8
+ "AutoModelForCausalLM": "model.DeepQwenVLForCausalLM"
9
+ },
10
  "attention_dropout": 0.0,
11
  "bos_token_id": 151643,
12
  "deepseek_vision_hidden_size": 2048,
data.py ADDED
@@ -0,0 +1,758 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List, Dict, Optional, Tuple
2
+ from PIL import Image, ImageOps, ImageDraw, ImageFont
3
+ import torch
4
+ import torch.nn as nn
5
+ from torchvision import transforms
6
+ from transformers import TextStreamer
7
+ from transformers.tokenization_utils import PreTrainedTokenizer as T
8
+ from abc import ABC
9
+ import re
10
+ import numpy as np
11
+
12
+
13
+ def load_image(image_path):
14
+ try:
15
+ image = Image.open(image_path)
16
+ corrected_image = ImageOps.exif_transpose(image)
17
+
18
+ return corrected_image
19
+
20
+ except Exception as e:
21
+ print(f"error: {e}")
22
+
23
+ return None
24
+
25
+
26
+ def re_match(text):
27
+ pattern = r'(<\|ref\|>(.*?)<\|/ref\|><\|det\|>(.*?)<\|/det\|>)'
28
+ matches = re.findall(pattern, text, re.DOTALL)
29
+
30
+ # pattern1 = r'<\|ref\|>.*?<\|/ref\|>\n'
31
+ # new_text1 = re.sub(pattern1, '', text, flags=re.DOTALL)
32
+
33
+ mathes_image = []
34
+ mathes_other = []
35
+ for a_match in matches:
36
+ if '<|ref|>image<|/ref|>' in a_match[0]:
37
+ mathes_image.append(a_match[0])
38
+ else:
39
+ mathes_other.append(a_match[0])
40
+ return matches, mathes_image, mathes_other
41
+
42
+
43
+ def extract_coordinates_and_label(ref_text, image_width, image_height):
44
+
45
+ try:
46
+ label_type = ref_text[1]
47
+ cor_list = eval(ref_text[2])
48
+ except Exception as e:
49
+ print(e)
50
+ return None
51
+
52
+ return (label_type, cor_list)
53
+
54
+
55
+ def draw_bounding_boxes(image, refs, ouput_path):
56
+
57
+ image_width, image_height = image.size
58
+
59
+ img_draw = image.copy()
60
+ draw = ImageDraw.Draw(img_draw)
61
+
62
+ overlay = Image.new('RGBA', img_draw.size, (0, 0, 0, 0))
63
+ draw2 = ImageDraw.Draw(overlay)
64
+
65
+ font = ImageFont.load_default()
66
+
67
+ img_idx = 0
68
+
69
+ for i, ref in enumerate(refs):
70
+ try:
71
+ result = extract_coordinates_and_label(ref, image_width, image_height)
72
+ if result:
73
+ label_type, points_list = result
74
+
75
+ color = (np.random.randint(0, 200), np.random.randint(0, 200), np.random.randint(0, 255))
76
+
77
+ color_a = color + (20, )
78
+ for points in points_list:
79
+ x1, y1, x2, y2 = points
80
+
81
+ x1 = int(x1 / 999 * image_width)
82
+ y1 = int(y1 / 999 * image_height)
83
+
84
+ x2 = int(x2 / 999 * image_width)
85
+ y2 = int(y2 / 999 * image_height)
86
+
87
+ if label_type == 'image':
88
+ try:
89
+ cropped = image.crop((x1, y1, x2, y2))
90
+ cropped.save(f"{ouput_path}/images/{img_idx}.jpg")
91
+ except Exception as e:
92
+ print(e)
93
+ pass
94
+ img_idx += 1
95
+
96
+ try:
97
+ if label_type == 'title':
98
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=4)
99
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
100
+ else:
101
+ draw.rectangle([x1, y1, x2, y2], outline=color, width=2)
102
+ draw2.rectangle([x1, y1, x2, y2], fill=color_a, outline=(0, 0, 0, 0), width=1)
103
+ text_x = x1
104
+ text_y = max(0, y1 - 15)
105
+
106
+
107
+ text_bbox = draw.textbbox((0, 0), label_type, font=font)
108
+ text_width = text_bbox[2] - text_bbox[0]
109
+ text_height = text_bbox[3] - text_bbox[1]
110
+ draw.rectangle([text_x, text_y, text_x + text_width, text_y + text_height],
111
+ fill=(255, 255, 255, 30))
112
+
113
+ draw.text((text_x, text_y), label_type, font=font, fill=color)
114
+ except:
115
+ pass
116
+ except:
117
+ continue
118
+ img_draw.paste(overlay, (0, 0), overlay)
119
+ return img_draw
120
+
121
+
122
+ def process_image_with_refs(image, ref_texts, output_path):
123
+
124
+ result_image = draw_bounding_boxes(image, ref_texts, output_path)
125
+
126
+ return result_image
127
+
128
+
129
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
130
+ best_ratio_diff = float('inf')
131
+ best_ratio = (1, 1)
132
+ area = width * height
133
+ for ratio in target_ratios:
134
+ target_aspect_ratio = ratio[0] / ratio[1]
135
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
136
+ if ratio_diff < best_ratio_diff:
137
+ best_ratio_diff = ratio_diff
138
+ best_ratio = ratio
139
+ elif ratio_diff == best_ratio_diff:
140
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
141
+ best_ratio = ratio
142
+
143
+ # print(f'width: {width}, height: {height}, best_ratio: {best_ratio}')
144
+ return best_ratio
145
+
146
+
147
+ def dynamic_preprocess(image, min_num=2, max_num=9, image_size=640, use_thumbnail=False):
148
+ orig_width, orig_height = image.size
149
+ aspect_ratio = orig_width / orig_height
150
+
151
+ # calculate the existing image aspect ratio
152
+ target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num)
153
+
154
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
155
+ # print(f"target_ratios: {target_ratios}")
156
+
157
+ target_aspect_ratio = find_closest_aspect_ratio(
158
+ aspect_ratio,
159
+ target_ratios,
160
+ orig_width,
161
+ orig_height,
162
+ image_size
163
+ )
164
+ # print(f"target_aspect_ratio: {target_aspect_ratio}")
165
+
166
+ # calculate the target width and height
167
+ target_width = image_size * target_aspect_ratio[0]
168
+ target_height = image_size * target_aspect_ratio[1]
169
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
170
+
171
+ # resize the image
172
+ resized_img = image.resize((target_width, target_height))
173
+ processed_images = []
174
+ for i in range(blocks):
175
+ box = (
176
+ (i % (target_width // image_size)) * image_size,
177
+ (i // (target_width // image_size)) * image_size,
178
+ ((i % (target_width // image_size)) + 1) * image_size,
179
+ ((i // (target_width // image_size)) + 1) * image_size
180
+ )
181
+ # split the image
182
+ split_img = resized_img.crop(box)
183
+ processed_images.append(split_img)
184
+
185
+ assert len(processed_images) == blocks
186
+ # print(f"Number of processed images: {len(processed_images)}, Blocks: {blocks}")
187
+
188
+ if use_thumbnail and len(processed_images) != 1:
189
+ thumbnail_img = image.resize((image_size, image_size))
190
+ processed_images.append(thumbnail_img)
191
+ return processed_images, target_aspect_ratio
192
+
193
+
194
+ def normalize_transform(mean, std):
195
+ if mean is None and std is None:
196
+ transform = None
197
+ elif mean is None and std is not None:
198
+ mean = [0.] * len(std)
199
+ transform = transforms.Normalize(mean=mean, std=std)
200
+ elif mean is not None and std is None:
201
+ std = [1.] * len(mean)
202
+ transform = transforms.Normalize(mean=mean, std=std)
203
+ else:
204
+ transform = transforms.Normalize(mean=mean, std=std)
205
+
206
+ return transform
207
+
208
+ def format_messages(
209
+ tokenizer: T,
210
+ conversations: List[Dict[str, str]],
211
+ system_prompt: str = "",
212
+ ):
213
+ if system_prompt is not None and system_prompt != "":
214
+ sys_prompt = {
215
+ "role": "system",
216
+ "content": system_prompt,
217
+ }
218
+ conversations = [sys_prompt] + conversations
219
+
220
+ sft_prompt = tokenizer.apply_chat_template(
221
+ conversations,
222
+ )
223
+
224
+ return sft_prompt
225
+
226
+
227
+ def text_encode(tokenizer, text: str, bos: bool = True, eos: bool = False):
228
+ """
229
+ Encode text with optional BOS/EOS tokens.
230
+
231
+ Note: Qwen2VL tokenizer has bos_token_id=None, so we skip BOS for Qwen.
232
+ The chat template handles special tokens automatically.
233
+ """
234
+ t = tokenizer.encode(text, add_special_tokens=False)
235
+ bos_id = tokenizer.bos_token_id
236
+ eos_id = tokenizer.eos_token_id
237
+
238
+ # Only add BOS if tokenizer has one AND bos=True
239
+ if bos and bos_id is not None:
240
+ t = [bos_id] + t
241
+
242
+ # Only add EOS if tokenizer has one AND eos=True
243
+ if eos and eos_id is not None:
244
+ t = t + [eos_id]
245
+
246
+ return t
247
+
248
+ def load_pil_images(conversations: List[Dict[str, str]]) -> List[Image.Image]:
249
+ pil_images = []
250
+
251
+ for message in conversations:
252
+ pil_image = None
253
+
254
+ if message["role"].lower() == "user":
255
+ if isinstance(message["content"], List):
256
+ for d in message["content"]:
257
+ if d.get("type", "") == "image":
258
+ # Support both "image" (Qwen format) and "data" keys
259
+ image_path = d.get("image") or d.get("data", "")
260
+ pil_image = load_image(image_path)
261
+
262
+ elif isinstance(message["content"], Dict):
263
+ if message["content"].get("type", "") == "image":
264
+ # Support both "image" (Qwen format) and "data" keys
265
+ image_path = message["content"].get("image") or message["content"].get("data", "")
266
+ pil_image = load_image(image_path)
267
+
268
+ if pil_image is not None:
269
+ pil_images.append(pil_image)
270
+
271
+ return pil_images
272
+
273
+
274
+ class BaseTransform(ABC):
275
+
276
+ def set_rng(self, *args, **kwargs):
277
+ pass
278
+
279
+ def __call__(self, *args, **kwargs) -> torch.Tensor:
280
+ pass
281
+
282
+ @property
283
+ def default_shape(self):
284
+ raise NotImplementedError
285
+
286
+
287
+ class BasicImageTransform(BaseTransform):
288
+ def __init__(
289
+ self,
290
+ mean: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
291
+ std: Optional[Tuple[float, float, float]] = (0.5, 0.5, 0.5),
292
+ normalize: bool = True
293
+ ):
294
+ self.mean = mean
295
+ self.std = std
296
+
297
+ transform_pipelines = [
298
+ transforms.ToTensor()
299
+ ]
300
+
301
+ normalize = normalize_transform(mean, std) if normalize else nn.Identity()
302
+ if normalize is not None:
303
+ transform_pipelines.append(normalize)
304
+
305
+ self.transform = transforms.Compose(transform_pipelines)
306
+
307
+ def __call__(self, x):
308
+ x = self.transform(x)
309
+ return x
310
+
311
+ class NoEOSTextStreamer(TextStreamer):
312
+
313
+ def on_finalized_text(self, text: str, stream_end: bool = False):
314
+ eos_text = self.tokenizer.decode([self.tokenizer.eos_token_id], skip_special_tokens=False)
315
+ text = text.replace(eos_text, "\n")
316
+ print(text, flush=True, end="")
317
+
318
+
319
+
320
+
321
+ # @title Create datacollator
322
+
323
+ import torch
324
+ import math
325
+ from dataclasses import dataclass
326
+ from typing import Dict, List, Any, Tuple
327
+ from PIL import Image, ImageOps
328
+ from torch.nn.utils.rnn import pad_sequence
329
+ import io
330
+
331
+ # Use local functions (Qwen-compatible) instead of DeepSeek's versions
332
+ # from deepseek_ocr.modeling_deepseekocr import (
333
+ # format_messages,
334
+ # text_encode,
335
+ # BasicImageTransform,
336
+ # dynamic_preprocess,
337
+ # )
338
+
339
+
340
+ @dataclass
341
+ class DeepQwenDataCollator:
342
+ """
343
+ Data collator for DeepQwen model using Qwen2VL tokenizer.
344
+
345
+ This collator processes images using DeepSeek OCR's dynamic cropping algorithm
346
+ while maintaining compatibility with Qwen2VL's tokenization format.
347
+
348
+ Key token mappings (Qwen2VL):
349
+ - image_token: <|image_pad|> (id=151655)
350
+ - vision_start: <|vision_start|> (id=151652)
351
+ - vision_end: <|vision_end|> (id=151653)
352
+ - eos_token: <|im_end|> (id=151645)
353
+ - NO bos_token (bos_token_id is None)
354
+
355
+ Args:
356
+ tokenizer: Qwen2VL Tokenizer
357
+ model: Model
358
+ image_size: Size for image patches (default: 640)
359
+ base_size: Size for global view (default: 1024)
360
+ crop_mode: Whether to use dynamic cropping for large images
361
+ train_on_responses_only: If True, only train on assistant responses (mask user prompts)
362
+ """
363
+ tokenizer: T
364
+ model: Any
365
+ image_size: int = 640
366
+ base_size: int = 1024
367
+ crop_mode: bool = True
368
+ train_on_responses_only: bool = True
369
+
370
+ def __init__(
371
+ self,
372
+ tokenizer,
373
+ model,
374
+ image_size: int = 640,
375
+ base_size: int = 1024,
376
+ crop_mode: bool = True,
377
+ train_on_responses_only: bool = True,
378
+ max_length: int = None,
379
+ ):
380
+ self.tokenizer = tokenizer
381
+ self.model = model
382
+ self.image_size = image_size
383
+ self.base_size = base_size
384
+ self.crop_mode = crop_mode
385
+ self.dtype = model.dtype # Get dtype from model
386
+ self.train_on_responses_only = train_on_responses_only
387
+ self.max_length = max_length # None means no truncation
388
+
389
+ # Qwen2VL specific token IDs
390
+ # <|image_pad|> = 151655
391
+ self.image_token_id = getattr(tokenizer, 'image_token_id', None)
392
+ if self.image_token_id is None:
393
+ # Fallback: try to get from added_tokens or use default Qwen2VL value
394
+ self.image_token_id = 151655 # Qwen2VL's <|image_pad|>
395
+
396
+ self.image_token = tokenizer.decode([self.image_token_id], skip_special_tokens=False)
397
+
398
+ # Vision wrapper tokens for Qwen2VL format
399
+ self.vision_start_token_id = getattr(tokenizer, 'vision_start_token_id', 151652)
400
+ self.vision_end_token_id = getattr(tokenizer, 'vision_end_token_id', 151653)
401
+
402
+ self.image_transform = BasicImageTransform(
403
+ mean=(0.5, 0.5, 0.5),
404
+ std=(0.5, 0.5, 0.5),
405
+ normalize=True
406
+ )
407
+ self.patch_size = 16
408
+ self.downsample_ratio = 4
409
+
410
+ # Qwen2VL has NO bos_token (bos_token_id is None)
411
+ # The chat template handles conversation formatting
412
+ self.bos_id = tokenizer.bos_token_id # Will be None for Qwen2VL
413
+ self.eos_id = tokenizer.eos_token_id # 151645 for Qwen2VL
414
+ self.pad_token_id = tokenizer.pad_token_id # 151643 for Qwen2VL
415
+
416
+ def deserialize_image(self, image_data) -> Image.Image:
417
+ """Convert image data (bytes dict, PIL Image, or file path) to PIL Image in RGB mode"""
418
+ if isinstance(image_data, Image.Image):
419
+ return image_data.convert("RGB")
420
+ elif isinstance(image_data, str):
421
+ # File path - load lazily
422
+ image = load_image(image_data)
423
+ if image is None:
424
+ raise ValueError(f"Failed to load image from path: {image_data}")
425
+ return image.convert("RGB")
426
+ elif isinstance(image_data, dict) and 'bytes' in image_data:
427
+ image_bytes = image_data['bytes']
428
+ image = Image.open(io.BytesIO(image_bytes))
429
+ return image.convert("RGB")
430
+ else:
431
+ raise ValueError(f"Unsupported image format: {type(image_data)}")
432
+
433
+ def calculate_image_token_count(self, image: Image.Image, crop_ratio: Tuple[int, int]) -> int:
434
+ """Calculate the number of tokens this image will generate"""
435
+ num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
436
+ num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
437
+
438
+ width_crop_num, height_crop_num = crop_ratio
439
+
440
+ if self.crop_mode:
441
+ img_tokens = num_queries_base * num_queries_base + 1
442
+ if width_crop_num > 1 or height_crop_num > 1:
443
+ img_tokens += (num_queries * width_crop_num + 1) * (num_queries * height_crop_num)
444
+ else:
445
+ img_tokens = num_queries * num_queries + 1
446
+
447
+ return img_tokens
448
+
449
+ def process_image(self, image: Image.Image) -> Tuple[List, List, List, List, Tuple[int, int]]:
450
+ """
451
+ Process a single image based on crop_mode and size thresholds
452
+
453
+ Returns:
454
+ Tuple of (images_list, images_crop_list, images_spatial_crop, tokenized_image, crop_ratio)
455
+ """
456
+ images_list = []
457
+ images_crop_list = []
458
+ images_spatial_crop = []
459
+
460
+ if self.crop_mode:
461
+ # Determine crop ratio based on image size
462
+ if image.size[0] <= 640 and image.size[1] <= 640:
463
+ crop_ratio = (1, 1)
464
+ images_crop_raw = []
465
+ else:
466
+ images_crop_raw, crop_ratio = dynamic_preprocess(
467
+ image, min_num=2, max_num=9,
468
+ image_size=self.image_size, use_thumbnail=False
469
+ )
470
+
471
+ # Process global view with padding
472
+ global_view = ImageOps.pad(
473
+ image, (self.base_size, self.base_size),
474
+ color=tuple(int(x * 255) for x in self.image_transform.mean)
475
+ )
476
+ images_list.append(self.image_transform(global_view).to(self.dtype))
477
+
478
+ width_crop_num, height_crop_num = crop_ratio
479
+ images_spatial_crop.append([width_crop_num, height_crop_num])
480
+
481
+ # Process local views (crops) if applicable
482
+ if width_crop_num > 1 or height_crop_num > 1:
483
+ for crop_img in images_crop_raw:
484
+ images_crop_list.append(
485
+ self.image_transform(crop_img).to(self.dtype)
486
+ )
487
+
488
+ # Calculate image tokens
489
+ num_queries = math.ceil((self.image_size // self.patch_size) / self.downsample_ratio)
490
+ num_queries_base = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
491
+
492
+ tokenized_image = ([self.image_token_id] * num_queries_base + [self.image_token_id]) * num_queries_base
493
+ tokenized_image += [self.image_token_id]
494
+
495
+ if width_crop_num > 1 or height_crop_num > 1:
496
+ tokenized_image += ([self.image_token_id] * (num_queries * width_crop_num) + [self.image_token_id]) * (
497
+ num_queries * height_crop_num)
498
+
499
+ else: # crop_mode = False
500
+ crop_ratio = (1, 1)
501
+ images_spatial_crop.append([1, 1])
502
+
503
+ # For smaller base sizes, resize; for larger, pad
504
+ if self.base_size <= 640:
505
+ resized_image = image.resize((self.base_size, self.base_size), Image.LANCZOS)
506
+ images_list.append(self.image_transform(resized_image).to(self.dtype))
507
+ else:
508
+ global_view = ImageOps.pad(
509
+ image, (self.base_size, self.base_size),
510
+ color=tuple(int(x * 255) for x in self.image_transform.mean)
511
+ )
512
+ images_list.append(self.image_transform(global_view).to(self.dtype))
513
+
514
+ num_queries = math.ceil((self.base_size // self.patch_size) / self.downsample_ratio)
515
+ tokenized_image = ([self.image_token_id] * num_queries + [self.image_token_id]) * num_queries
516
+ tokenized_image += [self.image_token_id]
517
+
518
+ return images_list, images_crop_list, images_spatial_crop, tokenized_image, crop_ratio
519
+
520
+ def process_single_sample(self, messages: List[Dict]) -> Dict[str, Any]:
521
+ """
522
+ Process a single conversation into model inputs.
523
+
524
+ Expected message format (Qwen2.5-VL native style):
525
+ [
526
+ {
527
+ "role": "user",
528
+ "content": [
529
+ {"type": "image", "image": <PIL.Image or path or bytes>},
530
+ {"type": "text", "text": "Describe this image."}
531
+ ]
532
+ },
533
+ {
534
+ "role": "assistant",
535
+ "content": [{"type": "text", "text": "This is a description..."}]
536
+ }
537
+ ]
538
+
539
+ Also supports string content for backward compatibility.
540
+ """
541
+
542
+ # --- 1. Setup ---
543
+ tokenized_str = []
544
+ images_seq_mask = []
545
+ images_list, images_crop_list, images_spatial_crop = [], [], []
546
+
547
+ prompt_token_count = -1 # Index to start training
548
+ assistant_started = False
549
+
550
+ # Qwen2VL has NO bos_token, so we don't add one
551
+
552
+ for message in messages:
553
+ role = message["role"].lower() # Normalize role to lowercase
554
+ content = message["content"]
555
+
556
+ # Check if this is the assistant's turn
557
+ if role == "assistant":
558
+ if not assistant_started:
559
+ # This is the split point. All tokens added *so far*
560
+ # are part of the prompt.
561
+ prompt_token_count = len(tokenized_str)
562
+ assistant_started = True
563
+
564
+ # Process content based on format
565
+ if isinstance(content, list):
566
+ # Qwen2.5-VL native format: content is a list of typed items
567
+ content_parts = []
568
+
569
+ for item in content:
570
+ item_type = item.get("type", "")
571
+
572
+ if item_type == "image":
573
+ # Get image data from various possible keys
574
+ image_data = item.get("image") or item.get("data")
575
+ if image_data is not None:
576
+ pil_image = self.deserialize_image(image_data)
577
+
578
+ # Process the image through DeepSeek's encoder
579
+ img_list, crop_list, spatial_crop, tok_img, _ = self.process_image(pil_image)
580
+
581
+ images_list.extend(img_list)
582
+ images_crop_list.extend(crop_list)
583
+ images_spatial_crop.extend(spatial_crop)
584
+
585
+ # Add image placeholder tokens
586
+ tokenized_str.extend(tok_img)
587
+ images_seq_mask.extend([True] * len(tok_img))
588
+
589
+ elif item_type == "text":
590
+ text = item.get("text", "")
591
+
592
+ # For assistant, append EOS at the end of all text
593
+ if role == "assistant" and item == content[-1]:
594
+ if self.tokenizer.eos_token:
595
+ text = f"{text.strip()}{self.tokenizer.eos_token}"
596
+
597
+ # Tokenize the text
598
+ tokenized_text = text_encode(self.tokenizer, text, bos=False, eos=False)
599
+ tokenized_str.extend(tokenized_text)
600
+ images_seq_mask.extend([False] * len(tokenized_text))
601
+
602
+ else:
603
+ # Legacy format: content is a string (backward compatibility)
604
+ text_content = content
605
+
606
+ # For assistant, append EOS token
607
+ if role == "assistant" and self.tokenizer.eos_token:
608
+ text_content = f"{text_content.strip()}{self.tokenizer.eos_token}"
609
+
610
+ # Tokenize the text
611
+ tokenized_text = text_encode(self.tokenizer, text_content, bos=False, eos=False)
612
+ tokenized_str.extend(tokenized_text)
613
+ images_seq_mask.extend([False] * len(tokenized_text))
614
+
615
+ # --- 2. Validation and Final Prep ---
616
+ # If we never found an assistant message, we're in a weird state
617
+ # (e.g., user-only prompt). We mask everything.
618
+ if not assistant_started:
619
+ print("Warning: No assistant message found in sample. Masking all tokens.")
620
+ prompt_token_count = len(tokenized_str)
621
+
622
+ # # DEBUG: Print after processing
623
+ # print(f"[DEBUG] tokenized_str length: {len(tokenized_str)}")
624
+ # print(f"[DEBUG] images_seq_mask length: {len(images_seq_mask)}, True count: {sum(images_seq_mask)}")
625
+ # print(f"[DEBUG] images_list length: {len(images_list)}")
626
+ # print(f"[DEBUG] images_crop_list length: {len(images_crop_list)}")
627
+ # print(f"[DEBUG] prompt_token_count: {prompt_token_count}")
628
+
629
+ # Prepare image tensors
630
+ images_ori = torch.stack(images_list, dim=0)
631
+ images_spatial_crop_tensor = torch.tensor(images_spatial_crop, dtype=torch.long)
632
+
633
+ if images_crop_list:
634
+ images_crop = torch.stack(images_crop_list, dim=0)
635
+ else:
636
+ images_crop = torch.zeros((1, 3, self.base_size, self.base_size), dtype=self.dtype)
637
+
638
+ return {
639
+ "input_ids": torch.tensor(tokenized_str, dtype=torch.long),
640
+ "images_seq_mask": torch.tensor(images_seq_mask, dtype=torch.bool),
641
+ "images_ori": images_ori,
642
+ "images_crop": images_crop,
643
+ "images_spatial_crop": images_spatial_crop_tensor,
644
+ "prompt_token_count": prompt_token_count, # This is now accurate
645
+ }
646
+
647
+ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
648
+ """
649
+ Collate batch of samples.
650
+
651
+ Expected feature format:
652
+ {
653
+ "prompt": str, # The user's question/instruction
654
+ "response": str, # The assistant's response
655
+ "image": PIL.Image or bytes dict # The image
656
+ }
657
+
658
+ This will be converted to Qwen2.5-VL native conversation format:
659
+ [
660
+ {
661
+ "role": "user",
662
+ "content": [
663
+ {"type": "image", "image": <PIL.Image>},
664
+ {"type": "text", "text": "<prompt>"}
665
+ ]
666
+ },
667
+ {
668
+ "role": "assistant",
669
+ "content": [{"type": "text", "text": "<response>"}]
670
+ }
671
+ ]
672
+ """
673
+ batch_data = []
674
+
675
+ # Process each sample
676
+ for feature in features:
677
+ try:
678
+ # Get image from either 'image' or 'image_path' key (lazy loading support)
679
+ image_data = feature.get('image') or feature.get('image_path')
680
+ if image_data is None:
681
+ raise ValueError("Sample missing both 'image' and 'image_path' keys")
682
+
683
+ # Use Qwen2.5-VL native message format
684
+ # content is a list of typed items: {"type": "image", ...} or {"type": "text", ...}
685
+ messages = [
686
+ {
687
+ "role": "user",
688
+ "content": [
689
+ {"type": "image", "image": image_data},
690
+ {"type": "text", "text": feature['prompt']}
691
+ ]
692
+ },
693
+ {
694
+ "role": "assistant",
695
+ "content": [
696
+ {"type": "text", "text": feature["response"]}
697
+ ]
698
+ }
699
+ ]
700
+
701
+ processed = self.process_single_sample(messages)
702
+ batch_data.append(processed)
703
+ except Exception as e:
704
+ print(f"Error processing sample: {e}")
705
+ continue
706
+
707
+ if not batch_data:
708
+ raise ValueError("No valid samples in batch")
709
+
710
+ # Extract lists
711
+ input_ids_list = [item['input_ids'] for item in batch_data]
712
+ images_seq_mask_list = [item['images_seq_mask'] for item in batch_data]
713
+ prompt_token_counts = [item['prompt_token_count'] for item in batch_data]
714
+
715
+ # Pad sequences using Qwen2VL's pad_token_id (151643 = <|endoftext|>)
716
+ input_ids = pad_sequence(input_ids_list, batch_first=True, padding_value=self.pad_token_id)
717
+ images_seq_mask = pad_sequence(images_seq_mask_list, batch_first=True, padding_value=False)
718
+
719
+ # Truncate to max_length if specified (prevents OOM on long sequences)
720
+ if self.max_length is not None and input_ids.shape[1] > self.max_length:
721
+ input_ids = input_ids[:, :self.max_length]
722
+ images_seq_mask = images_seq_mask[:, :self.max_length]
723
+ # Adjust prompt_token_counts if they exceed max_length
724
+ prompt_token_counts = [min(p, self.max_length) for p in prompt_token_counts]
725
+
726
+ # Create labels
727
+ labels = input_ids.clone()
728
+
729
+ # Mask padding tokens
730
+ labels[labels == self.pad_token_id] = -100
731
+
732
+ # Mask image tokens (model shouldn't predict these)
733
+ labels[images_seq_mask] = -100
734
+
735
+ # Mask user prompt tokens when train_on_responses_only=True (only train on assistant responses)
736
+ if self.train_on_responses_only:
737
+ for idx, prompt_count in enumerate(prompt_token_counts):
738
+ if prompt_count > 0:
739
+ labels[idx, :prompt_count] = -100
740
+
741
+ # Create attention mask
742
+ attention_mask = (input_ids != self.pad_token_id).long()
743
+
744
+ images_batch = []
745
+ for item in batch_data:
746
+ images_batch.append((item['images_crop'], item['images_ori']))
747
+
748
+ images_spatial_crop = torch.cat([item['images_spatial_crop'] for item in batch_data], dim=0)
749
+
750
+ return {
751
+ "input_ids": input_ids,
752
+ "attention_mask": attention_mask,
753
+ "labels": labels,
754
+ "images": images_batch,
755
+ "images_seq_mask": images_seq_mask,
756
+ "images_spatial_crop": images_spatial_crop,
757
+ }
758
+
encoder.py ADDED
@@ -0,0 +1,1052 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import copy
5
+
6
+ from contextlib import nullcontext
7
+ import math
8
+ from typing import Optional, Tuple
9
+ # from megatron.model import LayerNorm
10
+
11
+ from einops import rearrange
12
+ from easydict import EasyDict as adict
13
+
14
+ from typing import Optional, Tuple, Type
15
+ from functools import partial
16
+
17
+ class MlpProjector(nn.Module):
18
+ def __init__(self, cfg):
19
+ super().__init__()
20
+ self.cfg = cfg
21
+
22
+ if cfg.projector_type == "identity":
23
+ modules = nn.Identity()
24
+
25
+ elif cfg.projector_type == "linear":
26
+ modules = nn.Linear(cfg.input_dim, cfg.n_embed)
27
+
28
+ elif cfg.projector_type == "mlp_gelu":
29
+ mlp_depth = cfg.get("depth", 1)
30
+ modules = [nn.Linear(cfg.input_dim, cfg.n_embed)]
31
+ for _ in range(1, mlp_depth):
32
+ modules.append(nn.GELU())
33
+ modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
34
+ modules = nn.Sequential(*modules)
35
+
36
+ elif cfg.projector_type == "normlayer_downsample_mlp_gelu":
37
+ mlp_depth = cfg.get("depth", 1)
38
+ mlp_ratio = cfg.get("mlp_ratio", 1)
39
+ modules = [
40
+ nn.LayerNorm(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio),
41
+ nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)
42
+ ]
43
+ for _ in range(1, mlp_depth - 1):
44
+ modules.append(nn.GELU())
45
+ modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
46
+ modules.append(nn.GELU())
47
+ modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
48
+ modules = nn.Sequential(*modules)
49
+
50
+ elif cfg.projector_type == "downsample_mlp_gelu":
51
+ mlp_depth = cfg.get("depth", 1)
52
+ mlp_ratio = cfg.get("mlp_ratio", 1)
53
+ modules = [nn.Linear(cfg.input_dim * cfg.downsample_ratio * cfg.downsample_ratio, cfg.n_embed * mlp_ratio)]
54
+ for _ in range(1, mlp_depth - 1):
55
+ modules.append(nn.GELU())
56
+ modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed * mlp_ratio))
57
+ modules.append(nn.GELU())
58
+ modules.append(nn.Linear(cfg.n_embed * mlp_ratio, cfg.n_embed))
59
+ modules = nn.Sequential(*modules)
60
+
61
+ elif cfg.projector_type == "low_high_hybrid_split_mlp_gelu":
62
+ mlp_depth = cfg.get("depth", 1)
63
+ self.high_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
64
+ self.low_up_proj = nn.Linear(cfg.input_dim, cfg.n_embed // 2)
65
+
66
+ modules = []
67
+ for _ in range(1, mlp_depth):
68
+ modules.append(nn.GELU())
69
+ modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
70
+ modules = nn.Sequential(*modules)
71
+
72
+ elif cfg.projector_type == "hybrid_split_feature_mlp_gelu":
73
+ mlp_depth = cfg.get("depth", 1)
74
+ channel_div = cfg.get("channel_div", 0.5)
75
+ self.high_up_proj = nn.Linear(cfg.input_dim[0], int(cfg.n_embed * channel_div))
76
+ self.low_up_proj = nn.Linear(cfg.input_dim[1], cfg.n_embed - int(cfg.n_embed * channel_div))
77
+
78
+ modules = []
79
+ for _ in range(1, mlp_depth):
80
+ modules.append(nn.GELU())
81
+ modules.append(nn.Linear(cfg.n_embed, cfg.n_embed))
82
+ modules = nn.Sequential(*modules)
83
+
84
+ elif cfg.projector_type == "low_high_split_mlp_gelu":
85
+ mlp_depth = cfg.get("depth", 1)
86
+ modules = []
87
+ for _ in range(1, mlp_depth):
88
+ modules.append(nn.GELU())
89
+ modules.append(nn.Linear(cfg.n_embed // 2, cfg.n_embed // 2))
90
+ modules = nn.Sequential(*modules)
91
+ self.high_layers = nn.Sequential(*modules)
92
+ self.low_layers = copy.deepcopy(modules)
93
+
94
+ else:
95
+ raise ValueError(f"Unknown projector type: {cfg.projector_type}")
96
+
97
+ if cfg.get("token_pooling", False):
98
+ self.token_pooling_layer = nn.Linear(cfg.input_dim * 4, cfg.input_dim)
99
+
100
+ if cfg.get("conv_fusion_high_low_features", False):
101
+ self.fusion_layer = nn.Linear(cfg.input_dim, cfg.input_dim)
102
+ self.layers = modules
103
+
104
+ def forward(self, x):
105
+ if self.cfg.get("token_pooling", False):
106
+ batch_size, wxh, channels = x.shape
107
+ w = h = int(wxh**0.5)
108
+ x = x.view(batch_size, w, h, channels)
109
+ x = x.permute(0, 3, 1, 2)
110
+ # import ipdb; ipdb.set_trace()
111
+ patches = x.unfold(2, 2, 2).unfold(3, 2, 2)
112
+ batch_size, channels, h_patches, w_patches, _, _ = patches.size()
113
+ # 在通道维度上拼接
114
+ patches = patches.contiguous().view(batch_size, channels, h_patches * w_patches, -1)
115
+
116
+ # 通过线性层
117
+ patches = patches.permute(0, 2, 1, 3).contiguous()
118
+ patches = patches.view(batch_size, h_patches * w_patches, channels * 4)
119
+
120
+ x = self.token_pooling_layer(patches)
121
+
122
+ if self.cfg.get("conv_fusion_high_low_features", False):
123
+ x = self.fusion_layer(x[:, 0]) + x[:, 1]
124
+
125
+ if self.cfg.projector_type == 'low_high_hybrid_split_mlp_gelu':
126
+ high_x, low_x = x[0], x[1]
127
+ high_x = self.high_up_proj(high_x)
128
+ low_x = self.low_up_proj(low_x)
129
+ x = torch.concat([high_x, low_x], dim=-1)
130
+
131
+ if self.cfg.projector_type == 'hybrid_split_feature_mlp_gelu':
132
+ high_x = x[...,:self.cfg.input_dim[0]]
133
+ low_x = x[...,self.cfg.input_dim[0]:]
134
+ high_x = self.high_up_proj(high_x)
135
+ low_x = self.low_up_proj(low_x)
136
+ x = torch.concat([high_x, low_x], dim=-1)
137
+
138
+ if self.cfg.projector_type == 'low_high_split_mlp_gelu':
139
+ high_x, low_x = x[0], x[1]
140
+ high_x = self.high_layers(high_x)
141
+ low_x = self.low_layers(low_x)
142
+ x = torch.concat([high_x, low_x], dim=-1)
143
+ return x
144
+
145
+ if self.cfg.projector_type == 'downsample_mlp_gelu' or self.cfg.projector_type == 'normlayer_downsample_mlp_gelu':
146
+ bs, hw, input_dim = x.shape
147
+ h = w = int((hw) ** 0.5)
148
+
149
+ """compute padding"""
150
+ if h % self.cfg.downsample_ratio:
151
+ pad = self.cfg.downsample_ratio - h % self.cfg.downsample_ratio
152
+ else:
153
+ pad = 0
154
+ x = x.reshape(bs, h, w, input_dim)
155
+ if pad > 0:
156
+ x = F.pad(x, (0, 0, 0, pad, 0, pad), "constant", 0)
157
+
158
+ """4 to 1 concat"""
159
+ x = x.permute(0, 3, 1, 2) # B, C, H, W
160
+ x = F.unfold(x, kernel_size=self.cfg.downsample_ratio, stride=self.cfg.downsample_ratio, padding=0) # B, C*4, HW // 4
161
+ x = x.permute(0, 2, 1)
162
+
163
+ return self.layers(x)
164
+
165
+ @staticmethod
166
+ def get_flops_per_sample(cfg):
167
+ if cfg.projector_type == "linear":
168
+ fwd = 2 * cfg.input_dim * cfg.n_embed
169
+
170
+ elif "mlp_gelu" in cfg.projector_type :
171
+ mlp_depth = cfg.get("depth", 1)
172
+ downsample_ratio = cfg.get("downsample_ratio", 1)
173
+ input_dim = sum(cfg.input_dim) if isinstance(cfg.input_dim, list) else cfg.input_dim
174
+ input_dim = input_dim * downsample_ratio * downsample_ratio
175
+ fwd = 2 * input_dim * cfg.n_embed + (mlp_depth - 1) * 2 * cfg.n_embed * cfg.n_embed
176
+ else:
177
+ fwd = 0
178
+
179
+ return fwd * 3
180
+
181
+
182
+ #===================clip============================================================
183
+
184
+ class LayerNormfp32(torch.nn.LayerNorm):
185
+ """Subclass torch's LayerNorm to handle fp16."""
186
+
187
+ def forward(self, x: torch.Tensor):
188
+ orig_type = x.dtype
189
+ ret = super().forward(x.type(torch.float32))
190
+ return ret.type(orig_type)
191
+
192
+
193
+ def get_abs_pos(abs_pos, tgt_size):
194
+ # abs_pos: L, C
195
+ # tgt_size: M
196
+ # return: M, C
197
+
198
+ # print(tgt_size)
199
+ # print(abs_pos.shape)
200
+ # exit()
201
+ dim = abs_pos.size(-1)
202
+ # print(dim)
203
+ abs_pos_new = abs_pos.squeeze(0)
204
+ cls_token, old_pos_embed = abs_pos_new[:1], abs_pos_new[1:]
205
+
206
+
207
+
208
+ src_size = int(math.sqrt(abs_pos_new.shape[0] - 1))
209
+ tgt_size = int(math.sqrt(tgt_size))
210
+ dtype = abs_pos.dtype
211
+
212
+ if src_size != tgt_size:
213
+ old_pos_embed = old_pos_embed.view(1, src_size, src_size, dim).permute(0, 3, 1,
214
+ 2).contiguous()
215
+ old_pos_embed = old_pos_embed.to(torch.float32)
216
+ new_pos_embed = F.interpolate(
217
+ old_pos_embed,
218
+ size=(tgt_size, tgt_size),
219
+ mode='bicubic',
220
+ antialias=True,
221
+ align_corners=False,
222
+ ).to(dtype)
223
+ new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
224
+ new_pos_embed = new_pos_embed.view(tgt_size * tgt_size, dim)
225
+ vision_pos_embed = torch.cat([cls_token, new_pos_embed], dim=0)
226
+ vision_pos_embed = vision_pos_embed.view(1, tgt_size * tgt_size + 1, dim)
227
+ return vision_pos_embed
228
+ else:
229
+ return abs_pos
230
+
231
+ @torch.jit.script
232
+ def quick_gelu(x):
233
+ return x * torch.sigmoid(1.702 * x)
234
+
235
+
236
+
237
+ class CLIPVisionEmbeddings(nn.Module):
238
+ def __init__(self, hidden_size=1024, image_size=224, patch_size=14, num_channels=3):
239
+ super().__init__()
240
+ self.embed_dim = hidden_size
241
+ self.image_size = image_size
242
+ self.patch_size = patch_size
243
+
244
+ self.class_embedding = torch.nn.Parameter(torch.randn(self.embed_dim))
245
+
246
+ self.patch_embedding = torch.nn.Conv2d(
247
+ in_channels=num_channels,
248
+ out_channels=self.embed_dim,
249
+ kernel_size=self.patch_size,
250
+ stride=self.patch_size,
251
+ bias=False,
252
+ )
253
+
254
+ self.num_patches = (self.image_size // self.patch_size) ** 2
255
+ self.num_positions = self.num_patches + 1
256
+ self.position_embedding = torch.nn.Embedding(self.num_positions, self.embed_dim)
257
+ self.register_buffer(
258
+ "position_ids", torch.arange(self.num_positions).expand((1, -1))
259
+ )
260
+
261
+ def forward(self, pixel_values, patch_embeds):
262
+ batch_size = pixel_values.shape[0]
263
+ # patch_embeds = self.patch_embedding(
264
+ # pixel_values
265
+ # ) # shape = [*, width, grid, grid]
266
+
267
+
268
+ if patch_embeds is not None:
269
+ patch_embeds = patch_embeds
270
+ # print(patch_embeds.shape)
271
+ else:
272
+ patch_embeds = self.patch_embedding(pixel_values)
273
+ # print(111111)
274
+ # shape = [*, width, grid, grid]
275
+ # patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
276
+
277
+ patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
278
+
279
+
280
+ class_embeds = self.class_embedding.expand(batch_size, 1, -1)
281
+ embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
282
+
283
+ # x = torch.cat([cls_token, x], dim=1)
284
+ embeddings = embeddings + get_abs_pos(self.position_embedding(self.position_ids), embeddings.size(1))
285
+ # embeddings = embeddings + self.position_embedding(self.position_ids)
286
+ return embeddings
287
+
288
+
289
+ class NoTPFeedForward(nn.Module):
290
+ def __init__(
291
+ self,
292
+ cfg,
293
+ dim: int,
294
+ hidden_dim: int,
295
+ ):
296
+ super().__init__()
297
+
298
+ self.fc1 = torch.nn.Linear(dim, hidden_dim, bias=True)
299
+ self.fc2 = torch.nn.Linear(hidden_dim, dim, bias=True)
300
+
301
+ def forward(self, x):
302
+ output = self.fc2(quick_gelu(self.fc1(x)))
303
+ return output
304
+
305
+
306
+
307
+
308
+ class NoTPAttention(torch.nn.Module):
309
+ def __init__(self, cfg):
310
+ super().__init__()
311
+ self.num_heads = cfg.num_attention_heads
312
+ self.n_local_heads = cfg.num_attention_heads
313
+ self.head_dim = cfg.hidden_size // cfg.num_attention_heads
314
+ self.max_seq_len = cfg.seq_length
315
+ self.use_flash_attention = cfg.use_flash_attn
316
+
317
+ self.qkv_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size * 3, bias=True)
318
+ self.out_proj = torch.nn.Linear(cfg.hidden_size, cfg.hidden_size, bias=True)
319
+
320
+ # self.core_attention = CoreAttention(cfg, AttnType.self_attn)
321
+
322
+ self.attn_drop = cfg.attention_dropout
323
+
324
+ def forward(
325
+ self,
326
+ x: torch.Tensor,
327
+ ):
328
+ bsz, seqlen, _ = x.shape
329
+ xqkv = self.qkv_proj(x)
330
+ xqkv = xqkv.view(bsz, seqlen, 3, self.num_heads, self.head_dim)
331
+
332
+ if self.use_flash_attention:
333
+
334
+ xq, xk, xv = torch.split(xqkv, 1, dim=2)
335
+ xq = xq.squeeze(2)
336
+ xk = xk.squeeze(2)
337
+ xv = xv.squeeze(2)
338
+ # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
339
+
340
+ # (B, num_head, S, head_size)
341
+ xq = xq.permute(0, 2, 1, 3)
342
+ xk = xk.permute(0, 2, 1, 3)
343
+ xv = xv.permute(0, 2, 1, 3)
344
+ # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
345
+ output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)
346
+ output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
347
+ # output = output.permute(0, 2, 1, 3).contiguous().view(bsz, seqlen, -1)
348
+ else:
349
+ # print(22222)
350
+ xq, xk, xv = torch.split(xqkv, 1, dim=2)
351
+ xq = xq.squeeze(2)
352
+ xk = xk.squeeze(2)
353
+ xv = xv.squeeze(2)
354
+ # xq, xk, xv = xqkv[:, :, 0, ...], xqkv[:, :, 1, ...], xqkv[:, :, 2, ...]
355
+
356
+ # (B, num_head, S, head_size)
357
+ xq = xq.permute(0, 2, 1, 3)
358
+ xk = xk.permute(0, 2, 1, 3)
359
+ xv = xv.permute(0, 2, 1, 3)
360
+ # with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
361
+ output = torch.nn.functional.scaled_dot_product_attention(xq, xk, xv, attn_mask=None)
362
+ output = output.permute(0, 2, 1, 3).reshape(bsz, seqlen, -1)
363
+ # output = output.permute(0, 2, 1, 3).contiguous().view(bsz, seqlen, -1)
364
+ output = self.out_proj(output)
365
+ return output
366
+
367
+ class NoTPTransformerBlock(nn.Module):
368
+ def __init__(self, cfg, layer_id: int, multiple_of=256):
369
+ super().__init__()
370
+
371
+ self.n_heads = cfg.num_attention_heads
372
+ self.dim = cfg.hidden_size
373
+ self.head_dim = cfg.hidden_size // cfg.num_attention_heads
374
+ self.self_attn = NoTPAttention(cfg)
375
+ self.mlp = NoTPFeedForward(
376
+ cfg, dim=cfg.hidden_size, hidden_dim=cfg.ffn_hidden_size
377
+ )
378
+ self.layer_id = layer_id
379
+ self.layer_norm1 = torch.nn.LayerNorm(
380
+ cfg.hidden_size, eps=cfg.layernorm_epsilon
381
+ )
382
+ self.layer_norm2 = torch.nn.LayerNorm(
383
+ cfg.hidden_size, eps=cfg.layernorm_epsilon
384
+ )
385
+
386
+ def forward(self, x: torch.Tensor):
387
+ residual = self.self_attn.forward(self.layer_norm1(x))
388
+ h = x + residual
389
+ out = h + self.mlp.forward(self.layer_norm2(h))
390
+ return out
391
+
392
+
393
+ class NoTPTransformer(nn.Module):
394
+ def __init__(self, cfg):
395
+ super().__init__()
396
+
397
+ self.cfg = cfg
398
+ # self.recompute_list = self.cfg.get("recompute_list", [])
399
+ self.num_layers = cfg.num_layers # _get_num_layers(cfg)
400
+
401
+ self.layers = torch.nn.ModuleList()
402
+ for layer_id in range(self.num_layers):
403
+ self.layers.append(
404
+ NoTPTransformerBlock(
405
+ cfg,
406
+ layer_id + 1,
407
+ )
408
+ )
409
+
410
+ def forward(
411
+ self,
412
+ hidden_states,
413
+ ):
414
+
415
+ for lid, layer in enumerate(self.layers):
416
+ # if lid in self.recompute_list:
417
+ # def custom(layer_id):
418
+ # def custom_forward(*args, **kwargs):
419
+ # x_ = self.layers[layer_id](*args, **kwargs)
420
+ # return x_
421
+
422
+ # return custom_forward
423
+
424
+ # assert hidden_states.requires_grad == True, logger.warning(
425
+ # "When using recalculation, the input must have grad fn"
426
+ # )
427
+ # hidden_states = tensor_parallel.checkpoint(
428
+ # custom(lid),
429
+ # False,
430
+ # hidden_states.contiguous()
431
+ # )
432
+ # else:
433
+ hidden_states = layer(hidden_states)
434
+
435
+ return hidden_states
436
+
437
+
438
+ # from megatron.core.tensor_parallel.layers import non_tensor_paralleled, local_dp_reduce, local_dp_scatter
439
+
440
+ class VitModel(nn.Module):
441
+ def __init__(
442
+ self,
443
+ cfg,
444
+ freeze_embed=False,
445
+ freeze_pre_norm=False
446
+ ) -> None:
447
+ super().__init__()
448
+
449
+ self.embeddings = CLIPVisionEmbeddings(hidden_size=cfg.hidden_size, image_size=cfg.image_size, patch_size=cfg.patch_size)
450
+
451
+ if freeze_embed:
452
+ for name, param in self.embeddings.named_parameters():
453
+ param.requires_grad = False
454
+
455
+ self.transformer = NoTPTransformer(cfg=cfg)
456
+
457
+ if cfg.get("fp32norm", False):
458
+ logger.info("Load fp32 layernorm for ViT.")
459
+ self.pre_layrnorm = LayerNormfp32(
460
+ cfg.hidden_size,
461
+ eps=cfg.get("pre_layernorm_epsilon", 1e-5),
462
+ )
463
+ else:
464
+ self.pre_layrnorm = torch.nn.LayerNorm(
465
+ cfg.hidden_size,
466
+ eps=cfg.get("pre_layernorm_epsilon", 1e-5),
467
+ )
468
+
469
+ # self.pre_layrnorm = RMSNorm(
470
+ # cfg.hidden_size,
471
+ # eps=cfg.get("pre_layernorm_epsilon", 1e-5),
472
+ # sequence_parallel=False,
473
+ # use_fp32=True,
474
+ # use_optimus=True,
475
+ # )
476
+
477
+ if freeze_pre_norm:
478
+ for name, param in self.pre_layrnorm.named_parameters():
479
+ param.requires_grad = False
480
+
481
+ for p in self.parameters():
482
+ p.micro_dp = True
483
+
484
+ def set_input_tensor(self, input_tensor):
485
+ if not isinstance(input_tensor, list):
486
+ input_tensor = [input_tensor]
487
+ self.transformer.set_input_tensor(input_tensor[0])
488
+
489
+ def __str__(self) -> str:
490
+ return "open_clip"
491
+
492
+ def forward(
493
+ self,
494
+ x,
495
+ patch_embeds
496
+ ):
497
+ x = self.embeddings(x, patch_embeds)
498
+ hidden_states = self.pre_layrnorm(x)
499
+
500
+ # hidden_states, dis = local_dp_scatter(hidden_states)
501
+ output = self.transformer(hidden_states)
502
+
503
+ # output = local_dp_reduce(output, dis)
504
+
505
+ return output
506
+
507
+
508
+ vit_model_cfg = adict(
509
+ num_layers=24,
510
+ hidden_size=1024,
511
+ num_heads = 16,
512
+ num_attention_heads=16,
513
+ ffn_hidden_size=4096,
514
+ seq_length=256,
515
+ max_position_embeddings=256,
516
+ use_flash_attn=False,
517
+ understand_projector_stride=2,
518
+ hidden_dropout = 0.0,
519
+ attention_dropout = 0.0,
520
+ no_persist_layer_norm = False,
521
+ layernorm_epsilon = 1e-5,
522
+ pre_layernorm_epsilon = 1e-5,
523
+ image_size = 224,
524
+ patch_size = 14,
525
+ recompute_list = []
526
+ )
527
+
528
+ def build_clip_l():
529
+ return VitModel(
530
+ cfg=vit_model_cfg,
531
+ freeze_embed=False,
532
+ freeze_pre_norm=False,
533
+ )
534
+
535
+
536
+
537
+
538
+
539
+ #=========================Sam-Vary=================================
540
+
541
+
542
+ def get_abs_pos_sam(abs_pos, tgt_size):
543
+
544
+ dtype = abs_pos.dtype
545
+
546
+ src_size = abs_pos.size(1)
547
+
548
+ if src_size != tgt_size:
549
+ old_pos_embed = abs_pos.permute(0, 3, 1, 2)
550
+ old_pos_embed = old_pos_embed.to(torch.float32)
551
+ new_pos_embed = F.interpolate(
552
+ old_pos_embed,
553
+ size=(tgt_size, tgt_size),
554
+ mode='bicubic',
555
+ antialias=True,
556
+ align_corners=False,
557
+ ).to(dtype)
558
+ new_pos_embed = new_pos_embed.permute(0, 2, 3, 1)
559
+ return new_pos_embed
560
+ else:
561
+ return abs_pos
562
+
563
+
564
+
565
+
566
+ class MLPBlock(nn.Module):
567
+ def __init__(
568
+ self,
569
+ embedding_dim: int,
570
+ mlp_dim: int,
571
+ act: Type[nn.Module] = nn.GELU,
572
+ ) -> None:
573
+ super().__init__()
574
+ self.lin1 = nn.Linear(embedding_dim, mlp_dim)
575
+ self.lin2 = nn.Linear(mlp_dim, embedding_dim)
576
+ self.act = act()
577
+
578
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
579
+ return self.lin2(self.act(self.lin1(x)))
580
+
581
+
582
+ # From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
583
+ # Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa
584
+ class LayerNorm2d(nn.Module):
585
+ def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
586
+ super().__init__()
587
+ self.weight = nn.Parameter(torch.ones(num_channels))
588
+ self.bias = nn.Parameter(torch.zeros(num_channels))
589
+ self.eps = eps
590
+
591
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
592
+ u = x.mean(1, keepdim=True)
593
+ s = (x - u).pow(2).mean(1, keepdim=True)
594
+ x = (x - u) / torch.sqrt(s + self.eps)
595
+ x = self.weight[:, None, None] * x + self.bias[:, None, None]
596
+ return x
597
+
598
+
599
+ # This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa
600
+ class ImageEncoderViT(nn.Module):
601
+ def __init__(
602
+ self,
603
+ img_size: int = 1024,
604
+ patch_size: int = 16,
605
+ in_chans: int = 3,
606
+ embed_dim: int = 768,
607
+ depth: int = 12,
608
+ num_heads: int = 12,
609
+ mlp_ratio: float = 4.0,
610
+ out_chans: int = 256,
611
+ qkv_bias: bool = True,
612
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
613
+ act_layer: Type[nn.Module] = nn.GELU,
614
+ use_abs_pos: bool = True,
615
+ use_rel_pos: bool = False,
616
+ rel_pos_zero_init: bool = True,
617
+ window_size: int = 0,
618
+ global_attn_indexes: Tuple[int, ...] = (),
619
+ ) -> None:
620
+ """
621
+ Args:
622
+ img_size (int): Input image size.
623
+ patch_size (int): Patch size.
624
+ in_chans (int): Number of input image channels.
625
+ embed_dim (int): Patch embedding dimension.
626
+ depth (int): Depth of ViT.
627
+ num_heads (int): Number of attention heads in each ViT block.
628
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
629
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
630
+ norm_layer (nn.Module): Normalization layer.
631
+ act_layer (nn.Module): Activation layer.
632
+ use_abs_pos (bool): If True, use absolute positional embeddings.
633
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
634
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
635
+ window_size (int): Window size for window attention blocks.
636
+ global_attn_indexes (list): Indexes for blocks using global attention.
637
+ """
638
+ super().__init__()
639
+ self.img_size = img_size
640
+
641
+ self.patch_embed = PatchEmbed(
642
+ kernel_size=(patch_size, patch_size),
643
+ stride=(patch_size, patch_size),
644
+ in_chans=in_chans,
645
+ embed_dim=embed_dim,
646
+ )
647
+
648
+ self.pos_embed: Optional[nn.Parameter] = None
649
+ if use_abs_pos:
650
+ # Initialize absolute positional embedding with pretrain image size.
651
+ self.pos_embed = nn.Parameter(
652
+ torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim)
653
+ )
654
+
655
+ self.blocks = nn.ModuleList()
656
+ for i in range(depth):
657
+ block = Block(
658
+ dim=embed_dim,
659
+ num_heads=num_heads,
660
+ mlp_ratio=mlp_ratio,
661
+ qkv_bias=qkv_bias,
662
+ norm_layer=norm_layer,
663
+ act_layer=act_layer,
664
+ use_rel_pos=use_rel_pos,
665
+ rel_pos_zero_init=rel_pos_zero_init,
666
+ window_size=window_size if i not in global_attn_indexes else 0,
667
+ input_size=(img_size // patch_size, img_size // patch_size),
668
+ )
669
+ self.blocks.append(block)
670
+
671
+ self.neck = nn.Sequential(
672
+ nn.Conv2d(
673
+ embed_dim,
674
+ out_chans,
675
+ kernel_size=1,
676
+ bias=False,
677
+ ),
678
+ LayerNorm2d(out_chans),
679
+ nn.Conv2d(
680
+ out_chans,
681
+ out_chans,
682
+ kernel_size=3,
683
+ padding=1,
684
+ bias=False,
685
+ ),
686
+ LayerNorm2d(out_chans),
687
+ )
688
+
689
+ self.net_2 = nn.Conv2d(256, 512, kernel_size=3, stride=2, padding=1, bias=False)
690
+ self.net_3 = nn.Conv2d(512, 1024, kernel_size=3, stride=2, padding=1, bias=False)
691
+
692
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
693
+ x = self.patch_embed(x)
694
+ if self.pos_embed is not None:
695
+ # x = x + self.pos_embed
696
+ x = x + get_abs_pos_sam(self.pos_embed, x.size(1))
697
+
698
+ for blk in self.blocks:
699
+ x = blk(x)
700
+
701
+ x = self.neck(x.permute(0, 3, 1, 2))
702
+ x2 = self.net_2(x)
703
+ x3 = self.net_3(x2.clone())
704
+
705
+ return x3
706
+
707
+
708
+ class Block(nn.Module):
709
+ """Transformer blocks with support of window attention and residual propagation blocks"""
710
+
711
+ def __init__(
712
+ self,
713
+ dim: int,
714
+ num_heads: int,
715
+ mlp_ratio: float = 4.0,
716
+ qkv_bias: bool = True,
717
+ norm_layer: Type[nn.Module] = nn.LayerNorm,
718
+ act_layer: Type[nn.Module] = nn.GELU,
719
+ use_rel_pos: bool = False,
720
+ rel_pos_zero_init: bool = True,
721
+ window_size: int = 0,
722
+ input_size: Optional[Tuple[int, int]] = None,
723
+ ) -> None:
724
+ """
725
+ Args:
726
+ dim (int): Number of input channels.
727
+ num_heads (int): Number of attention heads in each ViT block.
728
+ mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
729
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
730
+ norm_layer (nn.Module): Normalization layer.
731
+ act_layer (nn.Module): Activation layer.
732
+ use_rel_pos (bool): If True, add relative positional embeddings to the attention map.
733
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
734
+ window_size (int): Window size for window attention blocks. If it equals 0, then
735
+ use global attention.
736
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
737
+ positional parameter size.
738
+ """
739
+ super().__init__()
740
+ self.norm1 = norm_layer(dim)
741
+ self.attn = Attention(
742
+ dim,
743
+ num_heads=num_heads,
744
+ qkv_bias=qkv_bias,
745
+ use_rel_pos=use_rel_pos,
746
+ rel_pos_zero_init=rel_pos_zero_init,
747
+ input_size=input_size if window_size == 0 else (window_size, window_size),
748
+ )
749
+
750
+ self.norm2 = norm_layer(dim)
751
+ self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer)
752
+
753
+ self.window_size = window_size
754
+
755
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
756
+ shortcut = x
757
+ x = self.norm1(x)
758
+ # Window partition
759
+ if self.window_size > 0:
760
+ H, W = x.shape[1], x.shape[2]
761
+ x, pad_hw = window_partition(x, self.window_size)
762
+
763
+ x = self.attn(x)
764
+ # Reverse window partition
765
+ if self.window_size > 0:
766
+ x = window_unpartition(x, self.window_size, pad_hw, (H, W))
767
+
768
+ x = shortcut + x
769
+ x = x + self.mlp(self.norm2(x))
770
+
771
+ return x
772
+
773
+
774
+ class Attention(nn.Module):
775
+ """Multi-head Attention block with relative position embeddings."""
776
+
777
+ def __init__(
778
+ self,
779
+ dim: int,
780
+ num_heads: int = 8,
781
+ qkv_bias: bool = True,
782
+ use_rel_pos: bool = False,
783
+ rel_pos_zero_init: bool = True,
784
+ input_size: Optional[Tuple[int, int]] = None,
785
+ ) -> None:
786
+ """
787
+ Args:
788
+ dim (int): Number of input channels.
789
+ num_heads (int): Number of attention heads.
790
+ qkv_bias (bool): If True, add a learnable bias to query, key, value.
791
+ rel_pos (bool): If True, add relative positional embeddings to the attention map.
792
+ rel_pos_zero_init (bool): If True, zero initialize relative positional parameters.
793
+ input_size (tuple(int, int) or None): Input resolution for calculating the relative
794
+ positional parameter size.
795
+ """
796
+ super().__init__()
797
+ self.num_heads = num_heads
798
+ head_dim = dim // num_heads
799
+ self.scale = head_dim**-0.5
800
+
801
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
802
+ self.proj = nn.Linear(dim, dim)
803
+
804
+ self.use_rel_pos = use_rel_pos
805
+ if self.use_rel_pos:
806
+ assert (
807
+ input_size is not None
808
+ ), "Input size must be provided if using relative positional encoding."
809
+ # initialize relative positional embeddings
810
+ self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim))
811
+ self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim))
812
+
813
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
814
+ B, H, W, _ = x.shape
815
+ # qkv with shape (3, B, nHead, H * W, C)
816
+ qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
817
+ # q, k, v with shape (B * nHead, H * W, C)
818
+ q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0)
819
+
820
+ rel_h, rel_w = None, None
821
+ if self.use_rel_pos:
822
+ rel_h, rel_w = add_decomposed_rel_pos(q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W))
823
+
824
+ q = q.view(B, self.num_heads, H * W, -1)
825
+ k = k.view(B, self.num_heads, H * W, -1)
826
+ v = v.view(B, self.num_heads, H * W, -1)
827
+
828
+ if self.use_rel_pos:
829
+ rel_h = rel_h.view(B, self.num_heads, rel_h.size(1), rel_h.size(2), rel_h.size(3))
830
+ rel_w = rel_w.view(B, self.num_heads, rel_w.size(1), rel_w.size(2), rel_w.size(3))
831
+ attn_bias = (rel_h + rel_w).view(B, self.num_heads, rel_h.size(2), rel_h.size(3) * rel_w.size(4))
832
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_bias)
833
+ # x = _attention_rel_h_rel_w(q, k, v, rel_h, rel_w)
834
+ else:
835
+ x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
836
+
837
+ x = x.view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1)
838
+
839
+ x = self.proj(x)
840
+
841
+ return x
842
+
843
+
844
+ def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]:
845
+ """
846
+ Partition into non-overlapping windows with padding if needed.
847
+ Args:
848
+ x (tensor): input tokens with [B, H, W, C].
849
+ window_size (int): window size.
850
+
851
+ Returns:
852
+ windows: windows after partition with [B * num_windows, window_size, window_size, C].
853
+ (Hp, Wp): padded height and width before partition
854
+ """
855
+ B, H, W, C = x.shape
856
+
857
+ pad_h = (window_size - H % window_size) % window_size
858
+ pad_w = (window_size - W % window_size) % window_size
859
+ if pad_h > 0 or pad_w > 0:
860
+ x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h))
861
+ Hp, Wp = H + pad_h, W + pad_w
862
+
863
+ x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C)
864
+ windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C)
865
+ return windows, (Hp, Wp)
866
+
867
+
868
+ def window_unpartition(
869
+ windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int]
870
+ ) -> torch.Tensor:
871
+ """
872
+ Window unpartition into original sequences and removing padding.
873
+ Args:
874
+ windows (tensor): input tokens with [B * num_windows, window_size, window_size, C].
875
+ window_size (int): window size.
876
+ pad_hw (Tuple): padded height and width (Hp, Wp).
877
+ hw (Tuple): original height and width (H, W) before padding.
878
+
879
+ Returns:
880
+ x: unpartitioned sequences with [B, H, W, C].
881
+ """
882
+ Hp, Wp = pad_hw
883
+ H, W = hw
884
+ B = windows.shape[0] // (Hp * Wp // window_size // window_size)
885
+ x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1)
886
+ x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1)
887
+
888
+ if Hp > H or Wp > W:
889
+ x = x[:, :H, :W, :].contiguous()
890
+ return x
891
+
892
+
893
+ def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor:
894
+ """
895
+ Get relative positional embeddings according to the relative positions of
896
+ query and key sizes.
897
+ Args:
898
+ q_size (int): size of query q.
899
+ k_size (int): size of key k.
900
+ rel_pos (Tensor): relative position embeddings (L, C).
901
+
902
+ Returns:
903
+ Extracted positional embeddings according to relative positions.
904
+ """
905
+ max_rel_dist = int(2 * max(q_size, k_size) - 1)
906
+ # Interpolate rel pos if needed.
907
+ if rel_pos.shape[0] != max_rel_dist:
908
+ # Interpolate rel pos.
909
+ dtype = rel_pos.dtype
910
+ rel_pos = rel_pos.to(torch.float32)
911
+ rel_pos_resized = F.interpolate(
912
+ rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1),
913
+ size=max_rel_dist,
914
+ mode="linear",
915
+ ).to(dtype)
916
+ rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0)
917
+ else:
918
+ rel_pos_resized = rel_pos
919
+
920
+ # Scale the coords with short length if shapes for q and k are different.
921
+ q_coords = torch.arange(q_size, device=rel_pos.device)[:, None] * max(k_size / q_size, 1.0)
922
+ k_coords = torch.arange(k_size, device=rel_pos.device)[None, :] * max(q_size / k_size, 1.0)
923
+ relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0)
924
+
925
+ return rel_pos_resized[relative_coords.long()]
926
+
927
+
928
+ def add_decomposed_rel_pos(
929
+ q: torch.Tensor,
930
+ rel_pos_h: torch.Tensor,
931
+ rel_pos_w: torch.Tensor,
932
+ q_size: Tuple[int, int],
933
+ k_size: Tuple[int, int],
934
+ ) -> torch.Tensor:
935
+ """
936
+ Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`.
937
+ https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950
938
+ Args:
939
+ q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C).
940
+ rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis.
941
+ rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis.
942
+ q_size (Tuple): spatial sequence size of query q with (q_h, q_w).
943
+ k_size (Tuple): spatial sequence size of key k with (k_h, k_w).
944
+
945
+ Returns:
946
+ attn (Tensor): attention map with added relative positional embeddings.
947
+ """
948
+ q_h, q_w = q_size
949
+ k_h, k_w = k_size
950
+ Rh = get_rel_pos(q_h, k_h, rel_pos_h)
951
+ Rw = get_rel_pos(q_w, k_w, rel_pos_w)
952
+
953
+ B, _, dim = q.shape
954
+ r_q = q.reshape(B, q_h, q_w, dim)
955
+ rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh)
956
+ rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw)
957
+ rel_h = rel_h.unsqueeze(-1)
958
+ rel_w = rel_w.unsqueeze(-2)
959
+ rel_h = rel_h.reshape(B, q_h * q_w, k_h, 1)
960
+ rel_w = rel_w.reshape(B, q_h * q_w, 1, k_w)
961
+
962
+ return rel_h, rel_w
963
+
964
+
965
+ class PatchEmbed(nn.Module):
966
+ """
967
+ Image to Patch Embedding.
968
+ """
969
+
970
+ def __init__(
971
+ self,
972
+ kernel_size: Tuple[int, int] = (16, 16),
973
+ stride: Tuple[int, int] = (16, 16),
974
+ padding: Tuple[int, int] = (0, 0),
975
+ in_chans: int = 3,
976
+ embed_dim: int = 768,
977
+ ) -> None:
978
+ """
979
+ Args:
980
+ kernel_size (Tuple): kernel size of the projection layer.
981
+ stride (Tuple): stride of the projection layer.
982
+ padding (Tuple): padding size of the projection layer.
983
+ in_chans (int): Number of input image channels.
984
+ embed_dim (int): Patch embedding dimension.
985
+ """
986
+ super().__init__()
987
+
988
+ self.proj = nn.Conv2d(
989
+ in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding
990
+ )
991
+
992
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
993
+ x = self.proj(x)
994
+ # B C H W -> B H W C
995
+ x = x.permute(0, 2, 3, 1)
996
+ return x
997
+
998
+
999
+ def build_sam_vit_b(checkpoint=None):
1000
+ return _build_sam(
1001
+ encoder_embed_dim=768,
1002
+ encoder_depth=12,
1003
+ encoder_num_heads=12,
1004
+ encoder_global_attn_indexes=[2, 5, 8, 11],
1005
+ checkpoint=checkpoint,
1006
+ )
1007
+
1008
+ def build_sam_fast_vit_b(checkpoint=None, compile_mode='max-autotune', dtype=torch.bfloat16):
1009
+ image_encoder = build_sam_vit_b(checkpoint).eval().to(dtype)
1010
+ # sam = _apply_eval_dtype_sam(sam, dtype)
1011
+ image_encoder = torch.compile(image_encoder, mode=compile_mode)
1012
+ return image_encoder
1013
+
1014
+
1015
+ def _build_sam(
1016
+ encoder_embed_dim,
1017
+ encoder_depth,
1018
+ encoder_num_heads,
1019
+ encoder_global_attn_indexes,
1020
+ checkpoint=None,
1021
+ ):
1022
+ prompt_embed_dim = 256
1023
+ image_size = 1024
1024
+ vit_patch_size = 16
1025
+ image_embedding_size = image_size // vit_patch_size
1026
+ image_encoder=ImageEncoderViT(
1027
+ depth=encoder_depth,
1028
+ embed_dim=encoder_embed_dim,
1029
+ img_size=image_size,
1030
+ mlp_ratio=4,
1031
+ norm_layer=partial(torch.nn.LayerNorm, eps=1e-6),
1032
+ num_heads=encoder_num_heads,
1033
+ patch_size=vit_patch_size,
1034
+ qkv_bias=True,
1035
+ use_rel_pos=True,
1036
+ global_attn_indexes=encoder_global_attn_indexes,
1037
+ window_size=14,
1038
+ out_chans=prompt_embed_dim,
1039
+ )
1040
+ image_encoder.eval()
1041
+ if checkpoint is not None:
1042
+ # with open(checkpoint, "rb") as f:
1043
+ state_dict = torch.load(checkpoint)
1044
+ # print(state_dict.keys())
1045
+ # for key in state_dict:
1046
+ # image_encoder.load_state_dict({k[14:]: v for k, v in state_dict.items() if 'image_encoder' in k}, strict=False)
1047
+ # ocr-anyting
1048
+ # image_encoder.load_state_dict(state_dict, strict=True)
1049
+ # tob
1050
+ image_encoder.load_state_dict({k[30:]: v for k, v in state_dict.items() if 'vision_tower_high' in k}, strict=True)
1051
+ print(checkpoint)
1052
+ return image_encoder
model.py ADDED
@@ -0,0 +1,950 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ from typing import Optional, List, Union, Tuple
5
+ from transformers import Qwen2VLTextModel, Qwen2VLTextConfig, Qwen2VLPreTrainedModel, PretrainedConfig
6
+ from transformers.models.llama.modeling_llama import LlamaAttention, LlamaRotaryEmbedding
7
+ from transformers.generation.utils import GenerationMixin
8
+ from transformers.modeling_utils import PreTrainedModel
9
+ from transformers.modeling_outputs import ModelOutput
10
+ from PIL import Image, ImageOps
11
+ from encoder import build_sam_vit_b, build_clip_l, MlpProjector
12
+ from addict import Dict as ADict
13
+ import os
14
+ import math
15
+ from data import (
16
+ format_messages,
17
+ load_pil_images,
18
+ text_encode,
19
+ BasicImageTransform,
20
+ dynamic_preprocess,
21
+ re_match,
22
+ process_image_with_refs,
23
+ NoEOSTextStreamer,
24
+ )
25
+ from tqdm import tqdm
26
+ from dataclasses import dataclass
27
+
28
+
29
+ class DeepQwenVLConfig(PretrainedConfig):
30
+ """
31
+ Configuration class for DeepQwenVL model.
32
+
33
+ This config wraps both the Qwen2VL text config and DeepSeek vision config.
34
+ When loading from a Qwen2-VL checkpoint, it will use the checkpoint's config
35
+ directly for the text model.
36
+ """
37
+ model_type = "deepqwen_vl"
38
+
39
+ def __init__(
40
+ self,
41
+ deepseek_vision_hidden_size: int = 2048,
42
+
43
+ # Projector settings
44
+ projector_type: str = "mlp", # "vision_projector" or "mlp"
45
+ projector_input_dim: int = 2048,
46
+ projector_output_dim: int = None,
47
+ projector_hidden_dim: int = None, # If None, uses projector_output_dim
48
+
49
+ # Learnable vision tokens
50
+ image_newline_dim: int = None, # If None, uses hidden_size
51
+ view_separator_dim: int = None, # If None, uses hidden_size
52
+
53
+ hidden_size: int = 1536,
54
+ intermediate_size: int = 8960,
55
+ num_hidden_layers: int = 28,
56
+ num_attention_heads: int = 12,
57
+ num_key_value_heads: int = 2,
58
+ hidden_act: str = "silu",
59
+ max_position_embeddings: int = 32768,
60
+ initializer_range: float = 0.02,
61
+ rms_norm_eps: float = 1e-6,
62
+ use_cache: bool = True,
63
+ tie_word_embeddings: bool = True,
64
+ rope_theta: float = 1000000.0,
65
+ attention_dropout: float = 0.0,
66
+ vocab_size: int = 151936,
67
+
68
+ bos_token_id: int = 151643,
69
+ eos_token_id: int = 151645,
70
+ pad_token_id: int = 151643,
71
+ image_token_id: int = 151655,
72
+ video_token_id: int = 151656,
73
+ vision_start_token_id: int = 151652,
74
+ vision_end_token_id: int = 151653,
75
+ vision_token_id: int = 151654,
76
+
77
+ rope_scaling: dict = None,
78
+
79
+ **kwargs
80
+ ):
81
+ super().__init__(
82
+ bos_token_id=bos_token_id,
83
+ eos_token_id=eos_token_id,
84
+ pad_token_id=pad_token_id,
85
+ tie_word_embeddings=tie_word_embeddings,
86
+ **kwargs
87
+ )
88
+
89
+ self.deepseek_vision_hidden_size = deepseek_vision_hidden_size
90
+
91
+ # Projector settings
92
+ self.projector_type = projector_type
93
+ self.projector_input_dim = projector_input_dim
94
+ self.projector_output_dim = projector_output_dim if projector_output_dim else hidden_size
95
+ self.projector_hidden_dim = projector_hidden_dim if projector_hidden_dim else self.projector_output_dim
96
+
97
+ # Learnable vision tokens
98
+ self.image_newline_dim = image_newline_dim if image_newline_dim else hidden_size
99
+ self.view_separator_dim = view_separator_dim if view_separator_dim else hidden_size
100
+
101
+ # Text model settings
102
+ self.hidden_size = hidden_size
103
+ self.intermediate_size = intermediate_size
104
+ self.num_hidden_layers = num_hidden_layers
105
+ self.num_attention_heads = num_attention_heads
106
+ self.num_key_value_heads = num_key_value_heads
107
+ self.hidden_act = hidden_act
108
+ self.max_position_embeddings = max_position_embeddings
109
+ self.initializer_range = initializer_range
110
+ self.rms_norm_eps = rms_norm_eps
111
+ self.use_cache = use_cache
112
+ self.rope_theta = rope_theta
113
+ self.attention_dropout = attention_dropout
114
+ self.vocab_size = vocab_size
115
+
116
+ # Special tokens
117
+ self.image_token_id = image_token_id
118
+ self.video_token_id = video_token_id
119
+ self.vision_start_token_id = vision_start_token_id
120
+ self.vision_end_token_id = vision_end_token_id
121
+ self.vision_token_id = vision_token_id
122
+
123
+ # Rope scaling
124
+ if rope_scaling is None:
125
+ rope_scaling = {"type": "mrope", "mrope_section": [16, 24, 24]}
126
+ self.rope_scaling = rope_scaling
127
+
128
+ def to_text_config(self) -> Qwen2VLTextConfig:
129
+ """Convert to Qwen2VLTextConfig for the text model."""
130
+ return Qwen2VLTextConfig(
131
+ hidden_size=self.hidden_size,
132
+ intermediate_size=self.intermediate_size,
133
+ num_hidden_layers=self.num_hidden_layers,
134
+ num_attention_heads=self.num_attention_heads,
135
+ num_key_value_heads=self.num_key_value_heads,
136
+ hidden_act=self.hidden_act,
137
+ max_position_embeddings=self.max_position_embeddings,
138
+ initializer_range=self.initializer_range,
139
+ rms_norm_eps=self.rms_norm_eps,
140
+ use_cache=self.use_cache,
141
+ tie_word_embeddings=self.tie_word_embeddings,
142
+ rope_theta=self.rope_theta,
143
+ attention_dropout=self.attention_dropout,
144
+ vocab_size=self.vocab_size,
145
+ bos_token_id=self.bos_token_id,
146
+ eos_token_id=self.eos_token_id,
147
+ pad_token_id=self.pad_token_id,
148
+ rope_scaling=self.rope_scaling,
149
+ )
150
+
151
+
152
+ @dataclass
153
+ class DeepQwenOutputWithPast(ModelOutput):
154
+ last_hidden_state: torch.FloatTensor = None
155
+ past_key_values: Optional[list[torch.FloatTensor]] = None
156
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
157
+ attentions: Optional[tuple[torch.FloatTensor]] = None
158
+
159
+ @dataclass
160
+ class DeepQwenCausalLMOutputWithPast(ModelOutput):
161
+ loss: Optional[torch.FloatTensor] = None
162
+ logits: Optional[torch.FloatTensor] = None
163
+ past_key_values: Optional[list[torch.FloatTensor]] = None
164
+ hidden_states: Optional[tuple[torch.FloatTensor]] = None
165
+ attentions: Optional[tuple[torch.FloatTensor]] = None
166
+
167
+
168
+ class VisionProjector(nn.Module):
169
+ """
170
+ Vision projector with DeepSeek's pretrained layer + trainable adapter.
171
+
172
+ Architecture:
173
+ deepseek_proj: Linear(2048→1280) [FROZEN - loaded from DeepSeek checkpoint]
174
+ SiLU activation
175
+ norm: LayerNorm(1280) [TRAINABLE]
176
+ adapter: Linear(1280→1536) [TRAINABLE]
177
+
178
+ This preserves DeepSeek's learned vision-text alignment while adapting to Qwen's
179
+ embedding space. Total 2 layers like LLaVA's MLP projector.
180
+ """
181
+
182
+ def __init__(self, input_dim: int = 2048, hidden_dim: int = 1280, output_dim: int = 1536):
183
+ super().__init__()
184
+ # DeepSeek's original projection (will be frozen after loading weights)
185
+ self.deepseek_proj = nn.Linear(input_dim, hidden_dim)
186
+ # Adapter for Qwen (trainable)
187
+ self.norm = nn.LayerNorm(hidden_dim)
188
+ self.adapter = nn.Linear(hidden_dim, output_dim)
189
+ self._init_adapter_weights()
190
+
191
+ def _init_adapter_weights(self):
192
+ """Initialize adapter weights. deepseek_proj will be loaded from checkpoint."""
193
+ nn.init.ones_(self.norm.weight)
194
+ nn.init.zeros_(self.norm.bias)
195
+ nn.init.normal_(self.adapter.weight, mean=0.0, std=0.01)
196
+ nn.init.zeros_(self.adapter.bias)
197
+
198
+ def forward(self, x):
199
+ x = self.deepseek_proj(x)
200
+ x = F.silu(x)
201
+ x = self.norm(x)
202
+ x = self.adapter(x)
203
+ return x
204
+
205
+ class DeepQwenVLPreTrainedModel(PreTrainedModel):
206
+ config_class = DeepQwenVLConfig
207
+ base_model_prefix = "model"
208
+ supports_gradient_checkpointing = True
209
+ _skip_keys_device_placement = "past_key_values"
210
+ _supports_flash_attn = True
211
+ _supports_sdpa = True
212
+ _supports_static_cache = True
213
+ _supports_attention_backend = True
214
+
215
+ _keys_to_ignore_on_load_missing = [
216
+ "sam_model",
217
+ "vision_model",
218
+ "projector",
219
+ "image_newline",
220
+ "view_separator",
221
+ ]
222
+
223
+ def _init_weights(self, module):
224
+ """Initialize the weights."""
225
+ std = self.config.initializer_range if hasattr(self.config, 'initializer_range') else 0.02
226
+ if isinstance(module, nn.Linear):
227
+ module.weight.data.normal_(mean=0.0, std=std)
228
+ if module.bias is not None:
229
+ module.bias.data.zero_()
230
+ elif isinstance(module, nn.Embedding):
231
+ module.weight.data.normal_(mean=0.0, std=std)
232
+
233
+
234
+ class DeepQwenVLModel(Qwen2VLTextModel):
235
+ """
236
+ DeepQwenVL Model that combines DeepSeek's vision encoders with Qwen2VL's text model.
237
+
238
+ Accepts either:
239
+ - A DeepQwenVLConfig
240
+ - A Qwen2VLTextConfig (for compatibility with from_pretrained from Qwen checkpoints)
241
+ - A generic PretrainedConfig (will extract necessary fields)
242
+ """
243
+ config_class = DeepQwenVLConfig
244
+
245
+ def __init__(self, config):
246
+ if isinstance(config, DeepQwenVLConfig):
247
+ text_config = config.to_text_config()
248
+ output_hidden_size = config.projector_output_dim
249
+ vision_dim = config.deepseek_vision_hidden_size
250
+ elif isinstance(config, Qwen2VLTextConfig):
251
+ text_config = config
252
+ output_hidden_size = config.hidden_size
253
+ vision_dim = 2048
254
+ else:
255
+ text_config = config
256
+ output_hidden_size = getattr(config, 'hidden_size', 1536)
257
+ vision_dim = getattr(config, 'deepseek_vision_hidden_size', 2048)
258
+
259
+ super(DeepQwenVLModel, self).__init__(text_config)
260
+
261
+ self.config = config
262
+ self.output_hidden_size = output_hidden_size
263
+
264
+ self.sam_model = build_sam_vit_b()
265
+ self.vision_model = build_clip_l()
266
+
267
+ self.deepseek_vision_dim = vision_dim
268
+ self.deepseek_hidden_dim = 1280 # DeepSeek's projector output dimension
269
+ # New projector: DeepSeek layer (frozen) + adapter (trainable)
270
+ self.projector = VisionProjector(
271
+ input_dim=self.deepseek_vision_dim, # 2048
272
+ hidden_dim=self.deepseek_hidden_dim, # 1280 (DeepSeek's output)
273
+ output_dim=output_hidden_size # 1536 (Qwen's hidden size)
274
+ )
275
+
276
+ embed_std = 1 / torch.sqrt(torch.tensor(output_hidden_size, dtype=torch.float32))
277
+ self.image_newline = nn.Parameter(torch.randn(output_hidden_size) * embed_std)
278
+ self.view_separator = nn.Parameter(torch.randn(output_hidden_size) * embed_std)
279
+
280
+ def forward(
281
+ self,
282
+ input_ids: torch.LongTensor = None,
283
+ attention_mask: Optional[torch.Tensor] = None,
284
+ position_ids: Optional[torch.LongTensor] = None,
285
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
286
+ inputs_embeds: Optional[torch.FloatTensor] = None,
287
+ use_cache: Optional[bool] = None,
288
+ output_attentions: Optional[bool] = None,
289
+ output_hidden_states: Optional[bool] = None,
290
+ images: Optional[torch.FloatTensor] = None,
291
+ images_seq_mask: Optional[torch.FloatTensor] = None,
292
+ images_spatial_crop: Optional[torch.FloatTensor] = None,
293
+ return_dict: Optional[bool] = None,
294
+ cache_position: Optional[torch.LongTensor] = None,
295
+ ) -> Union[torch.Tensor, List[torch.Tensor]]:
296
+
297
+ if inputs_embeds is None:
298
+ inputs_embeds = self.get_input_embeddings()(input_ids)
299
+
300
+ sam_model = getattr(self, 'sam_model', None)
301
+ vision_model = getattr(self, 'vision_model', None)
302
+
303
+ should_process_images = (
304
+ sam_model is not None
305
+ and images is not None
306
+ and images_seq_mask is not None
307
+ and (input_ids.shape[1] != 1 or self.training)
308
+ and torch.sum(images[0][1]).item() != 0
309
+ )
310
+
311
+ if should_process_images:
312
+ idx = 0
313
+ for image, crop_shape in zip(images, images_spatial_crop):
314
+ images_in_this_batch = []
315
+ patches = image[0]
316
+ image_ori = image[1]
317
+
318
+ if torch.sum(patches).item() != 0:
319
+ # Process local patches
320
+ with torch.no_grad():
321
+ local_features_1 = sam_model(patches)
322
+ local_features_2 = vision_model(patches, local_features_1)
323
+ local_features = torch.cat((local_features_2[:, 1:], local_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
324
+ local_features = local_features.detach()
325
+ local_features = self.projector(local_features)
326
+
327
+ # Process global image
328
+ with torch.no_grad():
329
+ global_features_1 = sam_model(image_ori)
330
+ global_features_2 = vision_model(image_ori, global_features_1)
331
+ global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
332
+ global_features = global_features.detach()
333
+ global_features = self.projector(global_features)
334
+
335
+ # Reshape and add newline tokens
336
+ _, hw, n_dim = global_features.shape
337
+ h = w = int(hw ** 0.5)
338
+ _2, hw2, n_dim2 = local_features.shape
339
+ h2 = w2 = int(hw2 ** 0.5)
340
+ width_crop_num, height_crop_num = crop_shape[0], crop_shape[1]
341
+
342
+ global_features = global_features.view(h, w, n_dim)
343
+ global_features = torch.cat(
344
+ [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
345
+ )
346
+ global_features = global_features.view(-1, n_dim)
347
+
348
+ local_features = local_features.view(
349
+ height_crop_num, width_crop_num, h2, w2, n_dim2
350
+ ).permute(0, 2, 1, 3, 4).reshape(height_crop_num*h2, width_crop_num*w2, n_dim2)
351
+ local_features = torch.cat(
352
+ [local_features, self.image_newline[None, None, :].expand(height_crop_num * h2, 1, n_dim2)], dim=1
353
+ )
354
+ local_features = local_features.view(-1, n_dim2)
355
+
356
+ global_local_features = torch.cat([local_features, global_features, self.view_separator[None, :]], dim=0)
357
+ images_in_this_batch.append(global_local_features)
358
+ else:
359
+ # Global-only branch (small images)
360
+ with torch.no_grad():
361
+ global_features_1 = sam_model(image_ori)
362
+ global_features_2 = vision_model(image_ori, global_features_1)
363
+ global_features = torch.cat((global_features_2[:, 1:], global_features_1.flatten(2).permute(0, 2, 1)), dim=-1)
364
+ global_features = global_features.detach()
365
+ global_features = self.projector(global_features)
366
+
367
+ _, hw, n_dim = global_features.shape
368
+ h = w = int(hw ** 0.5)
369
+ global_features = global_features.view(h, w, n_dim)
370
+ global_features = torch.cat(
371
+ [global_features, self.image_newline[None, None, :].expand(h, 1, n_dim)], dim=1
372
+ )
373
+ global_features = global_features.view(-1, n_dim)
374
+ global_local_features = torch.cat([global_features, self.view_separator[None, :]], dim=0)
375
+ images_in_this_batch.append(global_local_features)
376
+
377
+ if images_in_this_batch:
378
+ images_in_this_batch = torch.cat(images_in_this_batch, dim=0)
379
+ inputs_embeds[idx].masked_scatter_(images_seq_mask[idx].unsqueeze(-1).cuda(), images_in_this_batch)
380
+ idx += 1
381
+
382
+ outputs = super().forward(
383
+ input_ids=None, attention_mask=attention_mask, past_key_values=past_key_values,
384
+ inputs_embeds=inputs_embeds, use_cache=use_cache, position_ids=position_ids,
385
+ output_attentions=output_attentions, output_hidden_states=output_hidden_states,
386
+ return_dict=return_dict, cache_position=cache_position
387
+ )
388
+
389
+ return DeepQwenOutputWithPast(
390
+ last_hidden_state=outputs.last_hidden_state,
391
+ past_key_values=outputs.past_key_values,
392
+ hidden_states=outputs.hidden_states,
393
+ attentions=outputs.attentions,
394
+ ) if return_dict else outputs.to_tuple()
395
+
396
+
397
+ class DeepQwenVLForCausalLM(DeepQwenVLModel, GenerationMixin):
398
+ """
399
+ DeepQwenVL Model for causal language modeling with vision capabilities.
400
+
401
+ Combines DeepSeek's vision encoders (SAM + CLIP) with Qwen2VL's text model.
402
+ """
403
+ config_class = DeepQwenVLConfig
404
+ _tied_weights_keys = ["lm_head.weight"]
405
+
406
+ _keys_to_ignore_on_load_missing = [
407
+ # "sam_model",
408
+ # "vision_model",
409
+ # "projector",
410
+ # "image_newline",
411
+ # "view_separator",
412
+ ]
413
+
414
+ def __init__(self, config):
415
+ """
416
+ Initialize the model.
417
+
418
+ Args:
419
+ config: Can be DeepQwenVLConfig, Qwen2VLTextConfig, or a generic config
420
+ from a Qwen2-VL checkpoint.
421
+ """
422
+ super().__init__(config)
423
+
424
+ hidden_size = getattr(config, 'hidden_size', 1536)
425
+ vocab_size = getattr(config, 'vocab_size', 151936)
426
+
427
+ self.lm_head = nn.Linear(hidden_size, vocab_size, bias=False)
428
+
429
+ self.post_init()
430
+
431
+ def get_output_embeddings(self):
432
+ return getattr(self, 'lm_head', None)
433
+
434
+ def set_output_embeddings(self, new_embeddings):
435
+ self.lm_head = new_embeddings
436
+
437
+ def forward(
438
+ self,
439
+ input_ids: torch.LongTensor = None,
440
+ labels: Optional[torch.LongTensor] = None,
441
+ attention_mask: Optional[torch.Tensor] = None,
442
+ position_ids: Optional[torch.LongTensor] = None,
443
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
444
+ inputs_embeds: Optional[torch.FloatTensor] = None,
445
+ use_cache: Optional[bool] = None,
446
+ output_attentions: Optional[bool] = None,
447
+ output_hidden_states: Optional[bool] = None,
448
+ images: Optional[torch.FloatTensor] = None,
449
+ images_seq_mask: Optional[torch.FloatTensor] = None,
450
+ images_spatial_crop: Optional[torch.FloatTensor] = None,
451
+ return_dict: Optional[bool] = None,
452
+ cache_position: Optional[torch.LongTensor] = None,
453
+ ) -> Union[torch.Tensor, List[torch.Tensor]]:
454
+
455
+ outputs = super().forward(
456
+ input_ids=input_ids,
457
+ attention_mask=attention_mask,
458
+ past_key_values=past_key_values,
459
+ inputs_embeds=inputs_embeds,
460
+ use_cache=use_cache,
461
+ position_ids = position_ids,
462
+ output_attentions=output_attentions,
463
+ output_hidden_states=output_hidden_states,
464
+ images=images,
465
+ images_seq_mask=images_seq_mask,
466
+ images_spatial_crop=images_spatial_crop,
467
+ return_dict=True,
468
+ cache_position=cache_position,
469
+ )
470
+
471
+ hidden_states = outputs[0]
472
+ logits = self.lm_head(hidden_states)
473
+ logits = logits.float()
474
+
475
+ loss = None
476
+ if labels is not None:
477
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
478
+
479
+ return DeepQwenCausalLMOutputWithPast(
480
+ loss=loss,
481
+ logits=logits,
482
+ past_key_values=outputs.past_key_values,
483
+ hidden_states=outputs.hidden_states,
484
+ attentions=outputs.attentions,
485
+ )
486
+
487
+ def prepare_inputs_for_generation(
488
+ self,
489
+ input_ids,
490
+ past_key_values=None,
491
+ attention_mask=None,
492
+ inputs_embeds=None,
493
+ cache_position=None,
494
+ position_ids=None,
495
+ images=None,
496
+ images_seq_mask=None,
497
+ images_spatial_crop=None,
498
+ **kwargs,
499
+ ):
500
+ model_inputs = super().prepare_inputs_for_generation(
501
+ input_ids,
502
+ past_key_values=past_key_values,
503
+ attention_mask=attention_mask,
504
+ inputs_embeds=inputs_embeds,
505
+ cache_position=cache_position,
506
+ position_ids=position_ids,
507
+ **kwargs,
508
+ )
509
+
510
+ model_inputs["images"] = images
511
+ model_inputs["images_seq_mask"] = images_seq_mask
512
+ model_inputs["images_spatial_crop"] = images_spatial_crop
513
+ model_inputs["position_ids"] = None
514
+
515
+ # Clear images after first forward pass (cache_position[0] != 0 means subsequent tokens)
516
+ if cache_position is not None and cache_position[0] != 0:
517
+ model_inputs["images"] = None
518
+ model_inputs["images_seq_mask"] = None
519
+ model_inputs["images_spatial_crop"] = None
520
+
521
+ return model_inputs
522
+
523
+ def reinitialize_projector(self, vis_mlp=None, device=None, dtype=None):
524
+ """
525
+ Reinitialize the projector, image_newline, and view_separator.
526
+ Call this after from_pretrained when loading from a Qwen checkpoint.
527
+ """
528
+ if device is None:
529
+ for param in self.parameters():
530
+ if param.device.type != 'meta':
531
+ device = param.device
532
+ break
533
+ if device is None:
534
+ device = 'cpu'
535
+ if dtype is None:
536
+ dtype = torch.bfloat16
537
+
538
+ input_dim = self.deepseek_vision_dim
539
+ output_dim = self.output_hidden_size
540
+
541
+ if vis_mlp is not None:
542
+ self.projector = VisionProjector(input_dim=input_dim, output_dim=output_dim).to(device=device, dtype=dtype)
543
+
544
+ else:
545
+ self.projector = nn.Linear(in_features=input_dim, out_features=output_dim).to(device=device, dtype=dtype)
546
+ nn.init.normal_(self.projector.weight, mean=0.0, std=0.01)
547
+ if self.projector.bias is not None:
548
+ nn.init.zeros_(self.projector.bias)
549
+
550
+ embed_std = 1 / torch.sqrt(torch.tensor(output_dim, dtype=torch.float32))
551
+ self.image_newline = nn.Parameter(
552
+ torch.randn(output_dim, device=device, dtype=dtype) * embed_std.item()
553
+ )
554
+ self.view_separator = nn.Parameter(
555
+ torch.randn(output_dim, device=device, dtype=dtype) * embed_std.item()
556
+ )
557
+
558
+ print(f"Projector reinitialized on {device} with dtype {dtype}")
559
+
560
+ def load_pretrained_vision(self, pretrained_path: str):
561
+ try:
562
+ from safetensors import safe_open
563
+ except ImportError:
564
+ raise ImportError("Please install safetensors to load the pretrained vision model.")
565
+
566
+ assert os.path.exists(pretrained_path), f"Pretrained path {pretrained_path} does not exist."
567
+
568
+ vision_weights = {}
569
+ with safe_open(f"{pretrained_path}/model-00001-of-000001.safetensors", framework="pt", device="cpu") as f:
570
+ for k in f.keys():
571
+ vision_weights[k] = f.get_tensor(k)
572
+
573
+ prefixes = {
574
+ "sam_model": "model.sam_model.",
575
+ "vision_model": "model.vision_model.",
576
+ }
577
+
578
+ try:
579
+ for p in prefixes.keys():
580
+ state_dict = {}
581
+
582
+ for k, v in vision_weights.items():
583
+ if k.startswith(prefixes[p]):
584
+ new_key = k[len(prefixes[p]):]
585
+ state_dict[new_key] = v
586
+
587
+ getattr(self, p).load_state_dict(state_dict, strict=False)
588
+
589
+ print("Pretrained vision model loaded successfully.")
590
+ except Exception as e:
591
+ print("Error loading pretrained vision model:", e)
592
+ raise e
593
+
594
+ def load_deepseek_projector(self, pretrained_path: str):
595
+ """
596
+ Load DeepSeek's projector weights into the deepseek_proj layer.
597
+
598
+ DeepSeek checkpoint has:
599
+ - projector.weight: shape (1280, 2048)
600
+ - projector.bias: shape (1280,)
601
+
602
+ These get loaded into self.projector.deepseek_proj
603
+ """
604
+ try:
605
+ from safetensors import safe_open
606
+ except ImportError:
607
+ raise ImportError("Please install safetensors to load DeepSeek projector.")
608
+
609
+ assert os.path.exists(pretrained_path), f"Pretrained path {pretrained_path} does not exist."
610
+
611
+ # Find safetensors file
612
+ safetensor_files = [f for f in os.listdir(pretrained_path) if f.endswith('.safetensors')]
613
+ if not safetensor_files:
614
+ raise FileNotFoundError(f"No safetensors files found in {pretrained_path}")
615
+
616
+ safetensor_path = os.path.join(pretrained_path, safetensor_files[0])
617
+
618
+ projector_weights = {}
619
+ with safe_open(safetensor_path, framework="pt", device="cpu") as f:
620
+ for k in f.keys():
621
+ if 'projector' in k:
622
+ projector_weights[k] = f.get_tensor(k)
623
+
624
+ # Load into deepseek_proj
625
+ if 'projector.weight' in projector_weights:
626
+ self.projector.deepseek_proj.weight.data = projector_weights['projector.weight']
627
+ self.projector.deepseek_proj.bias.data = projector_weights['projector.bias']
628
+ print(f"Loaded DeepSeek projector weights: {self.projector.deepseek_proj.weight.shape}")
629
+ print(f" Weight mean: {self.projector.deepseek_proj.weight.mean().item():.6f}")
630
+ print(f" Weight std: {self.projector.deepseek_proj.weight.std().item():.6f}")
631
+ elif 'model.projector.weight' in projector_weights:
632
+ self.projector.deepseek_proj.weight.data = projector_weights['model.projector.weight']
633
+ self.projector.deepseek_proj.bias.data = projector_weights['model.projector.bias']
634
+ print(f"Loaded DeepSeek projector weights (model. prefix)")
635
+ else:
636
+ print(f"Warning: Could not find projector weights. Available keys: {list(projector_weights.keys())}")
637
+
638
+ def disable_torch_init(self):
639
+ """
640
+ Disable the redundant torch default initialization to accelerate model creation.
641
+ """
642
+ import torch
643
+ setattr(torch.nn.Linear, "reset_parameters", lambda self: None)
644
+ setattr(torch.nn.LayerNorm, "reset_parameters", lambda self: None)
645
+
646
+ def infer(
647
+ self,
648
+ tokenizer,
649
+ prompt='',
650
+ image_file='',
651
+ output_path = '',
652
+ base_size=1024,
653
+ image_size=640,
654
+ crop_mode=True,
655
+ test_compress=False,
656
+ save_results=False,
657
+ eval_mode=False
658
+ ):
659
+ self.disable_torch_init()
660
+
661
+ os.makedirs(output_path, exist_ok=True)
662
+ os.makedirs(f'{output_path}/images', exist_ok=True)
663
+ conversation = [
664
+ {
665
+ "role": "user",
666
+ "content": [
667
+ {
668
+ "type": "image",
669
+ "image": f"{image_file}",
670
+ },
671
+ {"type": "text", "text": f"{prompt}"},
672
+ ],
673
+ }
674
+ ]
675
+
676
+ formatted_prompt = tokenizer.apply_chat_template(conversation, tokenize=False, add_generation_prompt=True)
677
+
678
+ patch_size = 16
679
+ downsample_ratio = 4
680
+ images = load_pil_images(conversation)
681
+
682
+ valid_img_tokens = 0
683
+ ratio = 1
684
+
685
+ image_draw = images[0].copy()
686
+
687
+ w,h = image_draw.size
688
+ ratio = 1 - ((max(w, h) - min(w, h)) / (max(w, h)))
689
+
690
+
691
+ image_transform=BasicImageTransform(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), normalize=True)
692
+ images_seq_mask = []
693
+
694
+ image_token = '<|image_pad|>'
695
+ image_token_id = 151655
696
+ text_splits = formatted_prompt.split(image_token)
697
+
698
+ images_list, images_crop_list, images_seq_mask = [], [], []
699
+ tokenized_str = []
700
+ images_spatial_crop = []
701
+ for text_sep, image in zip(text_splits, images):
702
+
703
+ tokenized_sep = text_encode(tokenizer, text_sep, bos=False, eos=False)
704
+ tokenized_str += tokenized_sep
705
+ images_seq_mask += [False] * len(tokenized_sep)
706
+
707
+ if crop_mode:
708
+
709
+ if image.size[0] <= 640 and image.size[1] <= 640:
710
+ crop_ratio = [1, 1]
711
+
712
+ else:
713
+ if crop_mode:
714
+ images_crop_raw, crop_ratio = dynamic_preprocess(image)
715
+ else:
716
+ crop_ratio = [1, 1]
717
+
718
+ global_view = ImageOps.pad(image, (base_size, base_size),
719
+ color=tuple(int(x * 255) for x in image_transform.mean))
720
+
721
+ if base_size == 1024:
722
+ valid_img_tokens += int(256 * ratio)
723
+ elif base_size == 1280:
724
+ valid_img_tokens += int(400 * ratio)
725
+ # elif base_size == 640:
726
+ # valid_img_tokens += int(100 * ratio)
727
+
728
+
729
+ images_list.append(image_transform(global_view).to(torch.bfloat16))
730
+
731
+ # global_view_tensor = image_transform(global_view).to(torch.bfloat16)
732
+
733
+ width_crop_num, height_crop_num = crop_ratio
734
+
735
+ images_spatial_crop.append([width_crop_num, height_crop_num])
736
+
737
+
738
+ if width_crop_num > 1 or height_crop_num > 1:
739
+ """process the local views"""
740
+
741
+ for i in range(len(images_crop_raw)):
742
+ images_crop_list.append(image_transform(images_crop_raw[i]).to(torch.bfloat16))
743
+
744
+ if image_size == 640:
745
+ valid_img_tokens += len(images_crop_list) * 100
746
+
747
+ num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
748
+ num_queries_base = math.ceil((base_size // patch_size) / downsample_ratio)
749
+
750
+ """add image tokens"""
751
+
752
+ tokenized_image = ([image_token_id] * num_queries_base + [image_token_id]) * num_queries_base
753
+ tokenized_image += [image_token_id]
754
+ if width_crop_num > 1 or height_crop_num > 1:
755
+ tokenized_image += ([image_token_id] * (num_queries * width_crop_num) + [image_token_id]) * (
756
+ num_queries * height_crop_num)
757
+ tokenized_str += tokenized_image
758
+ images_seq_mask += [True] * len(tokenized_image)
759
+ # num_image_tokens.append(len(tokenized_image))
760
+
761
+ else:
762
+ """process the global view"""
763
+ if image_size <= 640:
764
+ image = image.resize((image_size, image_size))
765
+ global_view = ImageOps.pad(image, (image_size, image_size),
766
+ color=tuple(int(x * 255) for x in image_transform.mean))
767
+ images_list.append(image_transform(global_view).to(torch.bfloat16))
768
+
769
+ if base_size == 1024:
770
+ valid_img_tokens += int(256 * ratio)
771
+ elif base_size == 1280:
772
+ valid_img_tokens += int(400 * ratio)
773
+ elif base_size == 640:
774
+ valid_img_tokens += int(100 * 1)
775
+ elif base_size == 512:
776
+ valid_img_tokens += int(64 * 1)
777
+
778
+ width_crop_num, height_crop_num = 1, 1
779
+
780
+ images_spatial_crop.append([width_crop_num, height_crop_num])
781
+
782
+
783
+ """add image tokens"""
784
+ num_queries = math.ceil((image_size // patch_size) / downsample_ratio)
785
+
786
+ tokenized_image = ([image_token_id] * num_queries + [image_token_id]) * num_queries
787
+ tokenized_image += [image_token_id]
788
+ # tokenized_image += ([self.image_token_id] * (num_queries * width_crop_num) + [self.image_token_id]) * (
789
+ # num_queries * height_crop_num)
790
+ tokenized_str += tokenized_image
791
+ images_seq_mask += [True] * len(tokenized_image)
792
+ # num_image_tokens.append(len(tokenized_image))
793
+
794
+ """process the last text split"""
795
+ tokenized_sep = text_encode(tokenizer, text_splits[-1], bos=False, eos=False)
796
+ tokenized_str += tokenized_sep
797
+ images_seq_mask += [False] * len(tokenized_sep)
798
+
799
+ # Qwen2VL has NO bos_token (bos_token_id is None)
800
+ # The chat template already handles proper formatting
801
+
802
+ input_ids = torch.LongTensor(tokenized_str)
803
+
804
+ images_seq_mask = torch.tensor(images_seq_mask, dtype=torch.bool)
805
+
806
+ if len(images_list) == 0:
807
+ images_ori = torch.zeros((1, 3, image_size, image_size))
808
+ images_spatial_crop = torch.zeros((1, 2), dtype=torch.long)
809
+ images_crop = torch.zeros((1, 3, base_size, base_size))
810
+
811
+ else:
812
+ images_ori = torch.stack(images_list, dim=0)
813
+ images_spatial_crop = torch.tensor(images_spatial_crop, dtype=torch.long)
814
+ if images_crop_list:
815
+ images_crop = torch.stack(images_crop_list, dim=0)
816
+ else:
817
+ images_crop = torch.zeros((1, 3, base_size, base_size))
818
+
819
+
820
+
821
+ if not eval_mode:
822
+ streamer = NoEOSTextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=False)
823
+ with torch.autocast("cuda", dtype=torch.bfloat16):
824
+ with torch.no_grad():
825
+ output_ids = self.generate(
826
+ input_ids.unsqueeze(0).cuda(),
827
+ images=[(images_crop.cuda(), images_ori.cuda())],
828
+ images_seq_mask=images_seq_mask.unsqueeze(0).cuda(),
829
+ images_spatial_crop=images_spatial_crop,
830
+ temperature=0.5,
831
+ eos_token_id=tokenizer.eos_token_id,
832
+ streamer=streamer,
833
+ max_new_tokens=8192,
834
+ no_repeat_ngram_size=20,
835
+ use_cache=True
836
+ )
837
+ else:
838
+ with torch.autocast("cuda", dtype=torch.bfloat16):
839
+ with torch.no_grad():
840
+ output_ids = self.generate(
841
+ input_ids.unsqueeze(0).cuda(),
842
+ images=[(images_crop.cuda(), images_ori.cuda())],
843
+ images_seq_mask=images_seq_mask.unsqueeze(0).cuda(),
844
+ images_spatial_crop=images_spatial_crop,
845
+ temperature=0.5,
846
+ eos_token_id=tokenizer.eos_token_id,
847
+ max_new_tokens=8192,
848
+ no_repeat_ngram_size=35,
849
+ use_cache=True
850
+ )
851
+
852
+ # Check if conversation has image
853
+ has_image = any(
854
+ (isinstance(item, dict) and item.get('type') == 'image')
855
+ for msg in conversation
856
+ for item in (msg.get('content', []) if isinstance(msg.get('content'), list) else [])
857
+ )
858
+
859
+ if has_image and eval_mode:
860
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
861
+ # Qwen2VL's EOS token is <|im_end|>
862
+ stop_str = tokenizer.eos_token or '<|im_end|>'
863
+ if outputs.endswith(stop_str):
864
+ outputs = outputs[:-len(stop_str)]
865
+ outputs = outputs.strip()
866
+
867
+ return outputs
868
+
869
+ if has_image and test_compress:
870
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
871
+ pure_texts_outputs_token_length = len(text_encode(tokenizer, outputs, bos=False, eos=False))
872
+ print('='*50)
873
+ print('image size: ', (w, h))
874
+ print('valid image tokens: ', int(valid_img_tokens))
875
+ print('output texts tokens (valid): ', pure_texts_outputs_token_length)
876
+ print('compression ratio: ', round(pure_texts_outputs_token_length/valid_img_tokens, 2))
877
+ print('='*50)
878
+
879
+
880
+ if has_image and save_results:
881
+ outputs = tokenizer.decode(output_ids[0, input_ids.unsqueeze(0).cuda().shape[1]:], skip_special_tokens=False)
882
+ # Qwen2VL's EOS token
883
+ stop_str = tokenizer.eos_token or '<|im_end|>'
884
+
885
+ print('='*15 + 'save results:' + '='*15)
886
+
887
+ if outputs.endswith(stop_str):
888
+ outputs = outputs[:-len(stop_str)]
889
+ outputs = outputs.strip()
890
+
891
+ matches_ref, matches_images, mathes_other = re_match(outputs)
892
+ result = process_image_with_refs(image_draw, matches_ref, output_path)
893
+
894
+ for idx, a_match_image in enumerate(tqdm(matches_images, desc="image")):
895
+ outputs = outputs.replace(a_match_image, '![](images/' + str(idx) + '.jpg)\n')
896
+
897
+ for idx, a_match_other in enumerate(tqdm(mathes_other, desc="other")):
898
+ outputs = outputs.replace(a_match_other, '').replace('\\coloneqq', ':=').replace('\\eqqcolon', '=:')
899
+
900
+ with open(f'{output_path}/result.mmd', 'w', encoding = 'utf-8') as afile:
901
+ afile.write(outputs)
902
+
903
+ if 'line_type' in outputs:
904
+ import matplotlib.pyplot as plt
905
+ lines = eval(outputs)['Line']['line']
906
+
907
+ line_type = eval(outputs)['Line']['line_type']
908
+ endpoints = eval(outputs)['Line']['line_endpoint']
909
+
910
+ fig, ax = plt.subplots(figsize=(3,3), dpi=200)
911
+ ax.set_xlim(-15, 15)
912
+ ax.set_ylim(-15, 15)
913
+
914
+ for idx, line in enumerate(lines):
915
+ try:
916
+ p0 = eval(line.split(' -- ')[0])
917
+ p1 = eval(line.split(' -- ')[-1])
918
+
919
+ if line_type[idx] == '--':
920
+ ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth=0.8, color='k')
921
+ else:
922
+ ax.plot([p0[0], p1[0]], [p0[1], p1[1]], linewidth = 0.8, color = 'k')
923
+
924
+ ax.scatter(p0[0], p0[1], s=5, color = 'k')
925
+ ax.scatter(p1[0], p1[1], s=5, color = 'k')
926
+ except:
927
+ pass
928
+
929
+ for endpoint in endpoints:
930
+
931
+ label = endpoint.split(': ')[0]
932
+ (x, y) = eval(endpoint.split(': ')[1])
933
+ ax.annotate(label, (x, y), xytext=(1, 1), textcoords='offset points',
934
+ fontsize=5, fontweight='light')
935
+
936
+
937
+ plt.savefig(f'{output_path}/geo.jpg')
938
+ plt.close()
939
+
940
+ result.save(f"{output_path}/result_with_boxes.jpg")
941
+
942
+
943
+ ## TODO
944
+
945
+ # new training loop:
946
+ ## image -> vision encoder -> projection ->! txt_decoder -> embedding -> pool
947
+ # => alignment(text_pooling, image_pooling)
948
+ ## text -> text encoder -> projection -> embedding -> pool
949
+
950
+ ## cant let projection layer output into text decoder
model.safetensors.index.json ADDED
@@ -0,0 +1,827 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 1947057152,
4
+ "total_size": 6985491464
5
+ },
6
+ "weight_map": {
7
+ "embed_tokens.weight": "model-00001-of-00002.safetensors",
8
+ "image_newline": "model-00001-of-00002.safetensors",
9
+ "layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors",
10
+ "layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
11
+ "layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
12
+ "layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
13
+ "layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
14
+ "layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
15
+ "layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
16
+ "layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
17
+ "layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
18
+ "layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
19
+ "layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
20
+ "layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
21
+ "layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors",
22
+ "layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
23
+ "layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
24
+ "layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
25
+ "layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
26
+ "layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
27
+ "layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
28
+ "layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
29
+ "layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
30
+ "layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
31
+ "layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
32
+ "layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
33
+ "layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors",
34
+ "layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
35
+ "layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
36
+ "layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
37
+ "layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
38
+ "layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
39
+ "layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
40
+ "layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
41
+ "layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
42
+ "layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
43
+ "layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
44
+ "layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
45
+ "layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors",
46
+ "layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
47
+ "layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
48
+ "layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
49
+ "layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
50
+ "layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
51
+ "layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
52
+ "layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
53
+ "layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
54
+ "layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
55
+ "layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
56
+ "layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
57
+ "layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors",
58
+ "layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
59
+ "layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
60
+ "layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
61
+ "layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
62
+ "layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
63
+ "layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
64
+ "layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
65
+ "layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
66
+ "layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
67
+ "layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
68
+ "layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
69
+ "layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors",
70
+ "layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
71
+ "layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
72
+ "layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
73
+ "layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
74
+ "layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
75
+ "layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
76
+ "layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
77
+ "layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
78
+ "layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
79
+ "layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
80
+ "layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
81
+ "layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors",
82
+ "layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
83
+ "layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
84
+ "layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
85
+ "layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
86
+ "layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
87
+ "layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
88
+ "layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
89
+ "layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
90
+ "layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
91
+ "layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
92
+ "layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
93
+ "layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors",
94
+ "layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
95
+ "layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
96
+ "layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
97
+ "layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
98
+ "layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
99
+ "layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
100
+ "layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
101
+ "layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
102
+ "layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
103
+ "layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
104
+ "layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
105
+ "layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors",
106
+ "layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
107
+ "layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
108
+ "layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
109
+ "layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
110
+ "layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
111
+ "layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
112
+ "layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
113
+ "layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
114
+ "layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
115
+ "layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
116
+ "layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
117
+ "layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors",
118
+ "layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
119
+ "layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
120
+ "layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
121
+ "layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
122
+ "layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
123
+ "layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
124
+ "layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
125
+ "layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
126
+ "layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
127
+ "layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
128
+ "layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
129
+ "layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors",
130
+ "layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
131
+ "layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
132
+ "layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
133
+ "layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
134
+ "layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
135
+ "layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
136
+ "layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
137
+ "layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
138
+ "layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
139
+ "layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
140
+ "layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
141
+ "layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors",
142
+ "layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
143
+ "layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
144
+ "layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
145
+ "layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
146
+ "layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
147
+ "layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
148
+ "layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
149
+ "layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
150
+ "layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
151
+ "layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
152
+ "layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
153
+ "layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors",
154
+ "layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
155
+ "layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
156
+ "layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
157
+ "layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
158
+ "layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
159
+ "layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
160
+ "layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
161
+ "layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
162
+ "layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
163
+ "layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
164
+ "layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
165
+ "layers.20.input_layernorm.weight": "model-00001-of-00002.safetensors",
166
+ "layers.20.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
167
+ "layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
168
+ "layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
169
+ "layers.20.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
170
+ "layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
171
+ "layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
172
+ "layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
173
+ "layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
174
+ "layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
175
+ "layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
176
+ "layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
177
+ "layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors",
178
+ "layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
179
+ "layers.21.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
180
+ "layers.21.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
181
+ "layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
182
+ "layers.21.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
183
+ "layers.21.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
184
+ "layers.21.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
185
+ "layers.21.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
186
+ "layers.21.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
187
+ "layers.21.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
188
+ "layers.21.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
189
+ "layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors",
190
+ "layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
191
+ "layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
192
+ "layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
193
+ "layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
194
+ "layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
195
+ "layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
196
+ "layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
197
+ "layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
198
+ "layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
199
+ "layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
200
+ "layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
201
+ "layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors",
202
+ "layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
203
+ "layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
204
+ "layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
205
+ "layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
206
+ "layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
207
+ "layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
208
+ "layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
209
+ "layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
210
+ "layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
211
+ "layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
212
+ "layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
213
+ "layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors",
214
+ "layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
215
+ "layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
216
+ "layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
217
+ "layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
218
+ "layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
219
+ "layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
220
+ "layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
221
+ "layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
222
+ "layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
223
+ "layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
224
+ "layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
225
+ "layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors",
226
+ "layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
227
+ "layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
228
+ "layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
229
+ "layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
230
+ "layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
231
+ "layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
232
+ "layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
233
+ "layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
234
+ "layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
235
+ "layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
236
+ "layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
237
+ "layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors",
238
+ "layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
239
+ "layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
240
+ "layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
241
+ "layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
242
+ "layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
243
+ "layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
244
+ "layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
245
+ "layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
246
+ "layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
247
+ "layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
248
+ "layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
249
+ "layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors",
250
+ "layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors",
251
+ "layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors",
252
+ "layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors",
253
+ "layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors",
254
+ "layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors",
255
+ "layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors",
256
+ "layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors",
257
+ "layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors",
258
+ "layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors",
259
+ "layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors",
260
+ "layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors",
261
+ "layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors",
262
+ "layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
263
+ "layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
264
+ "layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
265
+ "layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
266
+ "layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
267
+ "layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
268
+ "layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
269
+ "layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
270
+ "layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
271
+ "layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
272
+ "layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
273
+ "layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors",
274
+ "layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
275
+ "layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
276
+ "layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
277
+ "layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
278
+ "layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
279
+ "layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
280
+ "layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
281
+ "layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
282
+ "layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
283
+ "layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
284
+ "layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
285
+ "layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors",
286
+ "layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
287
+ "layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
288
+ "layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
289
+ "layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
290
+ "layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
291
+ "layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
292
+ "layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
293
+ "layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
294
+ "layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
295
+ "layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
296
+ "layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
297
+ "layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors",
298
+ "layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
299
+ "layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
300
+ "layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
301
+ "layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
302
+ "layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
303
+ "layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
304
+ "layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
305
+ "layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
306
+ "layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
307
+ "layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
308
+ "layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
309
+ "layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors",
310
+ "layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
311
+ "layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
312
+ "layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
313
+ "layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
314
+ "layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
315
+ "layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
316
+ "layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
317
+ "layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
318
+ "layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
319
+ "layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
320
+ "layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
321
+ "layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors",
322
+ "layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
323
+ "layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
324
+ "layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
325
+ "layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
326
+ "layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
327
+ "layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
328
+ "layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
329
+ "layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
330
+ "layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
331
+ "layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
332
+ "layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
333
+ "layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors",
334
+ "layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors",
335
+ "layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors",
336
+ "layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors",
337
+ "layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors",
338
+ "layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors",
339
+ "layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors",
340
+ "layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors",
341
+ "layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors",
342
+ "layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors",
343
+ "layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors",
344
+ "layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors",
345
+ "norm.weight": "model-00002-of-00002.safetensors",
346
+ "projector.adapter.bias": "model-00002-of-00002.safetensors",
347
+ "projector.adapter.weight": "model-00002-of-00002.safetensors",
348
+ "projector.deepseek_proj.bias": "model-00002-of-00002.safetensors",
349
+ "projector.deepseek_proj.weight": "model-00002-of-00002.safetensors",
350
+ "projector.norm.bias": "model-00002-of-00002.safetensors",
351
+ "projector.norm.weight": "model-00002-of-00002.safetensors",
352
+ "sam_model.blocks.0.attn.proj.bias": "model-00002-of-00002.safetensors",
353
+ "sam_model.blocks.0.attn.proj.weight": "model-00002-of-00002.safetensors",
354
+ "sam_model.blocks.0.attn.qkv.bias": "model-00002-of-00002.safetensors",
355
+ "sam_model.blocks.0.attn.qkv.weight": "model-00002-of-00002.safetensors",
356
+ "sam_model.blocks.0.attn.rel_pos_h": "model-00002-of-00002.safetensors",
357
+ "sam_model.blocks.0.attn.rel_pos_w": "model-00002-of-00002.safetensors",
358
+ "sam_model.blocks.0.mlp.lin1.bias": "model-00002-of-00002.safetensors",
359
+ "sam_model.blocks.0.mlp.lin1.weight": "model-00002-of-00002.safetensors",
360
+ "sam_model.blocks.0.mlp.lin2.bias": "model-00002-of-00002.safetensors",
361
+ "sam_model.blocks.0.mlp.lin2.weight": "model-00002-of-00002.safetensors",
362
+ "sam_model.blocks.0.norm1.bias": "model-00002-of-00002.safetensors",
363
+ "sam_model.blocks.0.norm1.weight": "model-00002-of-00002.safetensors",
364
+ "sam_model.blocks.0.norm2.bias": "model-00002-of-00002.safetensors",
365
+ "sam_model.blocks.0.norm2.weight": "model-00002-of-00002.safetensors",
366
+ "sam_model.blocks.1.attn.proj.bias": "model-00002-of-00002.safetensors",
367
+ "sam_model.blocks.1.attn.proj.weight": "model-00002-of-00002.safetensors",
368
+ "sam_model.blocks.1.attn.qkv.bias": "model-00002-of-00002.safetensors",
369
+ "sam_model.blocks.1.attn.qkv.weight": "model-00002-of-00002.safetensors",
370
+ "sam_model.blocks.1.attn.rel_pos_h": "model-00002-of-00002.safetensors",
371
+ "sam_model.blocks.1.attn.rel_pos_w": "model-00002-of-00002.safetensors",
372
+ "sam_model.blocks.1.mlp.lin1.bias": "model-00002-of-00002.safetensors",
373
+ "sam_model.blocks.1.mlp.lin1.weight": "model-00002-of-00002.safetensors",
374
+ "sam_model.blocks.1.mlp.lin2.bias": "model-00002-of-00002.safetensors",
375
+ "sam_model.blocks.1.mlp.lin2.weight": "model-00002-of-00002.safetensors",
376
+ "sam_model.blocks.1.norm1.bias": "model-00002-of-00002.safetensors",
377
+ "sam_model.blocks.1.norm1.weight": "model-00002-of-00002.safetensors",
378
+ "sam_model.blocks.1.norm2.bias": "model-00002-of-00002.safetensors",
379
+ "sam_model.blocks.1.norm2.weight": "model-00002-of-00002.safetensors",
380
+ "sam_model.blocks.10.attn.proj.bias": "model-00002-of-00002.safetensors",
381
+ "sam_model.blocks.10.attn.proj.weight": "model-00002-of-00002.safetensors",
382
+ "sam_model.blocks.10.attn.qkv.bias": "model-00002-of-00002.safetensors",
383
+ "sam_model.blocks.10.attn.qkv.weight": "model-00002-of-00002.safetensors",
384
+ "sam_model.blocks.10.attn.rel_pos_h": "model-00002-of-00002.safetensors",
385
+ "sam_model.blocks.10.attn.rel_pos_w": "model-00002-of-00002.safetensors",
386
+ "sam_model.blocks.10.mlp.lin1.bias": "model-00002-of-00002.safetensors",
387
+ "sam_model.blocks.10.mlp.lin1.weight": "model-00002-of-00002.safetensors",
388
+ "sam_model.blocks.10.mlp.lin2.bias": "model-00002-of-00002.safetensors",
389
+ "sam_model.blocks.10.mlp.lin2.weight": "model-00002-of-00002.safetensors",
390
+ "sam_model.blocks.10.norm1.bias": "model-00002-of-00002.safetensors",
391
+ "sam_model.blocks.10.norm1.weight": "model-00002-of-00002.safetensors",
392
+ "sam_model.blocks.10.norm2.bias": "model-00002-of-00002.safetensors",
393
+ "sam_model.blocks.10.norm2.weight": "model-00002-of-00002.safetensors",
394
+ "sam_model.blocks.11.attn.proj.bias": "model-00002-of-00002.safetensors",
395
+ "sam_model.blocks.11.attn.proj.weight": "model-00002-of-00002.safetensors",
396
+ "sam_model.blocks.11.attn.qkv.bias": "model-00002-of-00002.safetensors",
397
+ "sam_model.blocks.11.attn.qkv.weight": "model-00002-of-00002.safetensors",
398
+ "sam_model.blocks.11.attn.rel_pos_h": "model-00002-of-00002.safetensors",
399
+ "sam_model.blocks.11.attn.rel_pos_w": "model-00002-of-00002.safetensors",
400
+ "sam_model.blocks.11.mlp.lin1.bias": "model-00002-of-00002.safetensors",
401
+ "sam_model.blocks.11.mlp.lin1.weight": "model-00002-of-00002.safetensors",
402
+ "sam_model.blocks.11.mlp.lin2.bias": "model-00002-of-00002.safetensors",
403
+ "sam_model.blocks.11.mlp.lin2.weight": "model-00002-of-00002.safetensors",
404
+ "sam_model.blocks.11.norm1.bias": "model-00002-of-00002.safetensors",
405
+ "sam_model.blocks.11.norm1.weight": "model-00002-of-00002.safetensors",
406
+ "sam_model.blocks.11.norm2.bias": "model-00002-of-00002.safetensors",
407
+ "sam_model.blocks.11.norm2.weight": "model-00002-of-00002.safetensors",
408
+ "sam_model.blocks.2.attn.proj.bias": "model-00002-of-00002.safetensors",
409
+ "sam_model.blocks.2.attn.proj.weight": "model-00002-of-00002.safetensors",
410
+ "sam_model.blocks.2.attn.qkv.bias": "model-00002-of-00002.safetensors",
411
+ "sam_model.blocks.2.attn.qkv.weight": "model-00002-of-00002.safetensors",
412
+ "sam_model.blocks.2.attn.rel_pos_h": "model-00002-of-00002.safetensors",
413
+ "sam_model.blocks.2.attn.rel_pos_w": "model-00002-of-00002.safetensors",
414
+ "sam_model.blocks.2.mlp.lin1.bias": "model-00002-of-00002.safetensors",
415
+ "sam_model.blocks.2.mlp.lin1.weight": "model-00002-of-00002.safetensors",
416
+ "sam_model.blocks.2.mlp.lin2.bias": "model-00002-of-00002.safetensors",
417
+ "sam_model.blocks.2.mlp.lin2.weight": "model-00002-of-00002.safetensors",
418
+ "sam_model.blocks.2.norm1.bias": "model-00002-of-00002.safetensors",
419
+ "sam_model.blocks.2.norm1.weight": "model-00002-of-00002.safetensors",
420
+ "sam_model.blocks.2.norm2.bias": "model-00002-of-00002.safetensors",
421
+ "sam_model.blocks.2.norm2.weight": "model-00002-of-00002.safetensors",
422
+ "sam_model.blocks.3.attn.proj.bias": "model-00002-of-00002.safetensors",
423
+ "sam_model.blocks.3.attn.proj.weight": "model-00002-of-00002.safetensors",
424
+ "sam_model.blocks.3.attn.qkv.bias": "model-00002-of-00002.safetensors",
425
+ "sam_model.blocks.3.attn.qkv.weight": "model-00002-of-00002.safetensors",
426
+ "sam_model.blocks.3.attn.rel_pos_h": "model-00002-of-00002.safetensors",
427
+ "sam_model.blocks.3.attn.rel_pos_w": "model-00002-of-00002.safetensors",
428
+ "sam_model.blocks.3.mlp.lin1.bias": "model-00002-of-00002.safetensors",
429
+ "sam_model.blocks.3.mlp.lin1.weight": "model-00002-of-00002.safetensors",
430
+ "sam_model.blocks.3.mlp.lin2.bias": "model-00002-of-00002.safetensors",
431
+ "sam_model.blocks.3.mlp.lin2.weight": "model-00002-of-00002.safetensors",
432
+ "sam_model.blocks.3.norm1.bias": "model-00002-of-00002.safetensors",
433
+ "sam_model.blocks.3.norm1.weight": "model-00002-of-00002.safetensors",
434
+ "sam_model.blocks.3.norm2.bias": "model-00002-of-00002.safetensors",
435
+ "sam_model.blocks.3.norm2.weight": "model-00002-of-00002.safetensors",
436
+ "sam_model.blocks.4.attn.proj.bias": "model-00002-of-00002.safetensors",
437
+ "sam_model.blocks.4.attn.proj.weight": "model-00002-of-00002.safetensors",
438
+ "sam_model.blocks.4.attn.qkv.bias": "model-00002-of-00002.safetensors",
439
+ "sam_model.blocks.4.attn.qkv.weight": "model-00002-of-00002.safetensors",
440
+ "sam_model.blocks.4.attn.rel_pos_h": "model-00002-of-00002.safetensors",
441
+ "sam_model.blocks.4.attn.rel_pos_w": "model-00002-of-00002.safetensors",
442
+ "sam_model.blocks.4.mlp.lin1.bias": "model-00002-of-00002.safetensors",
443
+ "sam_model.blocks.4.mlp.lin1.weight": "model-00002-of-00002.safetensors",
444
+ "sam_model.blocks.4.mlp.lin2.bias": "model-00002-of-00002.safetensors",
445
+ "sam_model.blocks.4.mlp.lin2.weight": "model-00002-of-00002.safetensors",
446
+ "sam_model.blocks.4.norm1.bias": "model-00002-of-00002.safetensors",
447
+ "sam_model.blocks.4.norm1.weight": "model-00002-of-00002.safetensors",
448
+ "sam_model.blocks.4.norm2.bias": "model-00002-of-00002.safetensors",
449
+ "sam_model.blocks.4.norm2.weight": "model-00002-of-00002.safetensors",
450
+ "sam_model.blocks.5.attn.proj.bias": "model-00002-of-00002.safetensors",
451
+ "sam_model.blocks.5.attn.proj.weight": "model-00002-of-00002.safetensors",
452
+ "sam_model.blocks.5.attn.qkv.bias": "model-00002-of-00002.safetensors",
453
+ "sam_model.blocks.5.attn.qkv.weight": "model-00002-of-00002.safetensors",
454
+ "sam_model.blocks.5.attn.rel_pos_h": "model-00002-of-00002.safetensors",
455
+ "sam_model.blocks.5.attn.rel_pos_w": "model-00002-of-00002.safetensors",
456
+ "sam_model.blocks.5.mlp.lin1.bias": "model-00002-of-00002.safetensors",
457
+ "sam_model.blocks.5.mlp.lin1.weight": "model-00002-of-00002.safetensors",
458
+ "sam_model.blocks.5.mlp.lin2.bias": "model-00002-of-00002.safetensors",
459
+ "sam_model.blocks.5.mlp.lin2.weight": "model-00002-of-00002.safetensors",
460
+ "sam_model.blocks.5.norm1.bias": "model-00002-of-00002.safetensors",
461
+ "sam_model.blocks.5.norm1.weight": "model-00002-of-00002.safetensors",
462
+ "sam_model.blocks.5.norm2.bias": "model-00002-of-00002.safetensors",
463
+ "sam_model.blocks.5.norm2.weight": "model-00002-of-00002.safetensors",
464
+ "sam_model.blocks.6.attn.proj.bias": "model-00002-of-00002.safetensors",
465
+ "sam_model.blocks.6.attn.proj.weight": "model-00002-of-00002.safetensors",
466
+ "sam_model.blocks.6.attn.qkv.bias": "model-00002-of-00002.safetensors",
467
+ "sam_model.blocks.6.attn.qkv.weight": "model-00002-of-00002.safetensors",
468
+ "sam_model.blocks.6.attn.rel_pos_h": "model-00002-of-00002.safetensors",
469
+ "sam_model.blocks.6.attn.rel_pos_w": "model-00002-of-00002.safetensors",
470
+ "sam_model.blocks.6.mlp.lin1.bias": "model-00002-of-00002.safetensors",
471
+ "sam_model.blocks.6.mlp.lin1.weight": "model-00002-of-00002.safetensors",
472
+ "sam_model.blocks.6.mlp.lin2.bias": "model-00002-of-00002.safetensors",
473
+ "sam_model.blocks.6.mlp.lin2.weight": "model-00002-of-00002.safetensors",
474
+ "sam_model.blocks.6.norm1.bias": "model-00002-of-00002.safetensors",
475
+ "sam_model.blocks.6.norm1.weight": "model-00002-of-00002.safetensors",
476
+ "sam_model.blocks.6.norm2.bias": "model-00002-of-00002.safetensors",
477
+ "sam_model.blocks.6.norm2.weight": "model-00002-of-00002.safetensors",
478
+ "sam_model.blocks.7.attn.proj.bias": "model-00002-of-00002.safetensors",
479
+ "sam_model.blocks.7.attn.proj.weight": "model-00002-of-00002.safetensors",
480
+ "sam_model.blocks.7.attn.qkv.bias": "model-00002-of-00002.safetensors",
481
+ "sam_model.blocks.7.attn.qkv.weight": "model-00002-of-00002.safetensors",
482
+ "sam_model.blocks.7.attn.rel_pos_h": "model-00002-of-00002.safetensors",
483
+ "sam_model.blocks.7.attn.rel_pos_w": "model-00002-of-00002.safetensors",
484
+ "sam_model.blocks.7.mlp.lin1.bias": "model-00002-of-00002.safetensors",
485
+ "sam_model.blocks.7.mlp.lin1.weight": "model-00002-of-00002.safetensors",
486
+ "sam_model.blocks.7.mlp.lin2.bias": "model-00002-of-00002.safetensors",
487
+ "sam_model.blocks.7.mlp.lin2.weight": "model-00002-of-00002.safetensors",
488
+ "sam_model.blocks.7.norm1.bias": "model-00002-of-00002.safetensors",
489
+ "sam_model.blocks.7.norm1.weight": "model-00002-of-00002.safetensors",
490
+ "sam_model.blocks.7.norm2.bias": "model-00002-of-00002.safetensors",
491
+ "sam_model.blocks.7.norm2.weight": "model-00002-of-00002.safetensors",
492
+ "sam_model.blocks.8.attn.proj.bias": "model-00002-of-00002.safetensors",
493
+ "sam_model.blocks.8.attn.proj.weight": "model-00002-of-00002.safetensors",
494
+ "sam_model.blocks.8.attn.qkv.bias": "model-00002-of-00002.safetensors",
495
+ "sam_model.blocks.8.attn.qkv.weight": "model-00002-of-00002.safetensors",
496
+ "sam_model.blocks.8.attn.rel_pos_h": "model-00002-of-00002.safetensors",
497
+ "sam_model.blocks.8.attn.rel_pos_w": "model-00002-of-00002.safetensors",
498
+ "sam_model.blocks.8.mlp.lin1.bias": "model-00002-of-00002.safetensors",
499
+ "sam_model.blocks.8.mlp.lin1.weight": "model-00002-of-00002.safetensors",
500
+ "sam_model.blocks.8.mlp.lin2.bias": "model-00002-of-00002.safetensors",
501
+ "sam_model.blocks.8.mlp.lin2.weight": "model-00002-of-00002.safetensors",
502
+ "sam_model.blocks.8.norm1.bias": "model-00002-of-00002.safetensors",
503
+ "sam_model.blocks.8.norm1.weight": "model-00002-of-00002.safetensors",
504
+ "sam_model.blocks.8.norm2.bias": "model-00002-of-00002.safetensors",
505
+ "sam_model.blocks.8.norm2.weight": "model-00002-of-00002.safetensors",
506
+ "sam_model.blocks.9.attn.proj.bias": "model-00002-of-00002.safetensors",
507
+ "sam_model.blocks.9.attn.proj.weight": "model-00002-of-00002.safetensors",
508
+ "sam_model.blocks.9.attn.qkv.bias": "model-00002-of-00002.safetensors",
509
+ "sam_model.blocks.9.attn.qkv.weight": "model-00002-of-00002.safetensors",
510
+ "sam_model.blocks.9.attn.rel_pos_h": "model-00002-of-00002.safetensors",
511
+ "sam_model.blocks.9.attn.rel_pos_w": "model-00002-of-00002.safetensors",
512
+ "sam_model.blocks.9.mlp.lin1.bias": "model-00002-of-00002.safetensors",
513
+ "sam_model.blocks.9.mlp.lin1.weight": "model-00002-of-00002.safetensors",
514
+ "sam_model.blocks.9.mlp.lin2.bias": "model-00002-of-00002.safetensors",
515
+ "sam_model.blocks.9.mlp.lin2.weight": "model-00002-of-00002.safetensors",
516
+ "sam_model.blocks.9.norm1.bias": "model-00002-of-00002.safetensors",
517
+ "sam_model.blocks.9.norm1.weight": "model-00002-of-00002.safetensors",
518
+ "sam_model.blocks.9.norm2.bias": "model-00002-of-00002.safetensors",
519
+ "sam_model.blocks.9.norm2.weight": "model-00002-of-00002.safetensors",
520
+ "sam_model.neck.0.weight": "model-00002-of-00002.safetensors",
521
+ "sam_model.neck.1.bias": "model-00002-of-00002.safetensors",
522
+ "sam_model.neck.1.weight": "model-00002-of-00002.safetensors",
523
+ "sam_model.neck.2.weight": "model-00002-of-00002.safetensors",
524
+ "sam_model.neck.3.bias": "model-00002-of-00002.safetensors",
525
+ "sam_model.neck.3.weight": "model-00002-of-00002.safetensors",
526
+ "sam_model.net_2.weight": "model-00002-of-00002.safetensors",
527
+ "sam_model.net_3.weight": "model-00002-of-00002.safetensors",
528
+ "sam_model.patch_embed.proj.bias": "model-00002-of-00002.safetensors",
529
+ "sam_model.patch_embed.proj.weight": "model-00002-of-00002.safetensors",
530
+ "sam_model.pos_embed": "model-00002-of-00002.safetensors",
531
+ "view_separator": "model-00001-of-00002.safetensors",
532
+ "vision_model.embeddings.class_embedding": "model-00002-of-00002.safetensors",
533
+ "vision_model.embeddings.patch_embedding.weight": "model-00002-of-00002.safetensors",
534
+ "vision_model.embeddings.position_embedding.weight": "model-00002-of-00002.safetensors",
535
+ "vision_model.embeddings.position_ids": "model-00002-of-00002.safetensors",
536
+ "vision_model.pre_layrnorm.bias": "model-00002-of-00002.safetensors",
537
+ "vision_model.pre_layrnorm.weight": "model-00002-of-00002.safetensors",
538
+ "vision_model.transformer.layers.0.layer_norm1.bias": "model-00002-of-00002.safetensors",
539
+ "vision_model.transformer.layers.0.layer_norm1.weight": "model-00002-of-00002.safetensors",
540
+ "vision_model.transformer.layers.0.layer_norm2.bias": "model-00002-of-00002.safetensors",
541
+ "vision_model.transformer.layers.0.layer_norm2.weight": "model-00002-of-00002.safetensors",
542
+ "vision_model.transformer.layers.0.mlp.fc1.bias": "model-00002-of-00002.safetensors",
543
+ "vision_model.transformer.layers.0.mlp.fc1.weight": "model-00002-of-00002.safetensors",
544
+ "vision_model.transformer.layers.0.mlp.fc2.bias": "model-00002-of-00002.safetensors",
545
+ "vision_model.transformer.layers.0.mlp.fc2.weight": "model-00002-of-00002.safetensors",
546
+ "vision_model.transformer.layers.0.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
547
+ "vision_model.transformer.layers.0.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
548
+ "vision_model.transformer.layers.0.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
549
+ "vision_model.transformer.layers.0.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
550
+ "vision_model.transformer.layers.1.layer_norm1.bias": "model-00002-of-00002.safetensors",
551
+ "vision_model.transformer.layers.1.layer_norm1.weight": "model-00002-of-00002.safetensors",
552
+ "vision_model.transformer.layers.1.layer_norm2.bias": "model-00002-of-00002.safetensors",
553
+ "vision_model.transformer.layers.1.layer_norm2.weight": "model-00002-of-00002.safetensors",
554
+ "vision_model.transformer.layers.1.mlp.fc1.bias": "model-00002-of-00002.safetensors",
555
+ "vision_model.transformer.layers.1.mlp.fc1.weight": "model-00002-of-00002.safetensors",
556
+ "vision_model.transformer.layers.1.mlp.fc2.bias": "model-00002-of-00002.safetensors",
557
+ "vision_model.transformer.layers.1.mlp.fc2.weight": "model-00002-of-00002.safetensors",
558
+ "vision_model.transformer.layers.1.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
559
+ "vision_model.transformer.layers.1.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
560
+ "vision_model.transformer.layers.1.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
561
+ "vision_model.transformer.layers.1.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
562
+ "vision_model.transformer.layers.10.layer_norm1.bias": "model-00002-of-00002.safetensors",
563
+ "vision_model.transformer.layers.10.layer_norm1.weight": "model-00002-of-00002.safetensors",
564
+ "vision_model.transformer.layers.10.layer_norm2.bias": "model-00002-of-00002.safetensors",
565
+ "vision_model.transformer.layers.10.layer_norm2.weight": "model-00002-of-00002.safetensors",
566
+ "vision_model.transformer.layers.10.mlp.fc1.bias": "model-00002-of-00002.safetensors",
567
+ "vision_model.transformer.layers.10.mlp.fc1.weight": "model-00002-of-00002.safetensors",
568
+ "vision_model.transformer.layers.10.mlp.fc2.bias": "model-00002-of-00002.safetensors",
569
+ "vision_model.transformer.layers.10.mlp.fc2.weight": "model-00002-of-00002.safetensors",
570
+ "vision_model.transformer.layers.10.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
571
+ "vision_model.transformer.layers.10.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
572
+ "vision_model.transformer.layers.10.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
573
+ "vision_model.transformer.layers.10.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
574
+ "vision_model.transformer.layers.11.layer_norm1.bias": "model-00002-of-00002.safetensors",
575
+ "vision_model.transformer.layers.11.layer_norm1.weight": "model-00002-of-00002.safetensors",
576
+ "vision_model.transformer.layers.11.layer_norm2.bias": "model-00002-of-00002.safetensors",
577
+ "vision_model.transformer.layers.11.layer_norm2.weight": "model-00002-of-00002.safetensors",
578
+ "vision_model.transformer.layers.11.mlp.fc1.bias": "model-00002-of-00002.safetensors",
579
+ "vision_model.transformer.layers.11.mlp.fc1.weight": "model-00002-of-00002.safetensors",
580
+ "vision_model.transformer.layers.11.mlp.fc2.bias": "model-00002-of-00002.safetensors",
581
+ "vision_model.transformer.layers.11.mlp.fc2.weight": "model-00002-of-00002.safetensors",
582
+ "vision_model.transformer.layers.11.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
583
+ "vision_model.transformer.layers.11.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
584
+ "vision_model.transformer.layers.11.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
585
+ "vision_model.transformer.layers.11.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
586
+ "vision_model.transformer.layers.12.layer_norm1.bias": "model-00002-of-00002.safetensors",
587
+ "vision_model.transformer.layers.12.layer_norm1.weight": "model-00002-of-00002.safetensors",
588
+ "vision_model.transformer.layers.12.layer_norm2.bias": "model-00002-of-00002.safetensors",
589
+ "vision_model.transformer.layers.12.layer_norm2.weight": "model-00002-of-00002.safetensors",
590
+ "vision_model.transformer.layers.12.mlp.fc1.bias": "model-00002-of-00002.safetensors",
591
+ "vision_model.transformer.layers.12.mlp.fc1.weight": "model-00002-of-00002.safetensors",
592
+ "vision_model.transformer.layers.12.mlp.fc2.bias": "model-00002-of-00002.safetensors",
593
+ "vision_model.transformer.layers.12.mlp.fc2.weight": "model-00002-of-00002.safetensors",
594
+ "vision_model.transformer.layers.12.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
595
+ "vision_model.transformer.layers.12.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
596
+ "vision_model.transformer.layers.12.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
597
+ "vision_model.transformer.layers.12.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
598
+ "vision_model.transformer.layers.13.layer_norm1.bias": "model-00002-of-00002.safetensors",
599
+ "vision_model.transformer.layers.13.layer_norm1.weight": "model-00002-of-00002.safetensors",
600
+ "vision_model.transformer.layers.13.layer_norm2.bias": "model-00002-of-00002.safetensors",
601
+ "vision_model.transformer.layers.13.layer_norm2.weight": "model-00002-of-00002.safetensors",
602
+ "vision_model.transformer.layers.13.mlp.fc1.bias": "model-00002-of-00002.safetensors",
603
+ "vision_model.transformer.layers.13.mlp.fc1.weight": "model-00002-of-00002.safetensors",
604
+ "vision_model.transformer.layers.13.mlp.fc2.bias": "model-00002-of-00002.safetensors",
605
+ "vision_model.transformer.layers.13.mlp.fc2.weight": "model-00002-of-00002.safetensors",
606
+ "vision_model.transformer.layers.13.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
607
+ "vision_model.transformer.layers.13.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
608
+ "vision_model.transformer.layers.13.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
609
+ "vision_model.transformer.layers.13.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
610
+ "vision_model.transformer.layers.14.layer_norm1.bias": "model-00002-of-00002.safetensors",
611
+ "vision_model.transformer.layers.14.layer_norm1.weight": "model-00002-of-00002.safetensors",
612
+ "vision_model.transformer.layers.14.layer_norm2.bias": "model-00002-of-00002.safetensors",
613
+ "vision_model.transformer.layers.14.layer_norm2.weight": "model-00002-of-00002.safetensors",
614
+ "vision_model.transformer.layers.14.mlp.fc1.bias": "model-00002-of-00002.safetensors",
615
+ "vision_model.transformer.layers.14.mlp.fc1.weight": "model-00002-of-00002.safetensors",
616
+ "vision_model.transformer.layers.14.mlp.fc2.bias": "model-00002-of-00002.safetensors",
617
+ "vision_model.transformer.layers.14.mlp.fc2.weight": "model-00002-of-00002.safetensors",
618
+ "vision_model.transformer.layers.14.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
619
+ "vision_model.transformer.layers.14.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
620
+ "vision_model.transformer.layers.14.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
621
+ "vision_model.transformer.layers.14.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
622
+ "vision_model.transformer.layers.15.layer_norm1.bias": "model-00002-of-00002.safetensors",
623
+ "vision_model.transformer.layers.15.layer_norm1.weight": "model-00002-of-00002.safetensors",
624
+ "vision_model.transformer.layers.15.layer_norm2.bias": "model-00002-of-00002.safetensors",
625
+ "vision_model.transformer.layers.15.layer_norm2.weight": "model-00002-of-00002.safetensors",
626
+ "vision_model.transformer.layers.15.mlp.fc1.bias": "model-00002-of-00002.safetensors",
627
+ "vision_model.transformer.layers.15.mlp.fc1.weight": "model-00002-of-00002.safetensors",
628
+ "vision_model.transformer.layers.15.mlp.fc2.bias": "model-00002-of-00002.safetensors",
629
+ "vision_model.transformer.layers.15.mlp.fc2.weight": "model-00002-of-00002.safetensors",
630
+ "vision_model.transformer.layers.15.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
631
+ "vision_model.transformer.layers.15.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
632
+ "vision_model.transformer.layers.15.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
633
+ "vision_model.transformer.layers.15.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
634
+ "vision_model.transformer.layers.16.layer_norm1.bias": "model-00002-of-00002.safetensors",
635
+ "vision_model.transformer.layers.16.layer_norm1.weight": "model-00002-of-00002.safetensors",
636
+ "vision_model.transformer.layers.16.layer_norm2.bias": "model-00002-of-00002.safetensors",
637
+ "vision_model.transformer.layers.16.layer_norm2.weight": "model-00002-of-00002.safetensors",
638
+ "vision_model.transformer.layers.16.mlp.fc1.bias": "model-00002-of-00002.safetensors",
639
+ "vision_model.transformer.layers.16.mlp.fc1.weight": "model-00002-of-00002.safetensors",
640
+ "vision_model.transformer.layers.16.mlp.fc2.bias": "model-00002-of-00002.safetensors",
641
+ "vision_model.transformer.layers.16.mlp.fc2.weight": "model-00002-of-00002.safetensors",
642
+ "vision_model.transformer.layers.16.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
643
+ "vision_model.transformer.layers.16.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
644
+ "vision_model.transformer.layers.16.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
645
+ "vision_model.transformer.layers.16.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
646
+ "vision_model.transformer.layers.17.layer_norm1.bias": "model-00002-of-00002.safetensors",
647
+ "vision_model.transformer.layers.17.layer_norm1.weight": "model-00002-of-00002.safetensors",
648
+ "vision_model.transformer.layers.17.layer_norm2.bias": "model-00002-of-00002.safetensors",
649
+ "vision_model.transformer.layers.17.layer_norm2.weight": "model-00002-of-00002.safetensors",
650
+ "vision_model.transformer.layers.17.mlp.fc1.bias": "model-00002-of-00002.safetensors",
651
+ "vision_model.transformer.layers.17.mlp.fc1.weight": "model-00002-of-00002.safetensors",
652
+ "vision_model.transformer.layers.17.mlp.fc2.bias": "model-00002-of-00002.safetensors",
653
+ "vision_model.transformer.layers.17.mlp.fc2.weight": "model-00002-of-00002.safetensors",
654
+ "vision_model.transformer.layers.17.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
655
+ "vision_model.transformer.layers.17.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
656
+ "vision_model.transformer.layers.17.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
657
+ "vision_model.transformer.layers.17.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
658
+ "vision_model.transformer.layers.18.layer_norm1.bias": "model-00002-of-00002.safetensors",
659
+ "vision_model.transformer.layers.18.layer_norm1.weight": "model-00002-of-00002.safetensors",
660
+ "vision_model.transformer.layers.18.layer_norm2.bias": "model-00002-of-00002.safetensors",
661
+ "vision_model.transformer.layers.18.layer_norm2.weight": "model-00002-of-00002.safetensors",
662
+ "vision_model.transformer.layers.18.mlp.fc1.bias": "model-00002-of-00002.safetensors",
663
+ "vision_model.transformer.layers.18.mlp.fc1.weight": "model-00002-of-00002.safetensors",
664
+ "vision_model.transformer.layers.18.mlp.fc2.bias": "model-00002-of-00002.safetensors",
665
+ "vision_model.transformer.layers.18.mlp.fc2.weight": "model-00002-of-00002.safetensors",
666
+ "vision_model.transformer.layers.18.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
667
+ "vision_model.transformer.layers.18.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
668
+ "vision_model.transformer.layers.18.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
669
+ "vision_model.transformer.layers.18.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
670
+ "vision_model.transformer.layers.19.layer_norm1.bias": "model-00002-of-00002.safetensors",
671
+ "vision_model.transformer.layers.19.layer_norm1.weight": "model-00002-of-00002.safetensors",
672
+ "vision_model.transformer.layers.19.layer_norm2.bias": "model-00002-of-00002.safetensors",
673
+ "vision_model.transformer.layers.19.layer_norm2.weight": "model-00002-of-00002.safetensors",
674
+ "vision_model.transformer.layers.19.mlp.fc1.bias": "model-00002-of-00002.safetensors",
675
+ "vision_model.transformer.layers.19.mlp.fc1.weight": "model-00002-of-00002.safetensors",
676
+ "vision_model.transformer.layers.19.mlp.fc2.bias": "model-00002-of-00002.safetensors",
677
+ "vision_model.transformer.layers.19.mlp.fc2.weight": "model-00002-of-00002.safetensors",
678
+ "vision_model.transformer.layers.19.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
679
+ "vision_model.transformer.layers.19.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
680
+ "vision_model.transformer.layers.19.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
681
+ "vision_model.transformer.layers.19.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
682
+ "vision_model.transformer.layers.2.layer_norm1.bias": "model-00002-of-00002.safetensors",
683
+ "vision_model.transformer.layers.2.layer_norm1.weight": "model-00002-of-00002.safetensors",
684
+ "vision_model.transformer.layers.2.layer_norm2.bias": "model-00002-of-00002.safetensors",
685
+ "vision_model.transformer.layers.2.layer_norm2.weight": "model-00002-of-00002.safetensors",
686
+ "vision_model.transformer.layers.2.mlp.fc1.bias": "model-00002-of-00002.safetensors",
687
+ "vision_model.transformer.layers.2.mlp.fc1.weight": "model-00002-of-00002.safetensors",
688
+ "vision_model.transformer.layers.2.mlp.fc2.bias": "model-00002-of-00002.safetensors",
689
+ "vision_model.transformer.layers.2.mlp.fc2.weight": "model-00002-of-00002.safetensors",
690
+ "vision_model.transformer.layers.2.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
691
+ "vision_model.transformer.layers.2.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
692
+ "vision_model.transformer.layers.2.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
693
+ "vision_model.transformer.layers.2.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
694
+ "vision_model.transformer.layers.20.layer_norm1.bias": "model-00002-of-00002.safetensors",
695
+ "vision_model.transformer.layers.20.layer_norm1.weight": "model-00002-of-00002.safetensors",
696
+ "vision_model.transformer.layers.20.layer_norm2.bias": "model-00002-of-00002.safetensors",
697
+ "vision_model.transformer.layers.20.layer_norm2.weight": "model-00002-of-00002.safetensors",
698
+ "vision_model.transformer.layers.20.mlp.fc1.bias": "model-00002-of-00002.safetensors",
699
+ "vision_model.transformer.layers.20.mlp.fc1.weight": "model-00002-of-00002.safetensors",
700
+ "vision_model.transformer.layers.20.mlp.fc2.bias": "model-00002-of-00002.safetensors",
701
+ "vision_model.transformer.layers.20.mlp.fc2.weight": "model-00002-of-00002.safetensors",
702
+ "vision_model.transformer.layers.20.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
703
+ "vision_model.transformer.layers.20.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
704
+ "vision_model.transformer.layers.20.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
705
+ "vision_model.transformer.layers.20.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
706
+ "vision_model.transformer.layers.21.layer_norm1.bias": "model-00002-of-00002.safetensors",
707
+ "vision_model.transformer.layers.21.layer_norm1.weight": "model-00002-of-00002.safetensors",
708
+ "vision_model.transformer.layers.21.layer_norm2.bias": "model-00002-of-00002.safetensors",
709
+ "vision_model.transformer.layers.21.layer_norm2.weight": "model-00002-of-00002.safetensors",
710
+ "vision_model.transformer.layers.21.mlp.fc1.bias": "model-00002-of-00002.safetensors",
711
+ "vision_model.transformer.layers.21.mlp.fc1.weight": "model-00002-of-00002.safetensors",
712
+ "vision_model.transformer.layers.21.mlp.fc2.bias": "model-00002-of-00002.safetensors",
713
+ "vision_model.transformer.layers.21.mlp.fc2.weight": "model-00002-of-00002.safetensors",
714
+ "vision_model.transformer.layers.21.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
715
+ "vision_model.transformer.layers.21.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
716
+ "vision_model.transformer.layers.21.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
717
+ "vision_model.transformer.layers.21.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
718
+ "vision_model.transformer.layers.22.layer_norm1.bias": "model-00002-of-00002.safetensors",
719
+ "vision_model.transformer.layers.22.layer_norm1.weight": "model-00002-of-00002.safetensors",
720
+ "vision_model.transformer.layers.22.layer_norm2.bias": "model-00002-of-00002.safetensors",
721
+ "vision_model.transformer.layers.22.layer_norm2.weight": "model-00002-of-00002.safetensors",
722
+ "vision_model.transformer.layers.22.mlp.fc1.bias": "model-00002-of-00002.safetensors",
723
+ "vision_model.transformer.layers.22.mlp.fc1.weight": "model-00002-of-00002.safetensors",
724
+ "vision_model.transformer.layers.22.mlp.fc2.bias": "model-00002-of-00002.safetensors",
725
+ "vision_model.transformer.layers.22.mlp.fc2.weight": "model-00002-of-00002.safetensors",
726
+ "vision_model.transformer.layers.22.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
727
+ "vision_model.transformer.layers.22.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
728
+ "vision_model.transformer.layers.22.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
729
+ "vision_model.transformer.layers.22.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
730
+ "vision_model.transformer.layers.23.layer_norm1.bias": "model-00002-of-00002.safetensors",
731
+ "vision_model.transformer.layers.23.layer_norm1.weight": "model-00002-of-00002.safetensors",
732
+ "vision_model.transformer.layers.23.layer_norm2.bias": "model-00002-of-00002.safetensors",
733
+ "vision_model.transformer.layers.23.layer_norm2.weight": "model-00002-of-00002.safetensors",
734
+ "vision_model.transformer.layers.23.mlp.fc1.bias": "model-00002-of-00002.safetensors",
735
+ "vision_model.transformer.layers.23.mlp.fc1.weight": "model-00002-of-00002.safetensors",
736
+ "vision_model.transformer.layers.23.mlp.fc2.bias": "model-00002-of-00002.safetensors",
737
+ "vision_model.transformer.layers.23.mlp.fc2.weight": "model-00002-of-00002.safetensors",
738
+ "vision_model.transformer.layers.23.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
739
+ "vision_model.transformer.layers.23.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
740
+ "vision_model.transformer.layers.23.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
741
+ "vision_model.transformer.layers.23.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
742
+ "vision_model.transformer.layers.3.layer_norm1.bias": "model-00002-of-00002.safetensors",
743
+ "vision_model.transformer.layers.3.layer_norm1.weight": "model-00002-of-00002.safetensors",
744
+ "vision_model.transformer.layers.3.layer_norm2.bias": "model-00002-of-00002.safetensors",
745
+ "vision_model.transformer.layers.3.layer_norm2.weight": "model-00002-of-00002.safetensors",
746
+ "vision_model.transformer.layers.3.mlp.fc1.bias": "model-00002-of-00002.safetensors",
747
+ "vision_model.transformer.layers.3.mlp.fc1.weight": "model-00002-of-00002.safetensors",
748
+ "vision_model.transformer.layers.3.mlp.fc2.bias": "model-00002-of-00002.safetensors",
749
+ "vision_model.transformer.layers.3.mlp.fc2.weight": "model-00002-of-00002.safetensors",
750
+ "vision_model.transformer.layers.3.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
751
+ "vision_model.transformer.layers.3.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
752
+ "vision_model.transformer.layers.3.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
753
+ "vision_model.transformer.layers.3.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
754
+ "vision_model.transformer.layers.4.layer_norm1.bias": "model-00002-of-00002.safetensors",
755
+ "vision_model.transformer.layers.4.layer_norm1.weight": "model-00002-of-00002.safetensors",
756
+ "vision_model.transformer.layers.4.layer_norm2.bias": "model-00002-of-00002.safetensors",
757
+ "vision_model.transformer.layers.4.layer_norm2.weight": "model-00002-of-00002.safetensors",
758
+ "vision_model.transformer.layers.4.mlp.fc1.bias": "model-00002-of-00002.safetensors",
759
+ "vision_model.transformer.layers.4.mlp.fc1.weight": "model-00002-of-00002.safetensors",
760
+ "vision_model.transformer.layers.4.mlp.fc2.bias": "model-00002-of-00002.safetensors",
761
+ "vision_model.transformer.layers.4.mlp.fc2.weight": "model-00002-of-00002.safetensors",
762
+ "vision_model.transformer.layers.4.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
763
+ "vision_model.transformer.layers.4.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
764
+ "vision_model.transformer.layers.4.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
765
+ "vision_model.transformer.layers.4.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
766
+ "vision_model.transformer.layers.5.layer_norm1.bias": "model-00002-of-00002.safetensors",
767
+ "vision_model.transformer.layers.5.layer_norm1.weight": "model-00002-of-00002.safetensors",
768
+ "vision_model.transformer.layers.5.layer_norm2.bias": "model-00002-of-00002.safetensors",
769
+ "vision_model.transformer.layers.5.layer_norm2.weight": "model-00002-of-00002.safetensors",
770
+ "vision_model.transformer.layers.5.mlp.fc1.bias": "model-00002-of-00002.safetensors",
771
+ "vision_model.transformer.layers.5.mlp.fc1.weight": "model-00002-of-00002.safetensors",
772
+ "vision_model.transformer.layers.5.mlp.fc2.bias": "model-00002-of-00002.safetensors",
773
+ "vision_model.transformer.layers.5.mlp.fc2.weight": "model-00002-of-00002.safetensors",
774
+ "vision_model.transformer.layers.5.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
775
+ "vision_model.transformer.layers.5.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
776
+ "vision_model.transformer.layers.5.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
777
+ "vision_model.transformer.layers.5.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
778
+ "vision_model.transformer.layers.6.layer_norm1.bias": "model-00002-of-00002.safetensors",
779
+ "vision_model.transformer.layers.6.layer_norm1.weight": "model-00002-of-00002.safetensors",
780
+ "vision_model.transformer.layers.6.layer_norm2.bias": "model-00002-of-00002.safetensors",
781
+ "vision_model.transformer.layers.6.layer_norm2.weight": "model-00002-of-00002.safetensors",
782
+ "vision_model.transformer.layers.6.mlp.fc1.bias": "model-00002-of-00002.safetensors",
783
+ "vision_model.transformer.layers.6.mlp.fc1.weight": "model-00002-of-00002.safetensors",
784
+ "vision_model.transformer.layers.6.mlp.fc2.bias": "model-00002-of-00002.safetensors",
785
+ "vision_model.transformer.layers.6.mlp.fc2.weight": "model-00002-of-00002.safetensors",
786
+ "vision_model.transformer.layers.6.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
787
+ "vision_model.transformer.layers.6.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
788
+ "vision_model.transformer.layers.6.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
789
+ "vision_model.transformer.layers.6.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
790
+ "vision_model.transformer.layers.7.layer_norm1.bias": "model-00002-of-00002.safetensors",
791
+ "vision_model.transformer.layers.7.layer_norm1.weight": "model-00002-of-00002.safetensors",
792
+ "vision_model.transformer.layers.7.layer_norm2.bias": "model-00002-of-00002.safetensors",
793
+ "vision_model.transformer.layers.7.layer_norm2.weight": "model-00002-of-00002.safetensors",
794
+ "vision_model.transformer.layers.7.mlp.fc1.bias": "model-00002-of-00002.safetensors",
795
+ "vision_model.transformer.layers.7.mlp.fc1.weight": "model-00002-of-00002.safetensors",
796
+ "vision_model.transformer.layers.7.mlp.fc2.bias": "model-00002-of-00002.safetensors",
797
+ "vision_model.transformer.layers.7.mlp.fc2.weight": "model-00002-of-00002.safetensors",
798
+ "vision_model.transformer.layers.7.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
799
+ "vision_model.transformer.layers.7.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
800
+ "vision_model.transformer.layers.7.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
801
+ "vision_model.transformer.layers.7.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
802
+ "vision_model.transformer.layers.8.layer_norm1.bias": "model-00002-of-00002.safetensors",
803
+ "vision_model.transformer.layers.8.layer_norm1.weight": "model-00002-of-00002.safetensors",
804
+ "vision_model.transformer.layers.8.layer_norm2.bias": "model-00002-of-00002.safetensors",
805
+ "vision_model.transformer.layers.8.layer_norm2.weight": "model-00002-of-00002.safetensors",
806
+ "vision_model.transformer.layers.8.mlp.fc1.bias": "model-00002-of-00002.safetensors",
807
+ "vision_model.transformer.layers.8.mlp.fc1.weight": "model-00002-of-00002.safetensors",
808
+ "vision_model.transformer.layers.8.mlp.fc2.bias": "model-00002-of-00002.safetensors",
809
+ "vision_model.transformer.layers.8.mlp.fc2.weight": "model-00002-of-00002.safetensors",
810
+ "vision_model.transformer.layers.8.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
811
+ "vision_model.transformer.layers.8.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
812
+ "vision_model.transformer.layers.8.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
813
+ "vision_model.transformer.layers.8.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors",
814
+ "vision_model.transformer.layers.9.layer_norm1.bias": "model-00002-of-00002.safetensors",
815
+ "vision_model.transformer.layers.9.layer_norm1.weight": "model-00002-of-00002.safetensors",
816
+ "vision_model.transformer.layers.9.layer_norm2.bias": "model-00002-of-00002.safetensors",
817
+ "vision_model.transformer.layers.9.layer_norm2.weight": "model-00002-of-00002.safetensors",
818
+ "vision_model.transformer.layers.9.mlp.fc1.bias": "model-00002-of-00002.safetensors",
819
+ "vision_model.transformer.layers.9.mlp.fc1.weight": "model-00002-of-00002.safetensors",
820
+ "vision_model.transformer.layers.9.mlp.fc2.bias": "model-00002-of-00002.safetensors",
821
+ "vision_model.transformer.layers.9.mlp.fc2.weight": "model-00002-of-00002.safetensors",
822
+ "vision_model.transformer.layers.9.self_attn.out_proj.bias": "model-00002-of-00002.safetensors",
823
+ "vision_model.transformer.layers.9.self_attn.out_proj.weight": "model-00002-of-00002.safetensors",
824
+ "vision_model.transformer.layers.9.self_attn.qkv_proj.bias": "model-00002-of-00002.safetensors",
825
+ "vision_model.transformer.layers.9.self_attn.qkv_proj.weight": "model-00002-of-00002.safetensors"
826
+ }
827
+ }