wwydmanski commited on
Commit
03d6533
·
verified ·
1 Parent(s): 48606e1

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
TabularModel.py ADDED
@@ -0,0 +1,701 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import (
2
+ AutoConfig,
3
+ AutoProcessor,
4
+ ProcessorMixin,
5
+ Qwen2TokenizerFast,
6
+ BaseImageProcessor,
7
+ Qwen2_5_VLForConditionalGeneration,
8
+ )
9
+ from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import (
10
+ Qwen2_5_VLCausalLMOutputWithPast,
11
+ Qwen2RMSNorm,
12
+ )
13
+ from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
14
+ from transformers.processing_utils import Unpack
15
+ from transformers.feature_extraction_sequence_utils import BatchFeature
16
+
17
+ from typing import List, Optional, TypedDict
18
+
19
+ # from tabpfn_extensions import TabPFNRegressor
20
+ # from tabpfn_extensions.embedding import TabPFNEmbedding
21
+ import numpy as np
22
+
23
+ import torch
24
+ from torch import nn
25
+ from torch.nn import CrossEntropyLoss
26
+
27
+ from pprint import pprint
28
+
29
+
30
+ class TabularProcessorKwargs(TypedDict):
31
+ """
32
+ Keyword arguments for tabular processing.
33
+ """
34
+
35
+ pass
36
+
37
+
38
+ class TabularPreprocessor(BaseImageProcessor):
39
+ def __call__(self, X: list | np.ndarray | torch.Tensor) -> torch.Tensor:
40
+ if not isinstance(X, list):
41
+ X = [X]
42
+
43
+ res = []
44
+ for X_sample in X:
45
+ if isinstance(X_sample, torch.Tensor):
46
+ X_sample = X_sample.cpu().numpy()
47
+
48
+ res.append(X_sample)
49
+ res = np.array(res)
50
+ return BatchFeature(data={"tabular_values": torch.from_numpy(res).to(torch.float32)})
51
+
52
+ AutoProcessor.register("TabularPreprocessor", TabularPreprocessor)
53
+
54
+ class TabularProcessor(nn.Module):
55
+ def __init__(self, **kwargs: Unpack[TabularProcessorKwargs]):
56
+ super().__init__(**kwargs)
57
+ self.tabpfn = TabPFNRegressor(
58
+ n_estimators=1,
59
+ model_path="./tabpfn-v2-regressor.ckpt", device="cuda:1"
60
+ )
61
+
62
+ def __call__(self, X: np.ndarray | torch.Tensor) -> torch.Tensor:
63
+ # Will convert specified categorical indices to category dtype, as well
64
+ # as handle `np.object` arrays or otherwise `object` dtype pandas columns.
65
+ if len(X.shape) == 2:
66
+ X = [X]
67
+ res = []
68
+ for X_sample in X:
69
+ if isinstance(X_sample, torch.Tensor):
70
+ X_sample = X_sample.cpu().numpy()
71
+
72
+ X_sample = X_sample[0]
73
+ self.tabpfn.fit(X_sample, np.random.random(X_sample.shape[0]))
74
+
75
+ embs = self.tabpfn.get_embeddings(X_sample)
76
+ embs_t = torch.from_numpy(embs).to(self.tabpfn.device)
77
+ embs_t = embs_t.mean(dim=0)
78
+ res.append(embs_t)
79
+
80
+ res = torch.stack(res)
81
+ res = res.view(-1, 192)
82
+ return res
83
+
84
+ class TabularBlock(nn.Module):
85
+ def __init__(self, input_dim: int, hidden_dim: int = 192):
86
+ super().__init__()
87
+ self.linear1 = nn.Linear(input_dim, hidden_dim)
88
+ self.activation = nn.GELU()
89
+ self.linear2 = nn.Linear(hidden_dim, input_dim)
90
+
91
+ def forward(self, x: torch.Tensor) -> torch.Tensor:
92
+ residual = x
93
+ x = self.linear1(x)
94
+ x = self.activation(x)
95
+ x = self.linear2(x)
96
+ return x + residual
97
+
98
+ class TabularLearnableProcessor(nn.Module):
99
+ def __init__(self, num_features: int = 1):
100
+ super().__init__()
101
+ # Each cell is processed individually as a scalar
102
+ self.input_proj = nn.Linear(num_features, 192)
103
+ self.nodes = nn.Sequential(
104
+ nn.GELU(),
105
+ TabularBlock(192, 64),
106
+ nn.GELU(),
107
+ TabularBlock(192, 64),
108
+ nn.GELU(),
109
+ TabularBlock(192, 64),
110
+ nn.GELU(),
111
+ TabularBlock(192, 64),
112
+ nn.GELU(),
113
+ TabularBlock(192, 64),
114
+ nn.GELU(),
115
+ TabularBlock(192, 64),
116
+ nn.GELU(),
117
+ TabularBlock(192, 64),
118
+ )
119
+
120
+ def forward(self, X: np.ndarray | torch.Tensor) -> torch.Tensor:
121
+ if isinstance(X, np.ndarray):
122
+ X = torch.from_numpy(X)
123
+
124
+ param_dtype = self.input_proj.weight.dtype
125
+ X = X.to(param_dtype)
126
+
127
+ # Flatten the table - each cell becomes a separate token
128
+ # X shape: (batch_size, rows, cols) -> (batch_size * rows * cols, 1)
129
+ batch_size = X.shape[0]
130
+ X_flat = X.reshape(-1, 1) # Flatten to individual cells
131
+
132
+ # RMS normalization per cell for stability
133
+ # X_normalized = X_flat * torch.rsqrt(X_flat.pow(2) + 1e-5)
134
+
135
+ projected = self.input_proj(X_flat)
136
+ # res = self.nodes(projected)
137
+ return projected
138
+
139
+ class Qwen_2_5_TabularProcessor(ProcessorMixin):
140
+ r"""
141
+ Constructs a Qwen2.5-VL processor which wraps a Qwen2.5-VL image processor and a Qwen2 tokenizer into a single processor.
142
+ [`Qwen2_5_VLProcessor`] offers all the functionalities of [`Qwen2VLImageProcessor`] and [`Qwen2TokenizerFast`]. See the
143
+ [`~Qwen2_5_VLProcessor.__call__`] and [`~Qwen2_5_VLProcessor.decode`] for more information.
144
+ Args:
145
+ image_processor ([`Qwen2VLImageProcessor`], *optional*):
146
+ The image processor is a required input.
147
+ tokenizer ([`Qwen2TokenizerFast`], *optional*):
148
+ The tokenizer is a required input.
149
+ chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
150
+ in a chat into a tokenizable string.
151
+ """
152
+
153
+ attributes = ["tokenizer"]
154
+ valid_kwargs = ["chat_template"]
155
+
156
+ tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
157
+
158
+ def __init__(
159
+ self,
160
+ tabular_processor: TabularPreprocessor | None = None,
161
+ tokenizer=None,
162
+ chat_template=None,
163
+ **kwargs,
164
+ ):
165
+ self.tabular_token = (
166
+ "<|tabular_pad|>"
167
+ if not hasattr(tokenizer, "tabular_token")
168
+ else tokenizer.tabular_token
169
+ )
170
+ self.tabular_processor = tabular_processor
171
+ super().__init__(tokenizer, chat_template=chat_template)
172
+
173
+ def __call__(
174
+ self,
175
+ tabular_values: np.ndarray | torch.Tensor | None = None,
176
+ text: TextInput | PreTokenizedInput | list[TextInput] | list[PreTokenizedInput] | None = None,
177
+ **kwargs: Unpack[TabularProcessorKwargs],
178
+ ) -> BatchFeature:
179
+ """
180
+ Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
181
+ and `kwargs` arguments to Qwen2TokenizerFast's [`~Qwen2TokenizerFast.__call__`] if `text` is not `None` to encode
182
+ the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
183
+ Qwen2VLImageProcessor's [`~Qwen2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
184
+
185
+ Args:
186
+ images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
187
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
188
+ tensor. Both channels-first and channels-last formats are supported.
189
+ text (`str`, `List[str]`, `List[List[str]]`):
190
+ The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
191
+ (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
192
+ `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
193
+ videos (`np.ndarray`, `torch.Tensor`, `List[np.ndarray]`, `List[torch.Tensor]`):
194
+ The image or batch of videos to be prepared. Each video can be a 4D NumPy array or PyTorch
195
+ tensor, or a nested list of 3D frames. Both channels-first and channels-last formats are supported.
196
+ return_tensors (`str` or [`~utils.TensorType`], *optional*):
197
+ If set, will return tensors of a particular framework. Acceptable values are:
198
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
199
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
200
+ - `'np'`: Return NumPy `np.ndarray` objects.
201
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
202
+
203
+ Returns:
204
+ [`BatchFeature`]: A [`BatchFeature`] with the following fields:
205
+
206
+ - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
207
+ - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
208
+ `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
209
+ `None`).
210
+ - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
211
+ - **pixel_values_videos** -- Pixel values of videos to be fed to a model. Returned when `videos` is not `None`.
212
+ - **image_grid_thw** -- List of image 3D grid in LLM. Returned when `images` is not `None`.
213
+ - **video_grid_thw** -- List of video 3D grid in LLM. Returned when `videos` is not `None`.
214
+ - **second_per_grid_ts** -- List of video seconds per time grid. Returned when `videos` is not `None`.
215
+ """
216
+ # print("Tabular values: ", tabular_values)
217
+ if tabular_values is not None:
218
+ tabular_inputs = self.tabular_processor(tabular_values)
219
+ else:
220
+ print("Warning! No tabular values provided!")
221
+ tabular_inputs = {}
222
+
223
+ if not isinstance(text, list):
224
+ text = [text]
225
+
226
+ if tabular_values is not None:
227
+ index = 0
228
+ for i in range(len(text)):
229
+ while self.tabular_token in text[i]:
230
+ # Each cell becomes a token: num_tokens = rows * cols
231
+ table_shape = tabular_inputs["tabular_values"][index].shape
232
+ rows, cols = table_shape[0], table_shape[1]
233
+ # Build pattern: for each row, add col tokens + row separator
234
+ row_pattern = "<|placeholder|>" * cols + "<|tabular_row|>"
235
+ replacement = row_pattern * rows
236
+ text[i] = text[i].replace(
237
+ self.tabular_token,
238
+ replacement,
239
+ 1,
240
+ )
241
+ index += 1
242
+ text[i] = text[i].replace("<|placeholder|>", self.tabular_token)
243
+
244
+ text_inputs = self.tokenizer(text, **kwargs)
245
+ return BatchFeature(data={**text_inputs, **tabular_inputs})
246
+
247
+ def batch_decode(self, *args, **kwargs):
248
+ """
249
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
250
+ refer to the docstring of this method for more information.
251
+ """
252
+ return self.tokenizer.batch_decode(*args, **kwargs)
253
+
254
+ def decode(self, *args, **kwargs):
255
+ """
256
+ This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
257
+ the docstring of this method for more information.
258
+ """
259
+ return self.tokenizer.decode(*args, **kwargs)
260
+
261
+ def post_process_image_text_to_text(
262
+ self,
263
+ generated_outputs,
264
+ skip_special_tokens=True,
265
+ clean_up_tokenization_spaces=False,
266
+ **kwargs,
267
+ ):
268
+ """
269
+ Post-process the output of the model to decode the text.
270
+
271
+ Args:
272
+ generated_outputs (`torch.Tensor` or `np.ndarray`):
273
+ The output of the model `generate` function. The output is expected to be a tensor of shape `(batch_size, sequence_length)`
274
+ or `(sequence_length,)`.
275
+ skip_special_tokens (`bool`, *optional*, defaults to `True`):
276
+ Whether or not to remove special tokens in the output. Argument passed to the tokenizer's `batch_decode` method.
277
+ Clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
278
+ Whether or not to clean up the tokenization spaces. Argument passed to the tokenizer's `batch_decode` method.
279
+ **kwargs:
280
+ Additional arguments to be passed to the tokenizer's `batch_decode method`.
281
+
282
+ Returns:
283
+ `List[str]`: The decoded text.
284
+ """
285
+ return self.tokenizer.batch_decode(
286
+ generated_outputs,
287
+ skip_special_tokens=skip_special_tokens,
288
+ clean_up_tokenization_spaces=clean_up_tokenization_spaces,
289
+ **kwargs,
290
+ )
291
+
292
+ @property
293
+ def model_input_names(self):
294
+ tokenizer_input_names = self.tokenizer.model_input_names
295
+ tabular_processor_input_names = self.tabular_processor.model_input_names if hasattr(self.tabular_processor, 'model_input_names') else []
296
+ names_from_processor = list(
297
+ dict.fromkeys(tokenizer_input_names + tabular_processor_input_names)
298
+ )
299
+ return names_from_processor + ["tabular_values"]
300
+
301
+
302
+ class Qwen2_5_TabularModel(Qwen2_5_VLForConditionalGeneration):
303
+ def __init__(self, *args, **kwargs):
304
+ super().__init__(*args, **kwargs)
305
+ self.tabular_processor = TabularLearnableProcessor(num_features=1)
306
+
307
+ self.tabular_projection = nn.Sequential(
308
+ nn.Linear(192, self.config.hidden_size),
309
+ nn.ReLU(),
310
+ TabularBlock(self.config.hidden_size, self.config.hidden_size),
311
+ nn.ReLU(),
312
+ TabularBlock(self.config.hidden_size, self.config.hidden_size),
313
+ nn.ReLU(),
314
+ TabularBlock(self.config.hidden_size, self.config.hidden_size),
315
+ )
316
+
317
+ def forward(
318
+ self,
319
+ input_ids: Optional[torch.LongTensor] = None,
320
+ attention_mask: Optional[torch.Tensor] = None,
321
+ position_ids: Optional[torch.LongTensor] = None,
322
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
323
+ inputs_embeds: Optional[torch.FloatTensor] = None,
324
+ labels: Optional[torch.LongTensor] = None,
325
+ use_cache: Optional[bool] = None,
326
+ output_attentions: Optional[bool] = None,
327
+ output_hidden_states: Optional[bool] = None,
328
+ return_dict: Optional[bool] = None,
329
+ pixel_values: Optional[torch.Tensor] = None,
330
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
331
+ tabular_values: Optional[torch.Tensor] = None,
332
+ image_grid_thw: Optional[torch.LongTensor] = None,
333
+ video_grid_thw: Optional[torch.LongTensor] = None,
334
+ rope_deltas: Optional[torch.LongTensor] = None,
335
+ cache_position: Optional[torch.LongTensor] = None,
336
+ second_per_grid_ts: Optional[torch.Tensor] = None,
337
+ ):
338
+ r"""
339
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
340
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
341
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
342
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
343
+
344
+ Returns:
345
+
346
+ Example:
347
+
348
+ ```python
349
+ >>> from PIL import Image
350
+ >>> import requests
351
+ >>> from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
352
+
353
+ >>> model = Qwen2_5_VLForConditionalGeneration.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
354
+ >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
355
+
356
+ >>> messages = [
357
+ {
358
+ "role": "user",
359
+ "content": [
360
+ {"type": "image"},
361
+ {"type": "text", "text": "What is shown in this image?"},
362
+ ],
363
+ },
364
+ ]
365
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
366
+ >>> image = Image.open(requests.get(url, stream=True).raw)
367
+
368
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
369
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
370
+
371
+ >>> # Generate
372
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
373
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
374
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
375
+ ```"""
376
+
377
+ output_attentions = (
378
+ output_attentions
379
+ if output_attentions is not None
380
+ else self.config.output_attentions
381
+ )
382
+ output_hidden_states = (
383
+ output_hidden_states
384
+ if output_hidden_states is not None
385
+ else self.config.output_hidden_states
386
+ )
387
+ return_dict = (
388
+ return_dict if return_dict is not None else self.config.use_return_dict
389
+ )
390
+
391
+ if inputs_embeds is None:
392
+ inputs_embeds = self.language_model.embed_tokens(input_ids)
393
+ if pixel_values is not None:
394
+ pixel_values = pixel_values.type(self.visual.dtype)
395
+ image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
396
+ n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
397
+ n_image_features = image_embeds.shape[0]
398
+ if n_image_tokens != n_image_features:
399
+ raise ValueError(
400
+ f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
401
+ )
402
+
403
+ mask = input_ids == self.config.image_token_id
404
+ mask_unsqueezed = mask.unsqueeze(-1)
405
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
406
+ image_mask = mask_expanded.to(inputs_embeds.device)
407
+
408
+ image_embeds = image_embeds.to(
409
+ inputs_embeds.device, inputs_embeds.dtype
410
+ )
411
+ inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
412
+
413
+ if pixel_values_videos is not None:
414
+ pixel_values_videos = pixel_values_videos.type(self.visual.dtype)
415
+ video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
416
+ n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
417
+ n_video_features = video_embeds.shape[0]
418
+ if n_video_tokens != n_video_features:
419
+ raise ValueError(
420
+ f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
421
+ )
422
+
423
+ mask = input_ids == self.config.video_token_id
424
+ mask_unsqueezed = mask.unsqueeze(-1)
425
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
426
+ video_mask = mask_expanded.to(inputs_embeds.device)
427
+
428
+ video_embeds = video_embeds.to(
429
+ inputs_embeds.device, inputs_embeds.dtype
430
+ )
431
+ inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
432
+
433
+ if tabular_values is not None:
434
+ proc_feats = self.tabular_processor(tabular_values.to(self.device, torch.float32))
435
+ proc_feats = proc_feats.to(inputs_embeds.dtype).to(self.device)
436
+ tabular_embeds = self.tabular_projection(proc_feats)
437
+
438
+ tabular_token_id = getattr(self.config, "tabular_token_id", None)
439
+ if tabular_token_id is None:
440
+ raise ValueError("Tabular token id (config.tabular_token_id) is not set.")
441
+ mask = (input_ids == int(tabular_token_id))
442
+
443
+ tabular_no_mask = mask.sum().item()
444
+ if tabular_no_mask != tabular_embeds.shape[0]:
445
+ raise ValueError(
446
+ f"Tabular features and tabular tokens do not match: tokens: {tabular_no_mask}, features {tabular_embeds.shape[0]}"
447
+ )
448
+
449
+ mask_unsqueezed = mask.unsqueeze(-1)
450
+ mask_expanded = mask_unsqueezed.expand_as(inputs_embeds)
451
+ tabular_mask = mask_expanded.to(inputs_embeds.device)
452
+ tabular_embeds = tabular_embeds.to(
453
+ inputs_embeds.device, inputs_embeds.dtype
454
+ )
455
+ inputs_embeds = inputs_embeds.masked_scatter(
456
+ tabular_mask, tabular_embeds
457
+ )
458
+
459
+ if attention_mask is not None:
460
+ attention_mask = attention_mask.to(inputs_embeds.device)
461
+
462
+ # if we get 4D attention mask we cannot calculate rope deltas anymore. TODO @raushan fixme
463
+ if position_ids is None and (
464
+ attention_mask is None or attention_mask.ndim == 2
465
+ ):
466
+ # calculate RoPE index once per generation in the pre-fill stage only
467
+ if (
468
+ (cache_position is not None and cache_position[0] == 0)
469
+ or self.rope_deltas is None
470
+ or (past_key_values is None or past_key_values.get_seq_length() == 0)
471
+ ):
472
+ position_ids, rope_deltas = self.model.get_rope_index(
473
+ input_ids,
474
+ image_grid_thw,
475
+ video_grid_thw,
476
+ second_per_grid_ts,
477
+ attention_mask,
478
+ )
479
+ self.rope_deltas = rope_deltas
480
+ # then use the prev pre-calculated rope-deltas to get the correct position ids
481
+ else:
482
+ batch_size, seq_length, _ = inputs_embeds.shape
483
+ delta = (
484
+ (cache_position[0] + self.rope_deltas).to(inputs_embeds.device)
485
+ if cache_position is not None
486
+ else 0
487
+ )
488
+ position_ids = torch.arange(seq_length, device=inputs_embeds.device)
489
+ position_ids = position_ids.view(1, -1).expand(batch_size, -1)
490
+ if cache_position is not None: # otherwise `deltas` is an int `0`
491
+ delta = delta.repeat_interleave(batch_size // delta.shape[0], dim=0)
492
+ position_ids = position_ids.add(delta)
493
+ position_ids = position_ids.unsqueeze(0).expand(3, -1, -1)
494
+
495
+ outputs = self.model(
496
+ input_ids=None,
497
+ position_ids=position_ids,
498
+ attention_mask=attention_mask,
499
+ past_key_values=past_key_values,
500
+ inputs_embeds=inputs_embeds,
501
+ use_cache=use_cache,
502
+ output_attentions=output_attentions,
503
+ output_hidden_states=output_hidden_states,
504
+ return_dict=return_dict,
505
+ cache_position=cache_position,
506
+ )
507
+
508
+ hidden_states = outputs[0]
509
+ logits = self.lm_head(hidden_states)
510
+
511
+ loss = None
512
+ if labels is not None:
513
+ # Upcast to float if we need to compute the loss to avoid potential precision issues
514
+ logits = logits.float()
515
+ # Shift so that tokens < n predict n
516
+ shift_logits = logits[..., :-1, :].contiguous()
517
+ shift_labels = labels[..., 1:].contiguous()
518
+ # Flatten the tokens
519
+ loss_fct = CrossEntropyLoss()
520
+ shift_logits = shift_logits.view(-1, self.config.vocab_size)
521
+ shift_labels = shift_labels.view(-1)
522
+ # Enable model parallelism
523
+ shift_labels = shift_labels.to(shift_logits.device)
524
+ loss = loss_fct(shift_logits, shift_labels)
525
+
526
+ if not return_dict:
527
+ output = (logits,) + outputs[1:]
528
+ return (loss,) + output if loss is not None else output
529
+
530
+ return Qwen2_5_VLCausalLMOutputWithPast(
531
+ loss=loss,
532
+ logits=logits,
533
+ past_key_values=outputs.past_key_values,
534
+ hidden_states=outputs.hidden_states,
535
+ attentions=outputs.attentions,
536
+ rope_deltas=self.rope_deltas,
537
+ )
538
+
539
+ def prepare_inputs_for_generation(
540
+ self,
541
+ input_ids,
542
+ past_key_values=None,
543
+ attention_mask=None,
544
+ inputs_embeds=None,
545
+ cache_position=None,
546
+ position_ids=None,
547
+ use_cache=True,
548
+ pixel_values=None,
549
+ pixel_values_videos=None,
550
+ image_grid_thw=None,
551
+ video_grid_thw=None,
552
+ second_per_grid_ts=None,
553
+ **kwargs,
554
+ ):
555
+ # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
556
+
557
+ model_inputs = super().prepare_inputs_for_generation(
558
+ input_ids,
559
+ past_key_values=past_key_values,
560
+ attention_mask=attention_mask,
561
+ inputs_embeds=inputs_embeds,
562
+ cache_position=cache_position,
563
+ position_ids=position_ids,
564
+ pixel_values=pixel_values,
565
+ pixel_values_videos=pixel_values_videos,
566
+ image_grid_thw=image_grid_thw,
567
+ video_grid_thw=video_grid_thw,
568
+ second_per_grid_ts=second_per_grid_ts,
569
+ use_cache=use_cache,
570
+ **kwargs,
571
+ )
572
+
573
+ # Qwen2-5-VL position_ids are prepareed with rope_deltas in forward
574
+ model_inputs["position_ids"] = None
575
+
576
+ if cache_position[0] != 0:
577
+ model_inputs["pixel_values"] = None
578
+ model_inputs["pixel_values_videos"] = None
579
+ model_inputs["tabular_values"] = None
580
+
581
+ return model_inputs
582
+
583
+ if __name__ == "__main__":
584
+ template = """"{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set tabular_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'tabular' or 'tabular' in content %}{% set tabular_count.value = tabular_count.value + 1 %}{% if add_vision_id %}Table {{ tabular_count.value }}: {% endif %}<|vision_start|><|tabular_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"""
585
+
586
+ MODE = "reconstruction_variable"
587
+
588
+ model_name_trained = f"./models/Tabular-LM-v0.1-{MODE}"
589
+ # model_name_trained = "Qwen/Qwen2.5-VL-3B-Instruct"
590
+ # model_name_trained = "./models/checkpoints/checkpoint-1000"
591
+
592
+ tabular_processor = TabularPreprocessor()
593
+ qwen_tabular_processor = Qwen_2_5_TabularProcessor(
594
+ tabular_processor=tabular_processor,
595
+ tokenizer=Qwen2TokenizerFast.from_pretrained(model_name_trained),
596
+ )
597
+
598
+ qwen_tabular_processor.tabular_token = "<|tabular_pad|>"
599
+ qwen_tabular_processor.tokenizer.add_tokens([qwen_tabular_processor.tabular_token, "<|tabular_row|>"])
600
+ qwen_tabular_processor.tokenizer.chat_template = template
601
+
602
+ tabular_data = np.random.randn(4,6).round(2)
603
+
604
+ messages = [
605
+ {
606
+ "role": "user",
607
+ "content": [
608
+ {"type": "text", "text": "This is a table."},
609
+ {"index": 0, "type": "tabular"},
610
+ {"type": "text", "text": "Give me its content in csv format."},
611
+ # {"type": "text", "text": "Give me a statistical summary."},
612
+ # {"type": "text", "text": "Give me the correlation matrix in csv format"},
613
+ # {"type": "text", "text": "Give me the content of the table"},
614
+ ],
615
+ }
616
+ ]
617
+
618
+ preprocessed = qwen_tabular_processor.tokenizer.apply_chat_template(
619
+ messages, tokenize=False
620
+ )
621
+
622
+ processed = qwen_tabular_processor(
623
+ [tabular_data], text=preprocessed, return_tensors="pt"
624
+ )
625
+
626
+ model = Qwen2_5_TabularModel.from_pretrained(model_name_trained).to("cuda:1")
627
+ model.config.tabular_token_id = (
628
+ qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_pad|>")
629
+ )
630
+ model.config.tabular_row_token_id = (
631
+ qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_row|>")
632
+ )
633
+
634
+ processed = {key: value.to("cuda:1") for key, value in processed.items()}
635
+
636
+ res = model.generate(**processed, max_new_tokens=512, do_sample=False)
637
+ generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(processed["input_ids"], res, strict=True)]
638
+ output_text = qwen_tabular_processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
639
+
640
+ print("="*80)
641
+ print("Original table:")
642
+ print(tabular_data)
643
+ print("\nModel output:")
644
+ print(output_text[0])
645
+ print("="*80)
646
+
647
+ if MODE in ["reconstruction", "reconstruction_variable"]:
648
+ # Try to evaluate reconstruction quality
649
+ from utils import text_to_array
650
+ generated_array = text_to_array(output_text[0])
651
+
652
+ # Round original to match expected precision
653
+ tabular_data_rounded = tabular_data.round(1)
654
+
655
+ print("\nReconstruction evaluation:")
656
+ print(f"Original shape: {tabular_data_rounded.shape}")
657
+ print(f"Generated shape: {generated_array.shape}")
658
+
659
+ if generated_array.shape == tabular_data_rounded.shape:
660
+ mse = np.mean((generated_array - tabular_data_rounded) ** 2)
661
+ mae = np.mean(np.abs(generated_array - tabular_data_rounded))
662
+ print(f"MSE: {mse:.4f}")
663
+ print(f"MAE: {mae:.4f}")
664
+ else:
665
+ print(f"Shape mismatch - cannot compute metrics")
666
+
667
+ if MODE == "summary":
668
+ summary_parts = []
669
+
670
+ # Podstawowe statystyki
671
+ summary_parts.append(f"Mean: {tabular_data.mean():.2f}")
672
+ summary_parts.append(f"Median: {np.median(tabular_data):.2f}")
673
+ summary_parts.append(f"Std: {tabular_data.std():.2f}")
674
+ summary_parts.append(f"Min: {tabular_data.min():.2f}")
675
+ summary_parts.append(f"Max: {tabular_data.max():.2f}")
676
+
677
+ # Średnie po wierszach
678
+ row_means = tabular_data.mean(axis=1)
679
+ row_means_str = ", ".join([f"{m:.2f}" for m in row_means])
680
+ summary_parts.append(f"Row means: [{row_means_str}]")
681
+
682
+ # Średnie po kolumnach
683
+ col_means = tabular_data.mean(axis=0)
684
+ col_means_str = ", ".join([f"{m:.2f}" for m in col_means])
685
+ summary_parts.append(f"Column means: [{col_means_str}]")
686
+
687
+ # Macierz korelacji (jeśli mamy więcej niż 1 kolumnę)
688
+ if tabular_data.shape[1] > 1:
689
+ try:
690
+ corrcoef = np.corrcoef(tabular_data.T)
691
+ corr_str = "Correlation matrix:\n"
692
+ for i in range(corrcoef.shape[0]):
693
+ corr_row = ", ".join([f"{corrcoef[i, j]:.2f}" for j in range(corrcoef.shape[1])])
694
+ corr_str += f" [{corr_row}]\n"
695
+ summary_parts.append(corr_str.strip())
696
+ except:
697
+ pass
698
+
699
+ summary_text = "\n".join(summary_parts)
700
+ print("True summary:")
701
+ print(summary_text)
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "dtype": "float32",
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 3584,
11
+ "image_token_id": 151655,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 18944,
14
+ "max_position_embeddings": 128000,
15
+ "max_window_layers": 28,
16
+ "model_type": "qwen2_5_vl",
17
+ "num_attention_heads": 28,
18
+ "num_hidden_layers": 28,
19
+ "num_key_value_heads": 4,
20
+ "rms_norm_eps": 1e-06,
21
+ "rope_scaling": {
22
+ "mrope_section": [
23
+ 16,
24
+ 24,
25
+ 24
26
+ ],
27
+ "rope_type": "default",
28
+ "type": "default"
29
+ },
30
+ "rope_theta": 1000000.0,
31
+ "sliding_window": 32768,
32
+ "tabular_row_token_id": 151666,
33
+ "tabular_token_id": 151665,
34
+ "text_config": {
35
+ "_name_or_path": "Qwen/Qwen2.5-VL-7B-Instruct",
36
+ "architectures": [
37
+ "Qwen2_5_TabularModel"
38
+ ],
39
+ "attention_dropout": 0.0,
40
+ "dtype": "float32",
41
+ "eos_token_id": 151645,
42
+ "hidden_act": "silu",
43
+ "hidden_size": 3584,
44
+ "initializer_range": 0.02,
45
+ "intermediate_size": 18944,
46
+ "layer_types": [
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention",
72
+ "full_attention",
73
+ "full_attention",
74
+ "full_attention"
75
+ ],
76
+ "max_position_embeddings": 128000,
77
+ "max_window_layers": 28,
78
+ "model_type": "qwen2_5_vl_text",
79
+ "num_attention_heads": 28,
80
+ "num_hidden_layers": 28,
81
+ "num_key_value_heads": 4,
82
+ "pad_token_id": 151643,
83
+ "rms_norm_eps": 1e-06,
84
+ "rope_scaling": {
85
+ "mrope_section": [
86
+ 16,
87
+ 24,
88
+ 24
89
+ ],
90
+ "rope_type": "default",
91
+ "type": "default"
92
+ },
93
+ "rope_theta": 1000000.0,
94
+ "sliding_window": null,
95
+ "use_cache": false,
96
+ "use_sliding_window": false,
97
+ "vision_token_id": 151654,
98
+ "vocab_size": 152064
99
+ },
100
+ "tie_word_embeddings": false,
101
+ "transformers_version": "4.57.1",
102
+ "use_cache": true,
103
+ "use_sliding_window": false,
104
+ "video_token_id": 151656,
105
+ "vision_config": {
106
+ "depth": 32,
107
+ "dtype": "float32",
108
+ "fullatt_block_indexes": [
109
+ 7,
110
+ 15,
111
+ 23,
112
+ 31
113
+ ],
114
+ "hidden_act": "silu",
115
+ "hidden_size": 1280,
116
+ "in_channels": 3,
117
+ "in_chans": 3,
118
+ "initializer_range": 0.02,
119
+ "intermediate_size": 3420,
120
+ "model_type": "qwen2_5_vl",
121
+ "num_heads": 16,
122
+ "out_hidden_size": 3584,
123
+ "patch_size": 14,
124
+ "spatial_merge_size": 2,
125
+ "spatial_patch_size": 14,
126
+ "temporal_patch_size": 2,
127
+ "tokens_per_second": 2,
128
+ "window_size": 112
129
+ },
130
+ "vision_end_token_id": 151653,
131
+ "vision_start_token_id": 151652,
132
+ "vision_token_id": 151654,
133
+ "vocab_size": 152064
134
+ }
generation_config.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151643
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "repetition_penalty": 1.05,
9
+ "temperature": 1e-06,
10
+ "transformers_version": "4.57.1"
11
+ }
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:765c46bcc1cefe87737ef64b0ba4516f5d4edff19feda16ff05cdcf99f1da101
3
+ size 4952311608
model-00002-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f93970df45d64d405983e24e8d8b8a32d968b07cb5ee343f15bed20334179b4
3
+ size 4984124272
model-00003-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3cc9536831500b8cbc21fbcf45d3fa8a53af99eb4c8c8031fa6efc908803084
3
+ size 4932743936
model-00004-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53d4d96e7d03aa6616f5d5e6f5cca339733f3b8aa3c33362d5a992b42d0bbd74
3
+ size 4998852296
model-00005-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f77afdeb3d99e17978c49c2eaf75a53a61fe4f926660eb54fb79676db9499c4d
3
+ size 4984124336
model-00006-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75b0a8e9dba356744aa0df4504fb99ca1a2221373aed91ed727511ea8e4a4e16
3
+ size 4932743992
model-00007-of-00007.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f682716b0a35062921038bf22590e08ac02559742f3dc21e1f55b89f893c2f5
3
+ size 3695682720
model.safetensors.index.json ADDED
@@ -0,0 +1,781 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 8370124416,
4
+ "total_size": 33480497664
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00007-of-00007.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00007.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00002-of-00007.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00007.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00007.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00007.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00007.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00007.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00007.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00002-of-00007.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00004-of-00007.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00004-of-00007.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00004-of-00007.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00004-of-00007.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00004-of-00007.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00004-of-00007.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00004-of-00007.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00004-of-00007.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00004-of-00007.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00004-of-00007.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00004-of-00007.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00005-of-00007.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00004-of-00007.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00004-of-00007.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00004-of-00007.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00004-of-00007.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00004-of-00007.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00004-of-00007.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00005-of-00007.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00005-of-00007.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00005-of-00007.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00002-of-00007.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00005-of-00007.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00005-of-00007.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00005-of-00007.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00005-of-00007.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00006-of-00007.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00005-of-00007.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00005-of-00007.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00005-of-00007.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00005-of-00007.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00005-of-00007.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00005-of-00007.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00005-of-00007.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00005-of-00007.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00006-of-00007.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00006-of-00007.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00006-of-00007.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00006-of-00007.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00006-of-00007.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00006-of-00007.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00007-of-00007.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00006-of-00007.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00006-of-00007.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00006-of-00007.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00006-of-00007.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00006-of-00007.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00006-of-00007.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00006-of-00007.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00006-of-00007.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00006-of-00007.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00007-of-00007.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00007-of-00007.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00007-of-00007.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00007-of-00007.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00007-of-00007.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00007-of-00007.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00007-of-00007.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00007-of-00007.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00007-of-00007.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00007-of-00007.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00007-of-00007.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00007-of-00007.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00002-of-00007.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00002-of-00007.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00002-of-00007.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00002-of-00007.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00002-of-00007.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00003-of-00007.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00002-of-00007.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00002-of-00007.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00002-of-00007.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00002-of-00007.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00002-of-00007.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00002-of-00007.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00002-of-00007.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00002-of-00007.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00003-of-00007.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00003-of-00007.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00003-of-00007.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00003-of-00007.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00003-of-00007.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00003-of-00007.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00003-of-00007.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00003-of-00007.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00003-of-00007.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00003-of-00007.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00003-of-00007.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00003-of-00007.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00003-of-00007.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00003-of-00007.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00003-of-00007.safetensors",
345
+ "model.norm.weight": "model-00007-of-00007.safetensors",
346
+ "tabular_processor.input_proj.bias": "model-00007-of-00007.safetensors",
347
+ "tabular_processor.input_proj.weight": "model-00007-of-00007.safetensors",
348
+ "tabular_processor.nodes.1.linear1.bias": "model-00007-of-00007.safetensors",
349
+ "tabular_processor.nodes.1.linear1.weight": "model-00007-of-00007.safetensors",
350
+ "tabular_processor.nodes.1.linear2.bias": "model-00007-of-00007.safetensors",
351
+ "tabular_processor.nodes.1.linear2.weight": "model-00007-of-00007.safetensors",
352
+ "tabular_processor.nodes.11.linear1.bias": "model-00007-of-00007.safetensors",
353
+ "tabular_processor.nodes.11.linear1.weight": "model-00007-of-00007.safetensors",
354
+ "tabular_processor.nodes.11.linear2.bias": "model-00007-of-00007.safetensors",
355
+ "tabular_processor.nodes.11.linear2.weight": "model-00007-of-00007.safetensors",
356
+ "tabular_processor.nodes.13.linear1.bias": "model-00007-of-00007.safetensors",
357
+ "tabular_processor.nodes.13.linear1.weight": "model-00007-of-00007.safetensors",
358
+ "tabular_processor.nodes.13.linear2.bias": "model-00007-of-00007.safetensors",
359
+ "tabular_processor.nodes.13.linear2.weight": "model-00007-of-00007.safetensors",
360
+ "tabular_processor.nodes.3.linear1.bias": "model-00007-of-00007.safetensors",
361
+ "tabular_processor.nodes.3.linear1.weight": "model-00007-of-00007.safetensors",
362
+ "tabular_processor.nodes.3.linear2.bias": "model-00007-of-00007.safetensors",
363
+ "tabular_processor.nodes.3.linear2.weight": "model-00007-of-00007.safetensors",
364
+ "tabular_processor.nodes.5.linear1.bias": "model-00007-of-00007.safetensors",
365
+ "tabular_processor.nodes.5.linear1.weight": "model-00007-of-00007.safetensors",
366
+ "tabular_processor.nodes.5.linear2.bias": "model-00007-of-00007.safetensors",
367
+ "tabular_processor.nodes.5.linear2.weight": "model-00007-of-00007.safetensors",
368
+ "tabular_processor.nodes.7.linear1.bias": "model-00007-of-00007.safetensors",
369
+ "tabular_processor.nodes.7.linear1.weight": "model-00007-of-00007.safetensors",
370
+ "tabular_processor.nodes.7.linear2.bias": "model-00007-of-00007.safetensors",
371
+ "tabular_processor.nodes.7.linear2.weight": "model-00007-of-00007.safetensors",
372
+ "tabular_processor.nodes.9.linear1.bias": "model-00007-of-00007.safetensors",
373
+ "tabular_processor.nodes.9.linear1.weight": "model-00007-of-00007.safetensors",
374
+ "tabular_processor.nodes.9.linear2.bias": "model-00007-of-00007.safetensors",
375
+ "tabular_processor.nodes.9.linear2.weight": "model-00007-of-00007.safetensors",
376
+ "tabular_projection.0.bias": "model-00007-of-00007.safetensors",
377
+ "tabular_projection.0.weight": "model-00007-of-00007.safetensors",
378
+ "tabular_projection.2.linear1.bias": "model-00007-of-00007.safetensors",
379
+ "tabular_projection.2.linear1.weight": "model-00007-of-00007.safetensors",
380
+ "tabular_projection.2.linear2.bias": "model-00007-of-00007.safetensors",
381
+ "tabular_projection.2.linear2.weight": "model-00007-of-00007.safetensors",
382
+ "tabular_projection.4.linear1.bias": "model-00007-of-00007.safetensors",
383
+ "tabular_projection.4.linear1.weight": "model-00007-of-00007.safetensors",
384
+ "tabular_projection.4.linear2.bias": "model-00007-of-00007.safetensors",
385
+ "tabular_projection.4.linear2.weight": "model-00007-of-00007.safetensors",
386
+ "tabular_projection.6.linear1.bias": "model-00007-of-00007.safetensors",
387
+ "tabular_projection.6.linear1.weight": "model-00007-of-00007.safetensors",
388
+ "tabular_projection.6.linear2.bias": "model-00007-of-00007.safetensors",
389
+ "tabular_projection.6.linear2.weight": "model-00007-of-00007.safetensors",
390
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00007.safetensors",
391
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00007.safetensors",
392
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00007.safetensors",
393
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00007.safetensors",
394
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
395
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
396
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
397
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
398
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
399
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
400
+ "visual.blocks.0.norm1.weight": "model-00001-of-00007.safetensors",
401
+ "visual.blocks.0.norm2.weight": "model-00001-of-00007.safetensors",
402
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00007.safetensors",
403
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00007.safetensors",
404
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00007.safetensors",
405
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00007.safetensors",
406
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
407
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
408
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
409
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
410
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
411
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
412
+ "visual.blocks.1.norm1.weight": "model-00001-of-00007.safetensors",
413
+ "visual.blocks.1.norm2.weight": "model-00001-of-00007.safetensors",
414
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00007.safetensors",
415
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00007.safetensors",
416
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00007.safetensors",
417
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00007.safetensors",
418
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
419
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
420
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
421
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
422
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
423
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
424
+ "visual.blocks.10.norm1.weight": "model-00001-of-00007.safetensors",
425
+ "visual.blocks.10.norm2.weight": "model-00001-of-00007.safetensors",
426
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00007.safetensors",
427
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00007.safetensors",
428
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00007.safetensors",
429
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00007.safetensors",
430
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
431
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
432
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
433
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
434
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
435
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
436
+ "visual.blocks.11.norm1.weight": "model-00001-of-00007.safetensors",
437
+ "visual.blocks.11.norm2.weight": "model-00001-of-00007.safetensors",
438
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00007.safetensors",
439
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00007.safetensors",
440
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00007.safetensors",
441
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00007.safetensors",
442
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
443
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
444
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
445
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
446
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
447
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
448
+ "visual.blocks.12.norm1.weight": "model-00001-of-00007.safetensors",
449
+ "visual.blocks.12.norm2.weight": "model-00001-of-00007.safetensors",
450
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00007.safetensors",
451
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00007.safetensors",
452
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00007.safetensors",
453
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00007.safetensors",
454
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
455
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
456
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
457
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
458
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
459
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
460
+ "visual.blocks.13.norm1.weight": "model-00001-of-00007.safetensors",
461
+ "visual.blocks.13.norm2.weight": "model-00001-of-00007.safetensors",
462
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00007.safetensors",
463
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00007.safetensors",
464
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00007.safetensors",
465
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00007.safetensors",
466
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
467
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
468
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
469
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
470
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
471
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
472
+ "visual.blocks.14.norm1.weight": "model-00001-of-00007.safetensors",
473
+ "visual.blocks.14.norm2.weight": "model-00001-of-00007.safetensors",
474
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00007.safetensors",
475
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00007.safetensors",
476
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00007.safetensors",
477
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00007.safetensors",
478
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
479
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
480
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
481
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
482
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
483
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
484
+ "visual.blocks.15.norm1.weight": "model-00001-of-00007.safetensors",
485
+ "visual.blocks.15.norm2.weight": "model-00001-of-00007.safetensors",
486
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00007.safetensors",
487
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00007.safetensors",
488
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00007.safetensors",
489
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00007.safetensors",
490
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
491
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
492
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
493
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
494
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
495
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
496
+ "visual.blocks.16.norm1.weight": "model-00001-of-00007.safetensors",
497
+ "visual.blocks.16.norm2.weight": "model-00001-of-00007.safetensors",
498
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00007.safetensors",
499
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00007.safetensors",
500
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00007.safetensors",
501
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00007.safetensors",
502
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
503
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
504
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
505
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
506
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
507
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
508
+ "visual.blocks.17.norm1.weight": "model-00001-of-00007.safetensors",
509
+ "visual.blocks.17.norm2.weight": "model-00001-of-00007.safetensors",
510
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00007.safetensors",
511
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00007.safetensors",
512
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00007.safetensors",
513
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00007.safetensors",
514
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
515
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
516
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
517
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
518
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
519
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
520
+ "visual.blocks.18.norm1.weight": "model-00001-of-00007.safetensors",
521
+ "visual.blocks.18.norm2.weight": "model-00001-of-00007.safetensors",
522
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00007.safetensors",
523
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00007.safetensors",
524
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00007.safetensors",
525
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00007.safetensors",
526
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
527
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
528
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
529
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
530
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
531
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
532
+ "visual.blocks.19.norm1.weight": "model-00001-of-00007.safetensors",
533
+ "visual.blocks.19.norm2.weight": "model-00001-of-00007.safetensors",
534
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00007.safetensors",
535
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00007.safetensors",
536
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00007.safetensors",
537
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00007.safetensors",
538
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
539
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
540
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
541
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
542
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
543
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
544
+ "visual.blocks.2.norm1.weight": "model-00001-of-00007.safetensors",
545
+ "visual.blocks.2.norm2.weight": "model-00001-of-00007.safetensors",
546
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00007.safetensors",
547
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00007.safetensors",
548
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00007.safetensors",
549
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00007.safetensors",
550
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
551
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
552
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
553
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
554
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
555
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
556
+ "visual.blocks.20.norm1.weight": "model-00001-of-00007.safetensors",
557
+ "visual.blocks.20.norm2.weight": "model-00001-of-00007.safetensors",
558
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00007.safetensors",
559
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00007.safetensors",
560
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00007.safetensors",
561
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00007.safetensors",
562
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
563
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
564
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
565
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
566
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
567
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
568
+ "visual.blocks.21.norm1.weight": "model-00001-of-00007.safetensors",
569
+ "visual.blocks.21.norm2.weight": "model-00001-of-00007.safetensors",
570
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00007.safetensors",
571
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00007.safetensors",
572
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00007.safetensors",
573
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00007.safetensors",
574
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
575
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
576
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
577
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
578
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
579
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
580
+ "visual.blocks.22.norm1.weight": "model-00001-of-00007.safetensors",
581
+ "visual.blocks.22.norm2.weight": "model-00001-of-00007.safetensors",
582
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00007.safetensors",
583
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00007.safetensors",
584
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00007.safetensors",
585
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00007.safetensors",
586
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
587
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
588
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
589
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
590
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
591
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
592
+ "visual.blocks.23.norm1.weight": "model-00001-of-00007.safetensors",
593
+ "visual.blocks.23.norm2.weight": "model-00001-of-00007.safetensors",
594
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00007.safetensors",
595
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00007.safetensors",
596
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00007.safetensors",
597
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00007.safetensors",
598
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
599
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
600
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
601
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
602
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
603
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
604
+ "visual.blocks.24.norm1.weight": "model-00001-of-00007.safetensors",
605
+ "visual.blocks.24.norm2.weight": "model-00001-of-00007.safetensors",
606
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00007.safetensors",
607
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00007.safetensors",
608
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00007.safetensors",
609
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00007.safetensors",
610
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
611
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
612
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
613
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
614
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
615
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
616
+ "visual.blocks.25.norm1.weight": "model-00001-of-00007.safetensors",
617
+ "visual.blocks.25.norm2.weight": "model-00001-of-00007.safetensors",
618
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00007.safetensors",
619
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00007.safetensors",
620
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00007.safetensors",
621
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00007.safetensors",
622
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
623
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
624
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
625
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
626
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
627
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
628
+ "visual.blocks.26.norm1.weight": "model-00001-of-00007.safetensors",
629
+ "visual.blocks.26.norm2.weight": "model-00001-of-00007.safetensors",
630
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00007.safetensors",
631
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00007.safetensors",
632
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00007.safetensors",
633
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00007.safetensors",
634
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
635
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
636
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
637
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
638
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
639
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
640
+ "visual.blocks.27.norm1.weight": "model-00001-of-00007.safetensors",
641
+ "visual.blocks.27.norm2.weight": "model-00001-of-00007.safetensors",
642
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00007.safetensors",
643
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00007.safetensors",
644
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00007.safetensors",
645
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00007.safetensors",
646
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
647
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
648
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
649
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
650
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
651
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
652
+ "visual.blocks.28.norm1.weight": "model-00001-of-00007.safetensors",
653
+ "visual.blocks.28.norm2.weight": "model-00001-of-00007.safetensors",
654
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00007.safetensors",
655
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00007.safetensors",
656
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00007.safetensors",
657
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00007.safetensors",
658
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
659
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
660
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
661
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
662
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
663
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
664
+ "visual.blocks.29.norm1.weight": "model-00001-of-00007.safetensors",
665
+ "visual.blocks.29.norm2.weight": "model-00001-of-00007.safetensors",
666
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00007.safetensors",
667
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00007.safetensors",
668
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00007.safetensors",
669
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00007.safetensors",
670
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
671
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
672
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
673
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
674
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
675
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
676
+ "visual.blocks.3.norm1.weight": "model-00001-of-00007.safetensors",
677
+ "visual.blocks.3.norm2.weight": "model-00001-of-00007.safetensors",
678
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00007.safetensors",
679
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00007.safetensors",
680
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00007.safetensors",
681
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00007.safetensors",
682
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
683
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
684
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
685
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
686
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
687
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
688
+ "visual.blocks.30.norm1.weight": "model-00001-of-00007.safetensors",
689
+ "visual.blocks.30.norm2.weight": "model-00001-of-00007.safetensors",
690
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00007.safetensors",
691
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00007.safetensors",
692
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00007.safetensors",
693
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00007.safetensors",
694
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
695
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
696
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
697
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
698
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
699
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
700
+ "visual.blocks.31.norm1.weight": "model-00001-of-00007.safetensors",
701
+ "visual.blocks.31.norm2.weight": "model-00001-of-00007.safetensors",
702
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00007.safetensors",
703
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00007.safetensors",
704
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00007.safetensors",
705
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00007.safetensors",
706
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
707
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
708
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
709
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
710
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
711
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
712
+ "visual.blocks.4.norm1.weight": "model-00001-of-00007.safetensors",
713
+ "visual.blocks.4.norm2.weight": "model-00001-of-00007.safetensors",
714
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00007.safetensors",
715
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00007.safetensors",
716
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00007.safetensors",
717
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00007.safetensors",
718
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
719
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
720
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
721
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
722
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
723
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
724
+ "visual.blocks.5.norm1.weight": "model-00001-of-00007.safetensors",
725
+ "visual.blocks.5.norm2.weight": "model-00001-of-00007.safetensors",
726
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00007.safetensors",
727
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00007.safetensors",
728
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00007.safetensors",
729
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00007.safetensors",
730
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
731
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
732
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
733
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
734
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
735
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
736
+ "visual.blocks.6.norm1.weight": "model-00001-of-00007.safetensors",
737
+ "visual.blocks.6.norm2.weight": "model-00001-of-00007.safetensors",
738
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00007.safetensors",
739
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00007.safetensors",
740
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00007.safetensors",
741
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00007.safetensors",
742
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
743
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
744
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
745
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
746
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
747
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
748
+ "visual.blocks.7.norm1.weight": "model-00001-of-00007.safetensors",
749
+ "visual.blocks.7.norm2.weight": "model-00001-of-00007.safetensors",
750
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00007.safetensors",
751
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00007.safetensors",
752
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00007.safetensors",
753
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00007.safetensors",
754
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
755
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
756
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
757
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
758
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
759
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
760
+ "visual.blocks.8.norm1.weight": "model-00001-of-00007.safetensors",
761
+ "visual.blocks.8.norm2.weight": "model-00001-of-00007.safetensors",
762
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00007.safetensors",
763
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00007.safetensors",
764
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00007.safetensors",
765
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00007.safetensors",
766
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00007.safetensors",
767
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00007.safetensors",
768
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00007.safetensors",
769
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00007.safetensors",
770
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00007.safetensors",
771
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00007.safetensors",
772
+ "visual.blocks.9.norm1.weight": "model-00001-of-00007.safetensors",
773
+ "visual.blocks.9.norm2.weight": "model-00001-of-00007.safetensors",
774
+ "visual.merger.ln_q.weight": "model-00001-of-00007.safetensors",
775
+ "visual.merger.mlp.0.bias": "model-00001-of-00007.safetensors",
776
+ "visual.merger.mlp.0.weight": "model-00001-of-00007.safetensors",
777
+ "visual.merger.mlp.2.bias": "model-00001-of-00007.safetensors",
778
+ "visual.merger.mlp.2.weight": "model-00001-of-00007.safetensors",
779
+ "visual.patch_embed.proj.weight": "model-00001-of-00007.safetensors"
780
+ }
781
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_pad": null,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.48145466,
15
+ 0.4578275,
16
+ 0.40821073
17
+ ],
18
+ "image_processor_type": "Qwen2VLImageProcessorFast",
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_pixels": 12845056,
26
+ "merge_size": 2,
27
+ "min_pixels": 3136,
28
+ "pad_size": null,
29
+ "patch_size": 14,
30
+ "processor_class": "Qwen2_5_VLProcessor",
31
+ "resample": 3,
32
+ "rescale_factor": 0.00392156862745098,
33
+ "return_tensors": null,
34
+ "size": {
35
+ "longest_edge": 12845056,
36
+ "shortest_edge": 3136
37
+ },
38
+ "temporal_patch_size": 2
39
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "processor_class": "Qwen2_5_VLProcessor",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
utils.py ADDED
@@ -0,0 +1,385 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from trl.models.utils import unwrap_model_for_generation
3
+ # %%
4
+ import re
5
+
6
+ import openai
7
+ import torch
8
+ from transformers import (
9
+ GenerationConfig,
10
+ TrainerCallback,
11
+ Qwen2TokenizerFast,
12
+ )
13
+
14
+ import wandb
15
+
16
+ import tqdm
17
+ from accelerate.utils import gather_object
18
+ import pandas as pd
19
+ import io
20
+ import numpy as np
21
+
22
+ # Chat template for tabular models
23
+ TABULAR_CHAT_TEMPLATE = """{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% set tabular_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}<|im_end|>\n{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif content['type'] == 'tabular' or 'tabular' in content %}{% set tabular_count.value = tabular_count.value + 1 %}{% if add_vision_id %}Table {{ tabular_count.value }}: {% endif %}<|vision_start|><|tabular_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>\n{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant\n{% endif %}"""
24
+
25
+ def load_model_and_processor(
26
+ model_path: str,
27
+ device: str = "cuda:0",
28
+ torch_dtype=torch.bfloat16,
29
+ ) -> tuple:
30
+ """
31
+ Load a Qwen2_5_TabularModel and its processor.
32
+
33
+ Args:
34
+ model_path: Path to the model checkpoint or HuggingFace model name
35
+ device: Device to load the model on (e.g., "cuda:0", "cuda:1", "cpu")
36
+ torch_dtype: Torch dtype for the model (default: torch.bfloat16)
37
+
38
+ Returns:
39
+ tuple: (model, processor) ready to use
40
+ """
41
+ from TabularModel import (
42
+ TabularPreprocessor,
43
+ Qwen_2_5_TabularProcessor,
44
+ Qwen2_5_TabularModel,
45
+ )
46
+
47
+ # Create tabular preprocessor
48
+ tabular_processor = TabularPreprocessor()
49
+
50
+ # Create Qwen tabular processor
51
+ qwen_tabular_processor = Qwen_2_5_TabularProcessor(
52
+ tabular_processor=tabular_processor,
53
+ tokenizer=Qwen2TokenizerFast.from_pretrained(model_path),
54
+ )
55
+
56
+ # Add special tokens
57
+ qwen_tabular_processor.tabular_token = "<|tabular_pad|>"
58
+ qwen_tabular_processor.tokenizer.add_tokens([
59
+ qwen_tabular_processor.tabular_token,
60
+ "<|tabular_row|>",
61
+ "<|tabular_cell|>"
62
+ ])
63
+ qwen_tabular_processor.tokenizer.chat_template = TABULAR_CHAT_TEMPLATE
64
+
65
+ # Load model
66
+ model = Qwen2_5_TabularModel.from_pretrained(
67
+ model_path,
68
+ torch_dtype=torch_dtype,
69
+ ).to(device)
70
+
71
+ # Set token IDs in config
72
+ model.config.tabular_token_id = (
73
+ qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_pad|>")
74
+ )
75
+ model.config.tabular_row_token_id = (
76
+ qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_row|>")
77
+ )
78
+ model.config.tabular_cell_token_id = (
79
+ qwen_tabular_processor.tokenizer.convert_tokens_to_ids("<|tabular_cell|>")
80
+ )
81
+
82
+ return model, qwen_tabular_processor
83
+
84
+ def get_role_by_idx(convo: list[dict[str, str]], role: str, idx: int) -> str:
85
+ found = 0
86
+ for message in convo:
87
+ if message["role"] == role:
88
+ if found == idx:
89
+ return message["content"]
90
+ found += 1
91
+ raise ValueError(f"Role {role} not found {idx} times")
92
+
93
+
94
+ class LLMSampleCB(TrainerCallback):
95
+ def __init__(
96
+ self,
97
+ trainer,
98
+ test_dataset,
99
+ num_samples=10,
100
+ max_new_tokens=256,
101
+ log_model="checkpoint",
102
+ ):
103
+ "A CallBack to log samples a wandb.Table during training"
104
+ super().__init__()
105
+ self._log_model = log_model
106
+ self.trainer = trainer
107
+
108
+ # Get unique tasks from the dataset
109
+ tasks = set([i["task"] for i in test_dataset])
110
+
111
+ # Get num_samples from each task
112
+ task_samples = []
113
+ for task in tasks:
114
+ task_dataset = [i for i in test_dataset if i["task"] == task][:num_samples]
115
+ task_samples.extend(task_dataset)
116
+
117
+ # Combine samples from all tasks
118
+ self.sample_dataset = task_samples
119
+
120
+ self.model, self.tokenizer = trainer.model_wrapped, trainer.tokenizer
121
+
122
+ self.tokenizer.padding_side = "left"
123
+
124
+ self.gen_config = GenerationConfig.from_pretrained(
125
+ trainer.model.name_or_path, temperature=0.001, max_new_tokens=max_new_tokens
126
+ )
127
+ self.idx = 0
128
+
129
+ def generate(self, conversations: list[list[dict[str, str]]]) -> list[str]:
130
+ accelerator = self.trainer.accelerator
131
+
132
+ # Create original prompts before distribution to use as keys
133
+ original_prompts = self.tokenizer.apply_chat_template(conversations, tokenize=False)
134
+ original_prompt_to_idx = {self._normalize_string(prompt): idx for idx, prompt in enumerate(original_prompts)}
135
+
136
+ completions = [None] * len(conversations) # Pre-allocate result array
137
+
138
+ with accelerator.split_between_processes(conversations) as conversation_subset:
139
+ model = self.trainer.model_wrapped
140
+ with unwrap_model_for_generation(model, accelerator) as unwrapped_model:
141
+ prompts = self.tokenizer.apply_chat_template(conversation_subset, tokenize=False)
142
+
143
+ tokenized_prompts = self.tokenizer(prompts, return_tensors="pt", padding=True).to(model.device)
144
+ with torch.inference_mode():
145
+ print("Generating...")
146
+ generations = unwrapped_model.generate(**tokenized_prompts, generation_config=self.gen_config).cpu()
147
+ print("Generated!")
148
+
149
+ results = []
150
+ for prompt_str, prompt_tokens, generation in zip(prompts, tokenized_prompts.input_ids, generations):
151
+ # Remove prompt from generation
152
+ generation = generation[len(prompt_tokens) :]
153
+ completion = self.tokenizer.decode(generation, skip_special_tokens=True)
154
+ results.append((prompt_str, completion))
155
+
156
+ # Gather results from all processes
157
+ all_results = gather_object(results)
158
+
159
+ # Place completions in their original positions
160
+ for prompt_str, completion in all_results:
161
+ norm_prompt = self._normalize_string(prompt_str)
162
+ if norm_prompt in original_prompt_to_idx:
163
+ idx = original_prompt_to_idx[norm_prompt]
164
+ completions[idx] = completion
165
+
166
+ return completions
167
+
168
+ def samples_filtering_table(self, examples):
169
+ "Create a wandb.Table to store the generations"
170
+ records_table = wandb.Table(columns=["full_prompt", "question", "generation", "real_answer", "points"])
171
+ max_num = [0]
172
+ summary = [0]
173
+
174
+ batch_size = 32
175
+ all_data = []
176
+
177
+ for i in tqdm.trange(0, len(examples), batch_size):
178
+ batch = examples[i : i + batch_size]
179
+ batch_data = []
180
+
181
+ # Prepare batch inputs
182
+ batch_inputs = []
183
+ for row in batch:
184
+ row = row["messages"]
185
+ user = get_role_by_idx(row, "user", 0)
186
+ real_answer = get_role_by_idx(row, "assistant", 0)
187
+
188
+ # Extract the question from the user prompt
189
+ question = user.split("Zapytanie brzmi:")[1].strip() if "Zapytanie brzmi:" in user else user
190
+ prompt = user
191
+
192
+ batch_inputs.append(row[:-1])
193
+ batch_data.append((prompt, question, real_answer))
194
+
195
+ # Generate all responses in a single pass
196
+ generations = self.generate(batch_inputs)
197
+
198
+ # Process results
199
+ if self.trainer.accelerator.is_main_process:
200
+ for idx, (prompt, question, real_answer) in enumerate(batch_data):
201
+ generation = generations[idx]
202
+
203
+ # Get points for this example
204
+ try:
205
+ _, points = self.compare_filtering_answer(question, generation, real_answer)
206
+ max_num[0] += 1
207
+ summary[0] += points
208
+ except Exception:
209
+ points = 0
210
+
211
+ records_table.add_data(prompt, question, generation, real_answer, points)
212
+ batch_data[idx] = (prompt, question, generation, real_answer)
213
+
214
+ all_data.extend(batch_data)
215
+
216
+ return records_table, summary[0] / max_num[0] if max_num[0] > 0 else 0
217
+
218
+ def compare_filtering_answer(self, question, answer, expected):
219
+ client = openai.Client()
220
+ system = "Jesteś sztuczną inteligencją do oceniania odpowiedzi na zadania filtrowania dokumentów prawniczych."
221
+ user = f"Zapytanie: '{question}'.\nPoprawna odpowiedź: '{expected}'\nOdpowiedź modelu: '{answer}'."
222
+ user += "\nOceń, czy odpowiedź modelu poprawnie identyfikuje powiązanie i zawiera odpowiednią argumentację, podobnie jak w poprawnej odpowiedzi." # noqa: E501
223
+ user += "\nOdpowiedz w formacie 'Argumentacja: (...)\nOcena: 0 lub 1', gdzie 0 to niepoprawna odpowiedź, a 1 to poprawna odpowiedź." # noqa: E501
224
+
225
+ response = client.chat.completions.create(
226
+ model="gpt-4o",
227
+ messages=[
228
+ {"role": "system", "content": system},
229
+ {"role": "user", "content": user},
230
+ ],
231
+ temperature=0.0,
232
+ max_tokens=512,
233
+ )
234
+ resp = response.choices[0].message.content.rstrip(".").strip()
235
+ print(resp)
236
+ try:
237
+ return resp, int(resp.split(":")[-1].split()[0].strip())
238
+ except Exception:
239
+ print("Error: ", resp)
240
+ # Look for either 0 or 1 in the response
241
+ score = 1 if "ocena: 1" in resp.lower() else 0
242
+ return resp, score
243
+
244
+ def on_evaluate(self, *args, **kwargs):
245
+ "Log the wandb.Table after calling trainer.evaluate"
246
+ filtering_dataset = [i for i in self.sample_dataset if i["task"] == "filtering"]
247
+ records_table, recall = self.samples_filtering_table(filtering_dataset)
248
+
249
+ if self.trainer.accelerator.is_main_process:
250
+ try:
251
+ wandb.log({"filtering_predictions_" + str(self.idx): records_table})
252
+ wandb.log({"filtering_recall": recall})
253
+ except Exception:
254
+ pass
255
+
256
+ self.idx += 1
257
+
258
+ def compare_answer(self, question, answer, expected):
259
+ client = openai.Client()
260
+ system = "Jesteś sztuczną inteligencją do oceniania odpowiedzi na egzaminie. Oceniasz odpowiedzi jako poprawne (1 punkt) lub niepoprawne (0 punktów)." # noqa: E501
261
+ user = f"Pytanie: '{question}'.\n Poprawna odpowiedź: '{expected}'\n Odpowiedź użytkownika: '{answer}'."
262
+ user += "\nCzy odpowiedź użytkownika jest poprawna? Przyznaj 1 punkt za poprawną odpowiedź lub 0 punktów za niepoprawną. Jeżeli poprawna odpowiedź sugeruje że nie da się odpowiedzieć na pytanie, to odpowiedź użytkownika powinna być taka sama. Nie dawaj punktów za chęci. Oceniaj odpowiedź tylko pod kątem poprawności." # noqa: E501
263
+ user += "\nPodkreślam: jeżeli poprawna odpowiedź sugeruje że nie da się udzielić odpowiedzi na podstawie źródeł, to odpowiedź użytkownika powinna być taka sama." # noqa: E501
264
+ user += (
265
+ "Odpowiedz w formacie 'Argumentacja: (...)\nOcena: 0 lub 1', gdzie 0 to brak punktów, a 1 to pełna ocena."
266
+ )
267
+ response = client.chat.completions.create(
268
+ model="gpt-4o",
269
+ messages=[
270
+ {"role": "system", "content": system},
271
+ {"role": "user", "content": user},
272
+ ],
273
+ temperature=0.0,
274
+ max_tokens=512,
275
+ )
276
+ resp = response.choices[0].message.content.rstrip(".").strip()
277
+ try:
278
+ return resp, int(resp.split(":")[-1].split()[0].strip())
279
+ except Exception:
280
+ print("Error: ", resp)
281
+ # Look for either 0 or 1 in the response
282
+ score = 1 if "1" in re.findall(r"\d+", resp) else 0
283
+ return resp, score
284
+
285
+ def _normalize_string(self, s):
286
+ """Normalize string to avoid whitespace/newline comparison issues"""
287
+ if s is None:
288
+ return ""
289
+ # Remove all whitespace and convert to lowercase for more robust matching
290
+ return re.sub(r'\s+', '', s).lower()
291
+
292
+ def text_to_array(text):
293
+ if '```' not in text:
294
+ csv_text = text.strip()
295
+ elif '```csv' not in text:
296
+ csv_text = text.strip().split("```")[1].strip()
297
+ else:
298
+ csv_text = text.strip().split("```csv")[1].split("```")[0]
299
+ # Parse CSV into a DataFrame
300
+ df = pd.read_csv(io.StringIO(csv_text), header=None)
301
+
302
+ # Convert DataFrame to numpy array for comparison
303
+ generated_corr_matrix = df.values
304
+ return generated_corr_matrix
305
+
306
+ def generate_answer(
307
+ model,
308
+ processor,
309
+ table: np.ndarray | torch.Tensor | list,
310
+ question: str,
311
+ max_new_tokens: int = 512,
312
+ do_sample: bool = False,
313
+ temperature: float | None = None,
314
+ ) -> str:
315
+ """
316
+ Generate an answer based on a table and a question.
317
+
318
+ Args:
319
+ model: The Qwen2_5_TabularModel instance
320
+ processor: The Qwen_2_5_TabularProcessor instance
321
+ table: The input table as numpy array (including dtype=object for mixed types),
322
+ torch tensor, or list of lists
323
+ question: The question to answer about the table
324
+ max_new_tokens: Maximum number of tokens to generate
325
+ do_sample: Whether to use sampling
326
+ temperature: Sampling temperature (if do_sample=True)
327
+
328
+ Returns:
329
+ Generated answer as a string
330
+ """
331
+ # Prepare messages in the expected format
332
+ messages = [
333
+ {
334
+ "role": "user",
335
+ "content": [
336
+ {"type": "text", "text": "Consider this table:"},
337
+ {"index": 0, "type": "tabular"},
338
+ {"type": "text", "text": question},
339
+ ],
340
+ }
341
+ ]
342
+
343
+ # Apply chat template
344
+ preprocessed = processor.tokenizer.apply_chat_template(
345
+ messages, tokenize=False
346
+ )
347
+
348
+ # Process inputs
349
+ processed = processor(
350
+ [table], text=preprocessed, return_tensors="pt"
351
+ )
352
+
353
+ # Move to model device
354
+ device = next(model.parameters()).device
355
+ processed = {
356
+ key: value.to(device) if isinstance(value, torch.Tensor) else value
357
+ for key, value in processed.items()
358
+ }
359
+
360
+ # Remove tabular_metadata as it's not a model parameter
361
+ processed.pop('tabular_metadata', None)
362
+
363
+ # Generate
364
+ gen_kwargs = {
365
+ "max_new_tokens": max_new_tokens,
366
+ "do_sample": do_sample,
367
+ }
368
+ if temperature is not None:
369
+ gen_kwargs["temperature"] = temperature
370
+
371
+ with torch.inference_mode():
372
+ res = model.generate(**processed, **gen_kwargs)
373
+
374
+ # Decode only the generated part (remove input)
375
+ generated_ids = [
376
+ output_ids[len(input_ids):]
377
+ for input_ids, output_ids in zip(processed["input_ids"], res, strict=True)
378
+ ]
379
+ output_text = processor.batch_decode(
380
+ generated_ids,
381
+ skip_special_tokens=True,
382
+ clean_up_tokenization_spaces=True
383
+ )
384
+
385
+ return output_text[0]
video_preprocessor_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_rescale": true,
10
+ "do_resize": true,
11
+ "do_sample_frames": false,
12
+ "fps": null,
13
+ "image_mean": [
14
+ 0.48145466,
15
+ 0.4578275,
16
+ 0.40821073
17
+ ],
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "input_data_format": null,
24
+ "max_frames": 768,
25
+ "max_pixels": 12845056,
26
+ "merge_size": 2,
27
+ "min_frames": 4,
28
+ "min_pixels": 3136,
29
+ "num_frames": null,
30
+ "pad_size": null,
31
+ "patch_size": 14,
32
+ "processor_class": "Qwen2_5_VLProcessor",
33
+ "resample": 3,
34
+ "rescale_factor": 0.00392156862745098,
35
+ "return_metadata": false,
36
+ "size": {
37
+ "longest_edge": 12845056,
38
+ "shortest_edge": 3136
39
+ },
40
+ "temporal_patch_size": 2,
41
+ "video_metadata": null,
42
+ "video_processor_type": "Qwen2VLVideoProcessor"
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff