Jinxing1 commited on
Commit
3826697
·
verified ·
1 Parent(s): 76d8894

Upload MQ-Auditor HyperLoRA weights

Browse files
README.md ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-2-7b-chat-hf
3
+ library_name: peft
4
+ license: cc-by-nc-sa-4.0
5
+ tags:
6
+ - audio
7
+ - video
8
+ - segmentation
9
+ - mask-quality-assessment
10
+ - audio-visual-segmentation
11
+ - lora
12
+ ---
13
+
14
+ # MQ-Auditor HyperLoRA Weights
15
+
16
+ This repository contains the released MQ-Auditor pretrained weights for reference-free mask quality assessment in language-referred audio-visual segmentation.
17
+
18
+ The checkpoint corresponds to:
19
+
20
+ ```text
21
+ epochs96_lr1e-4_bs4_gradacc8_lora_r32alpha64_pos0.5_ioulosswei0
22
+ ```
23
+
24
+ ## Model
25
+
26
+ MQ-Auditor takes a video clip, audio, a referring expression, a frame, and a candidate segmentation mask, then predicts mask quality attributes such as mask type, IoU, and recommended action.
27
+
28
+ The released weights are intended to be used with the MQ-Auditor codebase and MQ-RAVSBench dataset. The base LLM checkpoint and external encoders are not included in this package.
29
+
30
+ ## Release Contents
31
+
32
+ The public weight package should include:
33
+
34
+ ```text
35
+ adapter_config.json
36
+ adapter_model.safetensors
37
+ config.json
38
+ model.txt
39
+ model_trainable_params.txt
40
+ non_lora_trainables.bin
41
+ saved_config.json
42
+ trainer_state.json
43
+ checkpoint-960/
44
+ config.json
45
+ finetune_weights.bin
46
+ test_*/*.json
47
+ test_*/*.jsonl
48
+ ```
49
+
50
+ Intermediate epoch checkpoints and TensorBoard logs are not part of the release package.
51
+
52
+ ## Training Data
53
+
54
+ The model was trained on MQ-RAVSBench with:
55
+
56
+ ```text
57
+ train_test_meta_files/metadata.csv
58
+ train_test_meta_files/train_audit_only_filtered.json
59
+ ```
60
+
61
+ `null` masks are used during training as empty-mask examples. They are not part of the default/reported test-time evaluation protocol.
62
+
63
+ ## Evaluation
64
+
65
+ Evaluation is reported on the seen and unseen MQ-RAVSBench test splits:
66
+
67
+ ```text
68
+ test_s_image_filtered.json
69
+ test_u_image_filtered.json
70
+ test_s_video_filtered.json
71
+ test_u_video_filtered.json
72
+ ```
73
+
74
+ Reported mask types focus on non-empty candidate masks: `perfect`, `cutout`, `erode`, `dilate`, `merge`, and `full_neg`.
75
+
76
+ ## License
77
+
78
+ The released MQ-Auditor weights are provided for non-commercial research purposes only under CC BY-NC-SA 4.0-style terms. The weights depend on the Llama-2 base model and other pretrained encoders, so users must also comply with the applicable upstream model licenses and access terms.
79
+
80
+ ## Citation
81
+
82
+ ```bibtex
83
+ @article{zhou2026audit,
84
+ title={Audit After Segmentation: Reference-Free Mask Quality Assessment for Language-Referred Audio-Visual Segmentation},
85
+ author={Zhou, Jinxing and Zhou, Yanghao and Wang, Yaoting and Han, Zongyan and Ma, Jiaqi and Ding, Henghui and Anwer, Rao Muhammad and Cholakkal, Hisham},
86
+ journal={arXiv preprint arXiv:2602.03892},
87
+ year={2026}
88
+ }
89
+ ```
90
+
91
+ Paper: https://arxiv.org/pdf/2602.03892
92
+
93
+ ## Framework Versions
94
+
95
+ - PEFT 0.12.0
adapter_config.json ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/Llama-2-7b-chat-hf",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 64,
14
+ "lora_dropout": 0.05,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": null,
18
+ "peft_type": "LORA",
19
+ "r": 32,
20
+ "rank_pattern": {},
21
+ "revision": null,
22
+ "target_modules": [
23
+ "k_proj",
24
+ "up_proj",
25
+ "o_proj",
26
+ "v_proj",
27
+ "gate_proj",
28
+ "q_proj",
29
+ "down_proj"
30
+ ],
31
+ "task_type": "CAUSAL_LM",
32
+ "use_dora": false,
33
+ "use_rslora": false
34
+ }
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f754e5602c5284c124158d1e65cc17df71bc5fb8f9bc69517af2ee49031d3298
3
+ size 159968328
checkpoint-960/config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/Llama-2-7b-chat-hf",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "iou_loss_weight": 0.0,
15
+ "max_position_embeddings": 4096,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 32,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float16",
26
+ "transformers_version": "4.37.2",
27
+ "use_cache": false,
28
+ "vocab_size": 32028
29
+ }
checkpoint-960/finetune_weights.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bb8ea89cdc150b55e5734e1a5c607891c2ad8af9eb8750d92a7b4691c7d6f3ca
3
+ size 452770739
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/Llama-2-7b-chat-hf",
3
+ "architectures": [
4
+ "LlamaForCausalLM"
5
+ ],
6
+ "attention_bias": false,
7
+ "attention_dropout": 0.0,
8
+ "bos_token_id": 1,
9
+ "eos_token_id": 2,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 4096,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 11008,
14
+ "iou_loss_weight": 0.0,
15
+ "max_position_embeddings": 4096,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 32,
18
+ "num_hidden_layers": 32,
19
+ "num_key_value_heads": 32,
20
+ "pretraining_tp": 1,
21
+ "rms_norm_eps": 1e-05,
22
+ "rope_scaling": null,
23
+ "rope_theta": 10000.0,
24
+ "tie_word_embeddings": false,
25
+ "torch_dtype": "float16",
26
+ "transformers_version": "4.37.2",
27
+ "use_cache": true,
28
+ "vocab_size": 32028
29
+ }
model.txt ADDED
@@ -0,0 +1,371 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ PeftModelForCausalLM(
2
+ (base_model): LoraModel(
3
+ (model): UnifiedForCausalLM(
4
+ (model): UnifiedModel(
5
+ (embed_tokens): Embedding(32028, 4096)
6
+ (layers): ModuleList(
7
+ (0-31): 32 x LlamaDecoderLayer(
8
+ (self_attn): LlamaAttention(
9
+ (q_proj): lora.Linear(
10
+ (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
11
+ (lora_dropout): ModuleDict(
12
+ (default): Dropout(p=0.05, inplace=False)
13
+ )
14
+ (lora_A): ModuleDict(
15
+ (default): Linear(in_features=4096, out_features=32, bias=False)
16
+ )
17
+ (lora_B): ModuleDict(
18
+ (default): Linear(in_features=32, out_features=4096, bias=False)
19
+ )
20
+ (lora_embedding_A): ParameterDict()
21
+ (lora_embedding_B): ParameterDict()
22
+ (lora_magnitude_vector): ModuleDict()
23
+ )
24
+ (k_proj): lora.Linear(
25
+ (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
26
+ (lora_dropout): ModuleDict(
27
+ (default): Dropout(p=0.05, inplace=False)
28
+ )
29
+ (lora_A): ModuleDict(
30
+ (default): Linear(in_features=4096, out_features=32, bias=False)
31
+ )
32
+ (lora_B): ModuleDict(
33
+ (default): Linear(in_features=32, out_features=4096, bias=False)
34
+ )
35
+ (lora_embedding_A): ParameterDict()
36
+ (lora_embedding_B): ParameterDict()
37
+ (lora_magnitude_vector): ModuleDict()
38
+ )
39
+ (v_proj): lora.Linear(
40
+ (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
41
+ (lora_dropout): ModuleDict(
42
+ (default): Dropout(p=0.05, inplace=False)
43
+ )
44
+ (lora_A): ModuleDict(
45
+ (default): Linear(in_features=4096, out_features=32, bias=False)
46
+ )
47
+ (lora_B): ModuleDict(
48
+ (default): Linear(in_features=32, out_features=4096, bias=False)
49
+ )
50
+ (lora_embedding_A): ParameterDict()
51
+ (lora_embedding_B): ParameterDict()
52
+ (lora_magnitude_vector): ModuleDict()
53
+ )
54
+ (o_proj): lora.Linear(
55
+ (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
56
+ (lora_dropout): ModuleDict(
57
+ (default): Dropout(p=0.05, inplace=False)
58
+ )
59
+ (lora_A): ModuleDict(
60
+ (default): Linear(in_features=4096, out_features=32, bias=False)
61
+ )
62
+ (lora_B): ModuleDict(
63
+ (default): Linear(in_features=32, out_features=4096, bias=False)
64
+ )
65
+ (lora_embedding_A): ParameterDict()
66
+ (lora_embedding_B): ParameterDict()
67
+ (lora_magnitude_vector): ModuleDict()
68
+ )
69
+ (rotary_emb): LlamaRotaryEmbedding()
70
+ )
71
+ (mlp): LlamaMLP(
72
+ (gate_proj): lora.Linear(
73
+ (base_layer): Linear(in_features=4096, out_features=11008, bias=False)
74
+ (lora_dropout): ModuleDict(
75
+ (default): Dropout(p=0.05, inplace=False)
76
+ )
77
+ (lora_A): ModuleDict(
78
+ (default): Linear(in_features=4096, out_features=32, bias=False)
79
+ )
80
+ (lora_B): ModuleDict(
81
+ (default): Linear(in_features=32, out_features=11008, bias=False)
82
+ )
83
+ (lora_embedding_A): ParameterDict()
84
+ (lora_embedding_B): ParameterDict()
85
+ (lora_magnitude_vector): ModuleDict()
86
+ )
87
+ (up_proj): lora.Linear(
88
+ (base_layer): Linear(in_features=4096, out_features=11008, bias=False)
89
+ (lora_dropout): ModuleDict(
90
+ (default): Dropout(p=0.05, inplace=False)
91
+ )
92
+ (lora_A): ModuleDict(
93
+ (default): Linear(in_features=4096, out_features=32, bias=False)
94
+ )
95
+ (lora_B): ModuleDict(
96
+ (default): Linear(in_features=32, out_features=11008, bias=False)
97
+ )
98
+ (lora_embedding_A): ParameterDict()
99
+ (lora_embedding_B): ParameterDict()
100
+ (lora_magnitude_vector): ModuleDict()
101
+ )
102
+ (down_proj): lora.Linear(
103
+ (base_layer): Linear(in_features=11008, out_features=4096, bias=False)
104
+ (lora_dropout): ModuleDict(
105
+ (default): Dropout(p=0.05, inplace=False)
106
+ )
107
+ (lora_A): ModuleDict(
108
+ (default): Linear(in_features=11008, out_features=32, bias=False)
109
+ )
110
+ (lora_B): ModuleDict(
111
+ (default): Linear(in_features=32, out_features=4096, bias=False)
112
+ )
113
+ (lora_embedding_A): ParameterDict()
114
+ (lora_embedding_B): ParameterDict()
115
+ (lora_magnitude_vector): ModuleDict()
116
+ )
117
+ (act_fn): SiLU()
118
+ )
119
+ (input_layernorm): LlamaRMSNorm()
120
+ (post_attention_layernorm): LlamaRMSNorm()
121
+ )
122
+ )
123
+ (norm): LlamaRMSNorm()
124
+ (visual_encoder): VisualEncoder(
125
+ (vision_tower): CLIPVisionModel(
126
+ (vision_model): CLIPVisionTransformer(
127
+ (embeddings): CLIPVisionEmbeddings(
128
+ (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
129
+ (position_embedding): Embedding(257, 1024)
130
+ )
131
+ (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
132
+ (encoder): CLIPEncoder(
133
+ (layers): ModuleList(
134
+ (0-23): 24 x CLIPEncoderLayer(
135
+ (self_attn): CLIPAttention(
136
+ (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
137
+ (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
138
+ (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
139
+ (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
140
+ )
141
+ (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
142
+ (mlp): CLIPMLP(
143
+ (activation_fn): QuickGELUActivation()
144
+ (fc1): Linear(in_features=1024, out_features=4096, bias=True)
145
+ (fc2): Linear(in_features=4096, out_features=1024, bias=True)
146
+ )
147
+ (layer_norm2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
148
+ )
149
+ )
150
+ )
151
+ (post_layernorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
152
+ )
153
+ )
154
+ )
155
+ (vl_projector): VLProjector(
156
+ (visual_ln): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
157
+ (visual_Qformer): BertLMHeadModel(
158
+ (bert): BertModel(
159
+ (embeddings): BertEmbeddings(
160
+ (word_embeddings): Embedding(30522, 768, padding_idx=0)
161
+ (position_embeddings): Embedding(512, 768)
162
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
163
+ (dropout): Dropout(p=0.1, inplace=False)
164
+ )
165
+ (encoder): BertEncoder(
166
+ (layer): ModuleList(
167
+ (0-1): 2 x BertLayer(
168
+ (attention): BertAttention(
169
+ (self): BertSelfAttention(
170
+ (query): Linear(in_features=768, out_features=768, bias=True)
171
+ (key): Linear(in_features=768, out_features=768, bias=True)
172
+ (value): Linear(in_features=768, out_features=768, bias=True)
173
+ (dropout): Dropout(p=0.1, inplace=False)
174
+ )
175
+ (output): BertSelfOutput(
176
+ (dense): Linear(in_features=768, out_features=768, bias=True)
177
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
178
+ (dropout): Dropout(p=0.1, inplace=False)
179
+ )
180
+ )
181
+ (crossattention): BertAttention(
182
+ (self): BertSelfAttention(
183
+ (query): Linear(in_features=768, out_features=768, bias=True)
184
+ (key): Linear(in_features=1024, out_features=768, bias=True)
185
+ (value): Linear(in_features=1024, out_features=768, bias=True)
186
+ (dropout): Dropout(p=0.1, inplace=False)
187
+ )
188
+ (output): BertSelfOutput(
189
+ (dense): Linear(in_features=768, out_features=768, bias=True)
190
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
191
+ (dropout): Dropout(p=0.1, inplace=False)
192
+ )
193
+ )
194
+ (intermediate): BertIntermediate(
195
+ (dense): Linear(in_features=768, out_features=3072, bias=True)
196
+ (intermediate_act_fn): GELUActivation()
197
+ )
198
+ (output): BertOutput(
199
+ (dense): Linear(in_features=3072, out_features=768, bias=True)
200
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
201
+ (dropout): Dropout(p=0.1, inplace=False)
202
+ )
203
+ (intermediate_query): BertIntermediate(
204
+ (dense): Linear(in_features=768, out_features=3072, bias=True)
205
+ (intermediate_act_fn): GELUActivation()
206
+ )
207
+ (output_query): BertOutput(
208
+ (dense): Linear(in_features=3072, out_features=768, bias=True)
209
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
210
+ (dropout): Dropout(p=0.1, inplace=False)
211
+ )
212
+ )
213
+ )
214
+ )
215
+ )
216
+ (cls): BertOnlyMLMHead(
217
+ (predictions): BertLMPredictionHead(
218
+ (transform): BertPredictionHeadTransform(
219
+ (dense): Linear(in_features=768, out_features=768, bias=True)
220
+ (transform_act_fn): GELUActivation()
221
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
222
+ )
223
+ (decoder): Linear(in_features=768, out_features=30522, bias=True)
224
+ )
225
+ )
226
+ )
227
+ (visual_proj): Sequential(
228
+ (0): Linear(in_features=768, out_features=4096, bias=True)
229
+ (1): GELU(approximate='none')
230
+ (2): Linear(in_features=4096, out_features=4096, bias=True)
231
+ )
232
+ )
233
+ (audio_encoder): AudioEncoder(
234
+ (audio_encoder): BEATs(
235
+ (post_extract_proj): Linear(in_features=512, out_features=768, bias=True)
236
+ (patch_embedding): Conv2d(1, 512, kernel_size=(16, 16), stride=(16, 16), bias=False)
237
+ (dropout_input): Dropout(p=0.0, inplace=False)
238
+ (encoder): TransformerEncoder(
239
+ (pos_conv): Sequential(
240
+ (0): Conv1d(768, 768, kernel_size=(128,), stride=(1,), padding=(64,), groups=16)
241
+ (1): SamePad()
242
+ (2): GELU(approximate='none')
243
+ )
244
+ (layers): ModuleList(
245
+ (0): TransformerSentenceEncoderLayer(
246
+ (self_attn): MultiheadAttention(
247
+ (dropout_module): Dropout(p=0.0, inplace=False)
248
+ (relative_attention_bias): Embedding(320, 12)
249
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
250
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
251
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
252
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
253
+ (grep_linear): Linear(in_features=64, out_features=8, bias=True)
254
+ )
255
+ (dropout1): Dropout(p=0.0, inplace=False)
256
+ (dropout2): Dropout(p=0.0, inplace=False)
257
+ (dropout3): Dropout(p=0.0, inplace=False)
258
+ (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
259
+ (fc1): Linear(in_features=768, out_features=3072, bias=True)
260
+ (fc2): Linear(in_features=3072, out_features=768, bias=True)
261
+ (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
262
+ )
263
+ (1-11): 11 x TransformerSentenceEncoderLayer(
264
+ (self_attn): MultiheadAttention(
265
+ (dropout_module): Dropout(p=0.0, inplace=False)
266
+ (k_proj): Linear(in_features=768, out_features=768, bias=True)
267
+ (v_proj): Linear(in_features=768, out_features=768, bias=True)
268
+ (q_proj): Linear(in_features=768, out_features=768, bias=True)
269
+ (out_proj): Linear(in_features=768, out_features=768, bias=True)
270
+ (grep_linear): Linear(in_features=64, out_features=8, bias=True)
271
+ (relative_attention_bias): Embedding(320, 12)
272
+ )
273
+ (dropout1): Dropout(p=0.0, inplace=False)
274
+ (dropout2): Dropout(p=0.0, inplace=False)
275
+ (dropout3): Dropout(p=0.0, inplace=False)
276
+ (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
277
+ (fc1): Linear(in_features=768, out_features=3072, bias=True)
278
+ (fc2): Linear(in_features=3072, out_features=768, bias=True)
279
+ (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
280
+ )
281
+ )
282
+ (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
283
+ )
284
+ (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
285
+ (predictor_dropout): Dropout(p=0.0, inplace=False)
286
+ (predictor): Linear(in_features=768, out_features=527, bias=True)
287
+ )
288
+ )
289
+ (al_projector): ALProjector(
290
+ (audio_ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
291
+ (audio_Qformer): BertLMHeadModel(
292
+ (bert): BertModel(
293
+ (embeddings): BertEmbeddings(
294
+ (word_embeddings): Embedding(30522, 768, padding_idx=0)
295
+ (position_embeddings): Embedding(512, 768)
296
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
297
+ (dropout): Dropout(p=0.1, inplace=False)
298
+ )
299
+ (encoder): BertEncoder(
300
+ (layer): ModuleList(
301
+ (0-1): 2 x BertLayer(
302
+ (attention): BertAttention(
303
+ (self): BertSelfAttention(
304
+ (query): Linear(in_features=768, out_features=768, bias=True)
305
+ (key): Linear(in_features=768, out_features=768, bias=True)
306
+ (value): Linear(in_features=768, out_features=768, bias=True)
307
+ (dropout): Dropout(p=0.1, inplace=False)
308
+ )
309
+ (output): BertSelfOutput(
310
+ (dense): Linear(in_features=768, out_features=768, bias=True)
311
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
312
+ (dropout): Dropout(p=0.1, inplace=False)
313
+ )
314
+ )
315
+ (crossattention): BertAttention(
316
+ (self): BertSelfAttention(
317
+ (query): Linear(in_features=768, out_features=768, bias=True)
318
+ (key): Linear(in_features=768, out_features=768, bias=True)
319
+ (value): Linear(in_features=768, out_features=768, bias=True)
320
+ (dropout): Dropout(p=0.1, inplace=False)
321
+ )
322
+ (output): BertSelfOutput(
323
+ (dense): Linear(in_features=768, out_features=768, bias=True)
324
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
325
+ (dropout): Dropout(p=0.1, inplace=False)
326
+ )
327
+ )
328
+ (intermediate): BertIntermediate(
329
+ (dense): Linear(in_features=768, out_features=3072, bias=True)
330
+ (intermediate_act_fn): GELUActivation()
331
+ )
332
+ (output): BertOutput(
333
+ (dense): Linear(in_features=3072, out_features=768, bias=True)
334
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
335
+ (dropout): Dropout(p=0.1, inplace=False)
336
+ )
337
+ (intermediate_query): BertIntermediate(
338
+ (dense): Linear(in_features=768, out_features=3072, bias=True)
339
+ (intermediate_act_fn): GELUActivation()
340
+ )
341
+ (output_query): BertOutput(
342
+ (dense): Linear(in_features=3072, out_features=768, bias=True)
343
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
344
+ (dropout): Dropout(p=0.1, inplace=False)
345
+ )
346
+ )
347
+ )
348
+ )
349
+ )
350
+ (cls): BertOnlyMLMHead(
351
+ (predictions): BertLMPredictionHead(
352
+ (transform): BertPredictionHeadTransform(
353
+ (dense): Linear(in_features=768, out_features=768, bias=True)
354
+ (transform_act_fn): GELUActivation()
355
+ (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
356
+ )
357
+ (decoder): Linear(in_features=768, out_features=30522, bias=True)
358
+ )
359
+ )
360
+ )
361
+ (audio_proj): Sequential(
362
+ (0): Linear(in_features=768, out_features=4096, bias=True)
363
+ (1): GELU(approximate='none')
364
+ (2): Linear(in_features=4096, out_features=4096, bias=True)
365
+ )
366
+ )
367
+ )
368
+ (lm_head): Linear(in_features=4096, out_features=32028, bias=False)
369
+ )
370
+ )
371
+ )
model_trainable_params.txt ADDED
@@ -0,0 +1,612 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ base_model.model.model.embed_tokens.weight torch.Size([32028, 4096])
3
+ base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
4
+ base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
5
+ base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
6
+ base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
7
+ base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
8
+ base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
9
+ base_model.model.model.layers.0.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
10
+ base_model.model.model.layers.0.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
11
+ base_model.model.model.layers.0.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
12
+ base_model.model.model.layers.0.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
13
+ base_model.model.model.layers.0.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
14
+ base_model.model.model.layers.0.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
15
+ base_model.model.model.layers.0.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
16
+ base_model.model.model.layers.0.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
17
+ base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
18
+ base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
19
+ base_model.model.model.layers.1.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
20
+ base_model.model.model.layers.1.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
21
+ base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
22
+ base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
23
+ base_model.model.model.layers.1.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
24
+ base_model.model.model.layers.1.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
25
+ base_model.model.model.layers.1.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
26
+ base_model.model.model.layers.1.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
27
+ base_model.model.model.layers.1.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
28
+ base_model.model.model.layers.1.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
29
+ base_model.model.model.layers.1.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
30
+ base_model.model.model.layers.1.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
31
+ base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
32
+ base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
33
+ base_model.model.model.layers.2.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
34
+ base_model.model.model.layers.2.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
35
+ base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
36
+ base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
37
+ base_model.model.model.layers.2.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
38
+ base_model.model.model.layers.2.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
39
+ base_model.model.model.layers.2.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
40
+ base_model.model.model.layers.2.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
41
+ base_model.model.model.layers.2.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
42
+ base_model.model.model.layers.2.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
43
+ base_model.model.model.layers.2.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
44
+ base_model.model.model.layers.2.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
45
+ base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
46
+ base_model.model.model.layers.3.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
47
+ base_model.model.model.layers.3.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
48
+ base_model.model.model.layers.3.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
49
+ base_model.model.model.layers.3.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
50
+ base_model.model.model.layers.3.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
51
+ base_model.model.model.layers.3.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
52
+ base_model.model.model.layers.3.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
53
+ base_model.model.model.layers.3.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
54
+ base_model.model.model.layers.3.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
55
+ base_model.model.model.layers.3.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
56
+ base_model.model.model.layers.3.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
57
+ base_model.model.model.layers.3.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
58
+ base_model.model.model.layers.3.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
59
+ base_model.model.model.layers.4.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
60
+ base_model.model.model.layers.4.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
61
+ base_model.model.model.layers.4.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
62
+ base_model.model.model.layers.4.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
63
+ base_model.model.model.layers.4.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
64
+ base_model.model.model.layers.4.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
65
+ base_model.model.model.layers.4.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
66
+ base_model.model.model.layers.4.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
67
+ base_model.model.model.layers.4.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
68
+ base_model.model.model.layers.4.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
69
+ base_model.model.model.layers.4.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
70
+ base_model.model.model.layers.4.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
71
+ base_model.model.model.layers.4.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
72
+ base_model.model.model.layers.4.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
73
+ base_model.model.model.layers.5.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
74
+ base_model.model.model.layers.5.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
75
+ base_model.model.model.layers.5.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
76
+ base_model.model.model.layers.5.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
77
+ base_model.model.model.layers.5.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
78
+ base_model.model.model.layers.5.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
79
+ base_model.model.model.layers.5.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
80
+ base_model.model.model.layers.5.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
81
+ base_model.model.model.layers.5.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
82
+ base_model.model.model.layers.5.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
83
+ base_model.model.model.layers.5.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
84
+ base_model.model.model.layers.5.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
85
+ base_model.model.model.layers.5.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
86
+ base_model.model.model.layers.5.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
87
+ base_model.model.model.layers.6.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
88
+ base_model.model.model.layers.6.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
89
+ base_model.model.model.layers.6.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
90
+ base_model.model.model.layers.6.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
91
+ base_model.model.model.layers.6.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
92
+ base_model.model.model.layers.6.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
93
+ base_model.model.model.layers.6.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
94
+ base_model.model.model.layers.6.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
95
+ base_model.model.model.layers.6.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
96
+ base_model.model.model.layers.6.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
97
+ base_model.model.model.layers.6.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
98
+ base_model.model.model.layers.6.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
99
+ base_model.model.model.layers.6.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
100
+ base_model.model.model.layers.6.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
101
+ base_model.model.model.layers.7.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
102
+ base_model.model.model.layers.7.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
103
+ base_model.model.model.layers.7.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
104
+ base_model.model.model.layers.7.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
105
+ base_model.model.model.layers.7.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
106
+ base_model.model.model.layers.7.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
107
+ base_model.model.model.layers.7.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
108
+ base_model.model.model.layers.7.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
109
+ base_model.model.model.layers.7.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
110
+ base_model.model.model.layers.7.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
111
+ base_model.model.model.layers.7.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
112
+ base_model.model.model.layers.7.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
113
+ base_model.model.model.layers.7.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
114
+ base_model.model.model.layers.7.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
115
+ base_model.model.model.layers.8.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
116
+ base_model.model.model.layers.8.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
117
+ base_model.model.model.layers.8.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
118
+ base_model.model.model.layers.8.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
119
+ base_model.model.model.layers.8.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
120
+ base_model.model.model.layers.8.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
121
+ base_model.model.model.layers.8.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
122
+ base_model.model.model.layers.8.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
123
+ base_model.model.model.layers.8.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
124
+ base_model.model.model.layers.8.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
125
+ base_model.model.model.layers.8.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
126
+ base_model.model.model.layers.8.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
127
+ base_model.model.model.layers.8.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
128
+ base_model.model.model.layers.8.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
129
+ base_model.model.model.layers.9.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
130
+ base_model.model.model.layers.9.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
131
+ base_model.model.model.layers.9.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
132
+ base_model.model.model.layers.9.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
133
+ base_model.model.model.layers.9.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
134
+ base_model.model.model.layers.9.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
135
+ base_model.model.model.layers.9.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
136
+ base_model.model.model.layers.9.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
137
+ base_model.model.model.layers.9.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
138
+ base_model.model.model.layers.9.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
139
+ base_model.model.model.layers.9.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
140
+ base_model.model.model.layers.9.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
141
+ base_model.model.model.layers.9.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
142
+ base_model.model.model.layers.9.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
143
+ base_model.model.model.layers.10.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
144
+ base_model.model.model.layers.10.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
145
+ base_model.model.model.layers.10.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
146
+ base_model.model.model.layers.10.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
147
+ base_model.model.model.layers.10.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
148
+ base_model.model.model.layers.10.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
149
+ base_model.model.model.layers.10.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
150
+ base_model.model.model.layers.10.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
151
+ base_model.model.model.layers.10.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
152
+ base_model.model.model.layers.10.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
153
+ base_model.model.model.layers.10.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
154
+ base_model.model.model.layers.10.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
155
+ base_model.model.model.layers.10.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
156
+ base_model.model.model.layers.10.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
157
+ base_model.model.model.layers.11.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
158
+ base_model.model.model.layers.11.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
159
+ base_model.model.model.layers.11.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
160
+ base_model.model.model.layers.11.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
161
+ base_model.model.model.layers.11.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
162
+ base_model.model.model.layers.11.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
163
+ base_model.model.model.layers.11.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
164
+ base_model.model.model.layers.11.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
165
+ base_model.model.model.layers.11.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
166
+ base_model.model.model.layers.11.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
167
+ base_model.model.model.layers.11.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
168
+ base_model.model.model.layers.11.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
169
+ base_model.model.model.layers.11.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
170
+ base_model.model.model.layers.11.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
171
+ base_model.model.model.layers.12.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
172
+ base_model.model.model.layers.12.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
173
+ base_model.model.model.layers.12.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
174
+ base_model.model.model.layers.12.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
175
+ base_model.model.model.layers.12.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
176
+ base_model.model.model.layers.12.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
177
+ base_model.model.model.layers.12.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
178
+ base_model.model.model.layers.12.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
179
+ base_model.model.model.layers.12.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
180
+ base_model.model.model.layers.12.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
181
+ base_model.model.model.layers.12.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
182
+ base_model.model.model.layers.12.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
183
+ base_model.model.model.layers.12.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
184
+ base_model.model.model.layers.12.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
185
+ base_model.model.model.layers.13.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
186
+ base_model.model.model.layers.13.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
187
+ base_model.model.model.layers.13.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
188
+ base_model.model.model.layers.13.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
189
+ base_model.model.model.layers.13.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
190
+ base_model.model.model.layers.13.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
191
+ base_model.model.model.layers.13.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
192
+ base_model.model.model.layers.13.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
193
+ base_model.model.model.layers.13.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
194
+ base_model.model.model.layers.13.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
195
+ base_model.model.model.layers.13.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
196
+ base_model.model.model.layers.13.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
197
+ base_model.model.model.layers.13.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
198
+ base_model.model.model.layers.13.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
199
+ base_model.model.model.layers.14.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
200
+ base_model.model.model.layers.14.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
201
+ base_model.model.model.layers.14.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
202
+ base_model.model.model.layers.14.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
203
+ base_model.model.model.layers.14.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
204
+ base_model.model.model.layers.14.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
205
+ base_model.model.model.layers.14.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
206
+ base_model.model.model.layers.14.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
207
+ base_model.model.model.layers.14.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
208
+ base_model.model.model.layers.14.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
209
+ base_model.model.model.layers.14.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
210
+ base_model.model.model.layers.14.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
211
+ base_model.model.model.layers.14.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
212
+ base_model.model.model.layers.14.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
213
+ base_model.model.model.layers.15.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
214
+ base_model.model.model.layers.15.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
215
+ base_model.model.model.layers.15.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
216
+ base_model.model.model.layers.15.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
217
+ base_model.model.model.layers.15.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
218
+ base_model.model.model.layers.15.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
219
+ base_model.model.model.layers.15.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
220
+ base_model.model.model.layers.15.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
221
+ base_model.model.model.layers.15.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
222
+ base_model.model.model.layers.15.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
223
+ base_model.model.model.layers.15.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
224
+ base_model.model.model.layers.15.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
225
+ base_model.model.model.layers.15.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
226
+ base_model.model.model.layers.15.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
227
+ base_model.model.model.layers.16.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
228
+ base_model.model.model.layers.16.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
229
+ base_model.model.model.layers.16.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
230
+ base_model.model.model.layers.16.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
231
+ base_model.model.model.layers.16.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
232
+ base_model.model.model.layers.16.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
233
+ base_model.model.model.layers.16.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
234
+ base_model.model.model.layers.16.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
235
+ base_model.model.model.layers.16.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
236
+ base_model.model.model.layers.16.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
237
+ base_model.model.model.layers.16.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
238
+ base_model.model.model.layers.16.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
239
+ base_model.model.model.layers.16.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
240
+ base_model.model.model.layers.16.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
241
+ base_model.model.model.layers.17.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
242
+ base_model.model.model.layers.17.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
243
+ base_model.model.model.layers.17.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
244
+ base_model.model.model.layers.17.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
245
+ base_model.model.model.layers.17.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
246
+ base_model.model.model.layers.17.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
247
+ base_model.model.model.layers.17.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
248
+ base_model.model.model.layers.17.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
249
+ base_model.model.model.layers.17.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
250
+ base_model.model.model.layers.17.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
251
+ base_model.model.model.layers.17.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
252
+ base_model.model.model.layers.17.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
253
+ base_model.model.model.layers.17.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
254
+ base_model.model.model.layers.17.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
255
+ base_model.model.model.layers.18.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
256
+ base_model.model.model.layers.18.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
257
+ base_model.model.model.layers.18.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
258
+ base_model.model.model.layers.18.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
259
+ base_model.model.model.layers.18.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
260
+ base_model.model.model.layers.18.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
261
+ base_model.model.model.layers.18.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
262
+ base_model.model.model.layers.18.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
263
+ base_model.model.model.layers.18.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
264
+ base_model.model.model.layers.18.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
265
+ base_model.model.model.layers.18.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
266
+ base_model.model.model.layers.18.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
267
+ base_model.model.model.layers.18.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
268
+ base_model.model.model.layers.18.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
269
+ base_model.model.model.layers.19.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
270
+ base_model.model.model.layers.19.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
271
+ base_model.model.model.layers.19.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
272
+ base_model.model.model.layers.19.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
273
+ base_model.model.model.layers.19.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
274
+ base_model.model.model.layers.19.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
275
+ base_model.model.model.layers.19.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
276
+ base_model.model.model.layers.19.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
277
+ base_model.model.model.layers.19.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
278
+ base_model.model.model.layers.19.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
279
+ base_model.model.model.layers.19.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
280
+ base_model.model.model.layers.19.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
281
+ base_model.model.model.layers.19.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
282
+ base_model.model.model.layers.19.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
283
+ base_model.model.model.layers.20.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
284
+ base_model.model.model.layers.20.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
285
+ base_model.model.model.layers.20.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
286
+ base_model.model.model.layers.20.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
287
+ base_model.model.model.layers.20.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
288
+ base_model.model.model.layers.20.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
289
+ base_model.model.model.layers.20.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
290
+ base_model.model.model.layers.20.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
291
+ base_model.model.model.layers.20.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
292
+ base_model.model.model.layers.20.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
293
+ base_model.model.model.layers.20.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
294
+ base_model.model.model.layers.20.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
295
+ base_model.model.model.layers.20.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
296
+ base_model.model.model.layers.20.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
297
+ base_model.model.model.layers.21.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
298
+ base_model.model.model.layers.21.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
299
+ base_model.model.model.layers.21.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
300
+ base_model.model.model.layers.21.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
301
+ base_model.model.model.layers.21.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
302
+ base_model.model.model.layers.21.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
303
+ base_model.model.model.layers.21.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
304
+ base_model.model.model.layers.21.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
305
+ base_model.model.model.layers.21.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
306
+ base_model.model.model.layers.21.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
307
+ base_model.model.model.layers.21.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
308
+ base_model.model.model.layers.21.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
309
+ base_model.model.model.layers.21.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
310
+ base_model.model.model.layers.21.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
311
+ base_model.model.model.layers.22.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
312
+ base_model.model.model.layers.22.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
313
+ base_model.model.model.layers.22.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
314
+ base_model.model.model.layers.22.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
315
+ base_model.model.model.layers.22.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
316
+ base_model.model.model.layers.22.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
317
+ base_model.model.model.layers.22.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
318
+ base_model.model.model.layers.22.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
319
+ base_model.model.model.layers.22.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
320
+ base_model.model.model.layers.22.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
321
+ base_model.model.model.layers.22.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
322
+ base_model.model.model.layers.22.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
323
+ base_model.model.model.layers.22.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
324
+ base_model.model.model.layers.22.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
325
+ base_model.model.model.layers.23.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
326
+ base_model.model.model.layers.23.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
327
+ base_model.model.model.layers.23.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
328
+ base_model.model.model.layers.23.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
329
+ base_model.model.model.layers.23.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
330
+ base_model.model.model.layers.23.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
331
+ base_model.model.model.layers.23.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
332
+ base_model.model.model.layers.23.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
333
+ base_model.model.model.layers.23.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
334
+ base_model.model.model.layers.23.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
335
+ base_model.model.model.layers.23.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
336
+ base_model.model.model.layers.23.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
337
+ base_model.model.model.layers.23.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
338
+ base_model.model.model.layers.23.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
339
+ base_model.model.model.layers.24.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
340
+ base_model.model.model.layers.24.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
341
+ base_model.model.model.layers.24.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
342
+ base_model.model.model.layers.24.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
343
+ base_model.model.model.layers.24.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
344
+ base_model.model.model.layers.24.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
345
+ base_model.model.model.layers.24.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
346
+ base_model.model.model.layers.24.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
347
+ base_model.model.model.layers.24.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
348
+ base_model.model.model.layers.24.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
349
+ base_model.model.model.layers.24.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
350
+ base_model.model.model.layers.24.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
351
+ base_model.model.model.layers.24.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
352
+ base_model.model.model.layers.24.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
353
+ base_model.model.model.layers.25.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
354
+ base_model.model.model.layers.25.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
355
+ base_model.model.model.layers.25.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
356
+ base_model.model.model.layers.25.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
357
+ base_model.model.model.layers.25.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
358
+ base_model.model.model.layers.25.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
359
+ base_model.model.model.layers.25.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
360
+ base_model.model.model.layers.25.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
361
+ base_model.model.model.layers.25.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
362
+ base_model.model.model.layers.25.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
363
+ base_model.model.model.layers.25.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
364
+ base_model.model.model.layers.25.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
365
+ base_model.model.model.layers.25.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
366
+ base_model.model.model.layers.25.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
367
+ base_model.model.model.layers.26.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
368
+ base_model.model.model.layers.26.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
369
+ base_model.model.model.layers.26.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
370
+ base_model.model.model.layers.26.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
371
+ base_model.model.model.layers.26.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
372
+ base_model.model.model.layers.26.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
373
+ base_model.model.model.layers.26.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
374
+ base_model.model.model.layers.26.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
375
+ base_model.model.model.layers.26.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
376
+ base_model.model.model.layers.26.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
377
+ base_model.model.model.layers.26.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
378
+ base_model.model.model.layers.26.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
379
+ base_model.model.model.layers.26.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
380
+ base_model.model.model.layers.26.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
381
+ base_model.model.model.layers.27.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
382
+ base_model.model.model.layers.27.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
383
+ base_model.model.model.layers.27.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
384
+ base_model.model.model.layers.27.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
385
+ base_model.model.model.layers.27.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
386
+ base_model.model.model.layers.27.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
387
+ base_model.model.model.layers.27.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
388
+ base_model.model.model.layers.27.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
389
+ base_model.model.model.layers.27.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
390
+ base_model.model.model.layers.27.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
391
+ base_model.model.model.layers.27.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
392
+ base_model.model.model.layers.27.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
393
+ base_model.model.model.layers.27.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
394
+ base_model.model.model.layers.27.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
395
+ base_model.model.model.layers.28.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
396
+ base_model.model.model.layers.28.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
397
+ base_model.model.model.layers.28.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
398
+ base_model.model.model.layers.28.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
399
+ base_model.model.model.layers.28.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
400
+ base_model.model.model.layers.28.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
401
+ base_model.model.model.layers.28.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
402
+ base_model.model.model.layers.28.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
403
+ base_model.model.model.layers.28.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
404
+ base_model.model.model.layers.28.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
405
+ base_model.model.model.layers.28.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
406
+ base_model.model.model.layers.28.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
407
+ base_model.model.model.layers.28.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
408
+ base_model.model.model.layers.28.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
409
+ base_model.model.model.layers.29.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
410
+ base_model.model.model.layers.29.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
411
+ base_model.model.model.layers.29.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
412
+ base_model.model.model.layers.29.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
413
+ base_model.model.model.layers.29.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
414
+ base_model.model.model.layers.29.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
415
+ base_model.model.model.layers.29.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
416
+ base_model.model.model.layers.29.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
417
+ base_model.model.model.layers.29.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
418
+ base_model.model.model.layers.29.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
419
+ base_model.model.model.layers.29.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
420
+ base_model.model.model.layers.29.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
421
+ base_model.model.model.layers.29.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
422
+ base_model.model.model.layers.29.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
423
+ base_model.model.model.layers.30.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
424
+ base_model.model.model.layers.30.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
425
+ base_model.model.model.layers.30.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
426
+ base_model.model.model.layers.30.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
427
+ base_model.model.model.layers.30.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
428
+ base_model.model.model.layers.30.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
429
+ base_model.model.model.layers.30.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
430
+ base_model.model.model.layers.30.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
431
+ base_model.model.model.layers.30.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
432
+ base_model.model.model.layers.30.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
433
+ base_model.model.model.layers.30.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
434
+ base_model.model.model.layers.30.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
435
+ base_model.model.model.layers.30.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
436
+ base_model.model.model.layers.30.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
437
+ base_model.model.model.layers.31.self_attn.q_proj.lora_A.default.weight torch.Size([32, 4096])
438
+ base_model.model.model.layers.31.self_attn.q_proj.lora_B.default.weight torch.Size([4096, 32])
439
+ base_model.model.model.layers.31.self_attn.k_proj.lora_A.default.weight torch.Size([32, 4096])
440
+ base_model.model.model.layers.31.self_attn.k_proj.lora_B.default.weight torch.Size([4096, 32])
441
+ base_model.model.model.layers.31.self_attn.v_proj.lora_A.default.weight torch.Size([32, 4096])
442
+ base_model.model.model.layers.31.self_attn.v_proj.lora_B.default.weight torch.Size([4096, 32])
443
+ base_model.model.model.layers.31.self_attn.o_proj.lora_A.default.weight torch.Size([32, 4096])
444
+ base_model.model.model.layers.31.self_attn.o_proj.lora_B.default.weight torch.Size([4096, 32])
445
+ base_model.model.model.layers.31.mlp.gate_proj.lora_A.default.weight torch.Size([32, 4096])
446
+ base_model.model.model.layers.31.mlp.gate_proj.lora_B.default.weight torch.Size([11008, 32])
447
+ base_model.model.model.layers.31.mlp.up_proj.lora_A.default.weight torch.Size([32, 4096])
448
+ base_model.model.model.layers.31.mlp.up_proj.lora_B.default.weight torch.Size([11008, 32])
449
+ base_model.model.model.layers.31.mlp.down_proj.lora_A.default.weight torch.Size([32, 11008])
450
+ base_model.model.model.layers.31.mlp.down_proj.lora_B.default.weight torch.Size([4096, 32])
451
+ base_model.model.model.vl_projector.visual_query_tokens torch.Size([1, 32, 768])
452
+ base_model.model.model.vl_projector.visual_ln.weight torch.Size([1024])
453
+ base_model.model.model.vl_projector.visual_ln.bias torch.Size([1024])
454
+ base_model.model.model.vl_projector.visual_Qformer.bert.embeddings.word_embeddings.weight torch.Size([30522, 768])
455
+ base_model.model.model.vl_projector.visual_Qformer.bert.embeddings.position_embeddings.weight torch.Size([512, 768])
456
+ base_model.model.model.vl_projector.visual_Qformer.bert.embeddings.LayerNorm.weight torch.Size([768])
457
+ base_model.model.model.vl_projector.visual_Qformer.bert.embeddings.LayerNorm.bias torch.Size([768])
458
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
459
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
460
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
461
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
462
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
463
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
464
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
465
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
466
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
467
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
468
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.query.weight torch.Size([768, 768])
469
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.query.bias torch.Size([768])
470
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.key.weight torch.Size([768, 1024])
471
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.key.bias torch.Size([768])
472
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.value.weight torch.Size([768, 1024])
473
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.self.value.bias torch.Size([768])
474
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.output.dense.weight torch.Size([768, 768])
475
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.output.dense.bias torch.Size([768])
476
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight torch.Size([768])
477
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias torch.Size([768])
478
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
479
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.intermediate.dense.bias torch.Size([3072])
480
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output.dense.weight torch.Size([768, 3072])
481
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output.dense.bias torch.Size([768])
482
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output.LayerNorm.weight torch.Size([768])
483
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output.LayerNorm.bias torch.Size([768])
484
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.intermediate_query.dense.weight torch.Size([3072, 768])
485
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.intermediate_query.dense.bias torch.Size([3072])
486
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output_query.dense.weight torch.Size([768, 3072])
487
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output_query.dense.bias torch.Size([768])
488
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight torch.Size([768])
489
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias torch.Size([768])
490
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.query.weight torch.Size([768, 768])
491
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.query.bias torch.Size([768])
492
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.key.weight torch.Size([768, 768])
493
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.key.bias torch.Size([768])
494
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.value.weight torch.Size([768, 768])
495
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.self.value.bias torch.Size([768])
496
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.output.dense.weight torch.Size([768, 768])
497
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.output.dense.bias torch.Size([768])
498
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight torch.Size([768])
499
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias torch.Size([768])
500
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.query.weight torch.Size([768, 768])
501
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.query.bias torch.Size([768])
502
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.key.weight torch.Size([768, 1024])
503
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.key.bias torch.Size([768])
504
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.value.weight torch.Size([768, 1024])
505
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.self.value.bias torch.Size([768])
506
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.output.dense.weight torch.Size([768, 768])
507
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.output.dense.bias torch.Size([768])
508
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.output.LayerNorm.weight torch.Size([768])
509
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.crossattention.output.LayerNorm.bias torch.Size([768])
510
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.intermediate.dense.weight torch.Size([3072, 768])
511
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.intermediate.dense.bias torch.Size([3072])
512
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output.dense.weight torch.Size([768, 3072])
513
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output.dense.bias torch.Size([768])
514
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output.LayerNorm.weight torch.Size([768])
515
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output.LayerNorm.bias torch.Size([768])
516
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.intermediate_query.dense.weight torch.Size([3072, 768])
517
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.intermediate_query.dense.bias torch.Size([3072])
518
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output_query.dense.weight torch.Size([768, 3072])
519
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output_query.dense.bias torch.Size([768])
520
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight torch.Size([768])
521
+ base_model.model.model.vl_projector.visual_Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias torch.Size([768])
522
+ base_model.model.model.vl_projector.visual_Qformer.cls.predictions.bias torch.Size([30522])
523
+ base_model.model.model.vl_projector.visual_Qformer.cls.predictions.transform.dense.weight torch.Size([768, 768])
524
+ base_model.model.model.vl_projector.visual_Qformer.cls.predictions.transform.dense.bias torch.Size([768])
525
+ base_model.model.model.vl_projector.visual_Qformer.cls.predictions.transform.LayerNorm.weight torch.Size([768])
526
+ base_model.model.model.vl_projector.visual_Qformer.cls.predictions.transform.LayerNorm.bias torch.Size([768])
527
+ base_model.model.model.vl_projector.visual_proj.0.weight torch.Size([4096, 768])
528
+ base_model.model.model.vl_projector.visual_proj.0.bias torch.Size([4096])
529
+ base_model.model.model.vl_projector.visual_proj.2.weight torch.Size([4096, 4096])
530
+ base_model.model.model.vl_projector.visual_proj.2.bias torch.Size([4096])
531
+ base_model.model.model.al_projector.audio_query_tokens torch.Size([1, 32, 768])
532
+ base_model.model.model.al_projector.audio_ln.weight torch.Size([768])
533
+ base_model.model.model.al_projector.audio_ln.bias torch.Size([768])
534
+ base_model.model.model.al_projector.audio_Qformer.bert.embeddings.word_embeddings.weight torch.Size([30522, 768])
535
+ base_model.model.model.al_projector.audio_Qformer.bert.embeddings.position_embeddings.weight torch.Size([512, 768])
536
+ base_model.model.model.al_projector.audio_Qformer.bert.embeddings.LayerNorm.weight torch.Size([768])
537
+ base_model.model.model.al_projector.audio_Qformer.bert.embeddings.LayerNorm.bias torch.Size([768])
538
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.query.weight torch.Size([768, 768])
539
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.query.bias torch.Size([768])
540
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.key.weight torch.Size([768, 768])
541
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.key.bias torch.Size([768])
542
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.value.weight torch.Size([768, 768])
543
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.self.value.bias torch.Size([768])
544
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.output.dense.weight torch.Size([768, 768])
545
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.output.dense.bias torch.Size([768])
546
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.output.LayerNorm.weight torch.Size([768])
547
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.attention.output.LayerNorm.bias torch.Size([768])
548
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.query.weight torch.Size([768, 768])
549
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.query.bias torch.Size([768])
550
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.key.weight torch.Size([768, 768])
551
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.key.bias torch.Size([768])
552
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.value.weight torch.Size([768, 768])
553
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.self.value.bias torch.Size([768])
554
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.output.dense.weight torch.Size([768, 768])
555
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.output.dense.bias torch.Size([768])
556
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.weight torch.Size([768])
557
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.crossattention.output.LayerNorm.bias torch.Size([768])
558
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.intermediate.dense.weight torch.Size([3072, 768])
559
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.intermediate.dense.bias torch.Size([3072])
560
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output.dense.weight torch.Size([768, 3072])
561
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output.dense.bias torch.Size([768])
562
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output.LayerNorm.weight torch.Size([768])
563
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output.LayerNorm.bias torch.Size([768])
564
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.intermediate_query.dense.weight torch.Size([3072, 768])
565
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.intermediate_query.dense.bias torch.Size([3072])
566
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output_query.dense.weight torch.Size([768, 3072])
567
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output_query.dense.bias torch.Size([768])
568
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output_query.LayerNorm.weight torch.Size([768])
569
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.0.output_query.LayerNorm.bias torch.Size([768])
570
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.query.weight torch.Size([768, 768])
571
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.query.bias torch.Size([768])
572
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.key.weight torch.Size([768, 768])
573
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.key.bias torch.Size([768])
574
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.value.weight torch.Size([768, 768])
575
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.self.value.bias torch.Size([768])
576
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.output.dense.weight torch.Size([768, 768])
577
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.output.dense.bias torch.Size([768])
578
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.output.LayerNorm.weight torch.Size([768])
579
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.attention.output.LayerNorm.bias torch.Size([768])
580
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.query.weight torch.Size([768, 768])
581
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.query.bias torch.Size([768])
582
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.key.weight torch.Size([768, 768])
583
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.key.bias torch.Size([768])
584
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.value.weight torch.Size([768, 768])
585
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.self.value.bias torch.Size([768])
586
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.output.dense.weight torch.Size([768, 768])
587
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.output.dense.bias torch.Size([768])
588
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.output.LayerNorm.weight torch.Size([768])
589
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.crossattention.output.LayerNorm.bias torch.Size([768])
590
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.intermediate.dense.weight torch.Size([3072, 768])
591
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.intermediate.dense.bias torch.Size([3072])
592
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output.dense.weight torch.Size([768, 3072])
593
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output.dense.bias torch.Size([768])
594
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output.LayerNorm.weight torch.Size([768])
595
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output.LayerNorm.bias torch.Size([768])
596
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.intermediate_query.dense.weight torch.Size([3072, 768])
597
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.intermediate_query.dense.bias torch.Size([3072])
598
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output_query.dense.weight torch.Size([768, 3072])
599
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output_query.dense.bias torch.Size([768])
600
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output_query.LayerNorm.weight torch.Size([768])
601
+ base_model.model.model.al_projector.audio_Qformer.bert.encoder.layer.1.output_query.LayerNorm.bias torch.Size([768])
602
+ base_model.model.model.al_projector.audio_Qformer.cls.predictions.bias torch.Size([30522])
603
+ base_model.model.model.al_projector.audio_Qformer.cls.predictions.transform.dense.weight torch.Size([768, 768])
604
+ base_model.model.model.al_projector.audio_Qformer.cls.predictions.transform.dense.bias torch.Size([768])
605
+ base_model.model.model.al_projector.audio_Qformer.cls.predictions.transform.LayerNorm.weight torch.Size([768])
606
+ base_model.model.model.al_projector.audio_Qformer.cls.predictions.transform.LayerNorm.bias torch.Size([768])
607
+ base_model.model.model.al_projector.audio_proj.0.weight torch.Size([4096, 768])
608
+ base_model.model.model.al_projector.audio_proj.0.bias torch.Size([4096])
609
+ base_model.model.model.al_projector.audio_proj.2.weight torch.Size([4096, 4096])
610
+ base_model.model.model.al_projector.audio_proj.2.bias torch.Size([4096])
611
+ base_model.model.lm_head.weight torch.Size([32028, 4096])
612
+ trainable_params: 488.648MB
non_lora_trainables.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:54f177714a6a06e6d5564e47a87dacb11359eef4fae90efe93a4d3efa044ae61
3
+ size 817450285
saved_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_args": {
3
+ "model_name_or_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/Llama-2-7b-chat-hf",
4
+ "freeze_backbone": true,
5
+ "llm_name": "llama",
6
+ "vit_ckpt_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/clip-vit-large-patch14",
7
+ "select_feature": "patch",
8
+ "image_size": 224,
9
+ "patch_size": 14,
10
+ "visual_query_token_nums": 32,
11
+ "BEATs_ckpt_path": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights/BEATs_iter3_plus_AS2M_finetuned_on_AS2M_cpt2.pt",
12
+ "audio_query_token_nums": 32,
13
+ "prompt_embed_dim": 256,
14
+ "mask_decoder_transformer_depth": 2,
15
+ "low_res_mask_size": 112,
16
+ "image_scale_nums": 2,
17
+ "token_nums_per_scale": 3,
18
+ "avs_query_num": 300,
19
+ "num_classes": 1,
20
+ "query_generator_num_layers": 2
21
+ },
22
+ "data_args": {
23
+ "video_frame_nums": 10,
24
+ "image_caption_task": false,
25
+ "video_caption_task": false,
26
+ "audio_caption_task": false,
27
+ "segmentation_task": false,
28
+ "avqa_task": false,
29
+ "ave_task": false,
30
+ "avvp_task": false,
31
+ "arig_task": false,
32
+ "ms3_task": false,
33
+ "s4_task": false,
34
+ "avss_task": false,
35
+ "avcap_task": false,
36
+ "ref_avs_task": true,
37
+ "refavs_meta_csv_path": "../MQ-RAVSBench/train_test_meta_files/metadata.csv",
38
+ "refavs_test_image_json_path": "../MQ-RAVSBench/train_test_meta_files/test_s_image_filtered.json",
39
+ "refavs_test_u_image_json_path": "../MQ-RAVSBench/train_test_meta_files/test_u_image_filtered.json",
40
+ "refavs_test_video_json_path": "../MQ-RAVSBench/train_test_meta_files/test_s_video_filtered.json",
41
+ "refavs_test_u_video_json_path": "../MQ-RAVSBench/train_test_meta_files/test_u_video_filtered.json",
42
+ "refavs_cot_json_path": "../MQ-RAVSBench/train_test_meta_files/train_audit_only_filtered.json",
43
+ "refavs_data_root": "../MQ-RAVSBench",
44
+ "refavs_eval_mode": "image",
45
+ "refavs_mask_type_filter": "all",
46
+ "refavs_mask_rank_filter": -1,
47
+ "refavs_mask_encode_mode": "mask_and_masked_frame",
48
+ "refavs_pos_ratio": 0.5,
49
+ "multi_frames": false,
50
+ "data_path": null,
51
+ "model_max_length": 512
52
+ },
53
+ "training_args": {
54
+ "output_dir": "results_epoch96/Auditonly_mask_and_masked_frame/epochs96_lr1e-4_bs4_gradacc8_lora_r32alpha64_pos0.5_ioulosswei0",
55
+ "overwrite_output_dir": false,
56
+ "do_train": false,
57
+ "do_eval": false,
58
+ "do_predict": false,
59
+ "evaluation_strategy": "no",
60
+ "prediction_loss_only": false,
61
+ "per_device_train_batch_size": 4,
62
+ "per_device_eval_batch_size": 4,
63
+ "per_gpu_train_batch_size": null,
64
+ "per_gpu_eval_batch_size": null,
65
+ "gradient_accumulation_steps": 8,
66
+ "eval_accumulation_steps": null,
67
+ "eval_delay": 0,
68
+ "learning_rate": 0.0001,
69
+ "weight_decay": 0.0,
70
+ "adam_beta1": 0.9,
71
+ "adam_beta2": 0.999,
72
+ "adam_epsilon": 1e-08,
73
+ "max_grad_norm": 1.0,
74
+ "num_train_epochs": 96.0,
75
+ "max_steps": -1,
76
+ "lr_scheduler_type": "cosine",
77
+ "lr_scheduler_kwargs": {},
78
+ "warmup_ratio": 0.03,
79
+ "warmup_steps": 0,
80
+ "log_level": "passive",
81
+ "log_level_replica": "warning",
82
+ "log_on_each_node": true,
83
+ "logging_dir": "results_epoch96/Auditonly_mask_and_masked_frame/epochs96_lr1e-4_bs4_gradacc8_lora_r32alpha64_pos0.5_ioulosswei0/runs/May18_17-20-46_gpu-17",
84
+ "logging_strategy": "steps",
85
+ "logging_first_step": false,
86
+ "logging_steps": 1.0,
87
+ "logging_nan_inf_filter": true,
88
+ "save_strategy": "epoch",
89
+ "save_steps": -1.0,
90
+ "save_total_limit": 96,
91
+ "save_safetensors": true,
92
+ "save_on_each_node": false,
93
+ "save_only_model": false,
94
+ "no_cuda": false,
95
+ "use_cpu": false,
96
+ "use_mps_device": false,
97
+ "seed": 42,
98
+ "data_seed": null,
99
+ "jit_mode_eval": false,
100
+ "use_ipex": false,
101
+ "bf16": true,
102
+ "fp16": false,
103
+ "fp16_opt_level": "O1",
104
+ "half_precision_backend": "auto",
105
+ "bf16_full_eval": false,
106
+ "fp16_full_eval": false,
107
+ "tf32": false,
108
+ "local_rank": 0,
109
+ "ddp_backend": null,
110
+ "tpu_num_cores": null,
111
+ "tpu_metrics_debug": false,
112
+ "debug": [],
113
+ "dataloader_drop_last": false,
114
+ "eval_steps": null,
115
+ "dataloader_num_workers": 4,
116
+ "past_index": -1,
117
+ "run_name": "results_epoch96/Auditonly_mask_and_masked_frame/epochs96_lr1e-4_bs4_gradacc8_lora_r32alpha64_pos0.5_ioulosswei0",
118
+ "disable_tqdm": false,
119
+ "remove_unused_columns": false,
120
+ "label_names": null,
121
+ "load_best_model_at_end": false,
122
+ "metric_for_best_model": null,
123
+ "greater_is_better": null,
124
+ "ignore_data_skip": false,
125
+ "fsdp": [],
126
+ "fsdp_min_num_params": 0,
127
+ "fsdp_config": {
128
+ "min_num_params": 0,
129
+ "xla": false,
130
+ "xla_fsdp_grad_ckpt": false
131
+ },
132
+ "fsdp_transformer_layer_cls_to_wrap": null,
133
+ "deepspeed": "deepspeed/stage2-offload.json",
134
+ "label_smoothing_factor": 0.0,
135
+ "optim": "adamw_torch",
136
+ "optim_args": null,
137
+ "adafactor": false,
138
+ "group_by_length": false,
139
+ "length_column_name": "length",
140
+ "report_to": [
141
+ "tensorboard",
142
+ "wandb"
143
+ ],
144
+ "ddp_find_unused_parameters": true,
145
+ "ddp_bucket_cap_mb": null,
146
+ "ddp_broadcast_buffers": null,
147
+ "dataloader_pin_memory": true,
148
+ "dataloader_persistent_workers": false,
149
+ "skip_memory_metrics": true,
150
+ "use_legacy_prediction_loop": false,
151
+ "push_to_hub": false,
152
+ "resume_from_checkpoint": null,
153
+ "hub_model_id": null,
154
+ "hub_strategy": "every_save",
155
+ "hub_token": null,
156
+ "hub_private_repo": false,
157
+ "hub_always_push": false,
158
+ "gradient_checkpointing": true,
159
+ "gradient_checkpointing_kwargs": null,
160
+ "include_inputs_for_metrics": false,
161
+ "fp16_backend": "auto",
162
+ "push_to_hub_model_id": null,
163
+ "push_to_hub_organization": null,
164
+ "push_to_hub_token": null,
165
+ "_n_gpu": 1,
166
+ "mp_parameters": "",
167
+ "auto_find_batch_size": false,
168
+ "full_determinism": false,
169
+ "torchdynamo": null,
170
+ "ray_scope": "last",
171
+ "ddp_timeout": 1800,
172
+ "torch_compile": false,
173
+ "torch_compile_backend": null,
174
+ "torch_compile_mode": null,
175
+ "dispatch_batches": null,
176
+ "split_batches": false,
177
+ "include_tokens_per_second": false,
178
+ "include_num_input_tokens_seen": false,
179
+ "neftune_noise_alpha": null,
180
+ "mm_projector_lr": null,
181
+ "freeze_mm_mlp_adapter": false,
182
+ "cache_dir": null,
183
+ "group_by_modality_length": false,
184
+ "double_quant": true,
185
+ "quant_type": "nf4",
186
+ "bits": 32,
187
+ "lora_enable": true,
188
+ "lora_r": 32,
189
+ "lora_alpha": 64,
190
+ "lora_dropout": 0.05,
191
+ "lora_weight_path": "",
192
+ "lora_bias": "none",
193
+ "ce_loss_weight": 1.0,
194
+ "dice_loss_weight": 0.5,
195
+ "bce_loss_weight": 1.0,
196
+ "iou_loss_weight": 0.0,
197
+ "audio_branch": true,
198
+ "visual_branch": true,
199
+ "seg_branch": false,
200
+ "pretrain_ckpt_dir": "/home/panwen.hu/workspace1/jinxing.zhou/mllm/Crab/pretrained_weights",
201
+ "finetune_ckpt_dir": "",
202
+ "save_modules": "vl_projector,al_projector,lora",
203
+ "exp_desc": "exp",
204
+ "use_process": true,
205
+ "use_hyper_lora": true
206
+ }
207
+ }
trainer_state.json ADDED
The diff for this file is too large to render. See raw diff