yiyangd commited on
Commit
b40c7c4
·
verified ·
1 Parent(s): d07b5c7

Add files using upload-large-folder tool

Browse files
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. added_tokens.json +38 -0
  3. chat_template.jinja +6 -0
  4. config.json +117 -0
  5. generation_config.json +9 -0
  6. global_step6250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  7. global_step6250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  8. global_step6250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  9. global_step6250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  10. global_step6250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  11. global_step6250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  12. global_step6250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  13. global_step6250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  14. global_step6250/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  15. global_step6250/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  16. global_step6250/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  17. global_step6250/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  18. global_step6250/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  19. global_step6250/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  20. global_step6250/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  21. global_step6250/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  22. latest +1 -0
  23. merges.txt +0 -0
  24. model.safetensors +3 -0
  25. output_dir_lr1e-6/model.safetensors +3 -0
  26. output_dir_lr1e-6/original_keys.txt +753 -0
  27. output_dir_lr1e-6/output_keys.txt +753 -0
  28. preprocessor_config.json +35 -0
  29. processor_config.json +4 -0
  30. rng_state_0.pth +3 -0
  31. rng_state_1.pth +3 -0
  32. rng_state_2.pth +3 -0
  33. rng_state_3.pth +3 -0
  34. rng_state_4.pth +3 -0
  35. rng_state_5.pth +3 -0
  36. rng_state_6.pth +3 -0
  37. rng_state_7.pth +3 -0
  38. scheduler.pt +3 -0
  39. special_tokens_map.json +44 -0
  40. tokenizer.json +3 -0
  41. tokenizer_config.json +340 -0
  42. trainer_state.json +468 -0
  43. training_args.bin +3 -0
  44. video_preprocessor_config.json +38 -0
  45. vocab.json +0 -0
  46. zero_to_fp32.py +760 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</box>": 151677,
3
+ "</img>": 151670,
4
+ "</quad>": 151673,
5
+ "</ref>": 151675,
6
+ "</think>": 151668,
7
+ "</tool_call>": 151658,
8
+ "</tool_response>": 151666,
9
+ "<IMG_CONTEXT>": 151671,
10
+ "<box>": 151676,
11
+ "<img>": 151669,
12
+ "<quad>": 151672,
13
+ "<ref>": 151674,
14
+ "<think>": 151667,
15
+ "<tool_call>": 151657,
16
+ "<tool_response>": 151665,
17
+ "<video>": 151678,
18
+ "<|box_end|>": 151649,
19
+ "<|box_start|>": 151648,
20
+ "<|endoftext|>": 151643,
21
+ "<|file_sep|>": 151664,
22
+ "<|fim_middle|>": 151660,
23
+ "<|fim_pad|>": 151662,
24
+ "<|fim_prefix|>": 151659,
25
+ "<|fim_suffix|>": 151661,
26
+ "<|im_end|>": 151645,
27
+ "<|im_start|>": 151644,
28
+ "<|image_pad|>": 151655,
29
+ "<|object_ref_end|>": 151647,
30
+ "<|object_ref_start|>": 151646,
31
+ "<|quad_end|>": 151651,
32
+ "<|quad_start|>": 151650,
33
+ "<|repo_name|>": 151663,
34
+ "<|video_pad|>": 151656,
35
+ "<|vision_end|>": 151653,
36
+ "<|vision_pad|>": 151654,
37
+ "<|vision_start|>": 151652
38
+ }
chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ '}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<IMG_CONTEXT>
3
+ ' }}{% elif content['type'] == 'video' %}{{ '<video>
4
+ ' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
6
+ ' }}{% endif %}
config.json ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "InternVLForConditionalGeneration"
4
+ ],
5
+ "downsample_ratio": 0.5,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_size": 1024,
9
+ "image_seq_length": 256,
10
+ "image_token_id": 151671,
11
+ "model_type": "internvl",
12
+ "pad_token_id": 151643,
13
+ "projector_hidden_act": "gelu",
14
+ "text_config": {
15
+ "_name_or_path": "/root/codespace/checkpoints/Qwen3-0.6B",
16
+ "architectures": [
17
+ "Qwen3ForCausalLM"
18
+ ],
19
+ "attention_bias": false,
20
+ "attention_dropout": 0.0,
21
+ "bos_token_id": 151643,
22
+ "debug": false,
23
+ "dtype": "float32",
24
+ "eos_token_id": 151645,
25
+ "ep_size": 1,
26
+ "head_dim": 128,
27
+ "hidden_act": "silu",
28
+ "hidden_size": 1024,
29
+ "initializer_range": 0.02,
30
+ "intermediate_size": 3072,
31
+ "layer_types": [
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention"
60
+ ],
61
+ "max_position_embeddings": 40960,
62
+ "max_window_layers": 28,
63
+ "micro_forward": false,
64
+ "model_type": "qwen3",
65
+ "num_attention_heads": 16,
66
+ "num_hidden_layers": 28,
67
+ "num_key_value_heads": 8,
68
+ "rms_norm_eps": 1e-06,
69
+ "rope_scaling": null,
70
+ "rope_theta": 1000000,
71
+ "skip_checkpoint": false,
72
+ "sliding_window": null,
73
+ "use_cache": false,
74
+ "use_deepep": false,
75
+ "use_sliding_window": false,
76
+ "vocab_size": 151936
77
+ },
78
+ "transformers_version": "4.56.1",
79
+ "use_cache": false,
80
+ "vision_config": {
81
+ "architectures": [
82
+ "InternVisionModel"
83
+ ],
84
+ "attention_bias": true,
85
+ "attention_dropout": 0.0,
86
+ "dropout": 0.0,
87
+ "dtype": "float32",
88
+ "hidden_act": "gelu",
89
+ "hidden_dropout_prob": 0.0,
90
+ "hidden_size": 1024,
91
+ "image_size": [
92
+ 448,
93
+ 448
94
+ ],
95
+ "initializer_factor": 0.1,
96
+ "initializer_range": 1e-10,
97
+ "intermediate_size": 4096,
98
+ "layer_norm_eps": 1e-06,
99
+ "layer_scale_init_value": 0.1,
100
+ "model_type": "internvl_vision",
101
+ "norm_type": "layer_norm",
102
+ "num_attention_heads": 16,
103
+ "num_channels": 3,
104
+ "num_hidden_layers": 24,
105
+ "patch_size": [
106
+ 14,
107
+ 14
108
+ ],
109
+ "projection_dropout": 0.0,
110
+ "use_absolute_position_embeddings": true,
111
+ "use_mask_token": false,
112
+ "use_mean_pooling": true,
113
+ "use_qk_norm": false
114
+ },
115
+ "vision_feature_layer": -1,
116
+ "vision_feature_select_strategy": "default"
117
+ }
generation_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645
6
+ ],
7
+ "pad_token_id": 151643,
8
+ "transformers_version": "4.56.1"
9
+ }
global_step6250/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fc4ea2b80afe9976cb29bd706b9686b715778fe2278451ef9dc303e7aee7cd7
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942fb9c5c5e107591a9fcbf2ad1191271dc488738a0076290b821c069c938ae3
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c21df0eb354013f97854262c768bbc639b0410a9d6f16f4acd925c60dad758f3
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9aaef84a3fa39fe8eacd70445721247eb4882d3000287f9186634623fffa2c13
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e6c8b93a51896a00833d239a254b07a4b6d2bc0e42fe8cdcebb2e0c4cf3aa7b7
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e7a149a334bd8da281ff9d752a6e4103d501e69252d8778127ca6ee04d026e
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3704c9cbe73637a988fd23126bdc6d61071cd0a627727393ab9de567321d0811
3
+ size 1127453701
global_step6250/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9d5fa1fe9f840712ec0437b9f366c040a218fd998dcec72774bda9bd3fbc744c
3
+ size 1127453701
global_step6250/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:155708de228bf3248d0418ff9236edfeececb42f44bdc780b478dc1f4e22e9d2
3
+ size 77858715
global_step6250/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:013755cef1819326c3fac18245cf9a5077ae803e5f469eede9ba54213568a504
3
+ size 77858715
global_step6250/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:62243e4211cc4f7fdd376fd46b753939c5cdef29ec70a24e29ac4c72177d6013
3
+ size 77858715
global_step6250/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21fa1a5ea82501e3dddfa10890b08c46c004a2fda36b5388a01fe9e1bb8ca210
3
+ size 77858715
global_step6250/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5e90be76a90ca797fb3df208cfd95c031eec79d8b6705aa24481e3ff494210c
3
+ size 77858715
global_step6250/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c54613f4e6ecefbdba2a6cccf7c95c9fb5335c1006482a249027ec0c25ae016
3
+ size 77858715
global_step6250/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:090f37f33d3e3153788e6992a2ba2128fa0a54545c359809236bb8faaaa39635
3
+ size 77858715
global_step6250/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db44dff0979f11724e143c149839e12e090c99bae6291904f4b8741301967628
3
+ size 77858715
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step6250
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68a24497177bdb216ad5eabe436b39014d974cfc4a1c49c1636481816ffb5bc2
3
+ size 2121890856
output_dir_lr1e-6/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f24f8db4ae487d08b5f7379f5c637baccfffaf83351dea6f266cce7fc07f5efb
3
+ size 3625158224
output_dir_lr1e-6/original_keys.txt ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ language_model.lm_head.weight
2
+ language_model.model.embed_tokens.weight
3
+ language_model.model.layers.0.input_layernorm.weight
4
+ language_model.model.layers.0.mlp.down_proj.weight
5
+ language_model.model.layers.0.mlp.gate_proj.weight
6
+ language_model.model.layers.0.mlp.up_proj.weight
7
+ language_model.model.layers.0.post_attention_layernorm.weight
8
+ language_model.model.layers.0.self_attn.k_norm.weight
9
+ language_model.model.layers.0.self_attn.k_proj.weight
10
+ language_model.model.layers.0.self_attn.o_proj.weight
11
+ language_model.model.layers.0.self_attn.q_norm.weight
12
+ language_model.model.layers.0.self_attn.q_proj.weight
13
+ language_model.model.layers.0.self_attn.v_proj.weight
14
+ language_model.model.layers.1.input_layernorm.weight
15
+ language_model.model.layers.1.mlp.down_proj.weight
16
+ language_model.model.layers.1.mlp.gate_proj.weight
17
+ language_model.model.layers.1.mlp.up_proj.weight
18
+ language_model.model.layers.1.post_attention_layernorm.weight
19
+ language_model.model.layers.1.self_attn.k_norm.weight
20
+ language_model.model.layers.1.self_attn.k_proj.weight
21
+ language_model.model.layers.1.self_attn.o_proj.weight
22
+ language_model.model.layers.1.self_attn.q_norm.weight
23
+ language_model.model.layers.1.self_attn.q_proj.weight
24
+ language_model.model.layers.1.self_attn.v_proj.weight
25
+ language_model.model.layers.10.input_layernorm.weight
26
+ language_model.model.layers.10.mlp.down_proj.weight
27
+ language_model.model.layers.10.mlp.gate_proj.weight
28
+ language_model.model.layers.10.mlp.up_proj.weight
29
+ language_model.model.layers.10.post_attention_layernorm.weight
30
+ language_model.model.layers.10.self_attn.k_norm.weight
31
+ language_model.model.layers.10.self_attn.k_proj.weight
32
+ language_model.model.layers.10.self_attn.o_proj.weight
33
+ language_model.model.layers.10.self_attn.q_norm.weight
34
+ language_model.model.layers.10.self_attn.q_proj.weight
35
+ language_model.model.layers.10.self_attn.v_proj.weight
36
+ language_model.model.layers.11.input_layernorm.weight
37
+ language_model.model.layers.11.mlp.down_proj.weight
38
+ language_model.model.layers.11.mlp.gate_proj.weight
39
+ language_model.model.layers.11.mlp.up_proj.weight
40
+ language_model.model.layers.11.post_attention_layernorm.weight
41
+ language_model.model.layers.11.self_attn.k_norm.weight
42
+ language_model.model.layers.11.self_attn.k_proj.weight
43
+ language_model.model.layers.11.self_attn.o_proj.weight
44
+ language_model.model.layers.11.self_attn.q_norm.weight
45
+ language_model.model.layers.11.self_attn.q_proj.weight
46
+ language_model.model.layers.11.self_attn.v_proj.weight
47
+ language_model.model.layers.12.input_layernorm.weight
48
+ language_model.model.layers.12.mlp.down_proj.weight
49
+ language_model.model.layers.12.mlp.gate_proj.weight
50
+ language_model.model.layers.12.mlp.up_proj.weight
51
+ language_model.model.layers.12.post_attention_layernorm.weight
52
+ language_model.model.layers.12.self_attn.k_norm.weight
53
+ language_model.model.layers.12.self_attn.k_proj.weight
54
+ language_model.model.layers.12.self_attn.o_proj.weight
55
+ language_model.model.layers.12.self_attn.q_norm.weight
56
+ language_model.model.layers.12.self_attn.q_proj.weight
57
+ language_model.model.layers.12.self_attn.v_proj.weight
58
+ language_model.model.layers.13.input_layernorm.weight
59
+ language_model.model.layers.13.mlp.down_proj.weight
60
+ language_model.model.layers.13.mlp.gate_proj.weight
61
+ language_model.model.layers.13.mlp.up_proj.weight
62
+ language_model.model.layers.13.post_attention_layernorm.weight
63
+ language_model.model.layers.13.self_attn.k_norm.weight
64
+ language_model.model.layers.13.self_attn.k_proj.weight
65
+ language_model.model.layers.13.self_attn.o_proj.weight
66
+ language_model.model.layers.13.self_attn.q_norm.weight
67
+ language_model.model.layers.13.self_attn.q_proj.weight
68
+ language_model.model.layers.13.self_attn.v_proj.weight
69
+ language_model.model.layers.14.input_layernorm.weight
70
+ language_model.model.layers.14.mlp.down_proj.weight
71
+ language_model.model.layers.14.mlp.gate_proj.weight
72
+ language_model.model.layers.14.mlp.up_proj.weight
73
+ language_model.model.layers.14.post_attention_layernorm.weight
74
+ language_model.model.layers.14.self_attn.k_norm.weight
75
+ language_model.model.layers.14.self_attn.k_proj.weight
76
+ language_model.model.layers.14.self_attn.o_proj.weight
77
+ language_model.model.layers.14.self_attn.q_norm.weight
78
+ language_model.model.layers.14.self_attn.q_proj.weight
79
+ language_model.model.layers.14.self_attn.v_proj.weight
80
+ language_model.model.layers.15.input_layernorm.weight
81
+ language_model.model.layers.15.mlp.down_proj.weight
82
+ language_model.model.layers.15.mlp.gate_proj.weight
83
+ language_model.model.layers.15.mlp.up_proj.weight
84
+ language_model.model.layers.15.post_attention_layernorm.weight
85
+ language_model.model.layers.15.self_attn.k_norm.weight
86
+ language_model.model.layers.15.self_attn.k_proj.weight
87
+ language_model.model.layers.15.self_attn.o_proj.weight
88
+ language_model.model.layers.15.self_attn.q_norm.weight
89
+ language_model.model.layers.15.self_attn.q_proj.weight
90
+ language_model.model.layers.15.self_attn.v_proj.weight
91
+ language_model.model.layers.16.input_layernorm.weight
92
+ language_model.model.layers.16.mlp.down_proj.weight
93
+ language_model.model.layers.16.mlp.gate_proj.weight
94
+ language_model.model.layers.16.mlp.up_proj.weight
95
+ language_model.model.layers.16.post_attention_layernorm.weight
96
+ language_model.model.layers.16.self_attn.k_norm.weight
97
+ language_model.model.layers.16.self_attn.k_proj.weight
98
+ language_model.model.layers.16.self_attn.o_proj.weight
99
+ language_model.model.layers.16.self_attn.q_norm.weight
100
+ language_model.model.layers.16.self_attn.q_proj.weight
101
+ language_model.model.layers.16.self_attn.v_proj.weight
102
+ language_model.model.layers.17.input_layernorm.weight
103
+ language_model.model.layers.17.mlp.down_proj.weight
104
+ language_model.model.layers.17.mlp.gate_proj.weight
105
+ language_model.model.layers.17.mlp.up_proj.weight
106
+ language_model.model.layers.17.post_attention_layernorm.weight
107
+ language_model.model.layers.17.self_attn.k_norm.weight
108
+ language_model.model.layers.17.self_attn.k_proj.weight
109
+ language_model.model.layers.17.self_attn.o_proj.weight
110
+ language_model.model.layers.17.self_attn.q_norm.weight
111
+ language_model.model.layers.17.self_attn.q_proj.weight
112
+ language_model.model.layers.17.self_attn.v_proj.weight
113
+ language_model.model.layers.18.input_layernorm.weight
114
+ language_model.model.layers.18.mlp.down_proj.weight
115
+ language_model.model.layers.18.mlp.gate_proj.weight
116
+ language_model.model.layers.18.mlp.up_proj.weight
117
+ language_model.model.layers.18.post_attention_layernorm.weight
118
+ language_model.model.layers.18.self_attn.k_norm.weight
119
+ language_model.model.layers.18.self_attn.k_proj.weight
120
+ language_model.model.layers.18.self_attn.o_proj.weight
121
+ language_model.model.layers.18.self_attn.q_norm.weight
122
+ language_model.model.layers.18.self_attn.q_proj.weight
123
+ language_model.model.layers.18.self_attn.v_proj.weight
124
+ language_model.model.layers.19.input_layernorm.weight
125
+ language_model.model.layers.19.mlp.down_proj.weight
126
+ language_model.model.layers.19.mlp.gate_proj.weight
127
+ language_model.model.layers.19.mlp.up_proj.weight
128
+ language_model.model.layers.19.post_attention_layernorm.weight
129
+ language_model.model.layers.19.self_attn.k_norm.weight
130
+ language_model.model.layers.19.self_attn.k_proj.weight
131
+ language_model.model.layers.19.self_attn.o_proj.weight
132
+ language_model.model.layers.19.self_attn.q_norm.weight
133
+ language_model.model.layers.19.self_attn.q_proj.weight
134
+ language_model.model.layers.19.self_attn.v_proj.weight
135
+ language_model.model.layers.2.input_layernorm.weight
136
+ language_model.model.layers.2.mlp.down_proj.weight
137
+ language_model.model.layers.2.mlp.gate_proj.weight
138
+ language_model.model.layers.2.mlp.up_proj.weight
139
+ language_model.model.layers.2.post_attention_layernorm.weight
140
+ language_model.model.layers.2.self_attn.k_norm.weight
141
+ language_model.model.layers.2.self_attn.k_proj.weight
142
+ language_model.model.layers.2.self_attn.o_proj.weight
143
+ language_model.model.layers.2.self_attn.q_norm.weight
144
+ language_model.model.layers.2.self_attn.q_proj.weight
145
+ language_model.model.layers.2.self_attn.v_proj.weight
146
+ language_model.model.layers.20.input_layernorm.weight
147
+ language_model.model.layers.20.mlp.down_proj.weight
148
+ language_model.model.layers.20.mlp.gate_proj.weight
149
+ language_model.model.layers.20.mlp.up_proj.weight
150
+ language_model.model.layers.20.post_attention_layernorm.weight
151
+ language_model.model.layers.20.self_attn.k_norm.weight
152
+ language_model.model.layers.20.self_attn.k_proj.weight
153
+ language_model.model.layers.20.self_attn.o_proj.weight
154
+ language_model.model.layers.20.self_attn.q_norm.weight
155
+ language_model.model.layers.20.self_attn.q_proj.weight
156
+ language_model.model.layers.20.self_attn.v_proj.weight
157
+ language_model.model.layers.21.input_layernorm.weight
158
+ language_model.model.layers.21.mlp.down_proj.weight
159
+ language_model.model.layers.21.mlp.gate_proj.weight
160
+ language_model.model.layers.21.mlp.up_proj.weight
161
+ language_model.model.layers.21.post_attention_layernorm.weight
162
+ language_model.model.layers.21.self_attn.k_norm.weight
163
+ language_model.model.layers.21.self_attn.k_proj.weight
164
+ language_model.model.layers.21.self_attn.o_proj.weight
165
+ language_model.model.layers.21.self_attn.q_norm.weight
166
+ language_model.model.layers.21.self_attn.q_proj.weight
167
+ language_model.model.layers.21.self_attn.v_proj.weight
168
+ language_model.model.layers.22.input_layernorm.weight
169
+ language_model.model.layers.22.mlp.down_proj.weight
170
+ language_model.model.layers.22.mlp.gate_proj.weight
171
+ language_model.model.layers.22.mlp.up_proj.weight
172
+ language_model.model.layers.22.post_attention_layernorm.weight
173
+ language_model.model.layers.22.self_attn.k_norm.weight
174
+ language_model.model.layers.22.self_attn.k_proj.weight
175
+ language_model.model.layers.22.self_attn.o_proj.weight
176
+ language_model.model.layers.22.self_attn.q_norm.weight
177
+ language_model.model.layers.22.self_attn.q_proj.weight
178
+ language_model.model.layers.22.self_attn.v_proj.weight
179
+ language_model.model.layers.23.input_layernorm.weight
180
+ language_model.model.layers.23.mlp.down_proj.weight
181
+ language_model.model.layers.23.mlp.gate_proj.weight
182
+ language_model.model.layers.23.mlp.up_proj.weight
183
+ language_model.model.layers.23.post_attention_layernorm.weight
184
+ language_model.model.layers.23.self_attn.k_norm.weight
185
+ language_model.model.layers.23.self_attn.k_proj.weight
186
+ language_model.model.layers.23.self_attn.o_proj.weight
187
+ language_model.model.layers.23.self_attn.q_norm.weight
188
+ language_model.model.layers.23.self_attn.q_proj.weight
189
+ language_model.model.layers.23.self_attn.v_proj.weight
190
+ language_model.model.layers.24.input_layernorm.weight
191
+ language_model.model.layers.24.mlp.down_proj.weight
192
+ language_model.model.layers.24.mlp.gate_proj.weight
193
+ language_model.model.layers.24.mlp.up_proj.weight
194
+ language_model.model.layers.24.post_attention_layernorm.weight
195
+ language_model.model.layers.24.self_attn.k_norm.weight
196
+ language_model.model.layers.24.self_attn.k_proj.weight
197
+ language_model.model.layers.24.self_attn.o_proj.weight
198
+ language_model.model.layers.24.self_attn.q_norm.weight
199
+ language_model.model.layers.24.self_attn.q_proj.weight
200
+ language_model.model.layers.24.self_attn.v_proj.weight
201
+ language_model.model.layers.25.input_layernorm.weight
202
+ language_model.model.layers.25.mlp.down_proj.weight
203
+ language_model.model.layers.25.mlp.gate_proj.weight
204
+ language_model.model.layers.25.mlp.up_proj.weight
205
+ language_model.model.layers.25.post_attention_layernorm.weight
206
+ language_model.model.layers.25.self_attn.k_norm.weight
207
+ language_model.model.layers.25.self_attn.k_proj.weight
208
+ language_model.model.layers.25.self_attn.o_proj.weight
209
+ language_model.model.layers.25.self_attn.q_norm.weight
210
+ language_model.model.layers.25.self_attn.q_proj.weight
211
+ language_model.model.layers.25.self_attn.v_proj.weight
212
+ language_model.model.layers.26.input_layernorm.weight
213
+ language_model.model.layers.26.mlp.down_proj.weight
214
+ language_model.model.layers.26.mlp.gate_proj.weight
215
+ language_model.model.layers.26.mlp.up_proj.weight
216
+ language_model.model.layers.26.post_attention_layernorm.weight
217
+ language_model.model.layers.26.self_attn.k_norm.weight
218
+ language_model.model.layers.26.self_attn.k_proj.weight
219
+ language_model.model.layers.26.self_attn.o_proj.weight
220
+ language_model.model.layers.26.self_attn.q_norm.weight
221
+ language_model.model.layers.26.self_attn.q_proj.weight
222
+ language_model.model.layers.26.self_attn.v_proj.weight
223
+ language_model.model.layers.27.input_layernorm.weight
224
+ language_model.model.layers.27.mlp.down_proj.weight
225
+ language_model.model.layers.27.mlp.gate_proj.weight
226
+ language_model.model.layers.27.mlp.up_proj.weight
227
+ language_model.model.layers.27.post_attention_layernorm.weight
228
+ language_model.model.layers.27.self_attn.k_norm.weight
229
+ language_model.model.layers.27.self_attn.k_proj.weight
230
+ language_model.model.layers.27.self_attn.o_proj.weight
231
+ language_model.model.layers.27.self_attn.q_norm.weight
232
+ language_model.model.layers.27.self_attn.q_proj.weight
233
+ language_model.model.layers.27.self_attn.v_proj.weight
234
+ language_model.model.layers.3.input_layernorm.weight
235
+ language_model.model.layers.3.mlp.down_proj.weight
236
+ language_model.model.layers.3.mlp.gate_proj.weight
237
+ language_model.model.layers.3.mlp.up_proj.weight
238
+ language_model.model.layers.3.post_attention_layernorm.weight
239
+ language_model.model.layers.3.self_attn.k_norm.weight
240
+ language_model.model.layers.3.self_attn.k_proj.weight
241
+ language_model.model.layers.3.self_attn.o_proj.weight
242
+ language_model.model.layers.3.self_attn.q_norm.weight
243
+ language_model.model.layers.3.self_attn.q_proj.weight
244
+ language_model.model.layers.3.self_attn.v_proj.weight
245
+ language_model.model.layers.4.input_layernorm.weight
246
+ language_model.model.layers.4.mlp.down_proj.weight
247
+ language_model.model.layers.4.mlp.gate_proj.weight
248
+ language_model.model.layers.4.mlp.up_proj.weight
249
+ language_model.model.layers.4.post_attention_layernorm.weight
250
+ language_model.model.layers.4.self_attn.k_norm.weight
251
+ language_model.model.layers.4.self_attn.k_proj.weight
252
+ language_model.model.layers.4.self_attn.o_proj.weight
253
+ language_model.model.layers.4.self_attn.q_norm.weight
254
+ language_model.model.layers.4.self_attn.q_proj.weight
255
+ language_model.model.layers.4.self_attn.v_proj.weight
256
+ language_model.model.layers.5.input_layernorm.weight
257
+ language_model.model.layers.5.mlp.down_proj.weight
258
+ language_model.model.layers.5.mlp.gate_proj.weight
259
+ language_model.model.layers.5.mlp.up_proj.weight
260
+ language_model.model.layers.5.post_attention_layernorm.weight
261
+ language_model.model.layers.5.self_attn.k_norm.weight
262
+ language_model.model.layers.5.self_attn.k_proj.weight
263
+ language_model.model.layers.5.self_attn.o_proj.weight
264
+ language_model.model.layers.5.self_attn.q_norm.weight
265
+ language_model.model.layers.5.self_attn.q_proj.weight
266
+ language_model.model.layers.5.self_attn.v_proj.weight
267
+ language_model.model.layers.6.input_layernorm.weight
268
+ language_model.model.layers.6.mlp.down_proj.weight
269
+ language_model.model.layers.6.mlp.gate_proj.weight
270
+ language_model.model.layers.6.mlp.up_proj.weight
271
+ language_model.model.layers.6.post_attention_layernorm.weight
272
+ language_model.model.layers.6.self_attn.k_norm.weight
273
+ language_model.model.layers.6.self_attn.k_proj.weight
274
+ language_model.model.layers.6.self_attn.o_proj.weight
275
+ language_model.model.layers.6.self_attn.q_norm.weight
276
+ language_model.model.layers.6.self_attn.q_proj.weight
277
+ language_model.model.layers.6.self_attn.v_proj.weight
278
+ language_model.model.layers.7.input_layernorm.weight
279
+ language_model.model.layers.7.mlp.down_proj.weight
280
+ language_model.model.layers.7.mlp.gate_proj.weight
281
+ language_model.model.layers.7.mlp.up_proj.weight
282
+ language_model.model.layers.7.post_attention_layernorm.weight
283
+ language_model.model.layers.7.self_attn.k_norm.weight
284
+ language_model.model.layers.7.self_attn.k_proj.weight
285
+ language_model.model.layers.7.self_attn.o_proj.weight
286
+ language_model.model.layers.7.self_attn.q_norm.weight
287
+ language_model.model.layers.7.self_attn.q_proj.weight
288
+ language_model.model.layers.7.self_attn.v_proj.weight
289
+ language_model.model.layers.8.input_layernorm.weight
290
+ language_model.model.layers.8.mlp.down_proj.weight
291
+ language_model.model.layers.8.mlp.gate_proj.weight
292
+ language_model.model.layers.8.mlp.up_proj.weight
293
+ language_model.model.layers.8.post_attention_layernorm.weight
294
+ language_model.model.layers.8.self_attn.k_norm.weight
295
+ language_model.model.layers.8.self_attn.k_proj.weight
296
+ language_model.model.layers.8.self_attn.o_proj.weight
297
+ language_model.model.layers.8.self_attn.q_norm.weight
298
+ language_model.model.layers.8.self_attn.q_proj.weight
299
+ language_model.model.layers.8.self_attn.v_proj.weight
300
+ language_model.model.layers.9.input_layernorm.weight
301
+ language_model.model.layers.9.mlp.down_proj.weight
302
+ language_model.model.layers.9.mlp.gate_proj.weight
303
+ language_model.model.layers.9.mlp.up_proj.weight
304
+ language_model.model.layers.9.post_attention_layernorm.weight
305
+ language_model.model.layers.9.self_attn.k_norm.weight
306
+ language_model.model.layers.9.self_attn.k_proj.weight
307
+ language_model.model.layers.9.self_attn.o_proj.weight
308
+ language_model.model.layers.9.self_attn.q_norm.weight
309
+ language_model.model.layers.9.self_attn.q_proj.weight
310
+ language_model.model.layers.9.self_attn.v_proj.weight
311
+ language_model.model.norm.weight
312
+ multi_modal_projector.layer_norm.bias
313
+ multi_modal_projector.layer_norm.weight
314
+ multi_modal_projector.linear_1.bias
315
+ multi_modal_projector.linear_1.weight
316
+ multi_modal_projector.linear_2.bias
317
+ multi_modal_projector.linear_2.weight
318
+ vision_tower.embeddings.cls_token
319
+ vision_tower.embeddings.patch_embeddings.projection.bias
320
+ vision_tower.embeddings.patch_embeddings.projection.weight
321
+ vision_tower.embeddings.position_embeddings
322
+ vision_tower.encoder.layer.0.attention.k_proj.bias
323
+ vision_tower.encoder.layer.0.attention.k_proj.weight
324
+ vision_tower.encoder.layer.0.attention.projection_layer.bias
325
+ vision_tower.encoder.layer.0.attention.projection_layer.weight
326
+ vision_tower.encoder.layer.0.attention.q_proj.bias
327
+ vision_tower.encoder.layer.0.attention.q_proj.weight
328
+ vision_tower.encoder.layer.0.attention.v_proj.bias
329
+ vision_tower.encoder.layer.0.attention.v_proj.weight
330
+ vision_tower.encoder.layer.0.lambda_1
331
+ vision_tower.encoder.layer.0.lambda_2
332
+ vision_tower.encoder.layer.0.layernorm_after.bias
333
+ vision_tower.encoder.layer.0.layernorm_after.weight
334
+ vision_tower.encoder.layer.0.layernorm_before.bias
335
+ vision_tower.encoder.layer.0.layernorm_before.weight
336
+ vision_tower.encoder.layer.0.mlp.fc1.bias
337
+ vision_tower.encoder.layer.0.mlp.fc1.weight
338
+ vision_tower.encoder.layer.0.mlp.fc2.bias
339
+ vision_tower.encoder.layer.0.mlp.fc2.weight
340
+ vision_tower.encoder.layer.1.attention.k_proj.bias
341
+ vision_tower.encoder.layer.1.attention.k_proj.weight
342
+ vision_tower.encoder.layer.1.attention.projection_layer.bias
343
+ vision_tower.encoder.layer.1.attention.projection_layer.weight
344
+ vision_tower.encoder.layer.1.attention.q_proj.bias
345
+ vision_tower.encoder.layer.1.attention.q_proj.weight
346
+ vision_tower.encoder.layer.1.attention.v_proj.bias
347
+ vision_tower.encoder.layer.1.attention.v_proj.weight
348
+ vision_tower.encoder.layer.1.lambda_1
349
+ vision_tower.encoder.layer.1.lambda_2
350
+ vision_tower.encoder.layer.1.layernorm_after.bias
351
+ vision_tower.encoder.layer.1.layernorm_after.weight
352
+ vision_tower.encoder.layer.1.layernorm_before.bias
353
+ vision_tower.encoder.layer.1.layernorm_before.weight
354
+ vision_tower.encoder.layer.1.mlp.fc1.bias
355
+ vision_tower.encoder.layer.1.mlp.fc1.weight
356
+ vision_tower.encoder.layer.1.mlp.fc2.bias
357
+ vision_tower.encoder.layer.1.mlp.fc2.weight
358
+ vision_tower.encoder.layer.10.attention.k_proj.bias
359
+ vision_tower.encoder.layer.10.attention.k_proj.weight
360
+ vision_tower.encoder.layer.10.attention.projection_layer.bias
361
+ vision_tower.encoder.layer.10.attention.projection_layer.weight
362
+ vision_tower.encoder.layer.10.attention.q_proj.bias
363
+ vision_tower.encoder.layer.10.attention.q_proj.weight
364
+ vision_tower.encoder.layer.10.attention.v_proj.bias
365
+ vision_tower.encoder.layer.10.attention.v_proj.weight
366
+ vision_tower.encoder.layer.10.lambda_1
367
+ vision_tower.encoder.layer.10.lambda_2
368
+ vision_tower.encoder.layer.10.layernorm_after.bias
369
+ vision_tower.encoder.layer.10.layernorm_after.weight
370
+ vision_tower.encoder.layer.10.layernorm_before.bias
371
+ vision_tower.encoder.layer.10.layernorm_before.weight
372
+ vision_tower.encoder.layer.10.mlp.fc1.bias
373
+ vision_tower.encoder.layer.10.mlp.fc1.weight
374
+ vision_tower.encoder.layer.10.mlp.fc2.bias
375
+ vision_tower.encoder.layer.10.mlp.fc2.weight
376
+ vision_tower.encoder.layer.11.attention.k_proj.bias
377
+ vision_tower.encoder.layer.11.attention.k_proj.weight
378
+ vision_tower.encoder.layer.11.attention.projection_layer.bias
379
+ vision_tower.encoder.layer.11.attention.projection_layer.weight
380
+ vision_tower.encoder.layer.11.attention.q_proj.bias
381
+ vision_tower.encoder.layer.11.attention.q_proj.weight
382
+ vision_tower.encoder.layer.11.attention.v_proj.bias
383
+ vision_tower.encoder.layer.11.attention.v_proj.weight
384
+ vision_tower.encoder.layer.11.lambda_1
385
+ vision_tower.encoder.layer.11.lambda_2
386
+ vision_tower.encoder.layer.11.layernorm_after.bias
387
+ vision_tower.encoder.layer.11.layernorm_after.weight
388
+ vision_tower.encoder.layer.11.layernorm_before.bias
389
+ vision_tower.encoder.layer.11.layernorm_before.weight
390
+ vision_tower.encoder.layer.11.mlp.fc1.bias
391
+ vision_tower.encoder.layer.11.mlp.fc1.weight
392
+ vision_tower.encoder.layer.11.mlp.fc2.bias
393
+ vision_tower.encoder.layer.11.mlp.fc2.weight
394
+ vision_tower.encoder.layer.12.attention.k_proj.bias
395
+ vision_tower.encoder.layer.12.attention.k_proj.weight
396
+ vision_tower.encoder.layer.12.attention.projection_layer.bias
397
+ vision_tower.encoder.layer.12.attention.projection_layer.weight
398
+ vision_tower.encoder.layer.12.attention.q_proj.bias
399
+ vision_tower.encoder.layer.12.attention.q_proj.weight
400
+ vision_tower.encoder.layer.12.attention.v_proj.bias
401
+ vision_tower.encoder.layer.12.attention.v_proj.weight
402
+ vision_tower.encoder.layer.12.lambda_1
403
+ vision_tower.encoder.layer.12.lambda_2
404
+ vision_tower.encoder.layer.12.layernorm_after.bias
405
+ vision_tower.encoder.layer.12.layernorm_after.weight
406
+ vision_tower.encoder.layer.12.layernorm_before.bias
407
+ vision_tower.encoder.layer.12.layernorm_before.weight
408
+ vision_tower.encoder.layer.12.mlp.fc1.bias
409
+ vision_tower.encoder.layer.12.mlp.fc1.weight
410
+ vision_tower.encoder.layer.12.mlp.fc2.bias
411
+ vision_tower.encoder.layer.12.mlp.fc2.weight
412
+ vision_tower.encoder.layer.13.attention.k_proj.bias
413
+ vision_tower.encoder.layer.13.attention.k_proj.weight
414
+ vision_tower.encoder.layer.13.attention.projection_layer.bias
415
+ vision_tower.encoder.layer.13.attention.projection_layer.weight
416
+ vision_tower.encoder.layer.13.attention.q_proj.bias
417
+ vision_tower.encoder.layer.13.attention.q_proj.weight
418
+ vision_tower.encoder.layer.13.attention.v_proj.bias
419
+ vision_tower.encoder.layer.13.attention.v_proj.weight
420
+ vision_tower.encoder.layer.13.lambda_1
421
+ vision_tower.encoder.layer.13.lambda_2
422
+ vision_tower.encoder.layer.13.layernorm_after.bias
423
+ vision_tower.encoder.layer.13.layernorm_after.weight
424
+ vision_tower.encoder.layer.13.layernorm_before.bias
425
+ vision_tower.encoder.layer.13.layernorm_before.weight
426
+ vision_tower.encoder.layer.13.mlp.fc1.bias
427
+ vision_tower.encoder.layer.13.mlp.fc1.weight
428
+ vision_tower.encoder.layer.13.mlp.fc2.bias
429
+ vision_tower.encoder.layer.13.mlp.fc2.weight
430
+ vision_tower.encoder.layer.14.attention.k_proj.bias
431
+ vision_tower.encoder.layer.14.attention.k_proj.weight
432
+ vision_tower.encoder.layer.14.attention.projection_layer.bias
433
+ vision_tower.encoder.layer.14.attention.projection_layer.weight
434
+ vision_tower.encoder.layer.14.attention.q_proj.bias
435
+ vision_tower.encoder.layer.14.attention.q_proj.weight
436
+ vision_tower.encoder.layer.14.attention.v_proj.bias
437
+ vision_tower.encoder.layer.14.attention.v_proj.weight
438
+ vision_tower.encoder.layer.14.lambda_1
439
+ vision_tower.encoder.layer.14.lambda_2
440
+ vision_tower.encoder.layer.14.layernorm_after.bias
441
+ vision_tower.encoder.layer.14.layernorm_after.weight
442
+ vision_tower.encoder.layer.14.layernorm_before.bias
443
+ vision_tower.encoder.layer.14.layernorm_before.weight
444
+ vision_tower.encoder.layer.14.mlp.fc1.bias
445
+ vision_tower.encoder.layer.14.mlp.fc1.weight
446
+ vision_tower.encoder.layer.14.mlp.fc2.bias
447
+ vision_tower.encoder.layer.14.mlp.fc2.weight
448
+ vision_tower.encoder.layer.15.attention.k_proj.bias
449
+ vision_tower.encoder.layer.15.attention.k_proj.weight
450
+ vision_tower.encoder.layer.15.attention.projection_layer.bias
451
+ vision_tower.encoder.layer.15.attention.projection_layer.weight
452
+ vision_tower.encoder.layer.15.attention.q_proj.bias
453
+ vision_tower.encoder.layer.15.attention.q_proj.weight
454
+ vision_tower.encoder.layer.15.attention.v_proj.bias
455
+ vision_tower.encoder.layer.15.attention.v_proj.weight
456
+ vision_tower.encoder.layer.15.lambda_1
457
+ vision_tower.encoder.layer.15.lambda_2
458
+ vision_tower.encoder.layer.15.layernorm_after.bias
459
+ vision_tower.encoder.layer.15.layernorm_after.weight
460
+ vision_tower.encoder.layer.15.layernorm_before.bias
461
+ vision_tower.encoder.layer.15.layernorm_before.weight
462
+ vision_tower.encoder.layer.15.mlp.fc1.bias
463
+ vision_tower.encoder.layer.15.mlp.fc1.weight
464
+ vision_tower.encoder.layer.15.mlp.fc2.bias
465
+ vision_tower.encoder.layer.15.mlp.fc2.weight
466
+ vision_tower.encoder.layer.16.attention.k_proj.bias
467
+ vision_tower.encoder.layer.16.attention.k_proj.weight
468
+ vision_tower.encoder.layer.16.attention.projection_layer.bias
469
+ vision_tower.encoder.layer.16.attention.projection_layer.weight
470
+ vision_tower.encoder.layer.16.attention.q_proj.bias
471
+ vision_tower.encoder.layer.16.attention.q_proj.weight
472
+ vision_tower.encoder.layer.16.attention.v_proj.bias
473
+ vision_tower.encoder.layer.16.attention.v_proj.weight
474
+ vision_tower.encoder.layer.16.lambda_1
475
+ vision_tower.encoder.layer.16.lambda_2
476
+ vision_tower.encoder.layer.16.layernorm_after.bias
477
+ vision_tower.encoder.layer.16.layernorm_after.weight
478
+ vision_tower.encoder.layer.16.layernorm_before.bias
479
+ vision_tower.encoder.layer.16.layernorm_before.weight
480
+ vision_tower.encoder.layer.16.mlp.fc1.bias
481
+ vision_tower.encoder.layer.16.mlp.fc1.weight
482
+ vision_tower.encoder.layer.16.mlp.fc2.bias
483
+ vision_tower.encoder.layer.16.mlp.fc2.weight
484
+ vision_tower.encoder.layer.17.attention.k_proj.bias
485
+ vision_tower.encoder.layer.17.attention.k_proj.weight
486
+ vision_tower.encoder.layer.17.attention.projection_layer.bias
487
+ vision_tower.encoder.layer.17.attention.projection_layer.weight
488
+ vision_tower.encoder.layer.17.attention.q_proj.bias
489
+ vision_tower.encoder.layer.17.attention.q_proj.weight
490
+ vision_tower.encoder.layer.17.attention.v_proj.bias
491
+ vision_tower.encoder.layer.17.attention.v_proj.weight
492
+ vision_tower.encoder.layer.17.lambda_1
493
+ vision_tower.encoder.layer.17.lambda_2
494
+ vision_tower.encoder.layer.17.layernorm_after.bias
495
+ vision_tower.encoder.layer.17.layernorm_after.weight
496
+ vision_tower.encoder.layer.17.layernorm_before.bias
497
+ vision_tower.encoder.layer.17.layernorm_before.weight
498
+ vision_tower.encoder.layer.17.mlp.fc1.bias
499
+ vision_tower.encoder.layer.17.mlp.fc1.weight
500
+ vision_tower.encoder.layer.17.mlp.fc2.bias
501
+ vision_tower.encoder.layer.17.mlp.fc2.weight
502
+ vision_tower.encoder.layer.18.attention.k_proj.bias
503
+ vision_tower.encoder.layer.18.attention.k_proj.weight
504
+ vision_tower.encoder.layer.18.attention.projection_layer.bias
505
+ vision_tower.encoder.layer.18.attention.projection_layer.weight
506
+ vision_tower.encoder.layer.18.attention.q_proj.bias
507
+ vision_tower.encoder.layer.18.attention.q_proj.weight
508
+ vision_tower.encoder.layer.18.attention.v_proj.bias
509
+ vision_tower.encoder.layer.18.attention.v_proj.weight
510
+ vision_tower.encoder.layer.18.lambda_1
511
+ vision_tower.encoder.layer.18.lambda_2
512
+ vision_tower.encoder.layer.18.layernorm_after.bias
513
+ vision_tower.encoder.layer.18.layernorm_after.weight
514
+ vision_tower.encoder.layer.18.layernorm_before.bias
515
+ vision_tower.encoder.layer.18.layernorm_before.weight
516
+ vision_tower.encoder.layer.18.mlp.fc1.bias
517
+ vision_tower.encoder.layer.18.mlp.fc1.weight
518
+ vision_tower.encoder.layer.18.mlp.fc2.bias
519
+ vision_tower.encoder.layer.18.mlp.fc2.weight
520
+ vision_tower.encoder.layer.19.attention.k_proj.bias
521
+ vision_tower.encoder.layer.19.attention.k_proj.weight
522
+ vision_tower.encoder.layer.19.attention.projection_layer.bias
523
+ vision_tower.encoder.layer.19.attention.projection_layer.weight
524
+ vision_tower.encoder.layer.19.attention.q_proj.bias
525
+ vision_tower.encoder.layer.19.attention.q_proj.weight
526
+ vision_tower.encoder.layer.19.attention.v_proj.bias
527
+ vision_tower.encoder.layer.19.attention.v_proj.weight
528
+ vision_tower.encoder.layer.19.lambda_1
529
+ vision_tower.encoder.layer.19.lambda_2
530
+ vision_tower.encoder.layer.19.layernorm_after.bias
531
+ vision_tower.encoder.layer.19.layernorm_after.weight
532
+ vision_tower.encoder.layer.19.layernorm_before.bias
533
+ vision_tower.encoder.layer.19.layernorm_before.weight
534
+ vision_tower.encoder.layer.19.mlp.fc1.bias
535
+ vision_tower.encoder.layer.19.mlp.fc1.weight
536
+ vision_tower.encoder.layer.19.mlp.fc2.bias
537
+ vision_tower.encoder.layer.19.mlp.fc2.weight
538
+ vision_tower.encoder.layer.2.attention.k_proj.bias
539
+ vision_tower.encoder.layer.2.attention.k_proj.weight
540
+ vision_tower.encoder.layer.2.attention.projection_layer.bias
541
+ vision_tower.encoder.layer.2.attention.projection_layer.weight
542
+ vision_tower.encoder.layer.2.attention.q_proj.bias
543
+ vision_tower.encoder.layer.2.attention.q_proj.weight
544
+ vision_tower.encoder.layer.2.attention.v_proj.bias
545
+ vision_tower.encoder.layer.2.attention.v_proj.weight
546
+ vision_tower.encoder.layer.2.lambda_1
547
+ vision_tower.encoder.layer.2.lambda_2
548
+ vision_tower.encoder.layer.2.layernorm_after.bias
549
+ vision_tower.encoder.layer.2.layernorm_after.weight
550
+ vision_tower.encoder.layer.2.layernorm_before.bias
551
+ vision_tower.encoder.layer.2.layernorm_before.weight
552
+ vision_tower.encoder.layer.2.mlp.fc1.bias
553
+ vision_tower.encoder.layer.2.mlp.fc1.weight
554
+ vision_tower.encoder.layer.2.mlp.fc2.bias
555
+ vision_tower.encoder.layer.2.mlp.fc2.weight
556
+ vision_tower.encoder.layer.20.attention.k_proj.bias
557
+ vision_tower.encoder.layer.20.attention.k_proj.weight
558
+ vision_tower.encoder.layer.20.attention.projection_layer.bias
559
+ vision_tower.encoder.layer.20.attention.projection_layer.weight
560
+ vision_tower.encoder.layer.20.attention.q_proj.bias
561
+ vision_tower.encoder.layer.20.attention.q_proj.weight
562
+ vision_tower.encoder.layer.20.attention.v_proj.bias
563
+ vision_tower.encoder.layer.20.attention.v_proj.weight
564
+ vision_tower.encoder.layer.20.lambda_1
565
+ vision_tower.encoder.layer.20.lambda_2
566
+ vision_tower.encoder.layer.20.layernorm_after.bias
567
+ vision_tower.encoder.layer.20.layernorm_after.weight
568
+ vision_tower.encoder.layer.20.layernorm_before.bias
569
+ vision_tower.encoder.layer.20.layernorm_before.weight
570
+ vision_tower.encoder.layer.20.mlp.fc1.bias
571
+ vision_tower.encoder.layer.20.mlp.fc1.weight
572
+ vision_tower.encoder.layer.20.mlp.fc2.bias
573
+ vision_tower.encoder.layer.20.mlp.fc2.weight
574
+ vision_tower.encoder.layer.21.attention.k_proj.bias
575
+ vision_tower.encoder.layer.21.attention.k_proj.weight
576
+ vision_tower.encoder.layer.21.attention.projection_layer.bias
577
+ vision_tower.encoder.layer.21.attention.projection_layer.weight
578
+ vision_tower.encoder.layer.21.attention.q_proj.bias
579
+ vision_tower.encoder.layer.21.attention.q_proj.weight
580
+ vision_tower.encoder.layer.21.attention.v_proj.bias
581
+ vision_tower.encoder.layer.21.attention.v_proj.weight
582
+ vision_tower.encoder.layer.21.lambda_1
583
+ vision_tower.encoder.layer.21.lambda_2
584
+ vision_tower.encoder.layer.21.layernorm_after.bias
585
+ vision_tower.encoder.layer.21.layernorm_after.weight
586
+ vision_tower.encoder.layer.21.layernorm_before.bias
587
+ vision_tower.encoder.layer.21.layernorm_before.weight
588
+ vision_tower.encoder.layer.21.mlp.fc1.bias
589
+ vision_tower.encoder.layer.21.mlp.fc1.weight
590
+ vision_tower.encoder.layer.21.mlp.fc2.bias
591
+ vision_tower.encoder.layer.21.mlp.fc2.weight
592
+ vision_tower.encoder.layer.22.attention.k_proj.bias
593
+ vision_tower.encoder.layer.22.attention.k_proj.weight
594
+ vision_tower.encoder.layer.22.attention.projection_layer.bias
595
+ vision_tower.encoder.layer.22.attention.projection_layer.weight
596
+ vision_tower.encoder.layer.22.attention.q_proj.bias
597
+ vision_tower.encoder.layer.22.attention.q_proj.weight
598
+ vision_tower.encoder.layer.22.attention.v_proj.bias
599
+ vision_tower.encoder.layer.22.attention.v_proj.weight
600
+ vision_tower.encoder.layer.22.lambda_1
601
+ vision_tower.encoder.layer.22.lambda_2
602
+ vision_tower.encoder.layer.22.layernorm_after.bias
603
+ vision_tower.encoder.layer.22.layernorm_after.weight
604
+ vision_tower.encoder.layer.22.layernorm_before.bias
605
+ vision_tower.encoder.layer.22.layernorm_before.weight
606
+ vision_tower.encoder.layer.22.mlp.fc1.bias
607
+ vision_tower.encoder.layer.22.mlp.fc1.weight
608
+ vision_tower.encoder.layer.22.mlp.fc2.bias
609
+ vision_tower.encoder.layer.22.mlp.fc2.weight
610
+ vision_tower.encoder.layer.23.attention.k_proj.bias
611
+ vision_tower.encoder.layer.23.attention.k_proj.weight
612
+ vision_tower.encoder.layer.23.attention.projection_layer.bias
613
+ vision_tower.encoder.layer.23.attention.projection_layer.weight
614
+ vision_tower.encoder.layer.23.attention.q_proj.bias
615
+ vision_tower.encoder.layer.23.attention.q_proj.weight
616
+ vision_tower.encoder.layer.23.attention.v_proj.bias
617
+ vision_tower.encoder.layer.23.attention.v_proj.weight
618
+ vision_tower.encoder.layer.23.lambda_1
619
+ vision_tower.encoder.layer.23.lambda_2
620
+ vision_tower.encoder.layer.23.layernorm_after.bias
621
+ vision_tower.encoder.layer.23.layernorm_after.weight
622
+ vision_tower.encoder.layer.23.layernorm_before.bias
623
+ vision_tower.encoder.layer.23.layernorm_before.weight
624
+ vision_tower.encoder.layer.23.mlp.fc1.bias
625
+ vision_tower.encoder.layer.23.mlp.fc1.weight
626
+ vision_tower.encoder.layer.23.mlp.fc2.bias
627
+ vision_tower.encoder.layer.23.mlp.fc2.weight
628
+ vision_tower.encoder.layer.3.attention.k_proj.bias
629
+ vision_tower.encoder.layer.3.attention.k_proj.weight
630
+ vision_tower.encoder.layer.3.attention.projection_layer.bias
631
+ vision_tower.encoder.layer.3.attention.projection_layer.weight
632
+ vision_tower.encoder.layer.3.attention.q_proj.bias
633
+ vision_tower.encoder.layer.3.attention.q_proj.weight
634
+ vision_tower.encoder.layer.3.attention.v_proj.bias
635
+ vision_tower.encoder.layer.3.attention.v_proj.weight
636
+ vision_tower.encoder.layer.3.lambda_1
637
+ vision_tower.encoder.layer.3.lambda_2
638
+ vision_tower.encoder.layer.3.layernorm_after.bias
639
+ vision_tower.encoder.layer.3.layernorm_after.weight
640
+ vision_tower.encoder.layer.3.layernorm_before.bias
641
+ vision_tower.encoder.layer.3.layernorm_before.weight
642
+ vision_tower.encoder.layer.3.mlp.fc1.bias
643
+ vision_tower.encoder.layer.3.mlp.fc1.weight
644
+ vision_tower.encoder.layer.3.mlp.fc2.bias
645
+ vision_tower.encoder.layer.3.mlp.fc2.weight
646
+ vision_tower.encoder.layer.4.attention.k_proj.bias
647
+ vision_tower.encoder.layer.4.attention.k_proj.weight
648
+ vision_tower.encoder.layer.4.attention.projection_layer.bias
649
+ vision_tower.encoder.layer.4.attention.projection_layer.weight
650
+ vision_tower.encoder.layer.4.attention.q_proj.bias
651
+ vision_tower.encoder.layer.4.attention.q_proj.weight
652
+ vision_tower.encoder.layer.4.attention.v_proj.bias
653
+ vision_tower.encoder.layer.4.attention.v_proj.weight
654
+ vision_tower.encoder.layer.4.lambda_1
655
+ vision_tower.encoder.layer.4.lambda_2
656
+ vision_tower.encoder.layer.4.layernorm_after.bias
657
+ vision_tower.encoder.layer.4.layernorm_after.weight
658
+ vision_tower.encoder.layer.4.layernorm_before.bias
659
+ vision_tower.encoder.layer.4.layernorm_before.weight
660
+ vision_tower.encoder.layer.4.mlp.fc1.bias
661
+ vision_tower.encoder.layer.4.mlp.fc1.weight
662
+ vision_tower.encoder.layer.4.mlp.fc2.bias
663
+ vision_tower.encoder.layer.4.mlp.fc2.weight
664
+ vision_tower.encoder.layer.5.attention.k_proj.bias
665
+ vision_tower.encoder.layer.5.attention.k_proj.weight
666
+ vision_tower.encoder.layer.5.attention.projection_layer.bias
667
+ vision_tower.encoder.layer.5.attention.projection_layer.weight
668
+ vision_tower.encoder.layer.5.attention.q_proj.bias
669
+ vision_tower.encoder.layer.5.attention.q_proj.weight
670
+ vision_tower.encoder.layer.5.attention.v_proj.bias
671
+ vision_tower.encoder.layer.5.attention.v_proj.weight
672
+ vision_tower.encoder.layer.5.lambda_1
673
+ vision_tower.encoder.layer.5.lambda_2
674
+ vision_tower.encoder.layer.5.layernorm_after.bias
675
+ vision_tower.encoder.layer.5.layernorm_after.weight
676
+ vision_tower.encoder.layer.5.layernorm_before.bias
677
+ vision_tower.encoder.layer.5.layernorm_before.weight
678
+ vision_tower.encoder.layer.5.mlp.fc1.bias
679
+ vision_tower.encoder.layer.5.mlp.fc1.weight
680
+ vision_tower.encoder.layer.5.mlp.fc2.bias
681
+ vision_tower.encoder.layer.5.mlp.fc2.weight
682
+ vision_tower.encoder.layer.6.attention.k_proj.bias
683
+ vision_tower.encoder.layer.6.attention.k_proj.weight
684
+ vision_tower.encoder.layer.6.attention.projection_layer.bias
685
+ vision_tower.encoder.layer.6.attention.projection_layer.weight
686
+ vision_tower.encoder.layer.6.attention.q_proj.bias
687
+ vision_tower.encoder.layer.6.attention.q_proj.weight
688
+ vision_tower.encoder.layer.6.attention.v_proj.bias
689
+ vision_tower.encoder.layer.6.attention.v_proj.weight
690
+ vision_tower.encoder.layer.6.lambda_1
691
+ vision_tower.encoder.layer.6.lambda_2
692
+ vision_tower.encoder.layer.6.layernorm_after.bias
693
+ vision_tower.encoder.layer.6.layernorm_after.weight
694
+ vision_tower.encoder.layer.6.layernorm_before.bias
695
+ vision_tower.encoder.layer.6.layernorm_before.weight
696
+ vision_tower.encoder.layer.6.mlp.fc1.bias
697
+ vision_tower.encoder.layer.6.mlp.fc1.weight
698
+ vision_tower.encoder.layer.6.mlp.fc2.bias
699
+ vision_tower.encoder.layer.6.mlp.fc2.weight
700
+ vision_tower.encoder.layer.7.attention.k_proj.bias
701
+ vision_tower.encoder.layer.7.attention.k_proj.weight
702
+ vision_tower.encoder.layer.7.attention.projection_layer.bias
703
+ vision_tower.encoder.layer.7.attention.projection_layer.weight
704
+ vision_tower.encoder.layer.7.attention.q_proj.bias
705
+ vision_tower.encoder.layer.7.attention.q_proj.weight
706
+ vision_tower.encoder.layer.7.attention.v_proj.bias
707
+ vision_tower.encoder.layer.7.attention.v_proj.weight
708
+ vision_tower.encoder.layer.7.lambda_1
709
+ vision_tower.encoder.layer.7.lambda_2
710
+ vision_tower.encoder.layer.7.layernorm_after.bias
711
+ vision_tower.encoder.layer.7.layernorm_after.weight
712
+ vision_tower.encoder.layer.7.layernorm_before.bias
713
+ vision_tower.encoder.layer.7.layernorm_before.weight
714
+ vision_tower.encoder.layer.7.mlp.fc1.bias
715
+ vision_tower.encoder.layer.7.mlp.fc1.weight
716
+ vision_tower.encoder.layer.7.mlp.fc2.bias
717
+ vision_tower.encoder.layer.7.mlp.fc2.weight
718
+ vision_tower.encoder.layer.8.attention.k_proj.bias
719
+ vision_tower.encoder.layer.8.attention.k_proj.weight
720
+ vision_tower.encoder.layer.8.attention.projection_layer.bias
721
+ vision_tower.encoder.layer.8.attention.projection_layer.weight
722
+ vision_tower.encoder.layer.8.attention.q_proj.bias
723
+ vision_tower.encoder.layer.8.attention.q_proj.weight
724
+ vision_tower.encoder.layer.8.attention.v_proj.bias
725
+ vision_tower.encoder.layer.8.attention.v_proj.weight
726
+ vision_tower.encoder.layer.8.lambda_1
727
+ vision_tower.encoder.layer.8.lambda_2
728
+ vision_tower.encoder.layer.8.layernorm_after.bias
729
+ vision_tower.encoder.layer.8.layernorm_after.weight
730
+ vision_tower.encoder.layer.8.layernorm_before.bias
731
+ vision_tower.encoder.layer.8.layernorm_before.weight
732
+ vision_tower.encoder.layer.8.mlp.fc1.bias
733
+ vision_tower.encoder.layer.8.mlp.fc1.weight
734
+ vision_tower.encoder.layer.8.mlp.fc2.bias
735
+ vision_tower.encoder.layer.8.mlp.fc2.weight
736
+ vision_tower.encoder.layer.9.attention.k_proj.bias
737
+ vision_tower.encoder.layer.9.attention.k_proj.weight
738
+ vision_tower.encoder.layer.9.attention.projection_layer.bias
739
+ vision_tower.encoder.layer.9.attention.projection_layer.weight
740
+ vision_tower.encoder.layer.9.attention.q_proj.bias
741
+ vision_tower.encoder.layer.9.attention.q_proj.weight
742
+ vision_tower.encoder.layer.9.attention.v_proj.bias
743
+ vision_tower.encoder.layer.9.attention.v_proj.weight
744
+ vision_tower.encoder.layer.9.lambda_1
745
+ vision_tower.encoder.layer.9.lambda_2
746
+ vision_tower.encoder.layer.9.layernorm_after.bias
747
+ vision_tower.encoder.layer.9.layernorm_after.weight
748
+ vision_tower.encoder.layer.9.layernorm_before.bias
749
+ vision_tower.encoder.layer.9.layernorm_before.weight
750
+ vision_tower.encoder.layer.9.mlp.fc1.bias
751
+ vision_tower.encoder.layer.9.mlp.fc1.weight
752
+ vision_tower.encoder.layer.9.mlp.fc2.bias
753
+ vision_tower.encoder.layer.9.mlp.fc2.weight
output_dir_lr1e-6/output_keys.txt ADDED
@@ -0,0 +1,753 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ lm_head.weight
2
+ model.language_model.embed_tokens.weight
3
+ model.language_model.layers.0.input_layernorm.weight
4
+ model.language_model.layers.0.mlp.down_proj.weight
5
+ model.language_model.layers.0.mlp.gate_proj.weight
6
+ model.language_model.layers.0.mlp.up_proj.weight
7
+ model.language_model.layers.0.post_attention_layernorm.weight
8
+ model.language_model.layers.0.self_attn.k_norm.weight
9
+ model.language_model.layers.0.self_attn.k_proj.weight
10
+ model.language_model.layers.0.self_attn.o_proj.weight
11
+ model.language_model.layers.0.self_attn.q_norm.weight
12
+ model.language_model.layers.0.self_attn.q_proj.weight
13
+ model.language_model.layers.0.self_attn.v_proj.weight
14
+ model.language_model.layers.1.input_layernorm.weight
15
+ model.language_model.layers.1.mlp.down_proj.weight
16
+ model.language_model.layers.1.mlp.gate_proj.weight
17
+ model.language_model.layers.1.mlp.up_proj.weight
18
+ model.language_model.layers.1.post_attention_layernorm.weight
19
+ model.language_model.layers.1.self_attn.k_norm.weight
20
+ model.language_model.layers.1.self_attn.k_proj.weight
21
+ model.language_model.layers.1.self_attn.o_proj.weight
22
+ model.language_model.layers.1.self_attn.q_norm.weight
23
+ model.language_model.layers.1.self_attn.q_proj.weight
24
+ model.language_model.layers.1.self_attn.v_proj.weight
25
+ model.language_model.layers.10.input_layernorm.weight
26
+ model.language_model.layers.10.mlp.down_proj.weight
27
+ model.language_model.layers.10.mlp.gate_proj.weight
28
+ model.language_model.layers.10.mlp.up_proj.weight
29
+ model.language_model.layers.10.post_attention_layernorm.weight
30
+ model.language_model.layers.10.self_attn.k_norm.weight
31
+ model.language_model.layers.10.self_attn.k_proj.weight
32
+ model.language_model.layers.10.self_attn.o_proj.weight
33
+ model.language_model.layers.10.self_attn.q_norm.weight
34
+ model.language_model.layers.10.self_attn.q_proj.weight
35
+ model.language_model.layers.10.self_attn.v_proj.weight
36
+ model.language_model.layers.11.input_layernorm.weight
37
+ model.language_model.layers.11.mlp.down_proj.weight
38
+ model.language_model.layers.11.mlp.gate_proj.weight
39
+ model.language_model.layers.11.mlp.up_proj.weight
40
+ model.language_model.layers.11.post_attention_layernorm.weight
41
+ model.language_model.layers.11.self_attn.k_norm.weight
42
+ model.language_model.layers.11.self_attn.k_proj.weight
43
+ model.language_model.layers.11.self_attn.o_proj.weight
44
+ model.language_model.layers.11.self_attn.q_norm.weight
45
+ model.language_model.layers.11.self_attn.q_proj.weight
46
+ model.language_model.layers.11.self_attn.v_proj.weight
47
+ model.language_model.layers.12.input_layernorm.weight
48
+ model.language_model.layers.12.mlp.down_proj.weight
49
+ model.language_model.layers.12.mlp.gate_proj.weight
50
+ model.language_model.layers.12.mlp.up_proj.weight
51
+ model.language_model.layers.12.post_attention_layernorm.weight
52
+ model.language_model.layers.12.self_attn.k_norm.weight
53
+ model.language_model.layers.12.self_attn.k_proj.weight
54
+ model.language_model.layers.12.self_attn.o_proj.weight
55
+ model.language_model.layers.12.self_attn.q_norm.weight
56
+ model.language_model.layers.12.self_attn.q_proj.weight
57
+ model.language_model.layers.12.self_attn.v_proj.weight
58
+ model.language_model.layers.13.input_layernorm.weight
59
+ model.language_model.layers.13.mlp.down_proj.weight
60
+ model.language_model.layers.13.mlp.gate_proj.weight
61
+ model.language_model.layers.13.mlp.up_proj.weight
62
+ model.language_model.layers.13.post_attention_layernorm.weight
63
+ model.language_model.layers.13.self_attn.k_norm.weight
64
+ model.language_model.layers.13.self_attn.k_proj.weight
65
+ model.language_model.layers.13.self_attn.o_proj.weight
66
+ model.language_model.layers.13.self_attn.q_norm.weight
67
+ model.language_model.layers.13.self_attn.q_proj.weight
68
+ model.language_model.layers.13.self_attn.v_proj.weight
69
+ model.language_model.layers.14.input_layernorm.weight
70
+ model.language_model.layers.14.mlp.down_proj.weight
71
+ model.language_model.layers.14.mlp.gate_proj.weight
72
+ model.language_model.layers.14.mlp.up_proj.weight
73
+ model.language_model.layers.14.post_attention_layernorm.weight
74
+ model.language_model.layers.14.self_attn.k_norm.weight
75
+ model.language_model.layers.14.self_attn.k_proj.weight
76
+ model.language_model.layers.14.self_attn.o_proj.weight
77
+ model.language_model.layers.14.self_attn.q_norm.weight
78
+ model.language_model.layers.14.self_attn.q_proj.weight
79
+ model.language_model.layers.14.self_attn.v_proj.weight
80
+ model.language_model.layers.15.input_layernorm.weight
81
+ model.language_model.layers.15.mlp.down_proj.weight
82
+ model.language_model.layers.15.mlp.gate_proj.weight
83
+ model.language_model.layers.15.mlp.up_proj.weight
84
+ model.language_model.layers.15.post_attention_layernorm.weight
85
+ model.language_model.layers.15.self_attn.k_norm.weight
86
+ model.language_model.layers.15.self_attn.k_proj.weight
87
+ model.language_model.layers.15.self_attn.o_proj.weight
88
+ model.language_model.layers.15.self_attn.q_norm.weight
89
+ model.language_model.layers.15.self_attn.q_proj.weight
90
+ model.language_model.layers.15.self_attn.v_proj.weight
91
+ model.language_model.layers.16.input_layernorm.weight
92
+ model.language_model.layers.16.mlp.down_proj.weight
93
+ model.language_model.layers.16.mlp.gate_proj.weight
94
+ model.language_model.layers.16.mlp.up_proj.weight
95
+ model.language_model.layers.16.post_attention_layernorm.weight
96
+ model.language_model.layers.16.self_attn.k_norm.weight
97
+ model.language_model.layers.16.self_attn.k_proj.weight
98
+ model.language_model.layers.16.self_attn.o_proj.weight
99
+ model.language_model.layers.16.self_attn.q_norm.weight
100
+ model.language_model.layers.16.self_attn.q_proj.weight
101
+ model.language_model.layers.16.self_attn.v_proj.weight
102
+ model.language_model.layers.17.input_layernorm.weight
103
+ model.language_model.layers.17.mlp.down_proj.weight
104
+ model.language_model.layers.17.mlp.gate_proj.weight
105
+ model.language_model.layers.17.mlp.up_proj.weight
106
+ model.language_model.layers.17.post_attention_layernorm.weight
107
+ model.language_model.layers.17.self_attn.k_norm.weight
108
+ model.language_model.layers.17.self_attn.k_proj.weight
109
+ model.language_model.layers.17.self_attn.o_proj.weight
110
+ model.language_model.layers.17.self_attn.q_norm.weight
111
+ model.language_model.layers.17.self_attn.q_proj.weight
112
+ model.language_model.layers.17.self_attn.v_proj.weight
113
+ model.language_model.layers.18.input_layernorm.weight
114
+ model.language_model.layers.18.mlp.down_proj.weight
115
+ model.language_model.layers.18.mlp.gate_proj.weight
116
+ model.language_model.layers.18.mlp.up_proj.weight
117
+ model.language_model.layers.18.post_attention_layernorm.weight
118
+ model.language_model.layers.18.self_attn.k_norm.weight
119
+ model.language_model.layers.18.self_attn.k_proj.weight
120
+ model.language_model.layers.18.self_attn.o_proj.weight
121
+ model.language_model.layers.18.self_attn.q_norm.weight
122
+ model.language_model.layers.18.self_attn.q_proj.weight
123
+ model.language_model.layers.18.self_attn.v_proj.weight
124
+ model.language_model.layers.19.input_layernorm.weight
125
+ model.language_model.layers.19.mlp.down_proj.weight
126
+ model.language_model.layers.19.mlp.gate_proj.weight
127
+ model.language_model.layers.19.mlp.up_proj.weight
128
+ model.language_model.layers.19.post_attention_layernorm.weight
129
+ model.language_model.layers.19.self_attn.k_norm.weight
130
+ model.language_model.layers.19.self_attn.k_proj.weight
131
+ model.language_model.layers.19.self_attn.o_proj.weight
132
+ model.language_model.layers.19.self_attn.q_norm.weight
133
+ model.language_model.layers.19.self_attn.q_proj.weight
134
+ model.language_model.layers.19.self_attn.v_proj.weight
135
+ model.language_model.layers.2.input_layernorm.weight
136
+ model.language_model.layers.2.mlp.down_proj.weight
137
+ model.language_model.layers.2.mlp.gate_proj.weight
138
+ model.language_model.layers.2.mlp.up_proj.weight
139
+ model.language_model.layers.2.post_attention_layernorm.weight
140
+ model.language_model.layers.2.self_attn.k_norm.weight
141
+ model.language_model.layers.2.self_attn.k_proj.weight
142
+ model.language_model.layers.2.self_attn.o_proj.weight
143
+ model.language_model.layers.2.self_attn.q_norm.weight
144
+ model.language_model.layers.2.self_attn.q_proj.weight
145
+ model.language_model.layers.2.self_attn.v_proj.weight
146
+ model.language_model.layers.20.input_layernorm.weight
147
+ model.language_model.layers.20.mlp.down_proj.weight
148
+ model.language_model.layers.20.mlp.gate_proj.weight
149
+ model.language_model.layers.20.mlp.up_proj.weight
150
+ model.language_model.layers.20.post_attention_layernorm.weight
151
+ model.language_model.layers.20.self_attn.k_norm.weight
152
+ model.language_model.layers.20.self_attn.k_proj.weight
153
+ model.language_model.layers.20.self_attn.o_proj.weight
154
+ model.language_model.layers.20.self_attn.q_norm.weight
155
+ model.language_model.layers.20.self_attn.q_proj.weight
156
+ model.language_model.layers.20.self_attn.v_proj.weight
157
+ model.language_model.layers.21.input_layernorm.weight
158
+ model.language_model.layers.21.mlp.down_proj.weight
159
+ model.language_model.layers.21.mlp.gate_proj.weight
160
+ model.language_model.layers.21.mlp.up_proj.weight
161
+ model.language_model.layers.21.post_attention_layernorm.weight
162
+ model.language_model.layers.21.self_attn.k_norm.weight
163
+ model.language_model.layers.21.self_attn.k_proj.weight
164
+ model.language_model.layers.21.self_attn.o_proj.weight
165
+ model.language_model.layers.21.self_attn.q_norm.weight
166
+ model.language_model.layers.21.self_attn.q_proj.weight
167
+ model.language_model.layers.21.self_attn.v_proj.weight
168
+ model.language_model.layers.22.input_layernorm.weight
169
+ model.language_model.layers.22.mlp.down_proj.weight
170
+ model.language_model.layers.22.mlp.gate_proj.weight
171
+ model.language_model.layers.22.mlp.up_proj.weight
172
+ model.language_model.layers.22.post_attention_layernorm.weight
173
+ model.language_model.layers.22.self_attn.k_norm.weight
174
+ model.language_model.layers.22.self_attn.k_proj.weight
175
+ model.language_model.layers.22.self_attn.o_proj.weight
176
+ model.language_model.layers.22.self_attn.q_norm.weight
177
+ model.language_model.layers.22.self_attn.q_proj.weight
178
+ model.language_model.layers.22.self_attn.v_proj.weight
179
+ model.language_model.layers.23.input_layernorm.weight
180
+ model.language_model.layers.23.mlp.down_proj.weight
181
+ model.language_model.layers.23.mlp.gate_proj.weight
182
+ model.language_model.layers.23.mlp.up_proj.weight
183
+ model.language_model.layers.23.post_attention_layernorm.weight
184
+ model.language_model.layers.23.self_attn.k_norm.weight
185
+ model.language_model.layers.23.self_attn.k_proj.weight
186
+ model.language_model.layers.23.self_attn.o_proj.weight
187
+ model.language_model.layers.23.self_attn.q_norm.weight
188
+ model.language_model.layers.23.self_attn.q_proj.weight
189
+ model.language_model.layers.23.self_attn.v_proj.weight
190
+ model.language_model.layers.24.input_layernorm.weight
191
+ model.language_model.layers.24.mlp.down_proj.weight
192
+ model.language_model.layers.24.mlp.gate_proj.weight
193
+ model.language_model.layers.24.mlp.up_proj.weight
194
+ model.language_model.layers.24.post_attention_layernorm.weight
195
+ model.language_model.layers.24.self_attn.k_norm.weight
196
+ model.language_model.layers.24.self_attn.k_proj.weight
197
+ model.language_model.layers.24.self_attn.o_proj.weight
198
+ model.language_model.layers.24.self_attn.q_norm.weight
199
+ model.language_model.layers.24.self_attn.q_proj.weight
200
+ model.language_model.layers.24.self_attn.v_proj.weight
201
+ model.language_model.layers.25.input_layernorm.weight
202
+ model.language_model.layers.25.mlp.down_proj.weight
203
+ model.language_model.layers.25.mlp.gate_proj.weight
204
+ model.language_model.layers.25.mlp.up_proj.weight
205
+ model.language_model.layers.25.post_attention_layernorm.weight
206
+ model.language_model.layers.25.self_attn.k_norm.weight
207
+ model.language_model.layers.25.self_attn.k_proj.weight
208
+ model.language_model.layers.25.self_attn.o_proj.weight
209
+ model.language_model.layers.25.self_attn.q_norm.weight
210
+ model.language_model.layers.25.self_attn.q_proj.weight
211
+ model.language_model.layers.25.self_attn.v_proj.weight
212
+ model.language_model.layers.26.input_layernorm.weight
213
+ model.language_model.layers.26.mlp.down_proj.weight
214
+ model.language_model.layers.26.mlp.gate_proj.weight
215
+ model.language_model.layers.26.mlp.up_proj.weight
216
+ model.language_model.layers.26.post_attention_layernorm.weight
217
+ model.language_model.layers.26.self_attn.k_norm.weight
218
+ model.language_model.layers.26.self_attn.k_proj.weight
219
+ model.language_model.layers.26.self_attn.o_proj.weight
220
+ model.language_model.layers.26.self_attn.q_norm.weight
221
+ model.language_model.layers.26.self_attn.q_proj.weight
222
+ model.language_model.layers.26.self_attn.v_proj.weight
223
+ model.language_model.layers.27.input_layernorm.weight
224
+ model.language_model.layers.27.mlp.down_proj.weight
225
+ model.language_model.layers.27.mlp.gate_proj.weight
226
+ model.language_model.layers.27.mlp.up_proj.weight
227
+ model.language_model.layers.27.post_attention_layernorm.weight
228
+ model.language_model.layers.27.self_attn.k_norm.weight
229
+ model.language_model.layers.27.self_attn.k_proj.weight
230
+ model.language_model.layers.27.self_attn.o_proj.weight
231
+ model.language_model.layers.27.self_attn.q_norm.weight
232
+ model.language_model.layers.27.self_attn.q_proj.weight
233
+ model.language_model.layers.27.self_attn.v_proj.weight
234
+ model.language_model.layers.3.input_layernorm.weight
235
+ model.language_model.layers.3.mlp.down_proj.weight
236
+ model.language_model.layers.3.mlp.gate_proj.weight
237
+ model.language_model.layers.3.mlp.up_proj.weight
238
+ model.language_model.layers.3.post_attention_layernorm.weight
239
+ model.language_model.layers.3.self_attn.k_norm.weight
240
+ model.language_model.layers.3.self_attn.k_proj.weight
241
+ model.language_model.layers.3.self_attn.o_proj.weight
242
+ model.language_model.layers.3.self_attn.q_norm.weight
243
+ model.language_model.layers.3.self_attn.q_proj.weight
244
+ model.language_model.layers.3.self_attn.v_proj.weight
245
+ model.language_model.layers.4.input_layernorm.weight
246
+ model.language_model.layers.4.mlp.down_proj.weight
247
+ model.language_model.layers.4.mlp.gate_proj.weight
248
+ model.language_model.layers.4.mlp.up_proj.weight
249
+ model.language_model.layers.4.post_attention_layernorm.weight
250
+ model.language_model.layers.4.self_attn.k_norm.weight
251
+ model.language_model.layers.4.self_attn.k_proj.weight
252
+ model.language_model.layers.4.self_attn.o_proj.weight
253
+ model.language_model.layers.4.self_attn.q_norm.weight
254
+ model.language_model.layers.4.self_attn.q_proj.weight
255
+ model.language_model.layers.4.self_attn.v_proj.weight
256
+ model.language_model.layers.5.input_layernorm.weight
257
+ model.language_model.layers.5.mlp.down_proj.weight
258
+ model.language_model.layers.5.mlp.gate_proj.weight
259
+ model.language_model.layers.5.mlp.up_proj.weight
260
+ model.language_model.layers.5.post_attention_layernorm.weight
261
+ model.language_model.layers.5.self_attn.k_norm.weight
262
+ model.language_model.layers.5.self_attn.k_proj.weight
263
+ model.language_model.layers.5.self_attn.o_proj.weight
264
+ model.language_model.layers.5.self_attn.q_norm.weight
265
+ model.language_model.layers.5.self_attn.q_proj.weight
266
+ model.language_model.layers.5.self_attn.v_proj.weight
267
+ model.language_model.layers.6.input_layernorm.weight
268
+ model.language_model.layers.6.mlp.down_proj.weight
269
+ model.language_model.layers.6.mlp.gate_proj.weight
270
+ model.language_model.layers.6.mlp.up_proj.weight
271
+ model.language_model.layers.6.post_attention_layernorm.weight
272
+ model.language_model.layers.6.self_attn.k_norm.weight
273
+ model.language_model.layers.6.self_attn.k_proj.weight
274
+ model.language_model.layers.6.self_attn.o_proj.weight
275
+ model.language_model.layers.6.self_attn.q_norm.weight
276
+ model.language_model.layers.6.self_attn.q_proj.weight
277
+ model.language_model.layers.6.self_attn.v_proj.weight
278
+ model.language_model.layers.7.input_layernorm.weight
279
+ model.language_model.layers.7.mlp.down_proj.weight
280
+ model.language_model.layers.7.mlp.gate_proj.weight
281
+ model.language_model.layers.7.mlp.up_proj.weight
282
+ model.language_model.layers.7.post_attention_layernorm.weight
283
+ model.language_model.layers.7.self_attn.k_norm.weight
284
+ model.language_model.layers.7.self_attn.k_proj.weight
285
+ model.language_model.layers.7.self_attn.o_proj.weight
286
+ model.language_model.layers.7.self_attn.q_norm.weight
287
+ model.language_model.layers.7.self_attn.q_proj.weight
288
+ model.language_model.layers.7.self_attn.v_proj.weight
289
+ model.language_model.layers.8.input_layernorm.weight
290
+ model.language_model.layers.8.mlp.down_proj.weight
291
+ model.language_model.layers.8.mlp.gate_proj.weight
292
+ model.language_model.layers.8.mlp.up_proj.weight
293
+ model.language_model.layers.8.post_attention_layernorm.weight
294
+ model.language_model.layers.8.self_attn.k_norm.weight
295
+ model.language_model.layers.8.self_attn.k_proj.weight
296
+ model.language_model.layers.8.self_attn.o_proj.weight
297
+ model.language_model.layers.8.self_attn.q_norm.weight
298
+ model.language_model.layers.8.self_attn.q_proj.weight
299
+ model.language_model.layers.8.self_attn.v_proj.weight
300
+ model.language_model.layers.9.input_layernorm.weight
301
+ model.language_model.layers.9.mlp.down_proj.weight
302
+ model.language_model.layers.9.mlp.gate_proj.weight
303
+ model.language_model.layers.9.mlp.up_proj.weight
304
+ model.language_model.layers.9.post_attention_layernorm.weight
305
+ model.language_model.layers.9.self_attn.k_norm.weight
306
+ model.language_model.layers.9.self_attn.k_proj.weight
307
+ model.language_model.layers.9.self_attn.o_proj.weight
308
+ model.language_model.layers.9.self_attn.q_norm.weight
309
+ model.language_model.layers.9.self_attn.q_proj.weight
310
+ model.language_model.layers.9.self_attn.v_proj.weight
311
+ model.language_model.norm.weight
312
+ model.multi_modal_projector.layer_norm.bias
313
+ model.multi_modal_projector.layer_norm.weight
314
+ model.multi_modal_projector.linear_1.bias
315
+ model.multi_modal_projector.linear_1.weight
316
+ model.multi_modal_projector.linear_2.bias
317
+ model.multi_modal_projector.linear_2.weight
318
+ model.vision_tower.embeddings.cls_token
319
+ model.vision_tower.embeddings.patch_embeddings.projection.bias
320
+ model.vision_tower.embeddings.patch_embeddings.projection.weight
321
+ model.vision_tower.embeddings.position_embeddings
322
+ model.vision_tower.encoder.layer.0.attention.k_proj.bias
323
+ model.vision_tower.encoder.layer.0.attention.k_proj.weight
324
+ model.vision_tower.encoder.layer.0.attention.projection_layer.bias
325
+ model.vision_tower.encoder.layer.0.attention.projection_layer.weight
326
+ model.vision_tower.encoder.layer.0.attention.q_proj.bias
327
+ model.vision_tower.encoder.layer.0.attention.q_proj.weight
328
+ model.vision_tower.encoder.layer.0.attention.v_proj.bias
329
+ model.vision_tower.encoder.layer.0.attention.v_proj.weight
330
+ model.vision_tower.encoder.layer.0.lambda_1
331
+ model.vision_tower.encoder.layer.0.lambda_2
332
+ model.vision_tower.encoder.layer.0.layernorm_after.bias
333
+ model.vision_tower.encoder.layer.0.layernorm_after.weight
334
+ model.vision_tower.encoder.layer.0.layernorm_before.bias
335
+ model.vision_tower.encoder.layer.0.layernorm_before.weight
336
+ model.vision_tower.encoder.layer.0.mlp.fc1.bias
337
+ model.vision_tower.encoder.layer.0.mlp.fc1.weight
338
+ model.vision_tower.encoder.layer.0.mlp.fc2.bias
339
+ model.vision_tower.encoder.layer.0.mlp.fc2.weight
340
+ model.vision_tower.encoder.layer.1.attention.k_proj.bias
341
+ model.vision_tower.encoder.layer.1.attention.k_proj.weight
342
+ model.vision_tower.encoder.layer.1.attention.projection_layer.bias
343
+ model.vision_tower.encoder.layer.1.attention.projection_layer.weight
344
+ model.vision_tower.encoder.layer.1.attention.q_proj.bias
345
+ model.vision_tower.encoder.layer.1.attention.q_proj.weight
346
+ model.vision_tower.encoder.layer.1.attention.v_proj.bias
347
+ model.vision_tower.encoder.layer.1.attention.v_proj.weight
348
+ model.vision_tower.encoder.layer.1.lambda_1
349
+ model.vision_tower.encoder.layer.1.lambda_2
350
+ model.vision_tower.encoder.layer.1.layernorm_after.bias
351
+ model.vision_tower.encoder.layer.1.layernorm_after.weight
352
+ model.vision_tower.encoder.layer.1.layernorm_before.bias
353
+ model.vision_tower.encoder.layer.1.layernorm_before.weight
354
+ model.vision_tower.encoder.layer.1.mlp.fc1.bias
355
+ model.vision_tower.encoder.layer.1.mlp.fc1.weight
356
+ model.vision_tower.encoder.layer.1.mlp.fc2.bias
357
+ model.vision_tower.encoder.layer.1.mlp.fc2.weight
358
+ model.vision_tower.encoder.layer.10.attention.k_proj.bias
359
+ model.vision_tower.encoder.layer.10.attention.k_proj.weight
360
+ model.vision_tower.encoder.layer.10.attention.projection_layer.bias
361
+ model.vision_tower.encoder.layer.10.attention.projection_layer.weight
362
+ model.vision_tower.encoder.layer.10.attention.q_proj.bias
363
+ model.vision_tower.encoder.layer.10.attention.q_proj.weight
364
+ model.vision_tower.encoder.layer.10.attention.v_proj.bias
365
+ model.vision_tower.encoder.layer.10.attention.v_proj.weight
366
+ model.vision_tower.encoder.layer.10.lambda_1
367
+ model.vision_tower.encoder.layer.10.lambda_2
368
+ model.vision_tower.encoder.layer.10.layernorm_after.bias
369
+ model.vision_tower.encoder.layer.10.layernorm_after.weight
370
+ model.vision_tower.encoder.layer.10.layernorm_before.bias
371
+ model.vision_tower.encoder.layer.10.layernorm_before.weight
372
+ model.vision_tower.encoder.layer.10.mlp.fc1.bias
373
+ model.vision_tower.encoder.layer.10.mlp.fc1.weight
374
+ model.vision_tower.encoder.layer.10.mlp.fc2.bias
375
+ model.vision_tower.encoder.layer.10.mlp.fc2.weight
376
+ model.vision_tower.encoder.layer.11.attention.k_proj.bias
377
+ model.vision_tower.encoder.layer.11.attention.k_proj.weight
378
+ model.vision_tower.encoder.layer.11.attention.projection_layer.bias
379
+ model.vision_tower.encoder.layer.11.attention.projection_layer.weight
380
+ model.vision_tower.encoder.layer.11.attention.q_proj.bias
381
+ model.vision_tower.encoder.layer.11.attention.q_proj.weight
382
+ model.vision_tower.encoder.layer.11.attention.v_proj.bias
383
+ model.vision_tower.encoder.layer.11.attention.v_proj.weight
384
+ model.vision_tower.encoder.layer.11.lambda_1
385
+ model.vision_tower.encoder.layer.11.lambda_2
386
+ model.vision_tower.encoder.layer.11.layernorm_after.bias
387
+ model.vision_tower.encoder.layer.11.layernorm_after.weight
388
+ model.vision_tower.encoder.layer.11.layernorm_before.bias
389
+ model.vision_tower.encoder.layer.11.layernorm_before.weight
390
+ model.vision_tower.encoder.layer.11.mlp.fc1.bias
391
+ model.vision_tower.encoder.layer.11.mlp.fc1.weight
392
+ model.vision_tower.encoder.layer.11.mlp.fc2.bias
393
+ model.vision_tower.encoder.layer.11.mlp.fc2.weight
394
+ model.vision_tower.encoder.layer.12.attention.k_proj.bias
395
+ model.vision_tower.encoder.layer.12.attention.k_proj.weight
396
+ model.vision_tower.encoder.layer.12.attention.projection_layer.bias
397
+ model.vision_tower.encoder.layer.12.attention.projection_layer.weight
398
+ model.vision_tower.encoder.layer.12.attention.q_proj.bias
399
+ model.vision_tower.encoder.layer.12.attention.q_proj.weight
400
+ model.vision_tower.encoder.layer.12.attention.v_proj.bias
401
+ model.vision_tower.encoder.layer.12.attention.v_proj.weight
402
+ model.vision_tower.encoder.layer.12.lambda_1
403
+ model.vision_tower.encoder.layer.12.lambda_2
404
+ model.vision_tower.encoder.layer.12.layernorm_after.bias
405
+ model.vision_tower.encoder.layer.12.layernorm_after.weight
406
+ model.vision_tower.encoder.layer.12.layernorm_before.bias
407
+ model.vision_tower.encoder.layer.12.layernorm_before.weight
408
+ model.vision_tower.encoder.layer.12.mlp.fc1.bias
409
+ model.vision_tower.encoder.layer.12.mlp.fc1.weight
410
+ model.vision_tower.encoder.layer.12.mlp.fc2.bias
411
+ model.vision_tower.encoder.layer.12.mlp.fc2.weight
412
+ model.vision_tower.encoder.layer.13.attention.k_proj.bias
413
+ model.vision_tower.encoder.layer.13.attention.k_proj.weight
414
+ model.vision_tower.encoder.layer.13.attention.projection_layer.bias
415
+ model.vision_tower.encoder.layer.13.attention.projection_layer.weight
416
+ model.vision_tower.encoder.layer.13.attention.q_proj.bias
417
+ model.vision_tower.encoder.layer.13.attention.q_proj.weight
418
+ model.vision_tower.encoder.layer.13.attention.v_proj.bias
419
+ model.vision_tower.encoder.layer.13.attention.v_proj.weight
420
+ model.vision_tower.encoder.layer.13.lambda_1
421
+ model.vision_tower.encoder.layer.13.lambda_2
422
+ model.vision_tower.encoder.layer.13.layernorm_after.bias
423
+ model.vision_tower.encoder.layer.13.layernorm_after.weight
424
+ model.vision_tower.encoder.layer.13.layernorm_before.bias
425
+ model.vision_tower.encoder.layer.13.layernorm_before.weight
426
+ model.vision_tower.encoder.layer.13.mlp.fc1.bias
427
+ model.vision_tower.encoder.layer.13.mlp.fc1.weight
428
+ model.vision_tower.encoder.layer.13.mlp.fc2.bias
429
+ model.vision_tower.encoder.layer.13.mlp.fc2.weight
430
+ model.vision_tower.encoder.layer.14.attention.k_proj.bias
431
+ model.vision_tower.encoder.layer.14.attention.k_proj.weight
432
+ model.vision_tower.encoder.layer.14.attention.projection_layer.bias
433
+ model.vision_tower.encoder.layer.14.attention.projection_layer.weight
434
+ model.vision_tower.encoder.layer.14.attention.q_proj.bias
435
+ model.vision_tower.encoder.layer.14.attention.q_proj.weight
436
+ model.vision_tower.encoder.layer.14.attention.v_proj.bias
437
+ model.vision_tower.encoder.layer.14.attention.v_proj.weight
438
+ model.vision_tower.encoder.layer.14.lambda_1
439
+ model.vision_tower.encoder.layer.14.lambda_2
440
+ model.vision_tower.encoder.layer.14.layernorm_after.bias
441
+ model.vision_tower.encoder.layer.14.layernorm_after.weight
442
+ model.vision_tower.encoder.layer.14.layernorm_before.bias
443
+ model.vision_tower.encoder.layer.14.layernorm_before.weight
444
+ model.vision_tower.encoder.layer.14.mlp.fc1.bias
445
+ model.vision_tower.encoder.layer.14.mlp.fc1.weight
446
+ model.vision_tower.encoder.layer.14.mlp.fc2.bias
447
+ model.vision_tower.encoder.layer.14.mlp.fc2.weight
448
+ model.vision_tower.encoder.layer.15.attention.k_proj.bias
449
+ model.vision_tower.encoder.layer.15.attention.k_proj.weight
450
+ model.vision_tower.encoder.layer.15.attention.projection_layer.bias
451
+ model.vision_tower.encoder.layer.15.attention.projection_layer.weight
452
+ model.vision_tower.encoder.layer.15.attention.q_proj.bias
453
+ model.vision_tower.encoder.layer.15.attention.q_proj.weight
454
+ model.vision_tower.encoder.layer.15.attention.v_proj.bias
455
+ model.vision_tower.encoder.layer.15.attention.v_proj.weight
456
+ model.vision_tower.encoder.layer.15.lambda_1
457
+ model.vision_tower.encoder.layer.15.lambda_2
458
+ model.vision_tower.encoder.layer.15.layernorm_after.bias
459
+ model.vision_tower.encoder.layer.15.layernorm_after.weight
460
+ model.vision_tower.encoder.layer.15.layernorm_before.bias
461
+ model.vision_tower.encoder.layer.15.layernorm_before.weight
462
+ model.vision_tower.encoder.layer.15.mlp.fc1.bias
463
+ model.vision_tower.encoder.layer.15.mlp.fc1.weight
464
+ model.vision_tower.encoder.layer.15.mlp.fc2.bias
465
+ model.vision_tower.encoder.layer.15.mlp.fc2.weight
466
+ model.vision_tower.encoder.layer.16.attention.k_proj.bias
467
+ model.vision_tower.encoder.layer.16.attention.k_proj.weight
468
+ model.vision_tower.encoder.layer.16.attention.projection_layer.bias
469
+ model.vision_tower.encoder.layer.16.attention.projection_layer.weight
470
+ model.vision_tower.encoder.layer.16.attention.q_proj.bias
471
+ model.vision_tower.encoder.layer.16.attention.q_proj.weight
472
+ model.vision_tower.encoder.layer.16.attention.v_proj.bias
473
+ model.vision_tower.encoder.layer.16.attention.v_proj.weight
474
+ model.vision_tower.encoder.layer.16.lambda_1
475
+ model.vision_tower.encoder.layer.16.lambda_2
476
+ model.vision_tower.encoder.layer.16.layernorm_after.bias
477
+ model.vision_tower.encoder.layer.16.layernorm_after.weight
478
+ model.vision_tower.encoder.layer.16.layernorm_before.bias
479
+ model.vision_tower.encoder.layer.16.layernorm_before.weight
480
+ model.vision_tower.encoder.layer.16.mlp.fc1.bias
481
+ model.vision_tower.encoder.layer.16.mlp.fc1.weight
482
+ model.vision_tower.encoder.layer.16.mlp.fc2.bias
483
+ model.vision_tower.encoder.layer.16.mlp.fc2.weight
484
+ model.vision_tower.encoder.layer.17.attention.k_proj.bias
485
+ model.vision_tower.encoder.layer.17.attention.k_proj.weight
486
+ model.vision_tower.encoder.layer.17.attention.projection_layer.bias
487
+ model.vision_tower.encoder.layer.17.attention.projection_layer.weight
488
+ model.vision_tower.encoder.layer.17.attention.q_proj.bias
489
+ model.vision_tower.encoder.layer.17.attention.q_proj.weight
490
+ model.vision_tower.encoder.layer.17.attention.v_proj.bias
491
+ model.vision_tower.encoder.layer.17.attention.v_proj.weight
492
+ model.vision_tower.encoder.layer.17.lambda_1
493
+ model.vision_tower.encoder.layer.17.lambda_2
494
+ model.vision_tower.encoder.layer.17.layernorm_after.bias
495
+ model.vision_tower.encoder.layer.17.layernorm_after.weight
496
+ model.vision_tower.encoder.layer.17.layernorm_before.bias
497
+ model.vision_tower.encoder.layer.17.layernorm_before.weight
498
+ model.vision_tower.encoder.layer.17.mlp.fc1.bias
499
+ model.vision_tower.encoder.layer.17.mlp.fc1.weight
500
+ model.vision_tower.encoder.layer.17.mlp.fc2.bias
501
+ model.vision_tower.encoder.layer.17.mlp.fc2.weight
502
+ model.vision_tower.encoder.layer.18.attention.k_proj.bias
503
+ model.vision_tower.encoder.layer.18.attention.k_proj.weight
504
+ model.vision_tower.encoder.layer.18.attention.projection_layer.bias
505
+ model.vision_tower.encoder.layer.18.attention.projection_layer.weight
506
+ model.vision_tower.encoder.layer.18.attention.q_proj.bias
507
+ model.vision_tower.encoder.layer.18.attention.q_proj.weight
508
+ model.vision_tower.encoder.layer.18.attention.v_proj.bias
509
+ model.vision_tower.encoder.layer.18.attention.v_proj.weight
510
+ model.vision_tower.encoder.layer.18.lambda_1
511
+ model.vision_tower.encoder.layer.18.lambda_2
512
+ model.vision_tower.encoder.layer.18.layernorm_after.bias
513
+ model.vision_tower.encoder.layer.18.layernorm_after.weight
514
+ model.vision_tower.encoder.layer.18.layernorm_before.bias
515
+ model.vision_tower.encoder.layer.18.layernorm_before.weight
516
+ model.vision_tower.encoder.layer.18.mlp.fc1.bias
517
+ model.vision_tower.encoder.layer.18.mlp.fc1.weight
518
+ model.vision_tower.encoder.layer.18.mlp.fc2.bias
519
+ model.vision_tower.encoder.layer.18.mlp.fc2.weight
520
+ model.vision_tower.encoder.layer.19.attention.k_proj.bias
521
+ model.vision_tower.encoder.layer.19.attention.k_proj.weight
522
+ model.vision_tower.encoder.layer.19.attention.projection_layer.bias
523
+ model.vision_tower.encoder.layer.19.attention.projection_layer.weight
524
+ model.vision_tower.encoder.layer.19.attention.q_proj.bias
525
+ model.vision_tower.encoder.layer.19.attention.q_proj.weight
526
+ model.vision_tower.encoder.layer.19.attention.v_proj.bias
527
+ model.vision_tower.encoder.layer.19.attention.v_proj.weight
528
+ model.vision_tower.encoder.layer.19.lambda_1
529
+ model.vision_tower.encoder.layer.19.lambda_2
530
+ model.vision_tower.encoder.layer.19.layernorm_after.bias
531
+ model.vision_tower.encoder.layer.19.layernorm_after.weight
532
+ model.vision_tower.encoder.layer.19.layernorm_before.bias
533
+ model.vision_tower.encoder.layer.19.layernorm_before.weight
534
+ model.vision_tower.encoder.layer.19.mlp.fc1.bias
535
+ model.vision_tower.encoder.layer.19.mlp.fc1.weight
536
+ model.vision_tower.encoder.layer.19.mlp.fc2.bias
537
+ model.vision_tower.encoder.layer.19.mlp.fc2.weight
538
+ model.vision_tower.encoder.layer.2.attention.k_proj.bias
539
+ model.vision_tower.encoder.layer.2.attention.k_proj.weight
540
+ model.vision_tower.encoder.layer.2.attention.projection_layer.bias
541
+ model.vision_tower.encoder.layer.2.attention.projection_layer.weight
542
+ model.vision_tower.encoder.layer.2.attention.q_proj.bias
543
+ model.vision_tower.encoder.layer.2.attention.q_proj.weight
544
+ model.vision_tower.encoder.layer.2.attention.v_proj.bias
545
+ model.vision_tower.encoder.layer.2.attention.v_proj.weight
546
+ model.vision_tower.encoder.layer.2.lambda_1
547
+ model.vision_tower.encoder.layer.2.lambda_2
548
+ model.vision_tower.encoder.layer.2.layernorm_after.bias
549
+ model.vision_tower.encoder.layer.2.layernorm_after.weight
550
+ model.vision_tower.encoder.layer.2.layernorm_before.bias
551
+ model.vision_tower.encoder.layer.2.layernorm_before.weight
552
+ model.vision_tower.encoder.layer.2.mlp.fc1.bias
553
+ model.vision_tower.encoder.layer.2.mlp.fc1.weight
554
+ model.vision_tower.encoder.layer.2.mlp.fc2.bias
555
+ model.vision_tower.encoder.layer.2.mlp.fc2.weight
556
+ model.vision_tower.encoder.layer.20.attention.k_proj.bias
557
+ model.vision_tower.encoder.layer.20.attention.k_proj.weight
558
+ model.vision_tower.encoder.layer.20.attention.projection_layer.bias
559
+ model.vision_tower.encoder.layer.20.attention.projection_layer.weight
560
+ model.vision_tower.encoder.layer.20.attention.q_proj.bias
561
+ model.vision_tower.encoder.layer.20.attention.q_proj.weight
562
+ model.vision_tower.encoder.layer.20.attention.v_proj.bias
563
+ model.vision_tower.encoder.layer.20.attention.v_proj.weight
564
+ model.vision_tower.encoder.layer.20.lambda_1
565
+ model.vision_tower.encoder.layer.20.lambda_2
566
+ model.vision_tower.encoder.layer.20.layernorm_after.bias
567
+ model.vision_tower.encoder.layer.20.layernorm_after.weight
568
+ model.vision_tower.encoder.layer.20.layernorm_before.bias
569
+ model.vision_tower.encoder.layer.20.layernorm_before.weight
570
+ model.vision_tower.encoder.layer.20.mlp.fc1.bias
571
+ model.vision_tower.encoder.layer.20.mlp.fc1.weight
572
+ model.vision_tower.encoder.layer.20.mlp.fc2.bias
573
+ model.vision_tower.encoder.layer.20.mlp.fc2.weight
574
+ model.vision_tower.encoder.layer.21.attention.k_proj.bias
575
+ model.vision_tower.encoder.layer.21.attention.k_proj.weight
576
+ model.vision_tower.encoder.layer.21.attention.projection_layer.bias
577
+ model.vision_tower.encoder.layer.21.attention.projection_layer.weight
578
+ model.vision_tower.encoder.layer.21.attention.q_proj.bias
579
+ model.vision_tower.encoder.layer.21.attention.q_proj.weight
580
+ model.vision_tower.encoder.layer.21.attention.v_proj.bias
581
+ model.vision_tower.encoder.layer.21.attention.v_proj.weight
582
+ model.vision_tower.encoder.layer.21.lambda_1
583
+ model.vision_tower.encoder.layer.21.lambda_2
584
+ model.vision_tower.encoder.layer.21.layernorm_after.bias
585
+ model.vision_tower.encoder.layer.21.layernorm_after.weight
586
+ model.vision_tower.encoder.layer.21.layernorm_before.bias
587
+ model.vision_tower.encoder.layer.21.layernorm_before.weight
588
+ model.vision_tower.encoder.layer.21.mlp.fc1.bias
589
+ model.vision_tower.encoder.layer.21.mlp.fc1.weight
590
+ model.vision_tower.encoder.layer.21.mlp.fc2.bias
591
+ model.vision_tower.encoder.layer.21.mlp.fc2.weight
592
+ model.vision_tower.encoder.layer.22.attention.k_proj.bias
593
+ model.vision_tower.encoder.layer.22.attention.k_proj.weight
594
+ model.vision_tower.encoder.layer.22.attention.projection_layer.bias
595
+ model.vision_tower.encoder.layer.22.attention.projection_layer.weight
596
+ model.vision_tower.encoder.layer.22.attention.q_proj.bias
597
+ model.vision_tower.encoder.layer.22.attention.q_proj.weight
598
+ model.vision_tower.encoder.layer.22.attention.v_proj.bias
599
+ model.vision_tower.encoder.layer.22.attention.v_proj.weight
600
+ model.vision_tower.encoder.layer.22.lambda_1
601
+ model.vision_tower.encoder.layer.22.lambda_2
602
+ model.vision_tower.encoder.layer.22.layernorm_after.bias
603
+ model.vision_tower.encoder.layer.22.layernorm_after.weight
604
+ model.vision_tower.encoder.layer.22.layernorm_before.bias
605
+ model.vision_tower.encoder.layer.22.layernorm_before.weight
606
+ model.vision_tower.encoder.layer.22.mlp.fc1.bias
607
+ model.vision_tower.encoder.layer.22.mlp.fc1.weight
608
+ model.vision_tower.encoder.layer.22.mlp.fc2.bias
609
+ model.vision_tower.encoder.layer.22.mlp.fc2.weight
610
+ model.vision_tower.encoder.layer.23.attention.k_proj.bias
611
+ model.vision_tower.encoder.layer.23.attention.k_proj.weight
612
+ model.vision_tower.encoder.layer.23.attention.projection_layer.bias
613
+ model.vision_tower.encoder.layer.23.attention.projection_layer.weight
614
+ model.vision_tower.encoder.layer.23.attention.q_proj.bias
615
+ model.vision_tower.encoder.layer.23.attention.q_proj.weight
616
+ model.vision_tower.encoder.layer.23.attention.v_proj.bias
617
+ model.vision_tower.encoder.layer.23.attention.v_proj.weight
618
+ model.vision_tower.encoder.layer.23.lambda_1
619
+ model.vision_tower.encoder.layer.23.lambda_2
620
+ model.vision_tower.encoder.layer.23.layernorm_after.bias
621
+ model.vision_tower.encoder.layer.23.layernorm_after.weight
622
+ model.vision_tower.encoder.layer.23.layernorm_before.bias
623
+ model.vision_tower.encoder.layer.23.layernorm_before.weight
624
+ model.vision_tower.encoder.layer.23.mlp.fc1.bias
625
+ model.vision_tower.encoder.layer.23.mlp.fc1.weight
626
+ model.vision_tower.encoder.layer.23.mlp.fc2.bias
627
+ model.vision_tower.encoder.layer.23.mlp.fc2.weight
628
+ model.vision_tower.encoder.layer.3.attention.k_proj.bias
629
+ model.vision_tower.encoder.layer.3.attention.k_proj.weight
630
+ model.vision_tower.encoder.layer.3.attention.projection_layer.bias
631
+ model.vision_tower.encoder.layer.3.attention.projection_layer.weight
632
+ model.vision_tower.encoder.layer.3.attention.q_proj.bias
633
+ model.vision_tower.encoder.layer.3.attention.q_proj.weight
634
+ model.vision_tower.encoder.layer.3.attention.v_proj.bias
635
+ model.vision_tower.encoder.layer.3.attention.v_proj.weight
636
+ model.vision_tower.encoder.layer.3.lambda_1
637
+ model.vision_tower.encoder.layer.3.lambda_2
638
+ model.vision_tower.encoder.layer.3.layernorm_after.bias
639
+ model.vision_tower.encoder.layer.3.layernorm_after.weight
640
+ model.vision_tower.encoder.layer.3.layernorm_before.bias
641
+ model.vision_tower.encoder.layer.3.layernorm_before.weight
642
+ model.vision_tower.encoder.layer.3.mlp.fc1.bias
643
+ model.vision_tower.encoder.layer.3.mlp.fc1.weight
644
+ model.vision_tower.encoder.layer.3.mlp.fc2.bias
645
+ model.vision_tower.encoder.layer.3.mlp.fc2.weight
646
+ model.vision_tower.encoder.layer.4.attention.k_proj.bias
647
+ model.vision_tower.encoder.layer.4.attention.k_proj.weight
648
+ model.vision_tower.encoder.layer.4.attention.projection_layer.bias
649
+ model.vision_tower.encoder.layer.4.attention.projection_layer.weight
650
+ model.vision_tower.encoder.layer.4.attention.q_proj.bias
651
+ model.vision_tower.encoder.layer.4.attention.q_proj.weight
652
+ model.vision_tower.encoder.layer.4.attention.v_proj.bias
653
+ model.vision_tower.encoder.layer.4.attention.v_proj.weight
654
+ model.vision_tower.encoder.layer.4.lambda_1
655
+ model.vision_tower.encoder.layer.4.lambda_2
656
+ model.vision_tower.encoder.layer.4.layernorm_after.bias
657
+ model.vision_tower.encoder.layer.4.layernorm_after.weight
658
+ model.vision_tower.encoder.layer.4.layernorm_before.bias
659
+ model.vision_tower.encoder.layer.4.layernorm_before.weight
660
+ model.vision_tower.encoder.layer.4.mlp.fc1.bias
661
+ model.vision_tower.encoder.layer.4.mlp.fc1.weight
662
+ model.vision_tower.encoder.layer.4.mlp.fc2.bias
663
+ model.vision_tower.encoder.layer.4.mlp.fc2.weight
664
+ model.vision_tower.encoder.layer.5.attention.k_proj.bias
665
+ model.vision_tower.encoder.layer.5.attention.k_proj.weight
666
+ model.vision_tower.encoder.layer.5.attention.projection_layer.bias
667
+ model.vision_tower.encoder.layer.5.attention.projection_layer.weight
668
+ model.vision_tower.encoder.layer.5.attention.q_proj.bias
669
+ model.vision_tower.encoder.layer.5.attention.q_proj.weight
670
+ model.vision_tower.encoder.layer.5.attention.v_proj.bias
671
+ model.vision_tower.encoder.layer.5.attention.v_proj.weight
672
+ model.vision_tower.encoder.layer.5.lambda_1
673
+ model.vision_tower.encoder.layer.5.lambda_2
674
+ model.vision_tower.encoder.layer.5.layernorm_after.bias
675
+ model.vision_tower.encoder.layer.5.layernorm_after.weight
676
+ model.vision_tower.encoder.layer.5.layernorm_before.bias
677
+ model.vision_tower.encoder.layer.5.layernorm_before.weight
678
+ model.vision_tower.encoder.layer.5.mlp.fc1.bias
679
+ model.vision_tower.encoder.layer.5.mlp.fc1.weight
680
+ model.vision_tower.encoder.layer.5.mlp.fc2.bias
681
+ model.vision_tower.encoder.layer.5.mlp.fc2.weight
682
+ model.vision_tower.encoder.layer.6.attention.k_proj.bias
683
+ model.vision_tower.encoder.layer.6.attention.k_proj.weight
684
+ model.vision_tower.encoder.layer.6.attention.projection_layer.bias
685
+ model.vision_tower.encoder.layer.6.attention.projection_layer.weight
686
+ model.vision_tower.encoder.layer.6.attention.q_proj.bias
687
+ model.vision_tower.encoder.layer.6.attention.q_proj.weight
688
+ model.vision_tower.encoder.layer.6.attention.v_proj.bias
689
+ model.vision_tower.encoder.layer.6.attention.v_proj.weight
690
+ model.vision_tower.encoder.layer.6.lambda_1
691
+ model.vision_tower.encoder.layer.6.lambda_2
692
+ model.vision_tower.encoder.layer.6.layernorm_after.bias
693
+ model.vision_tower.encoder.layer.6.layernorm_after.weight
694
+ model.vision_tower.encoder.layer.6.layernorm_before.bias
695
+ model.vision_tower.encoder.layer.6.layernorm_before.weight
696
+ model.vision_tower.encoder.layer.6.mlp.fc1.bias
697
+ model.vision_tower.encoder.layer.6.mlp.fc1.weight
698
+ model.vision_tower.encoder.layer.6.mlp.fc2.bias
699
+ model.vision_tower.encoder.layer.6.mlp.fc2.weight
700
+ model.vision_tower.encoder.layer.7.attention.k_proj.bias
701
+ model.vision_tower.encoder.layer.7.attention.k_proj.weight
702
+ model.vision_tower.encoder.layer.7.attention.projection_layer.bias
703
+ model.vision_tower.encoder.layer.7.attention.projection_layer.weight
704
+ model.vision_tower.encoder.layer.7.attention.q_proj.bias
705
+ model.vision_tower.encoder.layer.7.attention.q_proj.weight
706
+ model.vision_tower.encoder.layer.7.attention.v_proj.bias
707
+ model.vision_tower.encoder.layer.7.attention.v_proj.weight
708
+ model.vision_tower.encoder.layer.7.lambda_1
709
+ model.vision_tower.encoder.layer.7.lambda_2
710
+ model.vision_tower.encoder.layer.7.layernorm_after.bias
711
+ model.vision_tower.encoder.layer.7.layernorm_after.weight
712
+ model.vision_tower.encoder.layer.7.layernorm_before.bias
713
+ model.vision_tower.encoder.layer.7.layernorm_before.weight
714
+ model.vision_tower.encoder.layer.7.mlp.fc1.bias
715
+ model.vision_tower.encoder.layer.7.mlp.fc1.weight
716
+ model.vision_tower.encoder.layer.7.mlp.fc2.bias
717
+ model.vision_tower.encoder.layer.7.mlp.fc2.weight
718
+ model.vision_tower.encoder.layer.8.attention.k_proj.bias
719
+ model.vision_tower.encoder.layer.8.attention.k_proj.weight
720
+ model.vision_tower.encoder.layer.8.attention.projection_layer.bias
721
+ model.vision_tower.encoder.layer.8.attention.projection_layer.weight
722
+ model.vision_tower.encoder.layer.8.attention.q_proj.bias
723
+ model.vision_tower.encoder.layer.8.attention.q_proj.weight
724
+ model.vision_tower.encoder.layer.8.attention.v_proj.bias
725
+ model.vision_tower.encoder.layer.8.attention.v_proj.weight
726
+ model.vision_tower.encoder.layer.8.lambda_1
727
+ model.vision_tower.encoder.layer.8.lambda_2
728
+ model.vision_tower.encoder.layer.8.layernorm_after.bias
729
+ model.vision_tower.encoder.layer.8.layernorm_after.weight
730
+ model.vision_tower.encoder.layer.8.layernorm_before.bias
731
+ model.vision_tower.encoder.layer.8.layernorm_before.weight
732
+ model.vision_tower.encoder.layer.8.mlp.fc1.bias
733
+ model.vision_tower.encoder.layer.8.mlp.fc1.weight
734
+ model.vision_tower.encoder.layer.8.mlp.fc2.bias
735
+ model.vision_tower.encoder.layer.8.mlp.fc2.weight
736
+ model.vision_tower.encoder.layer.9.attention.k_proj.bias
737
+ model.vision_tower.encoder.layer.9.attention.k_proj.weight
738
+ model.vision_tower.encoder.layer.9.attention.projection_layer.bias
739
+ model.vision_tower.encoder.layer.9.attention.projection_layer.weight
740
+ model.vision_tower.encoder.layer.9.attention.q_proj.bias
741
+ model.vision_tower.encoder.layer.9.attention.q_proj.weight
742
+ model.vision_tower.encoder.layer.9.attention.v_proj.bias
743
+ model.vision_tower.encoder.layer.9.attention.v_proj.weight
744
+ model.vision_tower.encoder.layer.9.lambda_1
745
+ model.vision_tower.encoder.layer.9.lambda_2
746
+ model.vision_tower.encoder.layer.9.layernorm_after.bias
747
+ model.vision_tower.encoder.layer.9.layernorm_after.weight
748
+ model.vision_tower.encoder.layer.9.layernorm_before.bias
749
+ model.vision_tower.encoder.layer.9.layernorm_before.weight
750
+ model.vision_tower.encoder.layer.9.mlp.fc1.bias
751
+ model.vision_tower.encoder.layer.9.mlp.fc1.weight
752
+ model.vision_tower.encoder.layer.9.mlp.fc2.bias
753
+ model.vision_tower.encoder.layer.9.mlp.fc2.weight
preprocessor_config.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "crop_to_patches": false,
4
+ "data_format": "channels_first",
5
+ "default_to_square": true,
6
+ "device": null,
7
+ "disable_grouping": null,
8
+ "do_center_crop": null,
9
+ "do_convert_rgb": true,
10
+ "do_normalize": true,
11
+ "do_rescale": true,
12
+ "do_resize": true,
13
+ "image_mean": [
14
+ 0.485,
15
+ 0.456,
16
+ 0.406
17
+ ],
18
+ "image_processor_type": "GotOcr2ImageProcessorFast",
19
+ "image_std": [
20
+ 0.229,
21
+ 0.224,
22
+ 0.225
23
+ ],
24
+ "input_data_format": null,
25
+ "max_patches": 12,
26
+ "min_patches": 1,
27
+ "processor_class": "InternVLProcessor",
28
+ "resample": 3,
29
+ "rescale_factor": 0.00392156862745098,
30
+ "return_tensors": null,
31
+ "size": {
32
+ "height": 448,
33
+ "width": 448
34
+ }
35
+ }
processor_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "image_seq_length": 256,
3
+ "processor_class": "InternVLProcessor"
4
+ }
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478b41e9f26d338fd8f896e08cad1adab7c423b61f1b45754113bc78d256a3f9
3
+ size 16389
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce29a8767a7d907dd24987aa2c3e654d4317f3042fbc13b5b72cadb46d43311a
3
+ size 16389
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a48db011646b4e9a867bf12f4a233cad5dfbfe309686f8996c250196d3783a
3
+ size 16389
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9562ee822472a4f01dcd6349ab3d1ef42a48915fe3b92e843a0c37db53c8421
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d2767d83c3bf27f12db022b0632e2c4f8c164274ba75e380cf18f9d5f21819
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76816358d4e5db8149d60d55234db658d67a13c0c1ce05d7404cf7125a676a5c
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1562e7520c977d178183d641f70abcf3f57da2489938756cfbebf9b6e6c1a9fd
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b6cabaed045c5398cd1b732f7ec48bd363f3b43cd24e0e70e641a42bd00c28
3
+ size 16389
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d3e21813a36ff9c57fdbac9a05399de3393d169c07b37d946fbfd7e30605568c
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>",
16
+ "<img>",
17
+ "</img>",
18
+ "<IMG_CONTEXT>",
19
+ "<quad>",
20
+ "</quad>",
21
+ "<ref>",
22
+ "</ref>",
23
+ "<box>",
24
+ "</box>"
25
+ ],
26
+ "context_image_token": "<IMG_CONTEXT>",
27
+ "end_image_token": "</img>",
28
+ "eos_token": {
29
+ "content": "<|im_end|>",
30
+ "lstrip": false,
31
+ "normalized": false,
32
+ "rstrip": false,
33
+ "single_word": false
34
+ },
35
+ "pad_token": {
36
+ "content": "<|endoftext|>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false
41
+ },
42
+ "start_image_token": "<img>",
43
+ "video_token": "<video>"
44
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7b9d18660f656ae5a87df2d5d6ed990e80f292d3473c1a35cae8259a5d28cd67
3
+ size 11424484
tokenizer_config.json ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "151643": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "151644": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "151645": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "151646": {
31
+ "content": "<|object_ref_start|>",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ },
38
+ "151647": {
39
+ "content": "<|object_ref_end|>",
40
+ "lstrip": false,
41
+ "normalized": false,
42
+ "rstrip": false,
43
+ "single_word": false,
44
+ "special": true
45
+ },
46
+ "151648": {
47
+ "content": "<|box_start|>",
48
+ "lstrip": false,
49
+ "normalized": false,
50
+ "rstrip": false,
51
+ "single_word": false,
52
+ "special": true
53
+ },
54
+ "151649": {
55
+ "content": "<|box_end|>",
56
+ "lstrip": false,
57
+ "normalized": false,
58
+ "rstrip": false,
59
+ "single_word": false,
60
+ "special": true
61
+ },
62
+ "151650": {
63
+ "content": "<|quad_start|>",
64
+ "lstrip": false,
65
+ "normalized": false,
66
+ "rstrip": false,
67
+ "single_word": false,
68
+ "special": true
69
+ },
70
+ "151651": {
71
+ "content": "<|quad_end|>",
72
+ "lstrip": false,
73
+ "normalized": false,
74
+ "rstrip": false,
75
+ "single_word": false,
76
+ "special": true
77
+ },
78
+ "151652": {
79
+ "content": "<|vision_start|>",
80
+ "lstrip": false,
81
+ "normalized": false,
82
+ "rstrip": false,
83
+ "single_word": false,
84
+ "special": true
85
+ },
86
+ "151653": {
87
+ "content": "<|vision_end|>",
88
+ "lstrip": false,
89
+ "normalized": false,
90
+ "rstrip": false,
91
+ "single_word": false,
92
+ "special": true
93
+ },
94
+ "151654": {
95
+ "content": "<|vision_pad|>",
96
+ "lstrip": false,
97
+ "normalized": false,
98
+ "rstrip": false,
99
+ "single_word": false,
100
+ "special": true
101
+ },
102
+ "151655": {
103
+ "content": "<|image_pad|>",
104
+ "lstrip": false,
105
+ "normalized": false,
106
+ "rstrip": false,
107
+ "single_word": false,
108
+ "special": true
109
+ },
110
+ "151656": {
111
+ "content": "<|video_pad|>",
112
+ "lstrip": false,
113
+ "normalized": false,
114
+ "rstrip": false,
115
+ "single_word": false,
116
+ "special": true
117
+ },
118
+ "151657": {
119
+ "content": "<tool_call>",
120
+ "lstrip": false,
121
+ "normalized": false,
122
+ "rstrip": false,
123
+ "single_word": false,
124
+ "special": false
125
+ },
126
+ "151658": {
127
+ "content": "</tool_call>",
128
+ "lstrip": false,
129
+ "normalized": false,
130
+ "rstrip": false,
131
+ "single_word": false,
132
+ "special": false
133
+ },
134
+ "151659": {
135
+ "content": "<|fim_prefix|>",
136
+ "lstrip": false,
137
+ "normalized": false,
138
+ "rstrip": false,
139
+ "single_word": false,
140
+ "special": false
141
+ },
142
+ "151660": {
143
+ "content": "<|fim_middle|>",
144
+ "lstrip": false,
145
+ "normalized": false,
146
+ "rstrip": false,
147
+ "single_word": false,
148
+ "special": false
149
+ },
150
+ "151661": {
151
+ "content": "<|fim_suffix|>",
152
+ "lstrip": false,
153
+ "normalized": false,
154
+ "rstrip": false,
155
+ "single_word": false,
156
+ "special": false
157
+ },
158
+ "151662": {
159
+ "content": "<|fim_pad|>",
160
+ "lstrip": false,
161
+ "normalized": false,
162
+ "rstrip": false,
163
+ "single_word": false,
164
+ "special": false
165
+ },
166
+ "151663": {
167
+ "content": "<|repo_name|>",
168
+ "lstrip": false,
169
+ "normalized": false,
170
+ "rstrip": false,
171
+ "single_word": false,
172
+ "special": false
173
+ },
174
+ "151664": {
175
+ "content": "<|file_sep|>",
176
+ "lstrip": false,
177
+ "normalized": false,
178
+ "rstrip": false,
179
+ "single_word": false,
180
+ "special": false
181
+ },
182
+ "151665": {
183
+ "content": "<tool_response>",
184
+ "lstrip": false,
185
+ "normalized": false,
186
+ "rstrip": false,
187
+ "single_word": false,
188
+ "special": false
189
+ },
190
+ "151666": {
191
+ "content": "</tool_response>",
192
+ "lstrip": false,
193
+ "normalized": false,
194
+ "rstrip": false,
195
+ "single_word": false,
196
+ "special": false
197
+ },
198
+ "151667": {
199
+ "content": "<think>",
200
+ "lstrip": false,
201
+ "normalized": false,
202
+ "rstrip": false,
203
+ "single_word": false,
204
+ "special": false
205
+ },
206
+ "151668": {
207
+ "content": "</think>",
208
+ "lstrip": false,
209
+ "normalized": false,
210
+ "rstrip": false,
211
+ "single_word": false,
212
+ "special": false
213
+ },
214
+ "151669": {
215
+ "content": "<img>",
216
+ "lstrip": false,
217
+ "normalized": false,
218
+ "rstrip": false,
219
+ "single_word": false,
220
+ "special": true
221
+ },
222
+ "151670": {
223
+ "content": "</img>",
224
+ "lstrip": false,
225
+ "normalized": false,
226
+ "rstrip": false,
227
+ "single_word": false,
228
+ "special": true
229
+ },
230
+ "151671": {
231
+ "content": "<IMG_CONTEXT>",
232
+ "lstrip": false,
233
+ "normalized": false,
234
+ "rstrip": false,
235
+ "single_word": false,
236
+ "special": true
237
+ },
238
+ "151672": {
239
+ "content": "<quad>",
240
+ "lstrip": false,
241
+ "normalized": false,
242
+ "rstrip": false,
243
+ "single_word": false,
244
+ "special": true
245
+ },
246
+ "151673": {
247
+ "content": "</quad>",
248
+ "lstrip": false,
249
+ "normalized": false,
250
+ "rstrip": false,
251
+ "single_word": false,
252
+ "special": true
253
+ },
254
+ "151674": {
255
+ "content": "<ref>",
256
+ "lstrip": false,
257
+ "normalized": false,
258
+ "rstrip": false,
259
+ "single_word": false,
260
+ "special": true
261
+ },
262
+ "151675": {
263
+ "content": "</ref>",
264
+ "lstrip": false,
265
+ "normalized": false,
266
+ "rstrip": false,
267
+ "single_word": false,
268
+ "special": true
269
+ },
270
+ "151676": {
271
+ "content": "<box>",
272
+ "lstrip": false,
273
+ "normalized": false,
274
+ "rstrip": false,
275
+ "single_word": false,
276
+ "special": true
277
+ },
278
+ "151677": {
279
+ "content": "</box>",
280
+ "lstrip": false,
281
+ "normalized": false,
282
+ "rstrip": false,
283
+ "single_word": false,
284
+ "special": true
285
+ },
286
+ "151678": {
287
+ "content": "<video>",
288
+ "lstrip": false,
289
+ "normalized": false,
290
+ "rstrip": false,
291
+ "single_word": false,
292
+ "special": true
293
+ }
294
+ },
295
+ "additional_special_tokens": [
296
+ "<|im_start|>",
297
+ "<|im_end|>",
298
+ "<|object_ref_start|>",
299
+ "<|object_ref_end|>",
300
+ "<|box_start|>",
301
+ "<|box_end|>",
302
+ "<|quad_start|>",
303
+ "<|quad_end|>",
304
+ "<|vision_start|>",
305
+ "<|vision_end|>",
306
+ "<|vision_pad|>",
307
+ "<|image_pad|>",
308
+ "<|video_pad|>",
309
+ "<img>",
310
+ "</img>",
311
+ "<IMG_CONTEXT>",
312
+ "<quad>",
313
+ "</quad>",
314
+ "<ref>",
315
+ "</ref>",
316
+ "<box>",
317
+ "</box>"
318
+ ],
319
+ "bos_token": null,
320
+ "clean_up_tokenization_spaces": false,
321
+ "context_image_token": "<IMG_CONTEXT>",
322
+ "end_image_token": "</img>",
323
+ "eos_token": "<|im_end|>",
324
+ "errors": "replace",
325
+ "extra_special_tokens": {
326
+ "context_image_token": "<IMG_CONTEXT>",
327
+ "end_image_token": "</img>",
328
+ "start_image_token": "<img>",
329
+ "video_token": "<video>"
330
+ },
331
+ "model_max_length": 40960,
332
+ "pad_token": "<|endoftext|>",
333
+ "padding_side": "right",
334
+ "processor_class": "InternVLProcessor",
335
+ "split_special_tokens": false,
336
+ "start_image_token": "<img>",
337
+ "tokenizer_class": "Qwen2Tokenizer",
338
+ "unk_token": null,
339
+ "video_token": "<video>"
340
+ }
trainer_state.json ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016,
14
+ "grad_norm": 5.986245155334473,
15
+ "learning_rate": 1.5840000000000002e-06,
16
+ "loss": 1.9745,
17
+ "step": 100
18
+ },
19
+ {
20
+ "epoch": 0.032,
21
+ "grad_norm": 5.599625110626221,
22
+ "learning_rate": 3.1840000000000003e-06,
23
+ "loss": 1.125,
24
+ "step": 200
25
+ },
26
+ {
27
+ "epoch": 0.048,
28
+ "grad_norm": 4.610442161560059,
29
+ "learning_rate": 4.784e-06,
30
+ "loss": 1.025,
31
+ "step": 300
32
+ },
33
+ {
34
+ "epoch": 0.064,
35
+ "grad_norm": 5.483785152435303,
36
+ "learning_rate": 6.384e-06,
37
+ "loss": 0.9741,
38
+ "step": 400
39
+ },
40
+ {
41
+ "epoch": 0.08,
42
+ "grad_norm": 4.7527313232421875,
43
+ "learning_rate": 7.984e-06,
44
+ "loss": 0.9544,
45
+ "step": 500
46
+ },
47
+ {
48
+ "epoch": 0.096,
49
+ "grad_norm": 4.325509548187256,
50
+ "learning_rate": 9.584000000000002e-06,
51
+ "loss": 0.9134,
52
+ "step": 600
53
+ },
54
+ {
55
+ "epoch": 0.112,
56
+ "grad_norm": 4.1613898277282715,
57
+ "learning_rate": 9.995730310237113e-06,
58
+ "loss": 0.8952,
59
+ "step": 700
60
+ },
61
+ {
62
+ "epoch": 0.128,
63
+ "grad_norm": 5.373358726501465,
64
+ "learning_rate": 9.976408726659296e-06,
65
+ "loss": 0.8752,
66
+ "step": 800
67
+ },
68
+ {
69
+ "epoch": 0.144,
70
+ "grad_norm": 5.119192600250244,
71
+ "learning_rate": 9.941568353618064e-06,
72
+ "loss": 0.8654,
73
+ "step": 900
74
+ },
75
+ {
76
+ "epoch": 0.16,
77
+ "grad_norm": 3.383653402328491,
78
+ "learning_rate": 9.891317839828527e-06,
79
+ "loss": 0.845,
80
+ "step": 1000
81
+ },
82
+ {
83
+ "epoch": 0.176,
84
+ "grad_norm": 3.9472246170043945,
85
+ "learning_rate": 9.825813890092639e-06,
86
+ "loss": 0.8178,
87
+ "step": 1100
88
+ },
89
+ {
90
+ "epoch": 0.192,
91
+ "grad_norm": 3.940248489379883,
92
+ "learning_rate": 9.745260776619698e-06,
93
+ "loss": 0.8142,
94
+ "step": 1200
95
+ },
96
+ {
97
+ "epoch": 0.208,
98
+ "grad_norm": 3.9536900520324707,
99
+ "learning_rate": 9.649909702009265e-06,
100
+ "loss": 0.8028,
101
+ "step": 1300
102
+ },
103
+ {
104
+ "epoch": 0.224,
105
+ "grad_norm": 4.985702037811279,
106
+ "learning_rate": 9.54005801588298e-06,
107
+ "loss": 0.795,
108
+ "step": 1400
109
+ },
110
+ {
111
+ "epoch": 0.24,
112
+ "grad_norm": 5.107704162597656,
113
+ "learning_rate": 9.416048287608195e-06,
114
+ "loss": 0.7805,
115
+ "step": 1500
116
+ },
117
+ {
118
+ "epoch": 0.256,
119
+ "grad_norm": 3.63683819770813,
120
+ "learning_rate": 9.27826723800513e-06,
121
+ "loss": 0.7734,
122
+ "step": 1600
123
+ },
124
+ {
125
+ "epoch": 0.272,
126
+ "grad_norm": 5.008887767791748,
127
+ "learning_rate": 9.127144533368956e-06,
128
+ "loss": 0.7681,
129
+ "step": 1700
130
+ },
131
+ {
132
+ "epoch": 0.288,
133
+ "grad_norm": 4.119167327880859,
134
+ "learning_rate": 8.963151445567642e-06,
135
+ "loss": 0.7479,
136
+ "step": 1800
137
+ },
138
+ {
139
+ "epoch": 0.304,
140
+ "grad_norm": 4.297443866729736,
141
+ "learning_rate": 8.786799382394e-06,
142
+ "loss": 0.7478,
143
+ "step": 1900
144
+ },
145
+ {
146
+ "epoch": 0.32,
147
+ "grad_norm": 4.435776710510254,
148
+ "learning_rate": 8.598638292755e-06,
149
+ "loss": 0.7389,
150
+ "step": 2000
151
+ },
152
+ {
153
+ "epoch": 0.336,
154
+ "grad_norm": 3.9187700748443604,
155
+ "learning_rate": 8.399254951671681e-06,
156
+ "loss": 0.7226,
157
+ "step": 2100
158
+ },
159
+ {
160
+ "epoch": 0.352,
161
+ "grad_norm": 3.695847988128662,
162
+ "learning_rate": 8.18927113043791e-06,
163
+ "loss": 0.7138,
164
+ "step": 2200
165
+ },
166
+ {
167
+ "epoch": 0.368,
168
+ "grad_norm": 4.540302753448486,
169
+ "learning_rate": 7.969341657644236e-06,
170
+ "loss": 0.7126,
171
+ "step": 2300
172
+ },
173
+ {
174
+ "epoch": 0.384,
175
+ "grad_norm": 4.550365447998047,
176
+ "learning_rate": 7.740152377113493e-06,
177
+ "loss": 0.7063,
178
+ "step": 2400
179
+ },
180
+ {
181
+ "epoch": 0.4,
182
+ "grad_norm": 3.8343756198883057,
183
+ "learning_rate": 7.5024180091162976e-06,
184
+ "loss": 0.6911,
185
+ "step": 2500
186
+ },
187
+ {
188
+ "epoch": 0.416,
189
+ "grad_norm": 4.098830223083496,
190
+ "learning_rate": 7.256879921536164e-06,
191
+ "loss": 0.6991,
192
+ "step": 2600
193
+ },
194
+ {
195
+ "epoch": 0.432,
196
+ "grad_norm": 3.9230875968933105,
197
+ "learning_rate": 7.004303817934775e-06,
198
+ "loss": 0.6848,
199
+ "step": 2700
200
+ },
201
+ {
202
+ "epoch": 0.448,
203
+ "grad_norm": 4.32880163192749,
204
+ "learning_rate": 6.745477349727154e-06,
205
+ "loss": 0.6643,
206
+ "step": 2800
207
+ },
208
+ {
209
+ "epoch": 0.464,
210
+ "grad_norm": 4.100039482116699,
211
+ "learning_rate": 6.481207659913062e-06,
212
+ "loss": 0.6791,
213
+ "step": 2900
214
+ },
215
+ {
216
+ "epoch": 0.48,
217
+ "grad_norm": 4.455363750457764,
218
+ "learning_rate": 6.212318866024449e-06,
219
+ "loss": 0.6568,
220
+ "step": 3000
221
+ },
222
+ {
223
+ "epoch": 0.496,
224
+ "grad_norm": 3.64780330657959,
225
+ "learning_rate": 5.939649490138305e-06,
226
+ "loss": 0.6609,
227
+ "step": 3100
228
+ },
229
+ {
230
+ "epoch": 0.512,
231
+ "grad_norm": 4.5561418533325195,
232
+ "learning_rate": 5.664049843969348e-06,
233
+ "loss": 0.6598,
234
+ "step": 3200
235
+ },
236
+ {
237
+ "epoch": 0.528,
238
+ "grad_norm": 4.1221747398376465,
239
+ "learning_rate": 5.386379377197056e-06,
240
+ "loss": 0.6499,
241
+ "step": 3300
242
+ },
243
+ {
244
+ "epoch": 0.544,
245
+ "grad_norm": 4.2754106521606445,
246
+ "learning_rate": 5.107503997296225e-06,
247
+ "loss": 0.6534,
248
+ "step": 3400
249
+ },
250
+ {
251
+ "epoch": 0.56,
252
+ "grad_norm": 3.418328285217285,
253
+ "learning_rate": 4.8282933692290665e-06,
254
+ "loss": 0.6511,
255
+ "step": 3500
256
+ },
257
+ {
258
+ "epoch": 0.576,
259
+ "grad_norm": 4.334653854370117,
260
+ "learning_rate": 4.549618203419684e-06,
261
+ "loss": 0.6388,
262
+ "step": 3600
263
+ },
264
+ {
265
+ "epoch": 0.592,
266
+ "grad_norm": 3.9164488315582275,
267
+ "learning_rate": 4.272347540468327e-06,
268
+ "loss": 0.6327,
269
+ "step": 3700
270
+ },
271
+ {
272
+ "epoch": 0.608,
273
+ "grad_norm": 4.421480655670166,
274
+ "learning_rate": 3.997346041072912e-06,
275
+ "loss": 0.6378,
276
+ "step": 3800
277
+ },
278
+ {
279
+ "epoch": 0.624,
280
+ "grad_norm": 4.170716762542725,
281
+ "learning_rate": 3.725471289609174e-06,
282
+ "loss": 0.6336,
283
+ "step": 3900
284
+ },
285
+ {
286
+ "epoch": 0.64,
287
+ "grad_norm": 5.108222961425781,
288
+ "learning_rate": 3.457571119778104e-06,
289
+ "loss": 0.613,
290
+ "step": 4000
291
+ },
292
+ {
293
+ "epoch": 0.656,
294
+ "grad_norm": 4.666893005371094,
295
+ "learning_rate": 3.1944809706606123e-06,
296
+ "loss": 0.6106,
297
+ "step": 4100
298
+ },
299
+ {
300
+ "epoch": 0.672,
301
+ "grad_norm": 4.099169731140137,
302
+ "learning_rate": 2.9370212814244436e-06,
303
+ "loss": 0.5947,
304
+ "step": 4200
305
+ },
306
+ {
307
+ "epoch": 0.688,
308
+ "grad_norm": 3.848003625869751,
309
+ "learning_rate": 2.6859949328079005e-06,
310
+ "loss": 0.5981,
311
+ "step": 4300
312
+ },
313
+ {
314
+ "epoch": 0.704,
315
+ "grad_norm": 4.583881378173828,
316
+ "learning_rate": 2.4421847433590466e-06,
317
+ "loss": 0.6008,
318
+ "step": 4400
319
+ },
320
+ {
321
+ "epoch": 0.72,
322
+ "grad_norm": 4.723909378051758,
323
+ "learning_rate": 2.2063510282382517e-06,
324
+ "loss": 0.5932,
325
+ "step": 4500
326
+ },
327
+ {
328
+ "epoch": 0.736,
329
+ "grad_norm": 4.5427045822143555,
330
+ "learning_rate": 1.979229228196942e-06,
331
+ "loss": 0.5972,
332
+ "step": 4600
333
+ },
334
+ {
335
+ "epoch": 0.752,
336
+ "grad_norm": 5.033416748046875,
337
+ "learning_rate": 1.761527616126475e-06,
338
+ "loss": 0.5964,
339
+ "step": 4700
340
+ },
341
+ {
342
+ "epoch": 0.768,
343
+ "grad_norm": 5.443480491638184,
344
+ "learning_rate": 1.5539250883292078e-06,
345
+ "loss": 0.589,
346
+ "step": 4800
347
+ },
348
+ {
349
+ "epoch": 0.784,
350
+ "grad_norm": 4.804011821746826,
351
+ "learning_rate": 1.3570690473996483e-06,
352
+ "loss": 0.5812,
353
+ "step": 4900
354
+ },
355
+ {
356
+ "epoch": 0.8,
357
+ "grad_norm": 5.263527870178223,
358
+ "learning_rate": 1.1715733833178178e-06,
359
+ "loss": 0.5747,
360
+ "step": 5000
361
+ },
362
+ {
363
+ "epoch": 0.816,
364
+ "grad_norm": 5.167060852050781,
365
+ "learning_rate": 9.98016559050765e-07,
366
+ "loss": 0.5652,
367
+ "step": 5100
368
+ },
369
+ {
370
+ "epoch": 0.832,
371
+ "grad_norm": 5.249851226806641,
372
+ "learning_rate": 8.369398066322049e-07,
373
+ "loss": 0.5808,
374
+ "step": 5200
375
+ },
376
+ {
377
+ "epoch": 0.848,
378
+ "grad_norm": 5.08359956741333,
379
+ "learning_rate": 6.888454393457817e-07,
380
+ "loss": 0.5656,
381
+ "step": 5300
382
+ },
383
+ {
384
+ "epoch": 0.864,
385
+ "grad_norm": 4.25005578994751,
386
+ "learning_rate": 5.541952852753341e-07,
387
+ "loss": 0.5745,
388
+ "step": 5400
389
+ },
390
+ {
391
+ "epoch": 0.88,
392
+ "grad_norm": 4.67172908782959,
393
+ "learning_rate": 4.334092471071194e-07,
394
+ "loss": 0.5695,
395
+ "step": 5500
396
+ },
397
+ {
398
+ "epoch": 0.896,
399
+ "grad_norm": 3.975259780883789,
400
+ "learning_rate": 3.268639926751943e-07,
401
+ "loss": 0.5632,
402
+ "step": 5600
403
+ },
404
+ {
405
+ "epoch": 0.912,
406
+ "grad_norm": 4.989463806152344,
407
+ "learning_rate": 2.3489178033345994e-07,
408
+ "loss": 0.5807,
409
+ "step": 5700
410
+ },
411
+ {
412
+ "epoch": 0.928,
413
+ "grad_norm": 4.873440742492676,
414
+ "learning_rate": 1.5777942281740789e-07,
415
+ "loss": 0.5645,
416
+ "step": 5800
417
+ },
418
+ {
419
+ "epoch": 0.944,
420
+ "grad_norm": 4.344705581665039,
421
+ "learning_rate": 9.576739282673886e-08,
422
+ "loss": 0.5681,
423
+ "step": 5900
424
+ },
425
+ {
426
+ "epoch": 0.96,
427
+ "grad_norm": 4.521224498748779,
428
+ "learning_rate": 4.9049073118072057e-08,
429
+ "loss": 0.5692,
430
+ "step": 6000
431
+ },
432
+ {
433
+ "epoch": 0.976,
434
+ "grad_norm": 4.890485763549805,
435
+ "learning_rate": 1.7770153446302618e-08,
436
+ "loss": 0.5742,
437
+ "step": 6100
438
+ },
439
+ {
440
+ "epoch": 0.992,
441
+ "grad_norm": 5.1736626625061035,
442
+ "learning_rate": 2.0281762352331034e-09,
443
+ "loss": 0.5735,
444
+ "step": 6200
445
+ }
446
+ ],
447
+ "logging_steps": 100,
448
+ "max_steps": 6250,
449
+ "num_input_tokens_seen": 0,
450
+ "num_train_epochs": 1,
451
+ "save_steps": 1500,
452
+ "stateful_callbacks": {
453
+ "TrainerControl": {
454
+ "args": {
455
+ "should_epoch_stop": false,
456
+ "should_evaluate": false,
457
+ "should_log": false,
458
+ "should_save": true,
459
+ "should_training_stop": true
460
+ },
461
+ "attributes": {}
462
+ }
463
+ },
464
+ "total_flos": 2582174620450816.0,
465
+ "train_batch_size": 16,
466
+ "trial_name": null,
467
+ "trial_params": null
468
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:056230b5bd08226120577c25e8f3415e93d8b4823d3219c6c9b4c4d8033ccee1
3
+ size 8273
video_preprocessor_config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": null,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "do_sample_frames": false,
13
+ "fps": null,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "initial_shift": true,
25
+ "input_data_format": null,
26
+ "num_frames": null,
27
+ "processor_class": "InternVLProcessor",
28
+ "resample": 3,
29
+ "rescale_factor": 0.00392156862745098,
30
+ "return_metadata": false,
31
+ "size": {
32
+ "height": 384,
33
+ "width": 384
34
+ },
35
+ "size_divisor": null,
36
+ "video_metadata": null,
37
+ "video_processor_type": "InternVLVideoProcessor"
38
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)