HappyCorpse commited on
Commit
f3a3b78
·
verified ·
1 Parent(s): 4fa96f0

Upload folder using huggingface_hub

Browse files
Files changed (46) hide show
  1. .gitattributes +1 -0
  2. added_tokens.json +24 -0
  3. chat_template.jinja +7 -0
  4. config.json +135 -0
  5. generation_config.json +12 -0
  6. global_step937/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt +3 -0
  7. global_step937/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt +3 -0
  8. global_step937/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt +3 -0
  9. global_step937/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt +3 -0
  10. global_step937/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt +3 -0
  11. global_step937/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt +3 -0
  12. global_step937/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt +3 -0
  13. global_step937/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt +3 -0
  14. global_step937/zero_pp_rank_0_mp_rank_00_model_states.pt +3 -0
  15. global_step937/zero_pp_rank_1_mp_rank_00_model_states.pt +3 -0
  16. global_step937/zero_pp_rank_2_mp_rank_00_model_states.pt +3 -0
  17. global_step937/zero_pp_rank_3_mp_rank_00_model_states.pt +3 -0
  18. global_step937/zero_pp_rank_4_mp_rank_00_model_states.pt +3 -0
  19. global_step937/zero_pp_rank_5_mp_rank_00_model_states.pt +3 -0
  20. global_step937/zero_pp_rank_6_mp_rank_00_model_states.pt +3 -0
  21. global_step937/zero_pp_rank_7_mp_rank_00_model_states.pt +3 -0
  22. latest +1 -0
  23. merges.txt +0 -0
  24. model-00001-of-00004.safetensors +3 -0
  25. model-00002-of-00004.safetensors +3 -0
  26. model-00003-of-00004.safetensors +3 -0
  27. model-00004-of-00004.safetensors +3 -0
  28. model.safetensors.index.json +737 -0
  29. preprocessor_config.json +37 -0
  30. rng_state_0.pth +3 -0
  31. rng_state_1.pth +3 -0
  32. rng_state_2.pth +3 -0
  33. rng_state_3.pth +3 -0
  34. rng_state_4.pth +3 -0
  35. rng_state_5.pth +3 -0
  36. rng_state_6.pth +3 -0
  37. rng_state_7.pth +3 -0
  38. scheduler.pt +3 -0
  39. special_tokens_map.json +31 -0
  40. tokenizer.json +3 -0
  41. tokenizer_config.json +209 -0
  42. trainer_state.json +1429 -0
  43. training_args.bin +3 -0
  44. video_preprocessor_config.json +43 -0
  45. vocab.json +0 -0
  46. zero_to_fp32.py +760 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
chat_template.jinja ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
2
+ You are a helpful assistant.<|im_end|>
3
+ {% endif %}<|im_start|>{{ message['role'] }}
4
+ {% if message['content'] is string %}{{ message['content'] }}<|im_end|>
5
+ {% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
6
+ {% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
7
+ {% endif %}
config.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2_5_VLForConditionalGeneration"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "eos_token_id": 151645,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 3584,
10
+ "image_token_id": 151655,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 18944,
13
+ "max_position_embeddings": 128000,
14
+ "max_window_layers": 28,
15
+ "model_type": "qwen2_5_vl",
16
+ "num_attention_heads": 28,
17
+ "num_hidden_layers": 28,
18
+ "num_key_value_heads": 4,
19
+ "rms_norm_eps": 1e-06,
20
+ "rope_scaling": {
21
+ "mrope_section": [
22
+ 16,
23
+ 24,
24
+ 24
25
+ ],
26
+ "rope_type": "default",
27
+ "type": "default"
28
+ },
29
+ "rope_theta": 1000000.0,
30
+ "sliding_window": 32768,
31
+ "text_config": {
32
+ "architectures": [
33
+ "Qwen2_5_VLForConditionalGeneration"
34
+ ],
35
+ "attention_dropout": 0.0,
36
+ "bos_token_id": 151643,
37
+ "eos_token_id": 151645,
38
+ "hidden_act": "silu",
39
+ "hidden_size": 3584,
40
+ "image_token_id": null,
41
+ "initializer_range": 0.02,
42
+ "intermediate_size": 18944,
43
+ "layer_types": [
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention",
64
+ "full_attention",
65
+ "full_attention",
66
+ "full_attention",
67
+ "full_attention",
68
+ "full_attention",
69
+ "full_attention",
70
+ "full_attention",
71
+ "full_attention"
72
+ ],
73
+ "max_position_embeddings": 128000,
74
+ "max_window_layers": 28,
75
+ "model_type": "qwen2_5_vl_text",
76
+ "num_attention_heads": 28,
77
+ "num_hidden_layers": 28,
78
+ "num_key_value_heads": 4,
79
+ "rms_norm_eps": 1e-06,
80
+ "rope_scaling": {
81
+ "mrope_section": [
82
+ 16,
83
+ 24,
84
+ 24
85
+ ],
86
+ "rope_type": "default",
87
+ "type": "default"
88
+ },
89
+ "rope_theta": 1000000.0,
90
+ "sliding_window": null,
91
+ "torch_dtype": "float32",
92
+ "use_cache": false,
93
+ "use_sliding_window": false,
94
+ "video_token_id": null,
95
+ "vision_end_token_id": 151653,
96
+ "vision_start_token_id": 151652,
97
+ "vision_token_id": 151654,
98
+ "vocab_size": 152064
99
+ },
100
+ "tie_word_embeddings": false,
101
+ "torch_dtype": "bfloat16",
102
+ "transformers_version": "4.55.0",
103
+ "use_cache": false,
104
+ "use_sliding_window": false,
105
+ "video_token_id": 151656,
106
+ "vision_config": {
107
+ "depth": 32,
108
+ "fullatt_block_indexes": [
109
+ 7,
110
+ 15,
111
+ 23,
112
+ 31
113
+ ],
114
+ "hidden_act": "silu",
115
+ "hidden_size": 1280,
116
+ "in_channels": 3,
117
+ "in_chans": 3,
118
+ "initializer_range": 0.02,
119
+ "intermediate_size": 3420,
120
+ "model_type": "qwen2_5_vl",
121
+ "num_heads": 16,
122
+ "out_hidden_size": 3584,
123
+ "patch_size": 14,
124
+ "spatial_merge_size": 2,
125
+ "spatial_patch_size": 14,
126
+ "temporal_patch_size": 2,
127
+ "tokens_per_second": 2,
128
+ "torch_dtype": "float32",
129
+ "window_size": 112
130
+ },
131
+ "vision_end_token_id": 151653,
132
+ "vision_start_token_id": 151652,
133
+ "vision_token_id": 151654,
134
+ "vocab_size": 152064
135
+ }
generation_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.05,
10
+ "temperature": 1e-06,
11
+ "transformers_version": "4.55.0"
12
+ }
global_step937/bf16_zero_pp_rank_0_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872572a993aab88b8e2ecdc60d6ef665c0cad364968d2f21cfa0608366abbd05
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_1_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81b1956fa4c79690df4e64e3e9f5960d3f6f8e125314fe84cd2c2584f3a46c4d
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_2_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42a3995f2fbf20339153870295c7f7f37f60f9974d5535d4ef4fec78c09d18e2
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_3_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0a2d412ece77474bd8177d797c8d41d4ee7ae86f2eb3588221a967934a8cac6e
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_4_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:401c2d17a34372c0051b393a620962b934a833b817586ad03e65a955b1ff45f8
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_5_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2ecb2fcd8b69f3e40204420c666baffffcfa6a6626cdda32b97ecaff24c9da3c
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_6_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd0d3da9cf81565eb386885bd74b443b9bc9ef43026a2452dc4272a2bf5bc663
3
+ size 11423430497
global_step937/bf16_zero_pp_rank_7_mp_rank_00_optim_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7a179c5adc0863faffec99b32d1ec23d851d804518dc60af50bb8357441ec051
3
+ size 11423430497
global_step937/zero_pp_rank_0_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40df9f4bcc85930d945fa0678ff88ae1a9c69d4a157b8ebe16f505f6f83d3cb5
3
+ size 169627507
global_step937/zero_pp_rank_1_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab59817f4453589263aa5e7a01959517629a6a861a4b16cea28ff354a4d4c56a
3
+ size 169627507
global_step937/zero_pp_rank_2_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ece99d0e4757353f0570d48534a2ed9a98eadfbd1ba55094374614177c8d2f9
3
+ size 169627507
global_step937/zero_pp_rank_3_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:570ae7bd9ff29d075bcda4b86de24c596b339fa69f2f0d9485f6de5bf1d965b0
3
+ size 169627507
global_step937/zero_pp_rank_4_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9224fee2222ca76d62aa1f75cb78079f9200d5e9e5e8dcd08174cc2b1bb303a4
3
+ size 169627507
global_step937/zero_pp_rank_5_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2150a4e8bec55181049d5d6affeb0a2e6fe1c6b62cfea616ebc1d6cce0501290
3
+ size 169627507
global_step937/zero_pp_rank_6_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88cbc3fd1c76cce48ca91d6ddf26fa1cddb6f77cfeea7807aea6e4d442a72078
3
+ size 169627507
global_step937/zero_pp_rank_7_mp_rank_00_model_states.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:417157661063ba254457db511ae24999b611f2e465f68c45b7f2fe362c29e035
3
+ size 169627507
latest ADDED
@@ -0,0 +1 @@
 
 
1
+ global_step937
merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
model-00001-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb9eaa6c375b0df0426e58437d792fd53fd06418dad292784dcc6bccb9c188d
3
+ size 4968243304
model-00002-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:59657b4548cc7d16af3798f32fab6464d7022cc7e790098af37132543bfb7b1e
3
+ size 4991495816
model-00003-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e42eed8a45b1c3a3bbaf4732b98ae267c528e7b321ae085dfd9068546e4f3943
3
+ size 4932751040
model-00004-of-00004.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ab3356ac68d601e410d185e2b78d2ac12f2259aa7f6d7eb25b9027e3ec610f01
3
+ size 1691924384
model.safetensors.index.json ADDED
@@ -0,0 +1,737 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "total_parameters": 848896,
4
+ "total_size": 16584333312
5
+ },
6
+ "weight_map": {
7
+ "lm_head.weight": "model-00004-of-00004.safetensors",
8
+ "model.embed_tokens.weight": "model-00001-of-00004.safetensors",
9
+ "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
10
+ "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
11
+ "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
12
+ "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
13
+ "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
14
+ "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
15
+ "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
16
+ "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
17
+ "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
18
+ "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
19
+ "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
20
+ "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
21
+ "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
22
+ "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
23
+ "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
24
+ "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
25
+ "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
26
+ "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
27
+ "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
28
+ "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
29
+ "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
30
+ "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
31
+ "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
32
+ "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
33
+ "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
34
+ "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
35
+ "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
36
+ "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
37
+ "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
38
+ "model.layers.10.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
39
+ "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
40
+ "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
41
+ "model.layers.10.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
42
+ "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
43
+ "model.layers.10.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
44
+ "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
45
+ "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
46
+ "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
47
+ "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
48
+ "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
49
+ "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
50
+ "model.layers.11.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
51
+ "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
52
+ "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
53
+ "model.layers.11.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
54
+ "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
55
+ "model.layers.11.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
56
+ "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
57
+ "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
58
+ "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
59
+ "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
60
+ "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
61
+ "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
62
+ "model.layers.12.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
63
+ "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
64
+ "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
65
+ "model.layers.12.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
66
+ "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
67
+ "model.layers.12.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
68
+ "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
69
+ "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
70
+ "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
71
+ "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
72
+ "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
73
+ "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
74
+ "model.layers.13.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
75
+ "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
76
+ "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
77
+ "model.layers.13.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
78
+ "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
79
+ "model.layers.13.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
80
+ "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
81
+ "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
82
+ "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
83
+ "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
84
+ "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
85
+ "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
86
+ "model.layers.14.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
87
+ "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
88
+ "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
89
+ "model.layers.14.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
90
+ "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
91
+ "model.layers.14.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
92
+ "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
93
+ "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
94
+ "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
95
+ "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
96
+ "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
97
+ "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
98
+ "model.layers.15.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
99
+ "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
100
+ "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
101
+ "model.layers.15.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
102
+ "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
103
+ "model.layers.15.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
104
+ "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
105
+ "model.layers.16.input_layernorm.weight": "model-00003-of-00004.safetensors",
106
+ "model.layers.16.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
107
+ "model.layers.16.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
108
+ "model.layers.16.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
109
+ "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
110
+ "model.layers.16.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
111
+ "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
112
+ "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
113
+ "model.layers.16.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
114
+ "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
115
+ "model.layers.16.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
116
+ "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
117
+ "model.layers.17.input_layernorm.weight": "model-00003-of-00004.safetensors",
118
+ "model.layers.17.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
119
+ "model.layers.17.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
120
+ "model.layers.17.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
121
+ "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
122
+ "model.layers.17.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
123
+ "model.layers.17.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
124
+ "model.layers.17.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
125
+ "model.layers.17.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
126
+ "model.layers.17.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
127
+ "model.layers.17.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
128
+ "model.layers.17.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
129
+ "model.layers.18.input_layernorm.weight": "model-00003-of-00004.safetensors",
130
+ "model.layers.18.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
131
+ "model.layers.18.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
132
+ "model.layers.18.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
133
+ "model.layers.18.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
134
+ "model.layers.18.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
135
+ "model.layers.18.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
136
+ "model.layers.18.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
137
+ "model.layers.18.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
138
+ "model.layers.18.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
139
+ "model.layers.18.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
140
+ "model.layers.18.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
141
+ "model.layers.19.input_layernorm.weight": "model-00003-of-00004.safetensors",
142
+ "model.layers.19.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
143
+ "model.layers.19.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
144
+ "model.layers.19.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
145
+ "model.layers.19.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
146
+ "model.layers.19.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
147
+ "model.layers.19.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
148
+ "model.layers.19.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
149
+ "model.layers.19.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
150
+ "model.layers.19.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
151
+ "model.layers.19.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
152
+ "model.layers.19.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
153
+ "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
154
+ "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
155
+ "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
156
+ "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
157
+ "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
158
+ "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
159
+ "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
160
+ "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
161
+ "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
162
+ "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
163
+ "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
164
+ "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
165
+ "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
166
+ "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
167
+ "model.layers.20.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
168
+ "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
169
+ "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
170
+ "model.layers.20.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
171
+ "model.layers.20.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
172
+ "model.layers.20.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
173
+ "model.layers.20.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
174
+ "model.layers.20.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
175
+ "model.layers.20.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
176
+ "model.layers.20.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
177
+ "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
178
+ "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
179
+ "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
180
+ "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
181
+ "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
182
+ "model.layers.21.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
183
+ "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
184
+ "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
185
+ "model.layers.21.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
186
+ "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
187
+ "model.layers.21.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
188
+ "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
189
+ "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
190
+ "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
191
+ "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
192
+ "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
193
+ "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
194
+ "model.layers.22.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
195
+ "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
196
+ "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
197
+ "model.layers.22.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
198
+ "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
199
+ "model.layers.22.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
200
+ "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
201
+ "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
202
+ "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
203
+ "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
204
+ "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
205
+ "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
206
+ "model.layers.23.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
207
+ "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
208
+ "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
209
+ "model.layers.23.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
210
+ "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
211
+ "model.layers.23.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
212
+ "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
213
+ "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
214
+ "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
215
+ "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
216
+ "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
217
+ "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
218
+ "model.layers.24.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
219
+ "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
220
+ "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
221
+ "model.layers.24.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
222
+ "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
223
+ "model.layers.24.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
224
+ "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
225
+ "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
226
+ "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
227
+ "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
228
+ "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
229
+ "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
230
+ "model.layers.25.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
231
+ "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
232
+ "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
233
+ "model.layers.25.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
234
+ "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
235
+ "model.layers.25.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
236
+ "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
237
+ "model.layers.26.input_layernorm.weight": "model-00004-of-00004.safetensors",
238
+ "model.layers.26.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
239
+ "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
240
+ "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
241
+ "model.layers.26.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
242
+ "model.layers.26.self_attn.k_proj.bias": "model-00003-of-00004.safetensors",
243
+ "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
244
+ "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
245
+ "model.layers.26.self_attn.q_proj.bias": "model-00003-of-00004.safetensors",
246
+ "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
247
+ "model.layers.26.self_attn.v_proj.bias": "model-00003-of-00004.safetensors",
248
+ "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
249
+ "model.layers.27.input_layernorm.weight": "model-00004-of-00004.safetensors",
250
+ "model.layers.27.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
251
+ "model.layers.27.mlp.gate_proj.weight": "model-00004-of-00004.safetensors",
252
+ "model.layers.27.mlp.up_proj.weight": "model-00004-of-00004.safetensors",
253
+ "model.layers.27.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
254
+ "model.layers.27.self_attn.k_proj.bias": "model-00004-of-00004.safetensors",
255
+ "model.layers.27.self_attn.k_proj.weight": "model-00004-of-00004.safetensors",
256
+ "model.layers.27.self_attn.o_proj.weight": "model-00004-of-00004.safetensors",
257
+ "model.layers.27.self_attn.q_proj.bias": "model-00004-of-00004.safetensors",
258
+ "model.layers.27.self_attn.q_proj.weight": "model-00004-of-00004.safetensors",
259
+ "model.layers.27.self_attn.v_proj.bias": "model-00004-of-00004.safetensors",
260
+ "model.layers.27.self_attn.v_proj.weight": "model-00004-of-00004.safetensors",
261
+ "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
262
+ "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
263
+ "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
264
+ "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
265
+ "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
266
+ "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
267
+ "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
268
+ "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
269
+ "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
270
+ "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
271
+ "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
272
+ "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
273
+ "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
274
+ "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
275
+ "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
276
+ "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
277
+ "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
278
+ "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
279
+ "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
280
+ "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
281
+ "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
282
+ "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
283
+ "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
284
+ "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
285
+ "model.layers.5.input_layernorm.weight": "model-00002-of-00004.safetensors",
286
+ "model.layers.5.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
287
+ "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
288
+ "model.layers.5.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
289
+ "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
290
+ "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00004.safetensors",
291
+ "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
292
+ "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
293
+ "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00004.safetensors",
294
+ "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
295
+ "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00004.safetensors",
296
+ "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
297
+ "model.layers.6.input_layernorm.weight": "model-00002-of-00004.safetensors",
298
+ "model.layers.6.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
299
+ "model.layers.6.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
300
+ "model.layers.6.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
301
+ "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
302
+ "model.layers.6.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
303
+ "model.layers.6.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
304
+ "model.layers.6.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
305
+ "model.layers.6.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
306
+ "model.layers.6.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
307
+ "model.layers.6.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
308
+ "model.layers.6.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
309
+ "model.layers.7.input_layernorm.weight": "model-00002-of-00004.safetensors",
310
+ "model.layers.7.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
311
+ "model.layers.7.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
312
+ "model.layers.7.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
313
+ "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
314
+ "model.layers.7.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
315
+ "model.layers.7.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
316
+ "model.layers.7.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
317
+ "model.layers.7.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
318
+ "model.layers.7.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
319
+ "model.layers.7.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
320
+ "model.layers.7.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
321
+ "model.layers.8.input_layernorm.weight": "model-00002-of-00004.safetensors",
322
+ "model.layers.8.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
323
+ "model.layers.8.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
324
+ "model.layers.8.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
325
+ "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
326
+ "model.layers.8.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
327
+ "model.layers.8.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
328
+ "model.layers.8.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
329
+ "model.layers.8.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
330
+ "model.layers.8.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
331
+ "model.layers.8.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
332
+ "model.layers.8.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
333
+ "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
334
+ "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
335
+ "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
336
+ "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
337
+ "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
338
+ "model.layers.9.self_attn.k_proj.bias": "model-00002-of-00004.safetensors",
339
+ "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
340
+ "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
341
+ "model.layers.9.self_attn.q_proj.bias": "model-00002-of-00004.safetensors",
342
+ "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
343
+ "model.layers.9.self_attn.v_proj.bias": "model-00002-of-00004.safetensors",
344
+ "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
345
+ "model.norm.weight": "model-00004-of-00004.safetensors",
346
+ "visual.blocks.0.attn.proj.bias": "model-00001-of-00004.safetensors",
347
+ "visual.blocks.0.attn.proj.weight": "model-00001-of-00004.safetensors",
348
+ "visual.blocks.0.attn.qkv.bias": "model-00001-of-00004.safetensors",
349
+ "visual.blocks.0.attn.qkv.weight": "model-00001-of-00004.safetensors",
350
+ "visual.blocks.0.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
351
+ "visual.blocks.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
352
+ "visual.blocks.0.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
353
+ "visual.blocks.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
354
+ "visual.blocks.0.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
355
+ "visual.blocks.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
356
+ "visual.blocks.0.norm1.weight": "model-00001-of-00004.safetensors",
357
+ "visual.blocks.0.norm2.weight": "model-00001-of-00004.safetensors",
358
+ "visual.blocks.1.attn.proj.bias": "model-00001-of-00004.safetensors",
359
+ "visual.blocks.1.attn.proj.weight": "model-00001-of-00004.safetensors",
360
+ "visual.blocks.1.attn.qkv.bias": "model-00001-of-00004.safetensors",
361
+ "visual.blocks.1.attn.qkv.weight": "model-00001-of-00004.safetensors",
362
+ "visual.blocks.1.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
363
+ "visual.blocks.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
364
+ "visual.blocks.1.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
365
+ "visual.blocks.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
366
+ "visual.blocks.1.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
367
+ "visual.blocks.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
368
+ "visual.blocks.1.norm1.weight": "model-00001-of-00004.safetensors",
369
+ "visual.blocks.1.norm2.weight": "model-00001-of-00004.safetensors",
370
+ "visual.blocks.10.attn.proj.bias": "model-00001-of-00004.safetensors",
371
+ "visual.blocks.10.attn.proj.weight": "model-00001-of-00004.safetensors",
372
+ "visual.blocks.10.attn.qkv.bias": "model-00001-of-00004.safetensors",
373
+ "visual.blocks.10.attn.qkv.weight": "model-00001-of-00004.safetensors",
374
+ "visual.blocks.10.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
375
+ "visual.blocks.10.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
376
+ "visual.blocks.10.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
377
+ "visual.blocks.10.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
378
+ "visual.blocks.10.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
379
+ "visual.blocks.10.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
380
+ "visual.blocks.10.norm1.weight": "model-00001-of-00004.safetensors",
381
+ "visual.blocks.10.norm2.weight": "model-00001-of-00004.safetensors",
382
+ "visual.blocks.11.attn.proj.bias": "model-00001-of-00004.safetensors",
383
+ "visual.blocks.11.attn.proj.weight": "model-00001-of-00004.safetensors",
384
+ "visual.blocks.11.attn.qkv.bias": "model-00001-of-00004.safetensors",
385
+ "visual.blocks.11.attn.qkv.weight": "model-00001-of-00004.safetensors",
386
+ "visual.blocks.11.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
387
+ "visual.blocks.11.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
388
+ "visual.blocks.11.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
389
+ "visual.blocks.11.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
390
+ "visual.blocks.11.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
391
+ "visual.blocks.11.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
392
+ "visual.blocks.11.norm1.weight": "model-00001-of-00004.safetensors",
393
+ "visual.blocks.11.norm2.weight": "model-00001-of-00004.safetensors",
394
+ "visual.blocks.12.attn.proj.bias": "model-00001-of-00004.safetensors",
395
+ "visual.blocks.12.attn.proj.weight": "model-00001-of-00004.safetensors",
396
+ "visual.blocks.12.attn.qkv.bias": "model-00001-of-00004.safetensors",
397
+ "visual.blocks.12.attn.qkv.weight": "model-00001-of-00004.safetensors",
398
+ "visual.blocks.12.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
399
+ "visual.blocks.12.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
400
+ "visual.blocks.12.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
401
+ "visual.blocks.12.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
402
+ "visual.blocks.12.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
403
+ "visual.blocks.12.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
404
+ "visual.blocks.12.norm1.weight": "model-00001-of-00004.safetensors",
405
+ "visual.blocks.12.norm2.weight": "model-00001-of-00004.safetensors",
406
+ "visual.blocks.13.attn.proj.bias": "model-00001-of-00004.safetensors",
407
+ "visual.blocks.13.attn.proj.weight": "model-00001-of-00004.safetensors",
408
+ "visual.blocks.13.attn.qkv.bias": "model-00001-of-00004.safetensors",
409
+ "visual.blocks.13.attn.qkv.weight": "model-00001-of-00004.safetensors",
410
+ "visual.blocks.13.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
411
+ "visual.blocks.13.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
412
+ "visual.blocks.13.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
413
+ "visual.blocks.13.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
414
+ "visual.blocks.13.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
415
+ "visual.blocks.13.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
416
+ "visual.blocks.13.norm1.weight": "model-00001-of-00004.safetensors",
417
+ "visual.blocks.13.norm2.weight": "model-00001-of-00004.safetensors",
418
+ "visual.blocks.14.attn.proj.bias": "model-00001-of-00004.safetensors",
419
+ "visual.blocks.14.attn.proj.weight": "model-00001-of-00004.safetensors",
420
+ "visual.blocks.14.attn.qkv.bias": "model-00001-of-00004.safetensors",
421
+ "visual.blocks.14.attn.qkv.weight": "model-00001-of-00004.safetensors",
422
+ "visual.blocks.14.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
423
+ "visual.blocks.14.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
424
+ "visual.blocks.14.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
425
+ "visual.blocks.14.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
426
+ "visual.blocks.14.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
427
+ "visual.blocks.14.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
428
+ "visual.blocks.14.norm1.weight": "model-00001-of-00004.safetensors",
429
+ "visual.blocks.14.norm2.weight": "model-00001-of-00004.safetensors",
430
+ "visual.blocks.15.attn.proj.bias": "model-00001-of-00004.safetensors",
431
+ "visual.blocks.15.attn.proj.weight": "model-00001-of-00004.safetensors",
432
+ "visual.blocks.15.attn.qkv.bias": "model-00001-of-00004.safetensors",
433
+ "visual.blocks.15.attn.qkv.weight": "model-00001-of-00004.safetensors",
434
+ "visual.blocks.15.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
435
+ "visual.blocks.15.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
436
+ "visual.blocks.15.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
437
+ "visual.blocks.15.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
438
+ "visual.blocks.15.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
439
+ "visual.blocks.15.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
440
+ "visual.blocks.15.norm1.weight": "model-00001-of-00004.safetensors",
441
+ "visual.blocks.15.norm2.weight": "model-00001-of-00004.safetensors",
442
+ "visual.blocks.16.attn.proj.bias": "model-00001-of-00004.safetensors",
443
+ "visual.blocks.16.attn.proj.weight": "model-00001-of-00004.safetensors",
444
+ "visual.blocks.16.attn.qkv.bias": "model-00001-of-00004.safetensors",
445
+ "visual.blocks.16.attn.qkv.weight": "model-00001-of-00004.safetensors",
446
+ "visual.blocks.16.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
447
+ "visual.blocks.16.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
448
+ "visual.blocks.16.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
449
+ "visual.blocks.16.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
450
+ "visual.blocks.16.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
451
+ "visual.blocks.16.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
452
+ "visual.blocks.16.norm1.weight": "model-00001-of-00004.safetensors",
453
+ "visual.blocks.16.norm2.weight": "model-00001-of-00004.safetensors",
454
+ "visual.blocks.17.attn.proj.bias": "model-00001-of-00004.safetensors",
455
+ "visual.blocks.17.attn.proj.weight": "model-00001-of-00004.safetensors",
456
+ "visual.blocks.17.attn.qkv.bias": "model-00001-of-00004.safetensors",
457
+ "visual.blocks.17.attn.qkv.weight": "model-00001-of-00004.safetensors",
458
+ "visual.blocks.17.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
459
+ "visual.blocks.17.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
460
+ "visual.blocks.17.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
461
+ "visual.blocks.17.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
462
+ "visual.blocks.17.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
463
+ "visual.blocks.17.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
464
+ "visual.blocks.17.norm1.weight": "model-00001-of-00004.safetensors",
465
+ "visual.blocks.17.norm2.weight": "model-00001-of-00004.safetensors",
466
+ "visual.blocks.18.attn.proj.bias": "model-00001-of-00004.safetensors",
467
+ "visual.blocks.18.attn.proj.weight": "model-00001-of-00004.safetensors",
468
+ "visual.blocks.18.attn.qkv.bias": "model-00001-of-00004.safetensors",
469
+ "visual.blocks.18.attn.qkv.weight": "model-00001-of-00004.safetensors",
470
+ "visual.blocks.18.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
471
+ "visual.blocks.18.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
472
+ "visual.blocks.18.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
473
+ "visual.blocks.18.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
474
+ "visual.blocks.18.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
475
+ "visual.blocks.18.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
476
+ "visual.blocks.18.norm1.weight": "model-00001-of-00004.safetensors",
477
+ "visual.blocks.18.norm2.weight": "model-00001-of-00004.safetensors",
478
+ "visual.blocks.19.attn.proj.bias": "model-00001-of-00004.safetensors",
479
+ "visual.blocks.19.attn.proj.weight": "model-00001-of-00004.safetensors",
480
+ "visual.blocks.19.attn.qkv.bias": "model-00001-of-00004.safetensors",
481
+ "visual.blocks.19.attn.qkv.weight": "model-00001-of-00004.safetensors",
482
+ "visual.blocks.19.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
483
+ "visual.blocks.19.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
484
+ "visual.blocks.19.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
485
+ "visual.blocks.19.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
486
+ "visual.blocks.19.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
487
+ "visual.blocks.19.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
488
+ "visual.blocks.19.norm1.weight": "model-00001-of-00004.safetensors",
489
+ "visual.blocks.19.norm2.weight": "model-00001-of-00004.safetensors",
490
+ "visual.blocks.2.attn.proj.bias": "model-00001-of-00004.safetensors",
491
+ "visual.blocks.2.attn.proj.weight": "model-00001-of-00004.safetensors",
492
+ "visual.blocks.2.attn.qkv.bias": "model-00001-of-00004.safetensors",
493
+ "visual.blocks.2.attn.qkv.weight": "model-00001-of-00004.safetensors",
494
+ "visual.blocks.2.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
495
+ "visual.blocks.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
496
+ "visual.blocks.2.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
497
+ "visual.blocks.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
498
+ "visual.blocks.2.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
499
+ "visual.blocks.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
500
+ "visual.blocks.2.norm1.weight": "model-00001-of-00004.safetensors",
501
+ "visual.blocks.2.norm2.weight": "model-00001-of-00004.safetensors",
502
+ "visual.blocks.20.attn.proj.bias": "model-00001-of-00004.safetensors",
503
+ "visual.blocks.20.attn.proj.weight": "model-00001-of-00004.safetensors",
504
+ "visual.blocks.20.attn.qkv.bias": "model-00001-of-00004.safetensors",
505
+ "visual.blocks.20.attn.qkv.weight": "model-00001-of-00004.safetensors",
506
+ "visual.blocks.20.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
507
+ "visual.blocks.20.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
508
+ "visual.blocks.20.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
509
+ "visual.blocks.20.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
510
+ "visual.blocks.20.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
511
+ "visual.blocks.20.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
512
+ "visual.blocks.20.norm1.weight": "model-00001-of-00004.safetensors",
513
+ "visual.blocks.20.norm2.weight": "model-00001-of-00004.safetensors",
514
+ "visual.blocks.21.attn.proj.bias": "model-00001-of-00004.safetensors",
515
+ "visual.blocks.21.attn.proj.weight": "model-00001-of-00004.safetensors",
516
+ "visual.blocks.21.attn.qkv.bias": "model-00001-of-00004.safetensors",
517
+ "visual.blocks.21.attn.qkv.weight": "model-00001-of-00004.safetensors",
518
+ "visual.blocks.21.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
519
+ "visual.blocks.21.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
520
+ "visual.blocks.21.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
521
+ "visual.blocks.21.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
522
+ "visual.blocks.21.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
523
+ "visual.blocks.21.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
524
+ "visual.blocks.21.norm1.weight": "model-00001-of-00004.safetensors",
525
+ "visual.blocks.21.norm2.weight": "model-00001-of-00004.safetensors",
526
+ "visual.blocks.22.attn.proj.bias": "model-00001-of-00004.safetensors",
527
+ "visual.blocks.22.attn.proj.weight": "model-00001-of-00004.safetensors",
528
+ "visual.blocks.22.attn.qkv.bias": "model-00001-of-00004.safetensors",
529
+ "visual.blocks.22.attn.qkv.weight": "model-00001-of-00004.safetensors",
530
+ "visual.blocks.22.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
531
+ "visual.blocks.22.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
532
+ "visual.blocks.22.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
533
+ "visual.blocks.22.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
534
+ "visual.blocks.22.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
535
+ "visual.blocks.22.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
536
+ "visual.blocks.22.norm1.weight": "model-00001-of-00004.safetensors",
537
+ "visual.blocks.22.norm2.weight": "model-00001-of-00004.safetensors",
538
+ "visual.blocks.23.attn.proj.bias": "model-00001-of-00004.safetensors",
539
+ "visual.blocks.23.attn.proj.weight": "model-00001-of-00004.safetensors",
540
+ "visual.blocks.23.attn.qkv.bias": "model-00001-of-00004.safetensors",
541
+ "visual.blocks.23.attn.qkv.weight": "model-00001-of-00004.safetensors",
542
+ "visual.blocks.23.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
543
+ "visual.blocks.23.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
544
+ "visual.blocks.23.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
545
+ "visual.blocks.23.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
546
+ "visual.blocks.23.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
547
+ "visual.blocks.23.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
548
+ "visual.blocks.23.norm1.weight": "model-00001-of-00004.safetensors",
549
+ "visual.blocks.23.norm2.weight": "model-00001-of-00004.safetensors",
550
+ "visual.blocks.24.attn.proj.bias": "model-00001-of-00004.safetensors",
551
+ "visual.blocks.24.attn.proj.weight": "model-00001-of-00004.safetensors",
552
+ "visual.blocks.24.attn.qkv.bias": "model-00001-of-00004.safetensors",
553
+ "visual.blocks.24.attn.qkv.weight": "model-00001-of-00004.safetensors",
554
+ "visual.blocks.24.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
555
+ "visual.blocks.24.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
556
+ "visual.blocks.24.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
557
+ "visual.blocks.24.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
558
+ "visual.blocks.24.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
559
+ "visual.blocks.24.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
560
+ "visual.blocks.24.norm1.weight": "model-00001-of-00004.safetensors",
561
+ "visual.blocks.24.norm2.weight": "model-00001-of-00004.safetensors",
562
+ "visual.blocks.25.attn.proj.bias": "model-00001-of-00004.safetensors",
563
+ "visual.blocks.25.attn.proj.weight": "model-00001-of-00004.safetensors",
564
+ "visual.blocks.25.attn.qkv.bias": "model-00001-of-00004.safetensors",
565
+ "visual.blocks.25.attn.qkv.weight": "model-00001-of-00004.safetensors",
566
+ "visual.blocks.25.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
567
+ "visual.blocks.25.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
568
+ "visual.blocks.25.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
569
+ "visual.blocks.25.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
570
+ "visual.blocks.25.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
571
+ "visual.blocks.25.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
572
+ "visual.blocks.25.norm1.weight": "model-00001-of-00004.safetensors",
573
+ "visual.blocks.25.norm2.weight": "model-00001-of-00004.safetensors",
574
+ "visual.blocks.26.attn.proj.bias": "model-00001-of-00004.safetensors",
575
+ "visual.blocks.26.attn.proj.weight": "model-00001-of-00004.safetensors",
576
+ "visual.blocks.26.attn.qkv.bias": "model-00001-of-00004.safetensors",
577
+ "visual.blocks.26.attn.qkv.weight": "model-00001-of-00004.safetensors",
578
+ "visual.blocks.26.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
579
+ "visual.blocks.26.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
580
+ "visual.blocks.26.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
581
+ "visual.blocks.26.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
582
+ "visual.blocks.26.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
583
+ "visual.blocks.26.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
584
+ "visual.blocks.26.norm1.weight": "model-00001-of-00004.safetensors",
585
+ "visual.blocks.26.norm2.weight": "model-00001-of-00004.safetensors",
586
+ "visual.blocks.27.attn.proj.bias": "model-00001-of-00004.safetensors",
587
+ "visual.blocks.27.attn.proj.weight": "model-00001-of-00004.safetensors",
588
+ "visual.blocks.27.attn.qkv.bias": "model-00001-of-00004.safetensors",
589
+ "visual.blocks.27.attn.qkv.weight": "model-00001-of-00004.safetensors",
590
+ "visual.blocks.27.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
591
+ "visual.blocks.27.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
592
+ "visual.blocks.27.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
593
+ "visual.blocks.27.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
594
+ "visual.blocks.27.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
595
+ "visual.blocks.27.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
596
+ "visual.blocks.27.norm1.weight": "model-00001-of-00004.safetensors",
597
+ "visual.blocks.27.norm2.weight": "model-00001-of-00004.safetensors",
598
+ "visual.blocks.28.attn.proj.bias": "model-00001-of-00004.safetensors",
599
+ "visual.blocks.28.attn.proj.weight": "model-00001-of-00004.safetensors",
600
+ "visual.blocks.28.attn.qkv.bias": "model-00001-of-00004.safetensors",
601
+ "visual.blocks.28.attn.qkv.weight": "model-00001-of-00004.safetensors",
602
+ "visual.blocks.28.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
603
+ "visual.blocks.28.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
604
+ "visual.blocks.28.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
605
+ "visual.blocks.28.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
606
+ "visual.blocks.28.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
607
+ "visual.blocks.28.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
608
+ "visual.blocks.28.norm1.weight": "model-00001-of-00004.safetensors",
609
+ "visual.blocks.28.norm2.weight": "model-00001-of-00004.safetensors",
610
+ "visual.blocks.29.attn.proj.bias": "model-00001-of-00004.safetensors",
611
+ "visual.blocks.29.attn.proj.weight": "model-00001-of-00004.safetensors",
612
+ "visual.blocks.29.attn.qkv.bias": "model-00001-of-00004.safetensors",
613
+ "visual.blocks.29.attn.qkv.weight": "model-00001-of-00004.safetensors",
614
+ "visual.blocks.29.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
615
+ "visual.blocks.29.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
616
+ "visual.blocks.29.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
617
+ "visual.blocks.29.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
618
+ "visual.blocks.29.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
619
+ "visual.blocks.29.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
620
+ "visual.blocks.29.norm1.weight": "model-00001-of-00004.safetensors",
621
+ "visual.blocks.29.norm2.weight": "model-00001-of-00004.safetensors",
622
+ "visual.blocks.3.attn.proj.bias": "model-00001-of-00004.safetensors",
623
+ "visual.blocks.3.attn.proj.weight": "model-00001-of-00004.safetensors",
624
+ "visual.blocks.3.attn.qkv.bias": "model-00001-of-00004.safetensors",
625
+ "visual.blocks.3.attn.qkv.weight": "model-00001-of-00004.safetensors",
626
+ "visual.blocks.3.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
627
+ "visual.blocks.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
628
+ "visual.blocks.3.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
629
+ "visual.blocks.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
630
+ "visual.blocks.3.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
631
+ "visual.blocks.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
632
+ "visual.blocks.3.norm1.weight": "model-00001-of-00004.safetensors",
633
+ "visual.blocks.3.norm2.weight": "model-00001-of-00004.safetensors",
634
+ "visual.blocks.30.attn.proj.bias": "model-00001-of-00004.safetensors",
635
+ "visual.blocks.30.attn.proj.weight": "model-00001-of-00004.safetensors",
636
+ "visual.blocks.30.attn.qkv.bias": "model-00001-of-00004.safetensors",
637
+ "visual.blocks.30.attn.qkv.weight": "model-00001-of-00004.safetensors",
638
+ "visual.blocks.30.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
639
+ "visual.blocks.30.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
640
+ "visual.blocks.30.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
641
+ "visual.blocks.30.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
642
+ "visual.blocks.30.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
643
+ "visual.blocks.30.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
644
+ "visual.blocks.30.norm1.weight": "model-00001-of-00004.safetensors",
645
+ "visual.blocks.30.norm2.weight": "model-00001-of-00004.safetensors",
646
+ "visual.blocks.31.attn.proj.bias": "model-00001-of-00004.safetensors",
647
+ "visual.blocks.31.attn.proj.weight": "model-00001-of-00004.safetensors",
648
+ "visual.blocks.31.attn.qkv.bias": "model-00001-of-00004.safetensors",
649
+ "visual.blocks.31.attn.qkv.weight": "model-00001-of-00004.safetensors",
650
+ "visual.blocks.31.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
651
+ "visual.blocks.31.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
652
+ "visual.blocks.31.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
653
+ "visual.blocks.31.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
654
+ "visual.blocks.31.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
655
+ "visual.blocks.31.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
656
+ "visual.blocks.31.norm1.weight": "model-00001-of-00004.safetensors",
657
+ "visual.blocks.31.norm2.weight": "model-00001-of-00004.safetensors",
658
+ "visual.blocks.4.attn.proj.bias": "model-00001-of-00004.safetensors",
659
+ "visual.blocks.4.attn.proj.weight": "model-00001-of-00004.safetensors",
660
+ "visual.blocks.4.attn.qkv.bias": "model-00001-of-00004.safetensors",
661
+ "visual.blocks.4.attn.qkv.weight": "model-00001-of-00004.safetensors",
662
+ "visual.blocks.4.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
663
+ "visual.blocks.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
664
+ "visual.blocks.4.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
665
+ "visual.blocks.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
666
+ "visual.blocks.4.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
667
+ "visual.blocks.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
668
+ "visual.blocks.4.norm1.weight": "model-00001-of-00004.safetensors",
669
+ "visual.blocks.4.norm2.weight": "model-00001-of-00004.safetensors",
670
+ "visual.blocks.5.attn.proj.bias": "model-00001-of-00004.safetensors",
671
+ "visual.blocks.5.attn.proj.weight": "model-00001-of-00004.safetensors",
672
+ "visual.blocks.5.attn.qkv.bias": "model-00001-of-00004.safetensors",
673
+ "visual.blocks.5.attn.qkv.weight": "model-00001-of-00004.safetensors",
674
+ "visual.blocks.5.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
675
+ "visual.blocks.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
676
+ "visual.blocks.5.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
677
+ "visual.blocks.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
678
+ "visual.blocks.5.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
679
+ "visual.blocks.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
680
+ "visual.blocks.5.norm1.weight": "model-00001-of-00004.safetensors",
681
+ "visual.blocks.5.norm2.weight": "model-00001-of-00004.safetensors",
682
+ "visual.blocks.6.attn.proj.bias": "model-00001-of-00004.safetensors",
683
+ "visual.blocks.6.attn.proj.weight": "model-00001-of-00004.safetensors",
684
+ "visual.blocks.6.attn.qkv.bias": "model-00001-of-00004.safetensors",
685
+ "visual.blocks.6.attn.qkv.weight": "model-00001-of-00004.safetensors",
686
+ "visual.blocks.6.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
687
+ "visual.blocks.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
688
+ "visual.blocks.6.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
689
+ "visual.blocks.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
690
+ "visual.blocks.6.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
691
+ "visual.blocks.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
692
+ "visual.blocks.6.norm1.weight": "model-00001-of-00004.safetensors",
693
+ "visual.blocks.6.norm2.weight": "model-00001-of-00004.safetensors",
694
+ "visual.blocks.7.attn.proj.bias": "model-00001-of-00004.safetensors",
695
+ "visual.blocks.7.attn.proj.weight": "model-00001-of-00004.safetensors",
696
+ "visual.blocks.7.attn.qkv.bias": "model-00001-of-00004.safetensors",
697
+ "visual.blocks.7.attn.qkv.weight": "model-00001-of-00004.safetensors",
698
+ "visual.blocks.7.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
699
+ "visual.blocks.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
700
+ "visual.blocks.7.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
701
+ "visual.blocks.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
702
+ "visual.blocks.7.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
703
+ "visual.blocks.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
704
+ "visual.blocks.7.norm1.weight": "model-00001-of-00004.safetensors",
705
+ "visual.blocks.7.norm2.weight": "model-00001-of-00004.safetensors",
706
+ "visual.blocks.8.attn.proj.bias": "model-00001-of-00004.safetensors",
707
+ "visual.blocks.8.attn.proj.weight": "model-00001-of-00004.safetensors",
708
+ "visual.blocks.8.attn.qkv.bias": "model-00001-of-00004.safetensors",
709
+ "visual.blocks.8.attn.qkv.weight": "model-00001-of-00004.safetensors",
710
+ "visual.blocks.8.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
711
+ "visual.blocks.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
712
+ "visual.blocks.8.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
713
+ "visual.blocks.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
714
+ "visual.blocks.8.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
715
+ "visual.blocks.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
716
+ "visual.blocks.8.norm1.weight": "model-00001-of-00004.safetensors",
717
+ "visual.blocks.8.norm2.weight": "model-00001-of-00004.safetensors",
718
+ "visual.blocks.9.attn.proj.bias": "model-00001-of-00004.safetensors",
719
+ "visual.blocks.9.attn.proj.weight": "model-00001-of-00004.safetensors",
720
+ "visual.blocks.9.attn.qkv.bias": "model-00001-of-00004.safetensors",
721
+ "visual.blocks.9.attn.qkv.weight": "model-00001-of-00004.safetensors",
722
+ "visual.blocks.9.mlp.down_proj.bias": "model-00001-of-00004.safetensors",
723
+ "visual.blocks.9.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
724
+ "visual.blocks.9.mlp.gate_proj.bias": "model-00001-of-00004.safetensors",
725
+ "visual.blocks.9.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
726
+ "visual.blocks.9.mlp.up_proj.bias": "model-00001-of-00004.safetensors",
727
+ "visual.blocks.9.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
728
+ "visual.blocks.9.norm1.weight": "model-00001-of-00004.safetensors",
729
+ "visual.blocks.9.norm2.weight": "model-00001-of-00004.safetensors",
730
+ "visual.merger.ln_q.weight": "model-00001-of-00004.safetensors",
731
+ "visual.merger.mlp.0.bias": "model-00001-of-00004.safetensors",
732
+ "visual.merger.mlp.0.weight": "model-00001-of-00004.safetensors",
733
+ "visual.merger.mlp.2.bias": "model-00001-of-00004.safetensors",
734
+ "visual.merger.mlp.2.weight": "model-00001-of-00004.safetensors",
735
+ "visual.patch_embed.proj.weight": "model-00001-of-00004.safetensors"
736
+ }
737
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "disable_grouping": null,
7
+ "do_center_crop": null,
8
+ "do_convert_rgb": true,
9
+ "do_normalize": true,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "image_mean": [
13
+ 0.48145466,
14
+ 0.4578275,
15
+ 0.40821073
16
+ ],
17
+ "image_processor_type": "Qwen2VLImageProcessorFast",
18
+ "image_std": [
19
+ 0.26862954,
20
+ 0.26130258,
21
+ 0.27577711
22
+ ],
23
+ "input_data_format": null,
24
+ "max_pixels": 12845056,
25
+ "merge_size": 2,
26
+ "min_pixels": 3136,
27
+ "patch_size": 14,
28
+ "processor_class": "Qwen2_5_VLProcessor",
29
+ "resample": 3,
30
+ "rescale_factor": 0.00392156862745098,
31
+ "return_tensors": null,
32
+ "size": {
33
+ "longest_edge": 12845056,
34
+ "shortest_edge": 3136
35
+ },
36
+ "temporal_patch_size": 2
37
+ }
rng_state_0.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478b41e9f26d338fd8f896e08cad1adab7c423b61f1b45754113bc78d256a3f9
3
+ size 16389
rng_state_1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ce29a8767a7d907dd24987aa2c3e654d4317f3042fbc13b5b72cadb46d43311a
3
+ size 16389
rng_state_2.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:61a48db011646b4e9a867bf12f4a233cad5dfbfe309686f8996c250196d3783a
3
+ size 16389
rng_state_3.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b9562ee822472a4f01dcd6349ab3d1ef42a48915fe3b92e843a0c37db53c8421
3
+ size 16389
rng_state_4.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7d2767d83c3bf27f12db022b0632e2c4f8c164274ba75e380cf18f9d5f21819
3
+ size 16389
rng_state_5.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76816358d4e5db8149d60d55234db658d67a13c0c1ce05d7404cf7125a676a5c
3
+ size 16389
rng_state_6.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1562e7520c977d178183d641f70abcf3f57da2489938756cfbebf9b6e6c1a9fd
3
+ size 16389
rng_state_7.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6b6cabaed045c5398cd1b732f7ec48bd363f3b43cd24e0e70e641a42bd00c28
3
+ size 16389
scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:35277d637a1704fbdf5c960b22475478a524394a3809ddf0f6c2eb198b7cd21c
3
+ size 1465
special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
tokenizer_config.json ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "processor_class": "Qwen2_5_VLProcessor",
206
+ "split_special_tokens": false,
207
+ "tokenizer_class": "Qwen2Tokenizer",
208
+ "unk_token": null
209
+ }
trainer_state.json ADDED
@@ -0,0 +1,1429 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 938,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.010666666666666666,
14
+ "grad_norm": 46.42798146527443,
15
+ "learning_rate": 9.574468085106382e-08,
16
+ "logits/chosen": 2.382258415222168,
17
+ "logits/rejected": 2.7442336082458496,
18
+ "logps/chosen": -138.31182861328125,
19
+ "logps/rejected": -147.63272094726562,
20
+ "loss": 0.6936,
21
+ "rewards/accuracies": 0.4281249940395355,
22
+ "rewards/chosen": -0.0015843239380046725,
23
+ "rewards/margins": 3.248453140258789e-06,
24
+ "rewards/rejected": -0.0015875725075602531,
25
+ "step": 10
26
+ },
27
+ {
28
+ "epoch": 0.021333333333333333,
29
+ "grad_norm": 48.61255110254263,
30
+ "learning_rate": 2.0212765957446807e-07,
31
+ "logits/chosen": 2.313032627105713,
32
+ "logits/rejected": 2.844193458557129,
33
+ "logps/chosen": -138.99807739257812,
34
+ "logps/rejected": -135.5998992919922,
35
+ "loss": 0.6931,
36
+ "rewards/accuracies": 0.512499988079071,
37
+ "rewards/chosen": -0.008929151110351086,
38
+ "rewards/margins": 0.0014812585432082415,
39
+ "rewards/rejected": -0.010410408489406109,
40
+ "step": 20
41
+ },
42
+ {
43
+ "epoch": 0.032,
44
+ "grad_norm": 41.089142727664616,
45
+ "learning_rate": 3.085106382978723e-07,
46
+ "logits/chosen": 2.291680097579956,
47
+ "logits/rejected": 2.7390356063842773,
48
+ "logps/chosen": -126.77516174316406,
49
+ "logps/rejected": -131.89663696289062,
50
+ "loss": 0.6804,
51
+ "rewards/accuracies": 0.6312499642372131,
52
+ "rewards/chosen": -0.013881472870707512,
53
+ "rewards/margins": 0.028002463281154633,
54
+ "rewards/rejected": -0.041883938014507294,
55
+ "step": 30
56
+ },
57
+ {
58
+ "epoch": 0.042666666666666665,
59
+ "grad_norm": 45.92788214228633,
60
+ "learning_rate": 4.148936170212766e-07,
61
+ "logits/chosen": 2.523430585861206,
62
+ "logits/rejected": 2.84600567817688,
63
+ "logps/chosen": -134.11170959472656,
64
+ "logps/rejected": -140.52801513671875,
65
+ "loss": 0.6492,
66
+ "rewards/accuracies": 0.6937500238418579,
67
+ "rewards/chosen": -0.009366204962134361,
68
+ "rewards/margins": 0.1010168194770813,
69
+ "rewards/rejected": -0.11038301885128021,
70
+ "step": 40
71
+ },
72
+ {
73
+ "epoch": 0.05333333333333334,
74
+ "grad_norm": 39.55493674378011,
75
+ "learning_rate": 5.212765957446809e-07,
76
+ "logits/chosen": 2.463056802749634,
77
+ "logits/rejected": 3.003122329711914,
78
+ "logps/chosen": -134.13511657714844,
79
+ "logps/rejected": -153.1929473876953,
80
+ "loss": 0.6023,
81
+ "rewards/accuracies": 0.7468750476837158,
82
+ "rewards/chosen": -0.04466788098216057,
83
+ "rewards/margins": 0.24204480648040771,
84
+ "rewards/rejected": -0.2867127060890198,
85
+ "step": 50
86
+ },
87
+ {
88
+ "epoch": 0.064,
89
+ "grad_norm": 30.3497904000491,
90
+ "learning_rate": 6.276595744680851e-07,
91
+ "logits/chosen": 2.2184526920318604,
92
+ "logits/rejected": 2.7464191913604736,
93
+ "logps/chosen": -125.64830017089844,
94
+ "logps/rejected": -129.73838806152344,
95
+ "loss": 0.5494,
96
+ "rewards/accuracies": 0.7531250715255737,
97
+ "rewards/chosen": 0.06083741411566734,
98
+ "rewards/margins": 0.429790735244751,
99
+ "rewards/rejected": -0.3689533472061157,
100
+ "step": 60
101
+ },
102
+ {
103
+ "epoch": 0.07466666666666667,
104
+ "grad_norm": 38.90457231803085,
105
+ "learning_rate": 7.340425531914893e-07,
106
+ "logits/chosen": 2.197575569152832,
107
+ "logits/rejected": 2.578211784362793,
108
+ "logps/chosen": -135.662353515625,
109
+ "logps/rejected": -136.03961181640625,
110
+ "loss": 0.5285,
111
+ "rewards/accuracies": 0.7562500238418579,
112
+ "rewards/chosen": 0.29434823989868164,
113
+ "rewards/margins": 0.5787851810455322,
114
+ "rewards/rejected": -0.28443700075149536,
115
+ "step": 70
116
+ },
117
+ {
118
+ "epoch": 0.08533333333333333,
119
+ "grad_norm": 43.610746377169825,
120
+ "learning_rate": 8.404255319148936e-07,
121
+ "logits/chosen": 2.2334251403808594,
122
+ "logits/rejected": 2.8087658882141113,
123
+ "logps/chosen": -132.78274536132812,
124
+ "logps/rejected": -142.40440368652344,
125
+ "loss": 0.4869,
126
+ "rewards/accuracies": 0.7718750834465027,
127
+ "rewards/chosen": 0.16994354128837585,
128
+ "rewards/margins": 0.8923333287239075,
129
+ "rewards/rejected": -0.722389817237854,
130
+ "step": 80
131
+ },
132
+ {
133
+ "epoch": 0.096,
134
+ "grad_norm": 26.982449978848148,
135
+ "learning_rate": 9.468085106382978e-07,
136
+ "logits/chosen": 2.2855160236358643,
137
+ "logits/rejected": 2.6503257751464844,
138
+ "logps/chosen": -136.7552947998047,
139
+ "logps/rejected": -148.2338104248047,
140
+ "loss": 0.4819,
141
+ "rewards/accuracies": 0.778124988079071,
142
+ "rewards/chosen": -0.1698712259531021,
143
+ "rewards/margins": 1.1215158700942993,
144
+ "rewards/rejected": -1.2913870811462402,
145
+ "step": 90
146
+ },
147
+ {
148
+ "epoch": 0.10666666666666667,
149
+ "grad_norm": 29.896207171331433,
150
+ "learning_rate": 9.999134070902206e-07,
151
+ "logits/chosen": 2.382267951965332,
152
+ "logits/rejected": 2.517061948776245,
153
+ "logps/chosen": -126.52420806884766,
154
+ "logps/rejected": -158.33543395996094,
155
+ "loss": 0.4183,
156
+ "rewards/accuracies": 0.778124988079071,
157
+ "rewards/chosen": -0.3784619867801666,
158
+ "rewards/margins": 1.4470703601837158,
159
+ "rewards/rejected": -1.82553231716156,
160
+ "step": 100
161
+ },
162
+ {
163
+ "epoch": 0.11733333333333333,
164
+ "grad_norm": 22.595589720258207,
165
+ "learning_rate": 9.99220843761565e-07,
166
+ "logits/chosen": 2.2189204692840576,
167
+ "logits/rejected": 2.7971031665802,
168
+ "logps/chosen": -145.4073486328125,
169
+ "logps/rejected": -148.38609313964844,
170
+ "loss": 0.3777,
171
+ "rewards/accuracies": 0.8343750238418579,
172
+ "rewards/chosen": -0.6497014760971069,
173
+ "rewards/margins": 1.5745065212249756,
174
+ "rewards/rejected": -2.224207878112793,
175
+ "step": 110
176
+ },
177
+ {
178
+ "epoch": 0.128,
179
+ "grad_norm": 30.721800605452906,
180
+ "learning_rate": 9.97836676558346e-07,
181
+ "logits/chosen": 2.174267292022705,
182
+ "logits/rejected": 2.853877544403076,
183
+ "logps/chosen": -142.8085479736328,
184
+ "logps/rejected": -149.57443237304688,
185
+ "loss": 0.4048,
186
+ "rewards/accuracies": 0.796875,
187
+ "rewards/chosen": -0.5126703381538391,
188
+ "rewards/margins": 1.7328587770462036,
189
+ "rewards/rejected": -2.2455291748046875,
190
+ "step": 120
191
+ },
192
+ {
193
+ "epoch": 0.13866666666666666,
194
+ "grad_norm": 34.136357440276,
195
+ "learning_rate": 9.957628230595525e-07,
196
+ "logits/chosen": 2.1472320556640625,
197
+ "logits/rejected": 2.640443801879883,
198
+ "logps/chosen": -141.8199005126953,
199
+ "logps/rejected": -155.64419555664062,
200
+ "loss": 0.4673,
201
+ "rewards/accuracies": 0.746874988079071,
202
+ "rewards/chosen": -0.25329455733299255,
203
+ "rewards/margins": 1.7003322839736938,
204
+ "rewards/rejected": -1.9536267518997192,
205
+ "step": 130
206
+ },
207
+ {
208
+ "epoch": 0.14933333333333335,
209
+ "grad_norm": 42.43483647388403,
210
+ "learning_rate": 9.9300215631252e-07,
211
+ "logits/chosen": 2.1069414615631104,
212
+ "logits/rejected": 2.291423797607422,
213
+ "logps/chosen": -125.19463348388672,
214
+ "logps/rejected": -158.3802032470703,
215
+ "loss": 0.3758,
216
+ "rewards/accuracies": 0.8437499403953552,
217
+ "rewards/chosen": 0.07672759145498276,
218
+ "rewards/margins": 2.0006155967712402,
219
+ "rewards/rejected": -1.9238877296447754,
220
+ "step": 140
221
+ },
222
+ {
223
+ "epoch": 0.16,
224
+ "grad_norm": 36.02384513576028,
225
+ "learning_rate": 9.895585008527075e-07,
226
+ "logits/chosen": 2.3169870376586914,
227
+ "logits/rejected": 2.500821590423584,
228
+ "logps/chosen": -138.5598907470703,
229
+ "logps/rejected": -168.0284881591797,
230
+ "loss": 0.4251,
231
+ "rewards/accuracies": 0.8218750953674316,
232
+ "rewards/chosen": -0.4848916530609131,
233
+ "rewards/margins": 2.033202886581421,
234
+ "rewards/rejected": -2.518094301223755,
235
+ "step": 150
236
+ },
237
+ {
238
+ "epoch": 0.17066666666666666,
239
+ "grad_norm": 15.042416822714568,
240
+ "learning_rate": 9.854366274053124e-07,
241
+ "logits/chosen": 2.0532939434051514,
242
+ "logits/rejected": 2.4741499423980713,
243
+ "logps/chosen": -135.32594299316406,
244
+ "logps/rejected": -155.9827880859375,
245
+ "loss": 0.3675,
246
+ "rewards/accuracies": 0.8187500238418579,
247
+ "rewards/chosen": -0.19416652619838715,
248
+ "rewards/margins": 2.0035641193389893,
249
+ "rewards/rejected": -2.197730541229248,
250
+ "step": 160
251
+ },
252
+ {
253
+ "epoch": 0.18133333333333335,
254
+ "grad_norm": 29.38433081829733,
255
+ "learning_rate": 9.806422462760687e-07,
256
+ "logits/chosen": 2.0831096172332764,
257
+ "logits/rejected": 2.4652578830718994,
258
+ "logps/chosen": -139.71656799316406,
259
+ "logps/rejected": -160.39244079589844,
260
+ "loss": 0.4298,
261
+ "rewards/accuracies": 0.8062499761581421,
262
+ "rewards/chosen": -0.6061689853668213,
263
+ "rewards/margins": 2.0064656734466553,
264
+ "rewards/rejected": -2.6126346588134766,
265
+ "step": 170
266
+ },
267
+ {
268
+ "epoch": 0.192,
269
+ "grad_norm": 30.944868310764818,
270
+ "learning_rate": 9.7518199944038e-07,
271
+ "logits/chosen": 2.0607786178588867,
272
+ "logits/rejected": 2.363945245742798,
273
+ "logps/chosen": -136.1494598388672,
274
+ "logps/rejected": -157.36947631835938,
275
+ "loss": 0.4231,
276
+ "rewards/accuracies": 0.8125,
277
+ "rewards/chosen": -0.39128583669662476,
278
+ "rewards/margins": 1.969771385192871,
279
+ "rewards/rejected": -2.3610572814941406,
280
+ "step": 180
281
+ },
282
+ {
283
+ "epoch": 0.20266666666666666,
284
+ "grad_norm": 42.05053142859476,
285
+ "learning_rate": 9.690634513417486e-07,
286
+ "logits/chosen": 2.169426918029785,
287
+ "logits/rejected": 2.328373670578003,
288
+ "logps/chosen": -135.956298828125,
289
+ "logps/rejected": -170.6649169921875,
290
+ "loss": 0.3561,
291
+ "rewards/accuracies": 0.859375,
292
+ "rewards/chosen": -0.7195563912391663,
293
+ "rewards/margins": 2.492656946182251,
294
+ "rewards/rejected": -3.2122128009796143,
295
+ "step": 190
296
+ },
297
+ {
298
+ "epoch": 0.21333333333333335,
299
+ "grad_norm": 35.520349085403645,
300
+ "learning_rate": 9.622950784122471e-07,
301
+ "logits/chosen": 2.020885944366455,
302
+ "logits/rejected": 2.4902377128601074,
303
+ "logps/chosen": -139.86936950683594,
304
+ "logps/rejected": -165.3739013671875,
305
+ "loss": 0.3948,
306
+ "rewards/accuracies": 0.8093750476837158,
307
+ "rewards/chosen": -0.8580716252326965,
308
+ "rewards/margins": 2.244415283203125,
309
+ "rewards/rejected": -3.1024868488311768,
310
+ "step": 200
311
+ },
312
+ {
313
+ "epoch": 0.224,
314
+ "grad_norm": 30.914393418212303,
315
+ "learning_rate": 9.54886257329555e-07,
316
+ "logits/chosen": 2.032724380493164,
317
+ "logits/rejected": 2.399937152862549,
318
+ "logps/chosen": -131.96702575683594,
319
+ "logps/rejected": -155.24063110351562,
320
+ "loss": 0.4144,
321
+ "rewards/accuracies": 0.8125000596046448,
322
+ "rewards/chosen": -0.2814589738845825,
323
+ "rewards/margins": 2.195185422897339,
324
+ "rewards/rejected": -2.476644515991211,
325
+ "step": 210
326
+ },
327
+ {
328
+ "epoch": 0.23466666666666666,
329
+ "grad_norm": 33.70657801293605,
330
+ "learning_rate": 9.468472520268205e-07,
331
+ "logits/chosen": 2.0024030208587646,
332
+ "logits/rejected": 2.4853296279907227,
333
+ "logps/chosen": -137.35540771484375,
334
+ "logps/rejected": -155.17100524902344,
335
+ "loss": 0.38,
336
+ "rewards/accuracies": 0.8218749761581421,
337
+ "rewards/chosen": -0.004300939850509167,
338
+ "rewards/margins": 2.1314268112182617,
339
+ "rewards/rejected": -2.135727643966675,
340
+ "step": 220
341
+ },
342
+ {
343
+ "epoch": 0.24533333333333332,
344
+ "grad_norm": 37.495577601362356,
345
+ "learning_rate": 9.381891994733519e-07,
346
+ "logits/chosen": 2.243539810180664,
347
+ "logits/rejected": 2.352928400039673,
348
+ "logps/chosen": -129.24896240234375,
349
+ "logps/rejected": -169.98135375976562,
350
+ "loss": 0.3679,
351
+ "rewards/accuracies": 0.8218750357627869,
352
+ "rewards/chosen": -0.28722256422042847,
353
+ "rewards/margins": 2.4391627311706543,
354
+ "rewards/rejected": -2.7263851165771484,
355
+ "step": 230
356
+ },
357
+ {
358
+ "epoch": 0.256,
359
+ "grad_norm": 25.343951064869156,
360
+ "learning_rate": 9.289240942458321e-07,
361
+ "logits/chosen": 2.0728087425231934,
362
+ "logits/rejected": 2.4773340225219727,
363
+ "logps/chosen": -135.0269775390625,
364
+ "logps/rejected": -168.2124481201172,
365
+ "loss": 0.3123,
366
+ "rewards/accuracies": 0.846875011920929,
367
+ "rewards/chosen": -0.4651724398136139,
368
+ "rewards/margins": 2.803205966949463,
369
+ "rewards/rejected": -3.268378734588623,
370
+ "step": 240
371
+ },
372
+ {
373
+ "epoch": 0.26666666666666666,
374
+ "grad_norm": 29.0620379587301,
375
+ "learning_rate": 9.190647719114326e-07,
376
+ "logits/chosen": 1.8383233547210693,
377
+ "logits/rejected": 2.2059688568115234,
378
+ "logps/chosen": -136.12245178222656,
379
+ "logps/rejected": -174.0221405029297,
380
+ "loss": 0.4021,
381
+ "rewards/accuracies": 0.8250000476837158,
382
+ "rewards/chosen": -0.4718845784664154,
383
+ "rewards/margins": 2.437692642211914,
384
+ "rewards/rejected": -2.9095773696899414,
385
+ "step": 250
386
+ },
387
+ {
388
+ "epoch": 0.2773333333333333,
389
+ "grad_norm": 32.206451154260584,
390
+ "learning_rate": 9.086248912458483e-07,
391
+ "logits/chosen": 1.901822805404663,
392
+ "logits/rejected": 2.3750545978546143,
393
+ "logps/chosen": -137.2100830078125,
394
+ "logps/rejected": -160.05950927734375,
395
+ "loss": 0.3368,
396
+ "rewards/accuracies": 0.8374999761581421,
397
+ "rewards/chosen": -0.45527350902557373,
398
+ "rewards/margins": 2.3181633949279785,
399
+ "rewards/rejected": -2.7734367847442627,
400
+ "step": 260
401
+ },
402
+ {
403
+ "epoch": 0.288,
404
+ "grad_norm": 22.28437598082458,
405
+ "learning_rate": 8.976189153108852e-07,
406
+ "logits/chosen": 1.8406670093536377,
407
+ "logits/rejected": 2.231459617614746,
408
+ "logps/chosen": -136.8876495361328,
409
+ "logps/rejected": -162.5147247314453,
410
+ "loss": 0.3545,
411
+ "rewards/accuracies": 0.8468750715255737,
412
+ "rewards/chosen": -0.5715955495834351,
413
+ "rewards/margins": 2.6194005012512207,
414
+ "rewards/rejected": -3.1909961700439453,
415
+ "step": 270
416
+ },
417
+ {
418
+ "epoch": 0.2986666666666667,
419
+ "grad_norm": 26.163035485727516,
420
+ "learning_rate": 8.860620914178187e-07,
421
+ "logits/chosen": 1.7282989025115967,
422
+ "logits/rejected": 2.090369462966919,
423
+ "logps/chosen": -133.57354736328125,
424
+ "logps/rejected": -169.84217834472656,
425
+ "loss": 0.3629,
426
+ "rewards/accuracies": 0.8374999761581421,
427
+ "rewards/chosen": -0.4477410912513733,
428
+ "rewards/margins": 2.430133581161499,
429
+ "rewards/rejected": -2.8778748512268066,
430
+ "step": 280
431
+ },
432
+ {
433
+ "epoch": 0.30933333333333335,
434
+ "grad_norm": 27.544558000777418,
435
+ "learning_rate": 8.739704300042778e-07,
436
+ "logits/chosen": 1.6349306106567383,
437
+ "logits/rejected": 2.2942090034484863,
438
+ "logps/chosen": -137.38523864746094,
439
+ "logps/rejected": -158.7125244140625,
440
+ "loss": 0.3788,
441
+ "rewards/accuracies": 0.8500000238418579,
442
+ "rewards/chosen": -0.5446420907974243,
443
+ "rewards/margins": 2.5425944328308105,
444
+ "rewards/rejected": -3.0872364044189453,
445
+ "step": 290
446
+ },
447
+ {
448
+ "epoch": 0.32,
449
+ "grad_norm": 28.15835022856019,
450
+ "learning_rate": 8.613606824539197e-07,
451
+ "logits/chosen": 1.611955165863037,
452
+ "logits/rejected": 2.1477112770080566,
453
+ "logps/chosen": -137.20155334472656,
454
+ "logps/rejected": -163.80764770507812,
455
+ "loss": 0.3507,
456
+ "rewards/accuracies": 0.8500000238418579,
457
+ "rewards/chosen": -0.41365841031074524,
458
+ "rewards/margins": 2.4483883380889893,
459
+ "rewards/rejected": -2.8620471954345703,
460
+ "step": 300
461
+ },
462
+ {
463
+ "epoch": 0.33066666666666666,
464
+ "grad_norm": 26.9394473330418,
465
+ "learning_rate": 8.482503178896226e-07,
466
+ "logits/chosen": 1.5950888395309448,
467
+ "logits/rejected": 1.8947237730026245,
468
+ "logps/chosen": -138.35462951660156,
469
+ "logps/rejected": -167.69569396972656,
470
+ "loss": 0.3696,
471
+ "rewards/accuracies": 0.831250011920929,
472
+ "rewards/chosen": -0.8275109529495239,
473
+ "rewards/margins": 2.685105323791504,
474
+ "rewards/rejected": -3.5126163959503174,
475
+ "step": 310
476
+ },
477
+ {
478
+ "epoch": 0.3413333333333333,
479
+ "grad_norm": 23.332637225870037,
480
+ "learning_rate": 8.346574989723469e-07,
481
+ "logits/chosen": 1.7531938552856445,
482
+ "logits/rejected": 2.2487497329711914,
483
+ "logps/chosen": -143.16526794433594,
484
+ "logps/rejected": -166.20895385742188,
485
+ "loss": 0.3582,
486
+ "rewards/accuracies": 0.8437500596046448,
487
+ "rewards/chosen": -0.5256407260894775,
488
+ "rewards/margins": 2.5411202907562256,
489
+ "rewards/rejected": -3.0667612552642822,
490
+ "step": 320
491
+ },
492
+ {
493
+ "epoch": 0.352,
494
+ "grad_norm": 21.87942904244536,
495
+ "learning_rate": 8.206010567391916e-07,
496
+ "logits/chosen": 1.593059778213501,
497
+ "logits/rejected": 2.1650094985961914,
498
+ "logps/chosen": -131.16326904296875,
499
+ "logps/rejected": -158.94223022460938,
500
+ "loss": 0.3461,
501
+ "rewards/accuracies": 0.8468750715255737,
502
+ "rewards/chosen": -0.4646984934806824,
503
+ "rewards/margins": 2.3126156330108643,
504
+ "rewards/rejected": -2.7773139476776123,
505
+ "step": 330
506
+ },
507
+ {
508
+ "epoch": 0.3626666666666667,
509
+ "grad_norm": 31.101098463657916,
510
+ "learning_rate": 8.061004645155048e-07,
511
+ "logits/chosen": 1.5677554607391357,
512
+ "logits/rejected": 1.9733402729034424,
513
+ "logps/chosen": -146.71609497070312,
514
+ "logps/rejected": -171.69955444335938,
515
+ "loss": 0.3235,
516
+ "rewards/accuracies": 0.8656250238418579,
517
+ "rewards/chosen": -0.48211732506752014,
518
+ "rewards/margins": 2.4466898441314697,
519
+ "rewards/rejected": -2.928807020187378,
520
+ "step": 340
521
+ },
522
+ {
523
+ "epoch": 0.37333333333333335,
524
+ "grad_norm": 30.645699097464068,
525
+ "learning_rate": 7.911758109371889e-07,
526
+ "logits/chosen": 1.6403833627700806,
527
+ "logits/rejected": 1.8808367252349854,
528
+ "logps/chosen": -140.71217346191406,
529
+ "logps/rejected": -175.47921752929688,
530
+ "loss": 0.3459,
531
+ "rewards/accuracies": 0.8437500596046448,
532
+ "rewards/chosen": -0.7281967401504517,
533
+ "rewards/margins": 2.3733932971954346,
534
+ "rewards/rejected": -3.1015896797180176,
535
+ "step": 350
536
+ },
537
+ {
538
+ "epoch": 0.384,
539
+ "grad_norm": 24.51134274765049,
540
+ "learning_rate": 7.758477721205765e-07,
541
+ "logits/chosen": 1.7783101797103882,
542
+ "logits/rejected": 2.1137070655822754,
543
+ "logps/chosen": -137.43540954589844,
544
+ "logps/rejected": -163.09820556640625,
545
+ "loss": 0.3483,
546
+ "rewards/accuracies": 0.8375000357627869,
547
+ "rewards/chosen": -0.7030624747276306,
548
+ "rewards/margins": 2.4687132835388184,
549
+ "rewards/rejected": -3.1717755794525146,
550
+ "step": 360
551
+ },
552
+ {
553
+ "epoch": 0.39466666666666667,
554
+ "grad_norm": 18.074083008686905,
555
+ "learning_rate": 7.601375830184295e-07,
556
+ "logits/chosen": 1.5980160236358643,
557
+ "logits/rejected": 1.8996552228927612,
558
+ "logps/chosen": -140.5148162841797,
559
+ "logps/rejected": -171.46630859375,
560
+ "loss": 0.313,
561
+ "rewards/accuracies": 0.8875000476837158,
562
+ "rewards/chosen": -0.7159416675567627,
563
+ "rewards/margins": 2.979588508605957,
564
+ "rewards/rejected": -3.695530414581299,
565
+ "step": 370
566
+ },
567
+ {
568
+ "epoch": 0.4053333333333333,
569
+ "grad_norm": 29.247823050349254,
570
+ "learning_rate": 7.440670080017454e-07,
571
+ "logits/chosen": 1.5842480659484863,
572
+ "logits/rejected": 2.019357919692993,
573
+ "logps/chosen": -132.16673278808594,
574
+ "logps/rejected": -162.817138671875,
575
+ "loss": 0.3415,
576
+ "rewards/accuracies": 0.8374999761581421,
577
+ "rewards/chosen": -0.69346022605896,
578
+ "rewards/margins": 2.366546392440796,
579
+ "rewards/rejected": -3.060007095336914,
580
+ "step": 380
581
+ },
582
+ {
583
+ "epoch": 0.416,
584
+ "grad_norm": 31.9631460774584,
585
+ "learning_rate": 7.276583107081242e-07,
586
+ "logits/chosen": 1.6213833093643188,
587
+ "logits/rejected": 2.0778937339782715,
588
+ "logps/chosen": -141.9418487548828,
589
+ "logps/rejected": -174.61468505859375,
590
+ "loss": 0.3329,
591
+ "rewards/accuracies": 0.8624999523162842,
592
+ "rewards/chosen": -1.060402750968933,
593
+ "rewards/margins": 2.9680538177490234,
594
+ "rewards/rejected": -4.028456211090088,
595
+ "step": 390
596
+ },
597
+ {
598
+ "epoch": 0.4266666666666667,
599
+ "grad_norm": 26.16381075907044,
600
+ "learning_rate": 7.109342231984698e-07,
601
+ "logits/chosen": 1.3554099798202515,
602
+ "logits/rejected": 2.0283570289611816,
603
+ "logps/chosen": -138.100341796875,
604
+ "logps/rejected": -159.7268829345703,
605
+ "loss": 0.3786,
606
+ "rewards/accuracies": 0.8343749642372131,
607
+ "rewards/chosen": -1.067877173423767,
608
+ "rewards/margins": 2.4551329612731934,
609
+ "rewards/rejected": -3.52301025390625,
610
+ "step": 400
611
+ },
612
+ {
613
+ "epoch": 0.43733333333333335,
614
+ "grad_norm": 11.140411163657106,
615
+ "learning_rate": 6.939179144647515e-07,
616
+ "logits/chosen": 1.385094165802002,
617
+ "logits/rejected": 1.9930095672607422,
618
+ "logps/chosen": -129.92941284179688,
619
+ "logps/rejected": -155.423095703125,
620
+ "loss": 0.2945,
621
+ "rewards/accuracies": 0.878125011920929,
622
+ "rewards/chosen": -0.29770562052726746,
623
+ "rewards/margins": 2.735492706298828,
624
+ "rewards/rejected": -3.033198118209839,
625
+ "step": 410
626
+ },
627
+ {
628
+ "epoch": 0.448,
629
+ "grad_norm": 28.56957418913061,
630
+ "learning_rate": 6.766329583324581e-07,
631
+ "logits/chosen": 1.437608003616333,
632
+ "logits/rejected": 1.9395537376403809,
633
+ "logps/chosen": -131.2879638671875,
634
+ "logps/rejected": -159.0998992919922,
635
+ "loss": 0.3127,
636
+ "rewards/accuracies": 0.856249988079071,
637
+ "rewards/chosen": -0.49805647134780884,
638
+ "rewards/margins": 2.613166570663452,
639
+ "rewards/rejected": -3.111222743988037,
640
+ "step": 420
641
+ },
642
+ {
643
+ "epoch": 0.45866666666666667,
644
+ "grad_norm": 33.23437534680263,
645
+ "learning_rate": 6.591033008022067e-07,
646
+ "logits/chosen": 1.3858729600906372,
647
+ "logits/rejected": 1.6989398002624512,
648
+ "logps/chosen": -134.23770141601562,
649
+ "logps/rejected": -176.20919799804688,
650
+ "loss": 0.3096,
651
+ "rewards/accuracies": 0.871874988079071,
652
+ "rewards/chosen": -0.535869836807251,
653
+ "rewards/margins": 3.0371181964874268,
654
+ "rewards/rejected": -3.572988510131836,
655
+ "step": 430
656
+ },
657
+ {
658
+ "epoch": 0.4693333333333333,
659
+ "grad_norm": 28.346897945132458,
660
+ "learning_rate": 6.413532268757537e-07,
661
+ "logits/chosen": 1.4444637298583984,
662
+ "logits/rejected": 1.6581867933273315,
663
+ "logps/chosen": -138.23431396484375,
664
+ "logps/rejected": -168.9981689453125,
665
+ "loss": 0.4194,
666
+ "rewards/accuracies": 0.796875,
667
+ "rewards/chosen": -0.7419812083244324,
668
+ "rewards/margins": 2.335297107696533,
669
+ "rewards/rejected": -3.0772783756256104,
670
+ "step": 440
671
+ },
672
+ {
673
+ "epoch": 0.48,
674
+ "grad_norm": 29.164626019209308,
675
+ "learning_rate": 6.234073269123653e-07,
676
+ "logits/chosen": 1.393236756324768,
677
+ "logits/rejected": 1.7022786140441895,
678
+ "logps/chosen": -142.96592712402344,
679
+ "logps/rejected": -173.4695587158203,
680
+ "loss": 0.3408,
681
+ "rewards/accuracies": 0.8375000357627869,
682
+ "rewards/chosen": -1.0506477355957031,
683
+ "rewards/margins": 2.8269805908203125,
684
+ "rewards/rejected": -3.8776283264160156,
685
+ "step": 450
686
+ },
687
+ {
688
+ "epoch": 0.49066666666666664,
689
+ "grad_norm": 27.21924736505641,
690
+ "learning_rate": 6.052904625621555e-07,
691
+ "logits/chosen": 1.4608548879623413,
692
+ "logits/rejected": 1.7432273626327515,
693
+ "logps/chosen": -138.21566772460938,
694
+ "logps/rejected": -167.56101989746094,
695
+ "loss": 0.3906,
696
+ "rewards/accuracies": 0.8343750238418579,
697
+ "rewards/chosen": -0.971255362033844,
698
+ "rewards/margins": 2.769684314727783,
699
+ "rewards/rejected": -3.7409396171569824,
700
+ "step": 460
701
+ },
702
+ {
703
+ "epoch": 0.5013333333333333,
704
+ "grad_norm": 23.667407037284928,
705
+ "learning_rate": 5.870277323235871e-07,
706
+ "logits/chosen": 1.3296135663986206,
707
+ "logits/rejected": 1.8589414358139038,
708
+ "logps/chosen": -145.2051239013672,
709
+ "logps/rejected": -172.4295196533203,
710
+ "loss": 0.3328,
711
+ "rewards/accuracies": 0.8593749403953552,
712
+ "rewards/chosen": -0.9092783331871033,
713
+ "rewards/margins": 2.5038325786590576,
714
+ "rewards/rejected": -3.4131109714508057,
715
+ "step": 470
716
+ },
717
+ {
718
+ "epoch": 0.512,
719
+ "grad_norm": 32.4308081410924,
720
+ "learning_rate": 5.686444367728494e-07,
721
+ "logits/chosen": 1.3602862358093262,
722
+ "logits/rejected": 1.752657413482666,
723
+ "logps/chosen": -137.1173858642578,
724
+ "logps/rejected": -167.48741149902344,
725
+ "loss": 0.3152,
726
+ "rewards/accuracies": 0.8812499642372131,
727
+ "rewards/chosen": -0.6123771071434021,
728
+ "rewards/margins": 2.6645078659057617,
729
+ "rewards/rejected": -3.2768850326538086,
730
+ "step": 480
731
+ },
732
+ {
733
+ "epoch": 0.5226666666666666,
734
+ "grad_norm": 22.650259255481703,
735
+ "learning_rate": 5.50166043513287e-07,
736
+ "logits/chosen": 1.3106579780578613,
737
+ "logits/rejected": 1.7208105325698853,
738
+ "logps/chosen": -132.9499969482422,
739
+ "logps/rejected": -168.179443359375,
740
+ "loss": 0.3225,
741
+ "rewards/accuracies": 0.8250000476837158,
742
+ "rewards/chosen": -0.5453524589538574,
743
+ "rewards/margins": 2.830275058746338,
744
+ "rewards/rejected": -3.3756275177001953,
745
+ "step": 490
746
+ },
747
+ {
748
+ "epoch": 0.5333333333333333,
749
+ "grad_norm": 34.87144459717604,
750
+ "learning_rate": 5.316181518934318e-07,
751
+ "logits/chosen": 1.4427666664123535,
752
+ "logits/rejected": 1.6337779760360718,
753
+ "logps/chosen": -131.83047485351562,
754
+ "logps/rejected": -163.92172241210938,
755
+ "loss": 0.3208,
756
+ "rewards/accuracies": 0.8531249761581421,
757
+ "rewards/chosen": -0.47837477922439575,
758
+ "rewards/margins": 2.7933568954467773,
759
+ "rewards/rejected": -3.2717316150665283,
760
+ "step": 500
761
+ },
762
+ {
763
+ "epoch": 0.544,
764
+ "grad_norm": 31.52951767510459,
765
+ "learning_rate": 5.130264575425224e-07,
766
+ "logits/chosen": 1.4013196229934692,
767
+ "logits/rejected": 1.6610275506973267,
768
+ "logps/chosen": -132.53924560546875,
769
+ "logps/rejected": -167.6267547607422,
770
+ "loss": 0.272,
771
+ "rewards/accuracies": 0.8593749403953552,
772
+ "rewards/chosen": -0.5326389074325562,
773
+ "rewards/margins": 2.8615810871124268,
774
+ "rewards/rejected": -3.3942196369171143,
775
+ "step": 510
776
+ },
777
+ {
778
+ "epoch": 0.5546666666666666,
779
+ "grad_norm": 29.37500979396824,
780
+ "learning_rate": 4.944167167726367e-07,
781
+ "logits/chosen": 1.4267271757125854,
782
+ "logits/rejected": 1.6526139974594116,
783
+ "logps/chosen": -133.26963806152344,
784
+ "logps/rejected": -173.13449096679688,
785
+ "loss": 0.3273,
786
+ "rewards/accuracies": 0.8375000357627869,
787
+ "rewards/chosen": -0.6281304955482483,
788
+ "rewards/margins": 3.023805856704712,
789
+ "rewards/rejected": -3.6519365310668945,
790
+ "step": 520
791
+ },
792
+ {
793
+ "epoch": 0.5653333333333334,
794
+ "grad_norm": 45.4295336560468,
795
+ "learning_rate": 4.758147108967584e-07,
796
+ "logits/chosen": 1.5470153093338013,
797
+ "logits/rejected": 1.7743972539901733,
798
+ "logps/chosen": -150.69190979003906,
799
+ "logps/rejected": -186.68701171875,
800
+ "loss": 0.3734,
801
+ "rewards/accuracies": 0.8281250596046448,
802
+ "rewards/chosen": -1.2518389225006104,
803
+ "rewards/margins": 2.885488271713257,
804
+ "rewards/rejected": -4.137327194213867,
805
+ "step": 530
806
+ },
807
+ {
808
+ "epoch": 0.576,
809
+ "grad_norm": 21.855665145983874,
810
+ "learning_rate": 4.572462105122077e-07,
811
+ "logits/chosen": 1.3561698198318481,
812
+ "logits/rejected": 1.7270151376724243,
813
+ "logps/chosen": -139.5657501220703,
814
+ "logps/rejected": -175.99508666992188,
815
+ "loss": 0.2833,
816
+ "rewards/accuracies": 0.8812500238418579,
817
+ "rewards/chosen": -1.112634301185608,
818
+ "rewards/margins": 3.2117066383361816,
819
+ "rewards/rejected": -4.3243408203125,
820
+ "step": 540
821
+ },
822
+ {
823
+ "epoch": 0.5866666666666667,
824
+ "grad_norm": 24.367461053001698,
825
+ "learning_rate": 4.3873693979891696e-07,
826
+ "logits/chosen": 1.4616541862487793,
827
+ "logits/rejected": 1.7883496284484863,
828
+ "logps/chosen": -140.05160522460938,
829
+ "logps/rejected": -181.77622985839844,
830
+ "loss": 0.2513,
831
+ "rewards/accuracies": 0.9062500596046448,
832
+ "rewards/chosen": -0.8183116912841797,
833
+ "rewards/margins": 3.3123300075531006,
834
+ "rewards/rejected": -4.130641937255859,
835
+ "step": 550
836
+ },
837
+ {
838
+ "epoch": 0.5973333333333334,
839
+ "grad_norm": 23.757622976541885,
840
+ "learning_rate": 4.203125408820105e-07,
841
+ "logits/chosen": 1.4851560592651367,
842
+ "logits/rejected": 1.794302225112915,
843
+ "logps/chosen": -134.1141357421875,
844
+ "logps/rejected": -170.49234008789062,
845
+ "loss": 0.3149,
846
+ "rewards/accuracies": 0.8843750357627869,
847
+ "rewards/chosen": -0.9360587000846863,
848
+ "rewards/margins": 2.853070020675659,
849
+ "rewards/rejected": -3.7891287803649902,
850
+ "step": 560
851
+ },
852
+ {
853
+ "epoch": 0.608,
854
+ "grad_norm": 21.213296976158986,
855
+ "learning_rate": 4.019985383080632e-07,
856
+ "logits/chosen": 1.237152338027954,
857
+ "logits/rejected": 1.7590258121490479,
858
+ "logps/chosen": -138.93270874023438,
859
+ "logps/rejected": -160.0484619140625,
860
+ "loss": 0.3722,
861
+ "rewards/accuracies": 0.856249988079071,
862
+ "rewards/chosen": -0.7550384998321533,
863
+ "rewards/margins": 2.6985340118408203,
864
+ "rewards/rejected": -3.4535727500915527,
865
+ "step": 570
866
+ },
867
+ {
868
+ "epoch": 0.6186666666666667,
869
+ "grad_norm": 26.294885755983195,
870
+ "learning_rate": 3.8382030368424454e-07,
871
+ "logits/chosen": 1.3558965921401978,
872
+ "logits/rejected": 1.6953362226486206,
873
+ "logps/chosen": -137.65817260742188,
874
+ "logps/rejected": -170.1033477783203,
875
+ "loss": 0.3269,
876
+ "rewards/accuracies": 0.8531249761581421,
877
+ "rewards/chosen": -0.6933549046516418,
878
+ "rewards/margins": 2.7456483840942383,
879
+ "rewards/rejected": -3.4390032291412354,
880
+ "step": 580
881
+ },
882
+ {
883
+ "epoch": 0.6293333333333333,
884
+ "grad_norm": 33.07924807089865,
885
+ "learning_rate": 3.6580302052934297e-07,
886
+ "logits/chosen": 1.2663443088531494,
887
+ "logits/rejected": 1.5939358472824097,
888
+ "logps/chosen": -132.91456604003906,
889
+ "logps/rejected": -162.4006805419922,
890
+ "loss": 0.331,
891
+ "rewards/accuracies": 0.875,
892
+ "rewards/chosen": -0.6310861706733704,
893
+ "rewards/margins": 2.8310067653656006,
894
+ "rewards/rejected": -3.4620931148529053,
895
+ "step": 590
896
+ },
897
+ {
898
+ "epoch": 0.64,
899
+ "grad_norm": 20.115198065979413,
900
+ "learning_rate": 3.479716493853611e-07,
901
+ "logits/chosen": 1.18405020236969,
902
+ "logits/rejected": 1.536871075630188,
903
+ "logps/chosen": -131.33584594726562,
904
+ "logps/rejected": -168.03636169433594,
905
+ "loss": 0.2805,
906
+ "rewards/accuracies": 0.8812500238418579,
907
+ "rewards/chosen": -0.6372289061546326,
908
+ "rewards/margins": 3.1736860275268555,
909
+ "rewards/rejected": -3.810914993286133,
910
+ "step": 600
911
+ },
912
+ {
913
+ "epoch": 0.6506666666666666,
914
+ "grad_norm": 19.273605121929624,
915
+ "learning_rate": 3.303508932380132e-07,
916
+ "logits/chosen": 1.2737094163894653,
917
+ "logits/rejected": 1.6445541381835938,
918
+ "logps/chosen": -136.96719360351562,
919
+ "logps/rejected": -174.71083068847656,
920
+ "loss": 0.3445,
921
+ "rewards/accuracies": 0.8562500476837158,
922
+ "rewards/chosen": -0.6345027685165405,
923
+ "rewards/margins": 2.950732946395874,
924
+ "rewards/rejected": -3.585235834121704,
925
+ "step": 610
926
+ },
927
+ {
928
+ "epoch": 0.6613333333333333,
929
+ "grad_norm": 30.667064690871456,
930
+ "learning_rate": 3.129651632940362e-07,
931
+ "logits/chosen": 1.292923927307129,
932
+ "logits/rejected": 1.5391473770141602,
933
+ "logps/chosen": -141.96238708496094,
934
+ "logps/rejected": -173.04873657226562,
935
+ "loss": 0.2906,
936
+ "rewards/accuracies": 0.887499988079071,
937
+ "rewards/chosen": -0.597599983215332,
938
+ "rewards/margins": 3.03706955909729,
939
+ "rewards/rejected": -3.634669303894043,
940
+ "step": 620
941
+ },
942
+ {
943
+ "epoch": 0.672,
944
+ "grad_norm": 25.676210849380702,
945
+ "learning_rate": 2.958385451627181e-07,
946
+ "logits/chosen": 1.231809139251709,
947
+ "logits/rejected": 1.600536823272705,
948
+ "logps/chosen": -139.60745239257812,
949
+ "logps/rejected": -178.7290496826172,
950
+ "loss": 0.2972,
951
+ "rewards/accuracies": 0.871874988079071,
952
+ "rewards/chosen": -0.7131984829902649,
953
+ "rewards/margins": 3.2930028438568115,
954
+ "rewards/rejected": -4.006201267242432,
955
+ "step": 630
956
+ },
957
+ {
958
+ "epoch": 0.6826666666666666,
959
+ "grad_norm": 30.707834181954617,
960
+ "learning_rate": 2.7899476548850043e-07,
961
+ "logits/chosen": 1.299914836883545,
962
+ "logits/rejected": 1.624165415763855,
963
+ "logps/chosen": -139.61639404296875,
964
+ "logps/rejected": -172.74610900878906,
965
+ "loss": 0.3445,
966
+ "rewards/accuracies": 0.8500000834465027,
967
+ "rewards/chosen": -0.7801929116249084,
968
+ "rewards/margins": 2.922720432281494,
969
+ "rewards/rejected": -3.702913284301758,
970
+ "step": 640
971
+ },
972
+ {
973
+ "epoch": 0.6933333333333334,
974
+ "grad_norm": 26.04525358558716,
975
+ "learning_rate": 2.6245715908087804e-07,
976
+ "logits/chosen": 1.3557870388031006,
977
+ "logits/rejected": 1.6440367698669434,
978
+ "logps/chosen": -138.72109985351562,
979
+ "logps/rejected": -166.85986328125,
980
+ "loss": 0.3891,
981
+ "rewards/accuracies": 0.8375000357627869,
982
+ "rewards/chosen": -0.8479130864143372,
983
+ "rewards/margins": 2.6863441467285156,
984
+ "rewards/rejected": -3.534257173538208,
985
+ "step": 650
986
+ },
987
+ {
988
+ "epoch": 0.704,
989
+ "grad_norm": 38.60539175189378,
990
+ "learning_rate": 2.462486365871338e-07,
991
+ "logits/chosen": 1.117995262145996,
992
+ "logits/rejected": 1.5660450458526611,
993
+ "logps/chosen": -139.3802490234375,
994
+ "logps/rejected": -170.3036346435547,
995
+ "loss": 0.3707,
996
+ "rewards/accuracies": 0.84375,
997
+ "rewards/chosen": -0.7362462878227234,
998
+ "rewards/margins": 2.719696521759033,
999
+ "rewards/rejected": -3.4559431076049805,
1000
+ "step": 660
1001
+ },
1002
+ {
1003
+ "epoch": 0.7146666666666667,
1004
+ "grad_norm": 29.88817411661078,
1005
+ "learning_rate": 2.3039165275269214e-07,
1006
+ "logits/chosen": 1.2235151529312134,
1007
+ "logits/rejected": 1.5882010459899902,
1008
+ "logps/chosen": -133.53123474121094,
1009
+ "logps/rejected": -165.77561950683594,
1010
+ "loss": 0.3481,
1011
+ "rewards/accuracies": 0.84375,
1012
+ "rewards/chosen": -0.7423657178878784,
1013
+ "rewards/margins": 2.9302687644958496,
1014
+ "rewards/rejected": -3.6726343631744385,
1015
+ "step": 670
1016
+ },
1017
+ {
1018
+ "epoch": 0.7253333333333334,
1019
+ "grad_norm": 26.689941208436807,
1020
+ "learning_rate": 2.1490817531306775e-07,
1021
+ "logits/chosen": 1.199167013168335,
1022
+ "logits/rejected": 1.4307794570922852,
1023
+ "logps/chosen": -138.82012939453125,
1024
+ "logps/rejected": -172.9408416748047,
1025
+ "loss": 0.3102,
1026
+ "rewards/accuracies": 0.8812500238418579,
1027
+ "rewards/chosen": -0.6661445498466492,
1028
+ "rewards/margins": 3.147897958755493,
1029
+ "rewards/rejected": -3.814042329788208,
1030
+ "step": 680
1031
+ },
1032
+ {
1033
+ "epoch": 0.736,
1034
+ "grad_norm": 26.51411323153538,
1035
+ "learning_rate": 1.9981965456049598e-07,
1036
+ "logits/chosen": 1.2847270965576172,
1037
+ "logits/rejected": 1.527156949043274,
1038
+ "logps/chosen": -144.3943634033203,
1039
+ "logps/rejected": -172.289306640625,
1040
+ "loss": 0.3899,
1041
+ "rewards/accuracies": 0.831250011920929,
1042
+ "rewards/chosen": -0.9222854971885681,
1043
+ "rewards/margins": 2.454240322113037,
1044
+ "rewards/rejected": -3.376525640487671,
1045
+ "step": 690
1046
+ },
1047
+ {
1048
+ "epoch": 0.7466666666666667,
1049
+ "grad_norm": 28.227088451077186,
1050
+ "learning_rate": 1.8514699362741738e-07,
1051
+ "logits/chosen": 1.2821115255355835,
1052
+ "logits/rejected": 1.5955793857574463,
1053
+ "logps/chosen": -143.79371643066406,
1054
+ "logps/rejected": -168.42379760742188,
1055
+ "loss": 0.2676,
1056
+ "rewards/accuracies": 0.890625,
1057
+ "rewards/chosen": -0.8676904439926147,
1058
+ "rewards/margins": 3.0038087368011475,
1059
+ "rewards/rejected": -3.8714990615844727,
1060
+ "step": 700
1061
+ },
1062
+ {
1063
+ "epoch": 0.7573333333333333,
1064
+ "grad_norm": 33.39329635319409,
1065
+ "learning_rate": 1.7091051952797402e-07,
1066
+ "logits/chosen": 1.2979816198349,
1067
+ "logits/rejected": 1.5841856002807617,
1068
+ "logps/chosen": -140.95835876464844,
1069
+ "logps/rejected": -180.0380401611328,
1070
+ "loss": 0.2788,
1071
+ "rewards/accuracies": 0.878125011920929,
1072
+ "rewards/chosen": -0.6436134576797485,
1073
+ "rewards/margins": 3.160693645477295,
1074
+ "rewards/rejected": -3.804307222366333,
1075
+ "step": 710
1076
+ },
1077
+ {
1078
+ "epoch": 0.768,
1079
+ "grad_norm": 29.663583912080895,
1080
+ "learning_rate": 1.571299549976456e-07,
1081
+ "logits/chosen": 1.376884937286377,
1082
+ "logits/rejected": 1.7266371250152588,
1083
+ "logps/chosen": -134.19406127929688,
1084
+ "logps/rejected": -173.26148986816406,
1085
+ "loss": 0.3333,
1086
+ "rewards/accuracies": 0.846875011920929,
1087
+ "rewards/chosen": -0.7068169116973877,
1088
+ "rewards/margins": 2.9150471687316895,
1089
+ "rewards/rejected": -3.6218643188476562,
1090
+ "step": 720
1091
+ },
1092
+ {
1093
+ "epoch": 0.7786666666666666,
1094
+ "grad_norm": 23.891419469257354,
1095
+ "learning_rate": 1.4382439117002936e-07,
1096
+ "logits/chosen": 1.2695218324661255,
1097
+ "logits/rejected": 1.564518928527832,
1098
+ "logps/chosen": -136.83729553222656,
1099
+ "logps/rejected": -178.53765869140625,
1100
+ "loss": 0.3344,
1101
+ "rewards/accuracies": 0.8281250596046448,
1102
+ "rewards/chosen": -0.796572744846344,
1103
+ "rewards/margins": 2.8927884101867676,
1104
+ "rewards/rejected": -3.689361095428467,
1105
+ "step": 730
1106
+ },
1107
+ {
1108
+ "epoch": 0.7893333333333333,
1109
+ "grad_norm": 23.85877560834035,
1110
+ "learning_rate": 1.310122611286223e-07,
1111
+ "logits/chosen": 1.2756718397140503,
1112
+ "logits/rejected": 1.7924617528915405,
1113
+ "logps/chosen": -140.0908203125,
1114
+ "logps/rejected": -170.6318817138672,
1115
+ "loss": 0.3521,
1116
+ "rewards/accuracies": 0.831250011920929,
1117
+ "rewards/chosen": -0.7002599239349365,
1118
+ "rewards/margins": 3.0346288681030273,
1119
+ "rewards/rejected": -3.734888792037964,
1120
+ "step": 740
1121
+ },
1122
+ {
1123
+ "epoch": 0.8,
1124
+ "grad_norm": 31.266912743625518,
1125
+ "learning_rate": 1.187113143702429e-07,
1126
+ "logits/chosen": 1.1684255599975586,
1127
+ "logits/rejected": 1.6504334211349487,
1128
+ "logps/chosen": -138.86920166015625,
1129
+ "logps/rejected": -172.11294555664062,
1130
+ "loss": 0.3586,
1131
+ "rewards/accuracies": 0.8625000715255737,
1132
+ "rewards/chosen": -0.6606683135032654,
1133
+ "rewards/margins": 2.9746954441070557,
1134
+ "rewards/rejected": -3.6353635787963867,
1135
+ "step": 750
1136
+ },
1137
+ {
1138
+ "epoch": 0.8106666666666666,
1139
+ "grad_norm": 27.45969382970204,
1140
+ "learning_rate": 1.0693859221547113e-07,
1141
+ "logits/chosen": 1.2002224922180176,
1142
+ "logits/rejected": 1.4490042924880981,
1143
+ "logps/chosen": -135.4897918701172,
1144
+ "logps/rejected": -168.1680908203125,
1145
+ "loss": 0.3814,
1146
+ "rewards/accuracies": 0.862500011920929,
1147
+ "rewards/chosen": -0.8046519160270691,
1148
+ "rewards/margins": 2.8222813606262207,
1149
+ "rewards/rejected": -3.6269333362579346,
1150
+ "step": 760
1151
+ },
1152
+ {
1153
+ "epoch": 0.8213333333333334,
1154
+ "grad_norm": 27.34194048181446,
1155
+ "learning_rate": 9.571040420017323e-08,
1156
+ "logits/chosen": 1.2844430208206177,
1157
+ "logits/rejected": 1.5368437767028809,
1158
+ "logps/chosen": -137.79006958007812,
1159
+ "logps/rejected": -180.3217010498047,
1160
+ "loss": 0.2863,
1161
+ "rewards/accuracies": 0.8718750476837158,
1162
+ "rewards/chosen": -0.5950286388397217,
1163
+ "rewards/margins": 3.1103334426879883,
1164
+ "rewards/rejected": -3.705361843109131,
1165
+ "step": 770
1166
+ },
1167
+ {
1168
+ "epoch": 0.832,
1169
+ "grad_norm": 22.71500177829076,
1170
+ "learning_rate": 8.504230548081498e-08,
1171
+ "logits/chosen": 1.1534736156463623,
1172
+ "logits/rejected": 1.5489188432693481,
1173
+ "logps/chosen": -135.84356689453125,
1174
+ "logps/rejected": -163.74789428710938,
1175
+ "loss": 0.3245,
1176
+ "rewards/accuracies": 0.8406250476837158,
1177
+ "rewards/chosen": -0.6919512152671814,
1178
+ "rewards/margins": 2.70729398727417,
1179
+ "rewards/rejected": -3.399244785308838,
1180
+ "step": 780
1181
+ },
1182
+ {
1183
+ "epoch": 0.8426666666666667,
1184
+ "grad_norm": 33.585488423583136,
1185
+ "learning_rate": 7.494907528486799e-08,
1186
+ "logits/chosen": 1.1899046897888184,
1187
+ "logits/rejected": 1.558382272720337,
1188
+ "logps/chosen": -144.0318603515625,
1189
+ "logps/rejected": -173.2529754638672,
1190
+ "loss": 0.2885,
1191
+ "rewards/accuracies": 0.8687500357627869,
1192
+ "rewards/chosen": -0.45610812306404114,
1193
+ "rewards/margins": 2.986624240875244,
1194
+ "rewards/rejected": -3.442732572555542,
1195
+ "step": 790
1196
+ },
1197
+ {
1198
+ "epoch": 0.8533333333333334,
1199
+ "grad_norm": 23.51906755223801,
1200
+ "learning_rate": 6.54446964361619e-08,
1201
+ "logits/chosen": 1.3951364755630493,
1202
+ "logits/rejected": 1.5346930027008057,
1203
+ "logps/chosen": -135.73672485351562,
1204
+ "logps/rejected": -177.74282836914062,
1205
+ "loss": 0.3352,
1206
+ "rewards/accuracies": 0.84375,
1207
+ "rewards/chosen": -0.6097190380096436,
1208
+ "rewards/margins": 2.9158732891082764,
1209
+ "rewards/rejected": -3.52559232711792,
1210
+ "step": 800
1211
+ },
1212
+ {
1213
+ "epoch": 0.864,
1214
+ "grad_norm": 18.099767999889643,
1215
+ "learning_rate": 5.6542335983547515e-08,
1216
+ "logits/chosen": 1.0958290100097656,
1217
+ "logits/rejected": 1.7642685174942017,
1218
+ "logps/chosen": -137.59483337402344,
1219
+ "logps/rejected": -159.05628967285156,
1220
+ "loss": 0.3021,
1221
+ "rewards/accuracies": 0.8468749523162842,
1222
+ "rewards/chosen": -0.548262357711792,
1223
+ "rewards/margins": 2.8460323810577393,
1224
+ "rewards/rejected": -3.3942949771881104,
1225
+ "step": 810
1226
+ },
1227
+ {
1228
+ "epoch": 0.8746666666666667,
1229
+ "grad_norm": 22.736224970421958,
1230
+ "learning_rate": 4.8254326959706714e-08,
1231
+ "logits/chosen": 1.1229872703552246,
1232
+ "logits/rejected": 1.6853474378585815,
1233
+ "logps/chosen": -139.9053497314453,
1234
+ "logps/rejected": -174.4130401611328,
1235
+ "loss": 0.3367,
1236
+ "rewards/accuracies": 0.856249988079071,
1237
+ "rewards/chosen": -0.6694452166557312,
1238
+ "rewards/margins": 2.701251268386841,
1239
+ "rewards/rejected": -3.370696544647217,
1240
+ "step": 820
1241
+ },
1242
+ {
1243
+ "epoch": 0.8853333333333333,
1244
+ "grad_norm": 28.380431064935998,
1245
+ "learning_rate": 4.059215129538246e-08,
1246
+ "logits/chosen": 1.2847788333892822,
1247
+ "logits/rejected": 1.3838608264923096,
1248
+ "logps/chosen": -139.0540313720703,
1249
+ "logps/rejected": -172.7044677734375,
1250
+ "loss": 0.3149,
1251
+ "rewards/accuracies": 0.862500011920929,
1252
+ "rewards/chosen": -0.6882542371749878,
1253
+ "rewards/margins": 2.8262672424316406,
1254
+ "rewards/rejected": -3.514521360397339,
1255
+ "step": 830
1256
+ },
1257
+ {
1258
+ "epoch": 0.896,
1259
+ "grad_norm": 24.47450186333891,
1260
+ "learning_rate": 3.3566423912694045e-08,
1261
+ "logits/chosen": 1.3405706882476807,
1262
+ "logits/rejected": 1.4811562299728394,
1263
+ "logps/chosen": -142.65008544921875,
1264
+ "logps/rejected": -186.91891479492188,
1265
+ "loss": 0.3103,
1266
+ "rewards/accuracies": 0.8750000596046448,
1267
+ "rewards/chosen": -0.6546552181243896,
1268
+ "rewards/margins": 3.121577024459839,
1269
+ "rewards/rejected": -3.7762320041656494,
1270
+ "step": 840
1271
+ },
1272
+ {
1273
+ "epoch": 0.9066666666666666,
1274
+ "grad_norm": 26.10582659575781,
1275
+ "learning_rate": 2.7186878019580194e-08,
1276
+ "logits/chosen": 1.2894738912582397,
1277
+ "logits/rejected": 1.5952208042144775,
1278
+ "logps/chosen": -134.9425048828125,
1279
+ "logps/rejected": -180.87374877929688,
1280
+ "loss": 0.2955,
1281
+ "rewards/accuracies": 0.887499988079071,
1282
+ "rewards/chosen": -0.5013420581817627,
1283
+ "rewards/margins": 2.893864154815674,
1284
+ "rewards/rejected": -3.3952059745788574,
1285
+ "step": 850
1286
+ },
1287
+ {
1288
+ "epoch": 0.9173333333333333,
1289
+ "grad_norm": 16.69269334633985,
1290
+ "learning_rate": 2.1462351625736673e-08,
1291
+ "logits/chosen": 1.1675758361816406,
1292
+ "logits/rejected": 1.6442891359329224,
1293
+ "logps/chosen": -130.08786010742188,
1294
+ "logps/rejected": -170.487548828125,
1295
+ "loss": 0.2872,
1296
+ "rewards/accuracies": 0.871874988079071,
1297
+ "rewards/chosen": -0.6101741790771484,
1298
+ "rewards/margins": 3.180131196975708,
1299
+ "rewards/rejected": -3.7903053760528564,
1300
+ "step": 860
1301
+ },
1302
+ {
1303
+ "epoch": 0.928,
1304
+ "grad_norm": 27.109723201805032,
1305
+ "learning_rate": 1.6400775298734015e-08,
1306
+ "logits/chosen": 1.1326353549957275,
1307
+ "logits/rejected": 1.4924323558807373,
1308
+ "logps/chosen": -130.98184204101562,
1309
+ "logps/rejected": -169.8687286376953,
1310
+ "loss": 0.2554,
1311
+ "rewards/accuracies": 0.890625,
1312
+ "rewards/chosen": -0.5216690301895142,
1313
+ "rewards/margins": 3.1709208488464355,
1314
+ "rewards/rejected": -3.6925899982452393,
1315
+ "step": 870
1316
+ },
1317
+ {
1318
+ "epoch": 0.9386666666666666,
1319
+ "grad_norm": 14.828413315706442,
1320
+ "learning_rate": 1.200916117727374e-08,
1321
+ "logits/chosen": 0.9768412113189697,
1322
+ "logits/rejected": 1.5602909326553345,
1323
+ "logps/chosen": -140.74815368652344,
1324
+ "logps/rejected": -172.26649475097656,
1325
+ "loss": 0.2573,
1326
+ "rewards/accuracies": 0.9000000357627869,
1327
+ "rewards/chosen": -0.5175585746765137,
1328
+ "rewards/margins": 3.088252544403076,
1329
+ "rewards/rejected": -3.605811595916748,
1330
+ "step": 880
1331
+ },
1332
+ {
1333
+ "epoch": 0.9493333333333334,
1334
+ "grad_norm": 26.86192866644961,
1335
+ "learning_rate": 8.293593256805842e-09,
1336
+ "logits/chosen": 1.1421420574188232,
1337
+ "logits/rejected": 1.6538828611373901,
1338
+ "logps/chosen": -141.6370086669922,
1339
+ "logps/rejected": -169.20147705078125,
1340
+ "loss": 0.3091,
1341
+ "rewards/accuracies": 0.8593750596046448,
1342
+ "rewards/chosen": -0.536719799041748,
1343
+ "rewards/margins": 3.0636281967163086,
1344
+ "rewards/rejected": -3.6003482341766357,
1345
+ "step": 890
1346
+ },
1347
+ {
1348
+ "epoch": 0.96,
1349
+ "grad_norm": 19.615598554875728,
1350
+ "learning_rate": 5.2592189609648726e-09,
1351
+ "logits/chosen": 1.0914634466171265,
1352
+ "logits/rejected": 1.4714587926864624,
1353
+ "logps/chosen": -131.58399963378906,
1354
+ "logps/rejected": -164.37319946289062,
1355
+ "loss": 0.3104,
1356
+ "rewards/accuracies": 0.856249988079071,
1357
+ "rewards/chosen": -0.6866143345832825,
1358
+ "rewards/margins": 2.836606025695801,
1359
+ "rewards/rejected": -3.5232203006744385,
1360
+ "step": 900
1361
+ },
1362
+ {
1363
+ "epoch": 0.9706666666666667,
1364
+ "grad_norm": 16.989834579672234,
1365
+ "learning_rate": 2.910242010500996e-09,
1366
+ "logits/chosen": 1.3143634796142578,
1367
+ "logits/rejected": 1.5896141529083252,
1368
+ "logps/chosen": -136.51361083984375,
1369
+ "logps/rejected": -165.6531524658203,
1370
+ "loss": 0.3146,
1371
+ "rewards/accuracies": 0.8343749642372131,
1372
+ "rewards/chosen": -0.6686439514160156,
1373
+ "rewards/margins": 2.7611334323883057,
1374
+ "rewards/rejected": -3.4297773838043213,
1375
+ "step": 910
1376
+ },
1377
+ {
1378
+ "epoch": 0.9813333333333333,
1379
+ "grad_norm": 27.194483822978505,
1380
+ "learning_rate": 1.249916599585954e-09,
1381
+ "logits/chosen": 1.2227872610092163,
1382
+ "logits/rejected": 1.5990573167800903,
1383
+ "logps/chosen": -140.06248474121094,
1384
+ "logps/rejected": -176.98333740234375,
1385
+ "loss": 0.2738,
1386
+ "rewards/accuracies": 0.875,
1387
+ "rewards/chosen": -0.6044124960899353,
1388
+ "rewards/margins": 3.01471209526062,
1389
+ "rewards/rejected": -3.619124412536621,
1390
+ "step": 920
1391
+ },
1392
+ {
1393
+ "epoch": 0.992,
1394
+ "grad_norm": 18.707222551914008,
1395
+ "learning_rate": 2.8054288756129696e-10,
1396
+ "logits/chosen": 1.316845417022705,
1397
+ "logits/rejected": 1.5853126049041748,
1398
+ "logps/chosen": -131.83575439453125,
1399
+ "logps/rejected": -169.5233917236328,
1400
+ "loss": 0.3086,
1401
+ "rewards/accuracies": 0.8687500357627869,
1402
+ "rewards/chosen": -0.5091441869735718,
1403
+ "rewards/margins": 2.9370408058166504,
1404
+ "rewards/rejected": -3.4461851119995117,
1405
+ "step": 930
1406
+ }
1407
+ ],
1408
+ "logging_steps": 10,
1409
+ "max_steps": 938,
1410
+ "num_input_tokens_seen": 0,
1411
+ "num_train_epochs": 1,
1412
+ "save_steps": 1000,
1413
+ "stateful_callbacks": {
1414
+ "TrainerControl": {
1415
+ "args": {
1416
+ "should_epoch_stop": false,
1417
+ "should_evaluate": false,
1418
+ "should_log": false,
1419
+ "should_save": true,
1420
+ "should_training_stop": true
1421
+ },
1422
+ "attributes": {}
1423
+ }
1424
+ },
1425
+ "total_flos": 111524967546880.0,
1426
+ "train_batch_size": 1,
1427
+ "trial_name": null,
1428
+ "trial_params": null
1429
+ }
training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d0e758b07493d4ad7dfbfa864847bd388c1eebb974a6656aa3ab444c0baa82f
3
+ size 8017
video_preprocessor_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": null,
3
+ "data_format": "channels_first",
4
+ "default_to_square": true,
5
+ "device": null,
6
+ "do_center_crop": null,
7
+ "do_convert_rgb": true,
8
+ "do_normalize": true,
9
+ "do_pad": null,
10
+ "do_rescale": true,
11
+ "do_resize": true,
12
+ "do_sample_frames": false,
13
+ "fps": null,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_std": [
20
+ 0.26862954,
21
+ 0.26130258,
22
+ 0.27577711
23
+ ],
24
+ "input_data_format": null,
25
+ "max_frames": 768,
26
+ "max_pixels": 12845056,
27
+ "merge_size": 2,
28
+ "min_frames": 4,
29
+ "min_pixels": 3136,
30
+ "num_frames": null,
31
+ "patch_size": 14,
32
+ "processor_class": "Qwen2_5_VLProcessor",
33
+ "resample": 3,
34
+ "rescale_factor": 0.00392156862745098,
35
+ "size": {
36
+ "longest_edge": 12845056,
37
+ "shortest_edge": 3136
38
+ },
39
+ "size_divisor": null,
40
+ "temporal_patch_size": 2,
41
+ "video_metadata": null,
42
+ "video_processor_type": "Qwen2VLVideoProcessor"
43
+ }
vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
zero_to_fp32.py ADDED
@@ -0,0 +1,760 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+
3
+ # Copyright (c) Microsoft Corporation.
4
+ # SPDX-License-Identifier: Apache-2.0
5
+
6
+ # DeepSpeed Team
7
+
8
+ # This script extracts fp32 consolidated weights from a zero 1, 2 and 3 DeepSpeed checkpoints. It gets
9
+ # copied into the top level checkpoint dir, so the user can easily do the conversion at any point in
10
+ # the future. Once extracted, the weights don't require DeepSpeed and can be used in any
11
+ # application.
12
+ #
13
+ # example:
14
+ # python zero_to_fp32.py . output_dir/
15
+ # or
16
+ # python zero_to_fp32.py . output_dir/ --safe_serialization
17
+
18
+ import argparse
19
+ import torch
20
+ import glob
21
+ import math
22
+ import os
23
+ import re
24
+ import gc
25
+ import json
26
+ import numpy as np
27
+ from tqdm import tqdm
28
+ from collections import OrderedDict
29
+ from dataclasses import dataclass
30
+
31
+ # while this script doesn't use deepspeed to recover data, since the checkpoints are pickled with
32
+ # DeepSpeed data structures it has to be available in the current python environment.
33
+ from deepspeed.utils import logger
34
+ from deepspeed.checkpoint.constants import (DS_VERSION, OPTIMIZER_STATE_DICT, SINGLE_PARTITION_OF_FP32_GROUPS,
35
+ FP32_FLAT_GROUPS, ZERO_STAGE, PARTITION_COUNT, PARAM_SHAPES, BUFFER_NAMES,
36
+ FROZEN_PARAM_SHAPES, FROZEN_PARAM_FRAGMENTS)
37
+
38
+
39
+ @dataclass
40
+ class zero_model_state:
41
+ buffers: dict()
42
+ param_shapes: dict()
43
+ shared_params: list
44
+ ds_version: int
45
+ frozen_param_shapes: dict()
46
+ frozen_param_fragments: dict()
47
+
48
+
49
+ debug = 0
50
+
51
+ # load to cpu
52
+ device = torch.device('cpu')
53
+
54
+
55
+ def atoi(text):
56
+ return int(text) if text.isdigit() else text
57
+
58
+
59
+ def natural_keys(text):
60
+ '''
61
+ alist.sort(key=natural_keys) sorts in human order
62
+ http://nedbatchelder.com/blog/200712/human_sorting.html
63
+ (See Toothy's implementation in the comments)
64
+ '''
65
+ return [atoi(c) for c in re.split(r'(\d+)', text)]
66
+
67
+
68
+ def get_model_state_file(checkpoint_dir, zero_stage):
69
+ if not os.path.isdir(checkpoint_dir):
70
+ raise FileNotFoundError(f"Directory '{checkpoint_dir}' doesn't exist")
71
+
72
+ # there should be only one file
73
+ if zero_stage <= 2:
74
+ file = os.path.join(checkpoint_dir, "mp_rank_00_model_states.pt")
75
+ elif zero_stage == 3:
76
+ file = os.path.join(checkpoint_dir, "zero_pp_rank_0_mp_rank_00_model_states.pt")
77
+
78
+ if not os.path.exists(file):
79
+ raise FileNotFoundError(f"can't find model states file at '{file}'")
80
+
81
+ return file
82
+
83
+
84
+ def get_checkpoint_files(checkpoint_dir, glob_pattern):
85
+ # XXX: need to test that this simple glob rule works for multi-node setup too
86
+ ckpt_files = sorted(glob.glob(os.path.join(checkpoint_dir, glob_pattern)), key=natural_keys)
87
+
88
+ if len(ckpt_files) == 0:
89
+ raise FileNotFoundError(f"can't find {glob_pattern} files in directory '{checkpoint_dir}'")
90
+
91
+ return ckpt_files
92
+
93
+
94
+ def get_optim_files(checkpoint_dir):
95
+ return get_checkpoint_files(checkpoint_dir, "*_optim_states.pt")
96
+
97
+
98
+ def get_model_state_files(checkpoint_dir):
99
+ return get_checkpoint_files(checkpoint_dir, "*_model_states.pt")
100
+
101
+
102
+ def parse_model_states(files):
103
+ zero_model_states = []
104
+ for file in files:
105
+ state_dict = torch.load(file, map_location=device, weights_only=False)
106
+
107
+ if BUFFER_NAMES not in state_dict:
108
+ raise ValueError(f"{file} is not a model state checkpoint")
109
+ buffer_names = state_dict[BUFFER_NAMES]
110
+ if debug:
111
+ print("Found buffers:", buffer_names)
112
+
113
+ # recover just the buffers while restoring them to fp32 if they were saved in fp16
114
+ buffers = {k: v.float() for k, v in state_dict["module"].items() if k in buffer_names}
115
+ param_shapes = state_dict[PARAM_SHAPES]
116
+
117
+ # collect parameters that are included in param_shapes
118
+ param_names = []
119
+ for s in param_shapes:
120
+ for name in s.keys():
121
+ param_names.append(name)
122
+
123
+ # update with frozen parameters
124
+ frozen_param_shapes = state_dict.get(FROZEN_PARAM_SHAPES, None)
125
+ if frozen_param_shapes is not None:
126
+ if debug:
127
+ print(f"Found frozen_param_shapes: {frozen_param_shapes}")
128
+ param_names += list(frozen_param_shapes.keys())
129
+
130
+ # handle shared params
131
+ shared_params = [[k, v] for k, v in state_dict["shared_params"].items()]
132
+
133
+ ds_version = state_dict.get(DS_VERSION, None)
134
+
135
+ frozen_param_fragments = state_dict.get(FROZEN_PARAM_FRAGMENTS, None)
136
+
137
+ z_model_state = zero_model_state(buffers=buffers,
138
+ param_shapes=param_shapes,
139
+ shared_params=shared_params,
140
+ ds_version=ds_version,
141
+ frozen_param_shapes=frozen_param_shapes,
142
+ frozen_param_fragments=frozen_param_fragments)
143
+ zero_model_states.append(z_model_state)
144
+
145
+ return zero_model_states
146
+
147
+
148
+ def parse_optim_states(files, ds_checkpoint_dir):
149
+ total_files = len(files)
150
+ state_dicts = []
151
+ for f in tqdm(files, desc='Loading checkpoint shards'):
152
+ state_dict = torch.load(f, map_location=device, mmap=True, weights_only=False)
153
+ # immediately discard the potentially huge 2 optimizer states as we only care for fp32 master weights
154
+ # and also handle the case where it was already removed by another helper script
155
+ state_dict["optimizer_state_dict"].pop("optimizer_state_dict", None)
156
+ state_dicts.append(state_dict)
157
+
158
+ if not ZERO_STAGE in state_dicts[0][OPTIMIZER_STATE_DICT]:
159
+ raise ValueError(f"{files[0]} is not a zero checkpoint")
160
+ zero_stage = state_dicts[0][OPTIMIZER_STATE_DICT][ZERO_STAGE]
161
+ world_size = state_dicts[0][OPTIMIZER_STATE_DICT][PARTITION_COUNT]
162
+
163
+ # For ZeRO-2 each param group can have different partition_count as data parallelism for expert
164
+ # parameters can be different from data parallelism for non-expert parameters. So we can just
165
+ # use the max of the partition_count to get the dp world_size.
166
+
167
+ if type(world_size) is list:
168
+ world_size = max(world_size)
169
+
170
+ if world_size != total_files:
171
+ raise ValueError(
172
+ f"Expected {world_size} of '*_optim_states.pt' under '{ds_checkpoint_dir}' but found {total_files} files. "
173
+ "Possibly due to an overwrite of an old checkpoint, or a checkpoint didn't get saved by one or more processes."
174
+ )
175
+
176
+ # the groups are named differently in each stage
177
+ if zero_stage <= 2:
178
+ fp32_groups_key = SINGLE_PARTITION_OF_FP32_GROUPS
179
+ elif zero_stage == 3:
180
+ fp32_groups_key = FP32_FLAT_GROUPS
181
+ else:
182
+ raise ValueError(f"unknown zero stage {zero_stage}")
183
+
184
+ fp32_flat_groups = [state_dicts[i][OPTIMIZER_STATE_DICT][fp32_groups_key] for i in range(len(state_dicts))]
185
+ return zero_stage, world_size, fp32_flat_groups
186
+
187
+
188
+ def _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters):
189
+ """
190
+ Returns fp32 state_dict reconstructed from ds checkpoint
191
+
192
+ Args:
193
+ - ``ds_checkpoint_dir``: path to the deepspeed checkpoint folder (where the optimizer files are)
194
+
195
+ """
196
+ print(f"Processing zero checkpoint '{ds_checkpoint_dir}'")
197
+
198
+ optim_files = get_optim_files(ds_checkpoint_dir)
199
+ zero_stage, world_size, fp32_flat_groups = parse_optim_states(optim_files, ds_checkpoint_dir)
200
+ print(f"Detected checkpoint of type zero stage {zero_stage}, world_size: {world_size}")
201
+
202
+ model_files = get_model_state_files(ds_checkpoint_dir)
203
+
204
+ zero_model_states = parse_model_states(model_files)
205
+ print(f'Parsing checkpoint created by deepspeed=={zero_model_states[0].ds_version}')
206
+
207
+ if zero_stage <= 2:
208
+ return _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
209
+ exclude_frozen_parameters)
210
+ elif zero_stage == 3:
211
+ return _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
212
+ exclude_frozen_parameters)
213
+
214
+
215
+ def _zero2_merge_frozen_params(state_dict, zero_model_states):
216
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
217
+ return
218
+
219
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
220
+ frozen_param_fragments = zero_model_states[0].frozen_param_fragments
221
+
222
+ if debug:
223
+ num_elem = sum(s.numel() for s in frozen_param_shapes.values())
224
+ print(f'rank 0: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
225
+
226
+ wanted_params = len(frozen_param_shapes)
227
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
228
+ avail_numel = sum([p.numel() for p in frozen_param_fragments.values()])
229
+ print(f'Frozen params: Have {avail_numel} numels to process.')
230
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
231
+
232
+ total_params = 0
233
+ total_numel = 0
234
+ for name, shape in frozen_param_shapes.items():
235
+ total_params += 1
236
+ unpartitioned_numel = shape.numel()
237
+ total_numel += unpartitioned_numel
238
+
239
+ state_dict[name] = frozen_param_fragments[name]
240
+
241
+ if debug:
242
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
243
+
244
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
245
+
246
+
247
+ def _has_callable(obj, fn):
248
+ attr = getattr(obj, fn, None)
249
+ return callable(attr)
250
+
251
+
252
+ def _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
253
+ param_shapes = zero_model_states[0].param_shapes
254
+
255
+ # Reconstruction protocol:
256
+ #
257
+ # XXX: document this
258
+
259
+ if debug:
260
+ for i in range(world_size):
261
+ for j in range(len(fp32_flat_groups[0])):
262
+ print(f"{FP32_FLAT_GROUPS}[{i}][{j}].shape={fp32_flat_groups[i][j].shape}")
263
+
264
+ # XXX: memory usage doubles here (zero2)
265
+ num_param_groups = len(fp32_flat_groups[0])
266
+ merged_single_partition_of_fp32_groups = []
267
+ for i in range(num_param_groups):
268
+ merged_partitions = [sd[i] for sd in fp32_flat_groups]
269
+ full_single_fp32_vector = torch.cat(merged_partitions, 0)
270
+ merged_single_partition_of_fp32_groups.append(full_single_fp32_vector)
271
+ avail_numel = sum(
272
+ [full_single_fp32_vector.numel() for full_single_fp32_vector in merged_single_partition_of_fp32_groups])
273
+
274
+ if debug:
275
+ wanted_params = sum([len(shapes) for shapes in param_shapes])
276
+ wanted_numel = sum([sum(shape.numel() for shape in shapes.values()) for shapes in param_shapes])
277
+ # not asserting if there is a mismatch due to possible padding
278
+ print(f"Have {avail_numel} numels to process.")
279
+ print(f"Need {wanted_numel} numels in {wanted_params} params.")
280
+
281
+ # params
282
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
283
+ # out-of-core computing solution
284
+ total_numel = 0
285
+ total_params = 0
286
+ for shapes, full_single_fp32_vector in zip(param_shapes, merged_single_partition_of_fp32_groups):
287
+ offset = 0
288
+ avail_numel = full_single_fp32_vector.numel()
289
+ for name, shape in shapes.items():
290
+
291
+ unpartitioned_numel = shape.numel() if _has_callable(shape, 'numel') else math.prod(shape)
292
+ total_numel += unpartitioned_numel
293
+ total_params += 1
294
+
295
+ if debug:
296
+ print(f"{name} full shape: {shape} unpartitioned numel {unpartitioned_numel} ")
297
+ state_dict[name] = full_single_fp32_vector.narrow(0, offset, unpartitioned_numel).view(shape)
298
+ offset += unpartitioned_numel
299
+
300
+ # Z2 started to align to 2*world_size to improve nccl performance. Therefore both offset and
301
+ # avail_numel can differ by anywhere between 0..2*world_size. Due to two unrelated complex
302
+ # paddings performed in the code it's almost impossible to predict the exact numbers w/o the
303
+ # live optimizer object, so we are checking that the numbers are within the right range
304
+ align_to = 2 * world_size
305
+
306
+ def zero2_align(x):
307
+ return align_to * math.ceil(x / align_to)
308
+
309
+ if debug:
310
+ print(f"original offset={offset}, avail_numel={avail_numel}")
311
+
312
+ offset = zero2_align(offset)
313
+ avail_numel = zero2_align(avail_numel)
314
+
315
+ if debug:
316
+ print(f"aligned offset={offset}, avail_numel={avail_numel}")
317
+
318
+ # Sanity check
319
+ if offset != avail_numel:
320
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
321
+
322
+ print(f"Reconstructed fp32 state dict with {total_params} params {total_numel} elements")
323
+
324
+
325
+ def _get_fp32_state_dict_from_zero2_checkpoint(world_size, fp32_flat_groups, zero_model_states,
326
+ exclude_frozen_parameters):
327
+ state_dict = OrderedDict()
328
+
329
+ # buffers
330
+ buffers = zero_model_states[0].buffers
331
+ state_dict.update(buffers)
332
+ if debug:
333
+ print(f"added {len(buffers)} buffers")
334
+
335
+ if not exclude_frozen_parameters:
336
+ _zero2_merge_frozen_params(state_dict, zero_model_states)
337
+
338
+ _zero2_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
339
+
340
+ # recover shared parameters
341
+ for pair in zero_model_states[0].shared_params:
342
+ if pair[1] in state_dict:
343
+ state_dict[pair[0]] = state_dict[pair[1]]
344
+
345
+ return state_dict
346
+
347
+
348
+ def zero3_partitioned_param_info(unpartitioned_numel, world_size):
349
+ remainder = unpartitioned_numel % world_size
350
+ padding_numel = (world_size - remainder) if remainder else 0
351
+ partitioned_numel = math.ceil(unpartitioned_numel / world_size)
352
+ return partitioned_numel, padding_numel
353
+
354
+
355
+ def _zero3_merge_frozen_params(state_dict, world_size, zero_model_states):
356
+ if zero_model_states[0].frozen_param_shapes is None or len(zero_model_states[0].frozen_param_shapes) == 0:
357
+ return
358
+
359
+ if debug:
360
+ for i in range(world_size):
361
+ num_elem = sum(s.numel() for s in zero_model_states[i].frozen_param_fragments.values())
362
+ print(f'rank {i}: {FROZEN_PARAM_SHAPES}.numel = {num_elem}')
363
+
364
+ frozen_param_shapes = zero_model_states[0].frozen_param_shapes
365
+ wanted_params = len(frozen_param_shapes)
366
+ wanted_numel = sum(s.numel() for s in frozen_param_shapes.values())
367
+ avail_numel = sum([p.numel() for p in zero_model_states[0].frozen_param_fragments.values()]) * world_size
368
+ print(f'Frozen params: Have {avail_numel} numels to process.')
369
+ print(f'Frozen params: Need {wanted_numel} numels in {wanted_params} params')
370
+
371
+ total_params = 0
372
+ total_numel = 0
373
+ for name, shape in zero_model_states[0].frozen_param_shapes.items():
374
+ total_params += 1
375
+ unpartitioned_numel = shape.numel()
376
+ total_numel += unpartitioned_numel
377
+
378
+ param_frags = tuple(model_state.frozen_param_fragments[name] for model_state in zero_model_states)
379
+ state_dict[name] = torch.cat(param_frags, 0).narrow(0, 0, unpartitioned_numel).view(shape)
380
+
381
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
382
+
383
+ if debug:
384
+ print(
385
+ f"Frozen params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
386
+ )
387
+
388
+ print(f"Reconstructed Frozen fp32 state dict with {total_params} params {total_numel} elements")
389
+
390
+
391
+ class GatheredTensor:
392
+ """
393
+ A pseudo tensor that collects partitioned weights.
394
+ It is more memory efficient when there are multiple groups.
395
+ """
396
+
397
+ def __init__(self, flat_groups, flat_groups_offset, offset, partitioned_numel, shape):
398
+ self.flat_groups = flat_groups
399
+ self.flat_groups_offset = flat_groups_offset
400
+ self.offset = offset
401
+ self.partitioned_numel = partitioned_numel
402
+ self.shape = shape
403
+ self.dtype = self.flat_groups[0][0].dtype
404
+
405
+ def contiguous(self):
406
+ """
407
+ Merge partitioned weights from flat_groups into a single tensor.
408
+ """
409
+ end_idx = self.offset + self.partitioned_numel
410
+ world_size = len(self.flat_groups)
411
+ pad_flat_param_chunks = []
412
+
413
+ for rank_i in range(world_size):
414
+ # for each rank, we need to collect weights from related group/groups
415
+ flat_groups_at_rank_i = self.flat_groups[rank_i]
416
+ start_group_id = None
417
+ end_group_id = None
418
+ for group_id in range(len(self.flat_groups_offset)):
419
+ if self.flat_groups_offset[group_id] <= self.offset < self.flat_groups_offset[group_id + 1]:
420
+ start_group_id = group_id
421
+ if self.flat_groups_offset[group_id] < end_idx <= self.flat_groups_offset[group_id + 1]:
422
+ end_group_id = group_id
423
+ break
424
+ # collect weights from related group/groups
425
+ for group_id in range(start_group_id, end_group_id + 1):
426
+ flat_tensor = flat_groups_at_rank_i[group_id]
427
+ start_offset = self.offset - self.flat_groups_offset[group_id]
428
+ end_offset = min(end_idx, self.flat_groups_offset[group_id + 1]) - self.flat_groups_offset[group_id]
429
+ pad_flat_param_chunks.append(flat_tensor[start_offset:end_offset])
430
+
431
+ # collect weights from all ranks
432
+ pad_flat_param = torch.cat(pad_flat_param_chunks, dim=0)
433
+ param = pad_flat_param[:self.shape.numel()].view(self.shape).contiguous()
434
+ return param
435
+
436
+
437
+ def _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states):
438
+ param_shapes = zero_model_states[0].param_shapes
439
+ avail_numel = sum([flat_group.numel() for flat_group in fp32_flat_groups[0]]) * world_size
440
+
441
+ # Reconstruction protocol: For zero3 we need to zip the partitions together at boundary of each
442
+ # param, re-consolidating each param, while dealing with padding if any
443
+
444
+ # merge list of dicts, preserving order
445
+ param_shapes = {k: v for d in param_shapes for k, v in d.items()}
446
+
447
+ if debug:
448
+ for i in range(world_size):
449
+ print(f"{FP32_FLAT_GROUPS}[{i}].shape={fp32_flat_groups[i].shape}")
450
+
451
+ wanted_params = len(param_shapes)
452
+ wanted_numel = sum(shape.numel() for shape in param_shapes.values())
453
+ # not asserting if there is a mismatch due to possible padding
454
+ avail_numel = fp32_flat_groups[0].numel() * world_size
455
+ print(f"Trainable params: Have {avail_numel} numels to process.")
456
+ print(f"Trainable params: Need {wanted_numel} numels in {wanted_params} params.")
457
+
458
+ # params
459
+ # XXX: for huge models that can't fit into the host's RAM we will have to recode this to support
460
+ # out-of-core computing solution
461
+ offset = 0
462
+ total_numel = 0
463
+ total_params = 0
464
+ flat_groups_offset = [0] + list(np.cumsum([flat_tensor.numel() for flat_tensor in fp32_flat_groups[0]]))
465
+ for name, shape in tqdm(param_shapes.items(), desc='Gathering sharded weights'):
466
+ unpartitioned_numel = shape.numel()
467
+ total_numel += unpartitioned_numel
468
+ total_params += 1
469
+ partitioned_numel, partitioned_padding_numel = zero3_partitioned_param_info(unpartitioned_numel, world_size)
470
+
471
+ if debug:
472
+ print(
473
+ f"Trainable params: {total_params} {name} full shape: {shape} partition0 numel={partitioned_numel} partitioned_padding_numel={partitioned_padding_numel}"
474
+ )
475
+
476
+ # memory efficient tensor
477
+ tensor = GatheredTensor(fp32_flat_groups, flat_groups_offset, offset, partitioned_numel, shape)
478
+ state_dict[name] = tensor
479
+ offset += partitioned_numel
480
+
481
+ offset *= world_size
482
+
483
+ # Sanity check
484
+ if offset != avail_numel:
485
+ raise ValueError(f"consumed {offset} numels out of {avail_numel} - something is wrong")
486
+
487
+ print(f"Reconstructed Trainable fp32 state dict with {total_params} params {total_numel} elements")
488
+
489
+
490
+ def _get_fp32_state_dict_from_zero3_checkpoint(world_size, fp32_flat_groups, zero_model_states,
491
+ exclude_frozen_parameters):
492
+ state_dict = OrderedDict()
493
+
494
+ # buffers
495
+ buffers = zero_model_states[0].buffers
496
+ state_dict.update(buffers)
497
+ if debug:
498
+ print(f"added {len(buffers)} buffers")
499
+
500
+ if not exclude_frozen_parameters:
501
+ _zero3_merge_frozen_params(state_dict, world_size, zero_model_states)
502
+
503
+ _zero3_merge_trainable_params(state_dict, world_size, fp32_flat_groups, zero_model_states)
504
+
505
+ # recover shared parameters
506
+ for pair in zero_model_states[0].shared_params:
507
+ if pair[1] in state_dict:
508
+ state_dict[pair[0]] = state_dict[pair[1]]
509
+
510
+ return state_dict
511
+
512
+
513
+ def to_torch_tensor(state_dict, return_empty_tensor=False):
514
+ """
515
+ Convert state_dict of GatheredTensor to torch tensor
516
+ """
517
+ torch_state_dict = {}
518
+ converted_tensors = {}
519
+ for name, tensor in state_dict.items():
520
+ tensor_id = id(tensor)
521
+ if tensor_id in converted_tensors: # shared tensors
522
+ shared_tensor = torch_state_dict[converted_tensors[tensor_id]]
523
+ torch_state_dict[name] = shared_tensor
524
+ else:
525
+ converted_tensors[tensor_id] = name
526
+ if return_empty_tensor:
527
+ torch_state_dict[name] = torch.empty(tensor.shape, dtype=tensor.dtype)
528
+ else:
529
+ torch_state_dict[name] = tensor.contiguous()
530
+ return torch_state_dict
531
+
532
+
533
+ def get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
534
+ tag=None,
535
+ exclude_frozen_parameters=False,
536
+ lazy_mode=False):
537
+ """
538
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated state_dict that can be loaded with
539
+ ``load_state_dict()`` and used for training without DeepSpeed or shared with others, for example
540
+ via a model hub.
541
+
542
+ Args:
543
+ - ``checkpoint_dir``: path to the desired checkpoint folder
544
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in 'latest' file. e.g., ``global_step14``
545
+ - ``exclude_frozen_parameters``: exclude frozen parameters
546
+ - ``lazy_mode``: get state_dict in lazy mode. It returns a dict of pesduo tensor instead of torch tensor, which is more memory efficient.
547
+ Convert the pesduo tensor to torch tensor by ``.contiguous()``
548
+
549
+ Returns:
550
+ - pytorch ``state_dict``
551
+
552
+ A typical usage might be ::
553
+
554
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
555
+ # do the training and checkpoint saving
556
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
557
+ model = model.cpu() # move to cpu
558
+ model.load_state_dict(state_dict)
559
+ # submit to model hub or save the model to share with others
560
+
561
+ In this example the ``model`` will no longer be usable in the deepspeed context of the same
562
+ application. i.e. you will need to re-initialize the deepspeed engine, since
563
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
564
+
565
+ If you want it all done for you, use ``load_state_dict_from_zero_checkpoint`` instead.
566
+
567
+ Note: the above usage may not work if your application doesn't have sufficient free CPU memory.
568
+ You may need to use the offline approach using the ``zero_to_fp32.py`` script that is saved with
569
+ the checkpoint. Or you can load state_dict in lazy mode ::
570
+
571
+ from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
572
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, lazy_mode=True) # not on cpu
573
+ for name, lazy_tensor in state_dict.item():
574
+ tensor = lazy_tensor.contiguous() # to cpu
575
+ print(name, tensor)
576
+ # del tensor to release memory if it no longer in use
577
+ """
578
+ if tag is None:
579
+ latest_path = os.path.join(checkpoint_dir, 'latest')
580
+ if os.path.isfile(latest_path):
581
+ with open(latest_path, 'r') as fd:
582
+ tag = fd.read().strip()
583
+ else:
584
+ raise ValueError(f"Unable to find 'latest' file at {latest_path}")
585
+
586
+ ds_checkpoint_dir = os.path.join(checkpoint_dir, tag)
587
+
588
+ if not os.path.isdir(ds_checkpoint_dir):
589
+ raise FileNotFoundError(f"Directory '{ds_checkpoint_dir}' doesn't exist")
590
+
591
+ state_dict = _get_fp32_state_dict_from_zero_checkpoint(ds_checkpoint_dir, exclude_frozen_parameters)
592
+ if lazy_mode:
593
+ return state_dict
594
+ else:
595
+ return to_torch_tensor(state_dict)
596
+
597
+
598
+ def convert_zero_checkpoint_to_fp32_state_dict(checkpoint_dir,
599
+ output_dir,
600
+ max_shard_size="5GB",
601
+ safe_serialization=False,
602
+ tag=None,
603
+ exclude_frozen_parameters=False):
604
+ """
605
+ Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict`` file that can be
606
+ loaded with ``torch.load(file)`` + ``load_state_dict()`` and used for training without DeepSpeed.
607
+
608
+ Args:
609
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
610
+ - ``output_dir``: directory to the pytorch fp32 state_dict output files
611
+ - ``max_shard_size``: the maximum size for a checkpoint before being sharded, default value is 5GB
612
+ - ``safe_serialization``: whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).
613
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
614
+ - ``exclude_frozen_parameters``: exclude frozen parameters
615
+ """
616
+
617
+ # Dependency pre-check
618
+ if safe_serialization:
619
+ try:
620
+ from safetensors.torch import save_file
621
+ except ImportError:
622
+ print('If you want to use `safe_serialization`, please `pip install safetensors`')
623
+ raise
624
+ if max_shard_size is not None:
625
+ try:
626
+ from huggingface_hub import split_torch_state_dict_into_shards
627
+ except ImportError:
628
+ print('If you want to use `max_shard_size`, please `pip install huggingface_hub`')
629
+ raise
630
+
631
+ # Convert zero checkpoint to state_dict
632
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir,
633
+ tag,
634
+ exclude_frozen_parameters,
635
+ lazy_mode=True)
636
+
637
+ # Shard the model if it is too big.
638
+ weights_name = "model.safetensors" if safe_serialization else "pytorch_model.bin"
639
+ if max_shard_size is not None:
640
+ filename_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(".safetensors", "{suffix}.safetensors")
641
+ # an memory-efficient approach for sharding
642
+ empty_state_dict = to_torch_tensor(state_dict, return_empty_tensor=True)
643
+ state_dict_split = split_torch_state_dict_into_shards(empty_state_dict,
644
+ filename_pattern=filename_pattern,
645
+ max_shard_size=max_shard_size)
646
+ else:
647
+ from collections import namedtuple
648
+ StateDictSplit = namedtuple("StateDictSplit", ["is_sharded", "filename_to_tensors"])
649
+ state_dict_split = StateDictSplit(is_sharded=False,
650
+ filename_to_tensors={weights_name: list(state_dict.keys())})
651
+
652
+ # Save the model by shard
653
+ os.makedirs(output_dir, exist_ok=True)
654
+ filename_to_tensors = state_dict_split.filename_to_tensors.items()
655
+ for shard_file, tensors in tqdm(filename_to_tensors, desc="Saving checkpoint shards"):
656
+ shard_state_dict = {tensor_name: state_dict[tensor_name] for tensor_name in tensors}
657
+ shard_state_dict = to_torch_tensor(shard_state_dict)
658
+ output_path = os.path.join(output_dir, shard_file)
659
+ if safe_serialization:
660
+ save_file(shard_state_dict, output_path, metadata={"format": "pt"})
661
+ else:
662
+ torch.save(shard_state_dict, output_path)
663
+ # release the memory of current shard
664
+ for tensor_name in list(shard_state_dict.keys()):
665
+ del state_dict[tensor_name]
666
+ del shard_state_dict[tensor_name]
667
+ del shard_state_dict
668
+ gc.collect()
669
+
670
+ # Save index if sharded
671
+ if state_dict_split.is_sharded:
672
+ index = {
673
+ "metadata": state_dict_split.metadata,
674
+ "weight_map": state_dict_split.tensor_to_filename,
675
+ }
676
+ save_index_file = "model.safetensors.index.json" if safe_serialization else "pytorch_model.bin.index.json"
677
+ save_index_file = os.path.join(output_dir, save_index_file)
678
+ with open(save_index_file, "w", encoding="utf-8") as f:
679
+ content = json.dumps(index, indent=2, sort_keys=True) + "\n"
680
+ f.write(content)
681
+
682
+
683
+ def load_state_dict_from_zero_checkpoint(model, checkpoint_dir, tag=None):
684
+ """
685
+ 1. Put the provided model to cpu
686
+ 2. Convert ZeRO 2 or 3 checkpoint into a single fp32 consolidated ``state_dict``
687
+ 3. Load it into the provided model
688
+
689
+ Args:
690
+ - ``model``: the model object to update
691
+ - ``checkpoint_dir``: path to the desired checkpoint folder. (one that contains the tag-folder, like ``global_step14``)
692
+ - ``tag``: checkpoint tag used as a unique identifier for checkpoint. If not provided will attempt to load tag in the file named ``latest`` in the checkpoint folder, e.g., ``global_step14``
693
+
694
+ Returns:
695
+ - ``model`: modified model
696
+
697
+ Make sure you have plenty of CPU memory available before you call this function. If you don't
698
+ have enough use the ``zero_to_fp32.py`` utility to do the conversion. You will find it
699
+ conveniently placed for you in the checkpoint folder.
700
+
701
+ A typical usage might be ::
702
+
703
+ from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
704
+ model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
705
+ # submit to model hub or save the model to share with others
706
+
707
+ Note, that once this was run, the ``model`` will no longer be usable in the deepspeed context
708
+ of the same application. i.e. you will need to re-initialize the deepspeed engine, since
709
+ ``model.load_state_dict(state_dict)`` will remove all the deepspeed magic from it.
710
+
711
+ """
712
+ logger.info(f"Extracting fp32 weights")
713
+ state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir, tag)
714
+
715
+ logger.info(f"Overwriting model with fp32 weights")
716
+ model = model.cpu()
717
+ model.load_state_dict(state_dict, strict=False)
718
+
719
+ return model
720
+
721
+
722
+ if __name__ == "__main__":
723
+ parser = argparse.ArgumentParser()
724
+ parser.add_argument("checkpoint_dir",
725
+ type=str,
726
+ help="path to the desired checkpoint folder, e.g., path/checkpoint-12")
727
+ parser.add_argument("output_dir",
728
+ type=str,
729
+ help="directory to the pytorch fp32 state_dict output files"
730
+ "(e.g. path/checkpoint-12-output/)")
731
+ parser.add_argument(
732
+ "--max_shard_size",
733
+ type=str,
734
+ default="5GB",
735
+ help="The maximum size for a checkpoint before being sharded. Checkpoints shard will then be each of size"
736
+ "lower than this size. If expressed as a string, needs to be digits followed by a unit (like `5MB`"
737
+ "We default it to 5GB in order for models to be able to run easily on free-tier google colab instances"
738
+ "without CPU OOM issues.")
739
+ parser.add_argument(
740
+ "--safe_serialization",
741
+ default=False,
742
+ action='store_true',
743
+ help="Whether to save the model using `safetensors` or the traditional PyTorch way (that uses `pickle`).")
744
+ parser.add_argument("-t",
745
+ "--tag",
746
+ type=str,
747
+ default=None,
748
+ help="checkpoint tag used as a unique identifier for checkpoint. e.g., global_step1")
749
+ parser.add_argument("--exclude_frozen_parameters", action='store_true', help="exclude frozen parameters")
750
+ parser.add_argument("-d", "--debug", action='store_true', help="enable debug")
751
+ args = parser.parse_args()
752
+
753
+ debug = args.debug
754
+
755
+ convert_zero_checkpoint_to_fp32_state_dict(args.checkpoint_dir,
756
+ args.output_dir,
757
+ max_shard_size=args.max_shard_size,
758
+ safe_serialization=args.safe_serialization,
759
+ tag=args.tag,
760
+ exclude_frozen_parameters=args.exclude_frozen_parameters)