sirui6011 commited on
Commit
139c2b2
·
verified ·
1 Parent(s): 97d47c5

add checkpoints/codi-single-1.5b

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. checkpoints/codi-single-1.5b/checkpoint-1000/added_tokens.json +35 -0
  3. checkpoints/codi-single-1.5b/checkpoint-1000/chat_template.jinja +54 -0
  4. checkpoints/codi-single-1.5b/checkpoint-1000/config.json +58 -0
  5. checkpoints/codi-single-1.5b/checkpoint-1000/merges.txt +0 -0
  6. checkpoints/codi-single-1.5b/checkpoint-1000/pytorch_model.bin +3 -0
  7. checkpoints/codi-single-1.5b/checkpoint-1000/special_tokens_map.json +31 -0
  8. checkpoints/codi-single-1.5b/checkpoint-1000/thought_projector.pt +3 -0
  9. checkpoints/codi-single-1.5b/checkpoint-1000/tokenizer.json +3 -0
  10. checkpoints/codi-single-1.5b/checkpoint-1000/tokenizer_config.json +295 -0
  11. checkpoints/codi-single-1.5b/checkpoint-1000/trainer_state.json +2046 -0
  12. checkpoints/codi-single-1.5b/checkpoint-1000/vocab.json +0 -0
  13. checkpoints/codi-single-1.5b/checkpoint-1500/added_tokens.json +35 -0
  14. checkpoints/codi-single-1.5b/checkpoint-1500/chat_template.jinja +54 -0
  15. checkpoints/codi-single-1.5b/checkpoint-1500/config.json +58 -0
  16. checkpoints/codi-single-1.5b/checkpoint-1500/merges.txt +0 -0
  17. checkpoints/codi-single-1.5b/checkpoint-1500/pytorch_model.bin +3 -0
  18. checkpoints/codi-single-1.5b/checkpoint-1500/special_tokens_map.json +31 -0
  19. checkpoints/codi-single-1.5b/checkpoint-1500/thought_projector.pt +3 -0
  20. checkpoints/codi-single-1.5b/checkpoint-1500/tokenizer.json +3 -0
  21. checkpoints/codi-single-1.5b/checkpoint-1500/tokenizer_config.json +295 -0
  22. checkpoints/codi-single-1.5b/checkpoint-1500/trainer_state.json +3046 -0
  23. checkpoints/codi-single-1.5b/checkpoint-1500/vocab.json +0 -0
  24. checkpoints/codi-single-1.5b/checkpoint-2000/added_tokens.json +35 -0
  25. checkpoints/codi-single-1.5b/checkpoint-2000/chat_template.jinja +54 -0
  26. checkpoints/codi-single-1.5b/checkpoint-2000/config.json +58 -0
  27. checkpoints/codi-single-1.5b/checkpoint-2000/merges.txt +0 -0
  28. checkpoints/codi-single-1.5b/checkpoint-2000/pytorch_model.bin +3 -0
  29. checkpoints/codi-single-1.5b/checkpoint-2000/special_tokens_map.json +31 -0
  30. checkpoints/codi-single-1.5b/checkpoint-2000/thought_projector.pt +3 -0
  31. checkpoints/codi-single-1.5b/checkpoint-2000/tokenizer.json +3 -0
  32. checkpoints/codi-single-1.5b/checkpoint-2000/tokenizer_config.json +295 -0
  33. checkpoints/codi-single-1.5b/checkpoint-2000/trainer_state.json +0 -0
  34. checkpoints/codi-single-1.5b/checkpoint-2000/vocab.json +0 -0
  35. checkpoints/codi-single-1.5b/checkpoint-2500/added_tokens.json +35 -0
  36. checkpoints/codi-single-1.5b/checkpoint-2500/chat_template.jinja +54 -0
  37. checkpoints/codi-single-1.5b/checkpoint-2500/config.json +58 -0
  38. checkpoints/codi-single-1.5b/checkpoint-2500/merges.txt +0 -0
  39. checkpoints/codi-single-1.5b/checkpoint-2500/pytorch_model.bin +3 -0
  40. checkpoints/codi-single-1.5b/checkpoint-2500/special_tokens_map.json +31 -0
  41. checkpoints/codi-single-1.5b/checkpoint-2500/thought_projector.pt +3 -0
  42. checkpoints/codi-single-1.5b/checkpoint-2500/tokenizer.json +3 -0
  43. checkpoints/codi-single-1.5b/checkpoint-2500/tokenizer_config.json +295 -0
  44. checkpoints/codi-single-1.5b/checkpoint-2500/trainer_state.json +0 -0
  45. checkpoints/codi-single-1.5b/checkpoint-2500/vocab.json +0 -0
  46. checkpoints/codi-single-1.5b/checkpoint-3000/added_tokens.json +35 -0
  47. checkpoints/codi-single-1.5b/checkpoint-3000/chat_template.jinja +54 -0
  48. checkpoints/codi-single-1.5b/checkpoint-3000/config.json +58 -0
  49. checkpoints/codi-single-1.5b/checkpoint-3000/merges.txt +0 -0
  50. checkpoints/codi-single-1.5b/checkpoint-3000/pytorch_model.bin +3 -0
.gitattributes CHANGED
@@ -103,3 +103,11 @@ checkpoints/codi_frozen_3b_hidden/wandb/offline-run-20260624_182251-t1mc3kud/run
103
  checkpoints/codi_frozen_3b_logit/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
104
  checkpoints/codi_frozen_3b_logit/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
105
  checkpoints/codi_frozen_3b_logit/wandb/offline-run-20260624_183654-h7tylfot/run-h7tylfot.wandb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
103
  checkpoints/codi_frozen_3b_logit/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
104
  checkpoints/codi_frozen_3b_logit/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
105
  checkpoints/codi_frozen_3b_logit/wandb/offline-run-20260624_183654-h7tylfot/run-h7tylfot.wandb filter=lfs diff=lfs merge=lfs -text
106
+ checkpoints/codi-single-1.5b/checkpoint-1000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
107
+ checkpoints/codi-single-1.5b/checkpoint-1500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
108
+ checkpoints/codi-single-1.5b/checkpoint-2000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
109
+ checkpoints/codi-single-1.5b/checkpoint-2500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
110
+ checkpoints/codi-single-1.5b/checkpoint-3000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
111
+ checkpoints/codi-single-1.5b/checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
112
+ checkpoints/codi-single-1.5b/wandb/offline-run-20260624_140203-4mlum96y/run-4mlum96y.wandb filter=lfs diff=lfs merge=lfs -text
113
+ checkpoints/codi-single-1.5b/wandb/offline-run-20260624_191514-shojcvof/run-shojcvof.wandb filter=lfs diff=lfs merge=lfs -text
checkpoints/codi-single-1.5b/checkpoint-1000/added_tokens.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|action_sep|>": 151670,
5
+ "<|arg_sep|>": 151671,
6
+ "<|box_end|>": 151649,
7
+ "<|box_start|>": 151648,
8
+ "<|call_sep|>": 151666,
9
+ "<|end_of_text|>": 151673,
10
+ "<|endoftext|>": 151643,
11
+ "<|exception_sep|>": 151669,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|frame_sep|>": 151672,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|latent_end|>": 151675,
22
+ "<|latent_start|>": 151674,
23
+ "<|line_sep|>": 151667,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|return_sep|>": 151668,
30
+ "<|trace_context_start|>": 151665,
31
+ "<|video_pad|>": 151656,
32
+ "<|vision_end|>": 151653,
33
+ "<|vision_pad|>": 151654,
34
+ "<|vision_start|>": 151652
35
+ }
checkpoints/codi-single-1.5b/checkpoint-1000/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/codi-single-1.5b/checkpoint-1000/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.6",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151676
58
+ }
checkpoints/codi-single-1.5b/checkpoint-1000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-1000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eacefa70e930dd171ef222d2a2871adf910bfeb7fa7e4b4ac125d13c74f9630a
3
+ size 3096212347
checkpoints/codi-single-1.5b/checkpoint-1000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/codi-single-1.5b/checkpoint-1000/thought_projector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:12072ec26682175d5541df39d90feac96dc8715d856cc7b196f9f0c831538319
3
+ size 9445953
checkpoints/codi-single-1.5b/checkpoint-1000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
3
+ size 11424004
checkpoints/codi-single-1.5b/checkpoint-1000/tokenizer_config.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|trace_context_start|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|call_sep|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|line_sep|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|return_sep|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|exception_sep|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|action_sep|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|arg_sep|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|frame_sep|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|end_of_text|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|latent_start|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|latent_end|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ }
269
+ },
270
+ "additional_special_tokens": [
271
+ "<|im_start|>",
272
+ "<|im_end|>",
273
+ "<|object_ref_start|>",
274
+ "<|object_ref_end|>",
275
+ "<|box_start|>",
276
+ "<|box_end|>",
277
+ "<|quad_start|>",
278
+ "<|quad_end|>",
279
+ "<|vision_start|>",
280
+ "<|vision_end|>",
281
+ "<|vision_pad|>",
282
+ "<|image_pad|>",
283
+ "<|video_pad|>"
284
+ ],
285
+ "bos_token": null,
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|endoftext|>",
288
+ "errors": "replace",
289
+ "extra_special_tokens": {},
290
+ "model_max_length": 32768,
291
+ "pad_token": "<|endoftext|>",
292
+ "split_special_tokens": false,
293
+ "tokenizer_class": "Qwen2Tokenizer",
294
+ "unk_token": null
295
+ }
checkpoints/codi-single-1.5b/checkpoint-1000/trainer_state.json ADDED
@@ -0,0 +1,2046 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.28835063437139563,
6
+ "eval_steps": 500,
7
+ "global_step": 1000,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0014417531718569781,
14
+ "grad_norm": 91136.0,
15
+ "kd_loss": 0.53515625,
16
+ "learning_rate": 1.3333333333333334e-06,
17
+ "loss": 1.4874,
18
+ "step": 5,
19
+ "student_loss": 0.6224480271339417,
20
+ "teacher_loss": 0.00038480176590383053
21
+ },
22
+ {
23
+ "epoch": 0.0028835063437139563,
24
+ "grad_norm": 29312.0,
25
+ "kd_loss": 0.54296875,
26
+ "learning_rate": 3e-06,
27
+ "loss": 1.3555,
28
+ "step": 10,
29
+ "student_loss": 0.7942929863929749,
30
+ "teacher_loss": 0.0007905907114036381
31
+ },
32
+ {
33
+ "epoch": 0.004325259515570935,
34
+ "grad_norm": 25088.0,
35
+ "kd_loss": 0.515625,
36
+ "learning_rate": 4.666666666666667e-06,
37
+ "loss": 1.3876,
38
+ "step": 15,
39
+ "student_loss": 0.2712247967720032,
40
+ "teacher_loss": 0.0006367590394802392
41
+ },
42
+ {
43
+ "epoch": 0.0057670126874279125,
44
+ "grad_norm": 11328.0,
45
+ "kd_loss": 0.5234375,
46
+ "learning_rate": 6.333333333333333e-06,
47
+ "loss": 1.3195,
48
+ "step": 20,
49
+ "student_loss": 0.991253137588501,
50
+ "teacher_loss": 0.00029781984630972147
51
+ },
52
+ {
53
+ "epoch": 0.00720876585928489,
54
+ "grad_norm": 4896.0,
55
+ "kd_loss": 0.46875,
56
+ "learning_rate": 8.000000000000001e-06,
57
+ "loss": 1.2722,
58
+ "step": 25,
59
+ "student_loss": 0.24368858337402344,
60
+ "teacher_loss": 0.0009928465588018298
61
+ },
62
+ {
63
+ "epoch": 0.00865051903114187,
64
+ "grad_norm": 35584.0,
65
+ "kd_loss": 0.48828125,
66
+ "learning_rate": 9.666666666666667e-06,
67
+ "loss": 1.2823,
68
+ "step": 30,
69
+ "student_loss": 0.7547083497047424,
70
+ "teacher_loss": 0.0003548521490301937
71
+ },
72
+ {
73
+ "epoch": 0.010092272202998846,
74
+ "grad_norm": 11584.0,
75
+ "kd_loss": 0.458984375,
76
+ "learning_rate": 9.99958042442916e-06,
77
+ "loss": 1.1068,
78
+ "step": 35,
79
+ "student_loss": 0.48039907217025757,
80
+ "teacher_loss": 0.00026277394499629736
81
+ },
82
+ {
83
+ "epoch": 0.011534025374855825,
84
+ "grad_norm": 33536.0,
85
+ "kd_loss": 0.5,
86
+ "learning_rate": 9.997876019358083e-06,
87
+ "loss": 1.174,
88
+ "step": 40,
89
+ "student_loss": 0.7645292282104492,
90
+ "teacher_loss": 0.07346436381340027
91
+ },
92
+ {
93
+ "epoch": 0.012975778546712802,
94
+ "grad_norm": 3664.0,
95
+ "kd_loss": 0.478515625,
96
+ "learning_rate": 9.99486100792044e-06,
97
+ "loss": 0.9907,
98
+ "step": 45,
99
+ "student_loss": 1.340754508972168,
100
+ "teacher_loss": 0.0009389458573423326
101
+ },
102
+ {
103
+ "epoch": 0.01441753171856978,
104
+ "grad_norm": 1584.0,
105
+ "kd_loss": 0.486328125,
106
+ "learning_rate": 9.990536180750724e-06,
107
+ "loss": 0.9709,
108
+ "step": 50,
109
+ "student_loss": 0.31134045124053955,
110
+ "teacher_loss": 0.00372113729827106
111
+ },
112
+ {
113
+ "epoch": 0.015859284890426758,
114
+ "grad_norm": 672.0,
115
+ "kd_loss": 0.44921875,
116
+ "learning_rate": 9.984902671959911e-06,
117
+ "loss": 0.7719,
118
+ "step": 55,
119
+ "student_loss": 0.11045902222394943,
120
+ "teacher_loss": 0.0004030822601635009
121
+ },
122
+ {
123
+ "epoch": 0.01730103806228374,
124
+ "grad_norm": 648.0,
125
+ "kd_loss": 0.447265625,
126
+ "learning_rate": 9.97796195883804e-06,
127
+ "loss": 0.725,
128
+ "step": 60,
129
+ "student_loss": 0.15876038372516632,
130
+ "teacher_loss": 0.0018200079211965203
131
+ },
132
+ {
133
+ "epoch": 0.018742791234140715,
134
+ "grad_norm": 490.0,
135
+ "kd_loss": 0.447265625,
136
+ "learning_rate": 9.969715861466839e-06,
137
+ "loss": 0.6844,
138
+ "step": 65,
139
+ "student_loss": 0.2451532483100891,
140
+ "teacher_loss": 0.056328896433115005
141
+ },
142
+ {
143
+ "epoch": 0.020184544405997693,
144
+ "grad_norm": 210.0,
145
+ "kd_loss": 0.4453125,
146
+ "learning_rate": 9.96016654224243e-06,
147
+ "loss": 0.6719,
148
+ "step": 70,
149
+ "student_loss": 0.036203403025865555,
150
+ "teacher_loss": 0.0021477588452398777
151
+ },
152
+ {
153
+ "epoch": 0.02162629757785467,
154
+ "grad_norm": 366.0,
155
+ "kd_loss": 0.44921875,
156
+ "learning_rate": 9.94931650530827e-06,
157
+ "loss": 0.6026,
158
+ "step": 75,
159
+ "student_loss": 0.009898507036268711,
160
+ "teacher_loss": 0.0007659209077246487
161
+ },
162
+ {
163
+ "epoch": 0.02306805074971165,
164
+ "grad_norm": 1816.0,
165
+ "kd_loss": 0.462890625,
166
+ "learning_rate": 9.93716859589851e-06,
167
+ "loss": 0.6433,
168
+ "step": 80,
169
+ "student_loss": 0.030777160078287125,
170
+ "teacher_loss": 0.0008218760485760868
171
+ },
172
+ {
173
+ "epoch": 0.024509803921568627,
174
+ "grad_norm": 360.0,
175
+ "kd_loss": 0.470703125,
176
+ "learning_rate": 9.923725999591846e-06,
177
+ "loss": 0.5588,
178
+ "step": 85,
179
+ "student_loss": 0.07332021743059158,
180
+ "teacher_loss": 0.00044353402336128056
181
+ },
182
+ {
183
+ "epoch": 0.025951557093425604,
184
+ "grad_norm": 732.0,
185
+ "kd_loss": 0.39453125,
186
+ "learning_rate": 9.908992241476189e-06,
187
+ "loss": 0.5723,
188
+ "step": 90,
189
+ "student_loss": 0.07177069783210754,
190
+ "teacher_loss": 0.00044491401058621705
191
+ },
192
+ {
193
+ "epoch": 0.027393310265282585,
194
+ "grad_norm": 250.0,
195
+ "kd_loss": 0.4296875,
196
+ "learning_rate": 9.892971185224244e-06,
197
+ "loss": 0.5615,
198
+ "step": 95,
199
+ "student_loss": 0.25651854276657104,
200
+ "teacher_loss": 0.0204803254455328
201
+ },
202
+ {
203
+ "epoch": 0.02883506343713956,
204
+ "grad_norm": 195.0,
205
+ "kd_loss": 0.42578125,
206
+ "learning_rate": 9.875667032080354e-06,
207
+ "loss": 0.5376,
208
+ "step": 100,
209
+ "student_loss": 0.0018209030386060476,
210
+ "teacher_loss": 0.0005951338680461049
211
+ },
212
+ {
213
+ "epoch": 0.03027681660899654,
214
+ "grad_norm": 74.5,
215
+ "kd_loss": 0.416015625,
216
+ "learning_rate": 9.857084319758772e-06,
217
+ "loss": 0.5554,
218
+ "step": 105,
219
+ "student_loss": 0.009238015860319138,
220
+ "teacher_loss": 0.000378329394152388
221
+ },
222
+ {
223
+ "epoch": 0.031718569780853516,
224
+ "grad_norm": 134.0,
225
+ "kd_loss": 0.3984375,
226
+ "learning_rate": 9.837227921253747e-06,
227
+ "loss": 0.5376,
228
+ "step": 110,
229
+ "student_loss": 0.008933513425290585,
230
+ "teacher_loss": 0.054748453199863434
231
+ },
232
+ {
233
+ "epoch": 0.03316032295271049,
234
+ "grad_norm": 53.5,
235
+ "kd_loss": 0.380859375,
236
+ "learning_rate": 9.816103043561648e-06,
237
+ "loss": 0.5101,
238
+ "step": 115,
239
+ "student_loss": 0.0015179987531155348,
240
+ "teacher_loss": 0.0003994991711806506
241
+ },
242
+ {
243
+ "epoch": 0.03460207612456748,
244
+ "grad_norm": 90.5,
245
+ "kd_loss": 0.34765625,
246
+ "learning_rate": 9.79371522631553e-06,
247
+ "loss": 0.4575,
248
+ "step": 120,
249
+ "student_loss": 0.005917699541896582,
250
+ "teacher_loss": 0.0011113588698208332
251
+ },
252
+ {
253
+ "epoch": 0.036043829296424454,
254
+ "grad_norm": 105.0,
255
+ "kd_loss": 0.333984375,
256
+ "learning_rate": 9.770070340332457e-06,
257
+ "loss": 0.4262,
258
+ "step": 125,
259
+ "student_loss": 0.15691499412059784,
260
+ "teacher_loss": 0.0019031435949727893
261
+ },
262
+ {
263
+ "epoch": 0.03748558246828143,
264
+ "grad_norm": 16.375,
265
+ "kd_loss": 0.3125,
266
+ "learning_rate": 9.745174586073982e-06,
267
+ "loss": 0.4434,
268
+ "step": 130,
269
+ "student_loss": 0.0024075880646705627,
270
+ "teacher_loss": 0.007021783851087093
271
+ },
272
+ {
273
+ "epoch": 0.03892733564013841,
274
+ "grad_norm": 21.625,
275
+ "kd_loss": 0.294921875,
276
+ "learning_rate": 9.719034492020183e-06,
277
+ "loss": 0.3819,
278
+ "step": 135,
279
+ "student_loss": 0.002887926297262311,
280
+ "teacher_loss": 0.00043111268314532936
281
+ },
282
+ {
283
+ "epoch": 0.040369088811995385,
284
+ "grad_norm": 11.875,
285
+ "kd_loss": 0.26953125,
286
+ "learning_rate": 9.691656912957686e-06,
287
+ "loss": 0.3717,
288
+ "step": 140,
289
+ "student_loss": 0.0013271740172058344,
290
+ "teacher_loss": 0.0008426732383668423
291
+ },
292
+ {
293
+ "epoch": 0.04181084198385236,
294
+ "grad_norm": 11.6875,
295
+ "kd_loss": 0.259765625,
296
+ "learning_rate": 9.663049028182112e-06,
297
+ "loss": 0.3423,
298
+ "step": 145,
299
+ "student_loss": 0.11146840453147888,
300
+ "teacher_loss": 0.0059180548414587975
301
+ },
302
+ {
303
+ "epoch": 0.04325259515570934,
304
+ "grad_norm": 7.09375,
305
+ "kd_loss": 0.2333984375,
306
+ "learning_rate": 9.633218339615433e-06,
307
+ "loss": 0.3012,
308
+ "step": 150,
309
+ "student_loss": 0.001806147862225771,
310
+ "teacher_loss": 0.001238134689629078
311
+ },
312
+ {
313
+ "epoch": 0.04469434832756632,
314
+ "grad_norm": 6.84375,
315
+ "kd_loss": 0.201171875,
316
+ "learning_rate": 9.602172669838721e-06,
317
+ "loss": 0.3157,
318
+ "step": 155,
319
+ "student_loss": 0.0033397674560546875,
320
+ "teacher_loss": 0.0009877807460725307
321
+ },
322
+ {
323
+ "epoch": 0.0461361014994233,
324
+ "grad_norm": 7.0625,
325
+ "kd_loss": 0.22265625,
326
+ "learning_rate": 9.569920160040815e-06,
327
+ "loss": 0.2821,
328
+ "step": 160,
329
+ "student_loss": 0.22769513726234436,
330
+ "teacher_loss": 0.041321102529764175
331
+ },
332
+ {
333
+ "epoch": 0.04757785467128028,
334
+ "grad_norm": 5.96875,
335
+ "kd_loss": 0.1943359375,
336
+ "learning_rate": 9.536469267883432e-06,
337
+ "loss": 0.2727,
338
+ "step": 165,
339
+ "student_loss": 0.00636801915243268,
340
+ "teacher_loss": 0.00474645895883441
341
+ },
342
+ {
343
+ "epoch": 0.049019607843137254,
344
+ "grad_norm": 6.375,
345
+ "kd_loss": 0.166015625,
346
+ "learning_rate": 9.501828765283295e-06,
347
+ "loss": 0.2549,
348
+ "step": 170,
349
+ "student_loss": 0.00595078757032752,
350
+ "teacher_loss": 0.002222016453742981
351
+ },
352
+ {
353
+ "epoch": 0.05046136101499423,
354
+ "grad_norm": 5.875,
355
+ "kd_loss": 0.154296875,
356
+ "learning_rate": 9.466007736111846e-06,
357
+ "loss": 0.249,
358
+ "step": 175,
359
+ "student_loss": 0.0170090701431036,
360
+ "teacher_loss": 0.001197761739604175
361
+ },
362
+ {
363
+ "epoch": 0.05190311418685121,
364
+ "grad_norm": 6.5,
365
+ "kd_loss": 0.15234375,
366
+ "learning_rate": 9.429015573813163e-06,
367
+ "loss": 0.2458,
368
+ "step": 180,
369
+ "student_loss": 0.0050661382265388966,
370
+ "teacher_loss": 0.002651061164215207
371
+ },
372
+ {
373
+ "epoch": 0.05334486735870819,
374
+ "grad_norm": 4.25,
375
+ "kd_loss": 0.169921875,
376
+ "learning_rate": 9.390861978940687e-06,
377
+ "loss": 0.2313,
378
+ "step": 185,
379
+ "student_loss": 0.3254599869251251,
380
+ "teacher_loss": 0.0007031414425000548
381
+ },
382
+ {
383
+ "epoch": 0.05478662053056517,
384
+ "grad_norm": 7.71875,
385
+ "kd_loss": 0.177734375,
386
+ "learning_rate": 9.351556956613423e-06,
387
+ "loss": 0.2414,
388
+ "step": 190,
389
+ "student_loss": 0.003914886154234409,
390
+ "teacher_loss": 0.001010580570437014
391
+ },
392
+ {
393
+ "epoch": 0.056228373702422146,
394
+ "grad_norm": 6.84375,
395
+ "kd_loss": 0.1435546875,
396
+ "learning_rate": 9.31111081389227e-06,
397
+ "loss": 0.2156,
398
+ "step": 195,
399
+ "student_loss": 0.06280706077814102,
400
+ "teacher_loss": 0.0005038110539317131
401
+ },
402
+ {
403
+ "epoch": 0.05767012687427912,
404
+ "grad_norm": 3.5625,
405
+ "kd_loss": 0.140625,
406
+ "learning_rate": 9.269534157077177e-06,
407
+ "loss": 0.1981,
408
+ "step": 200,
409
+ "student_loss": 0.002929725218564272,
410
+ "teacher_loss": 0.0005093662184663117
411
+ },
412
+ {
413
+ "epoch": 0.0591118800461361,
414
+ "grad_norm": 4.96875,
415
+ "kd_loss": 0.150390625,
416
+ "learning_rate": 9.226837888925813e-06,
417
+ "loss": 0.2157,
418
+ "step": 205,
419
+ "student_loss": 0.00641452893614769,
420
+ "teacher_loss": 0.03549756482243538
421
+ },
422
+ {
423
+ "epoch": 0.06055363321799308,
424
+ "grad_norm": 6.375,
425
+ "kd_loss": 0.1318359375,
426
+ "learning_rate": 9.183033205794525e-06,
427
+ "loss": 0.2094,
428
+ "step": 210,
429
+ "student_loss": 0.13332298398017883,
430
+ "teacher_loss": 0.0007853205897845328
431
+ },
432
+ {
433
+ "epoch": 0.061995386389850055,
434
+ "grad_norm": 6.46875,
435
+ "kd_loss": 0.1259765625,
436
+ "learning_rate": 9.13813159470227e-06,
437
+ "loss": 0.2075,
438
+ "step": 215,
439
+ "student_loss": 0.020972760394215584,
440
+ "teacher_loss": 0.0003814305819105357
441
+ },
442
+ {
443
+ "epoch": 0.06343713956170703,
444
+ "grad_norm": 7.8125,
445
+ "kd_loss": 0.1357421875,
446
+ "learning_rate": 9.092144830318357e-06,
447
+ "loss": 0.2394,
448
+ "step": 220,
449
+ "student_loss": 0.3116561472415924,
450
+ "teacher_loss": 0.008577825501561165
451
+ },
452
+ {
453
+ "epoch": 0.06487889273356401,
454
+ "grad_norm": 6.125,
455
+ "kd_loss": 0.11572265625,
456
+ "learning_rate": 9.045084971874738e-06,
457
+ "loss": 0.2193,
458
+ "step": 225,
459
+ "student_loss": 0.17015616595745087,
460
+ "teacher_loss": 0.0012981987092643976
461
+ },
462
+ {
463
+ "epoch": 0.06632064590542099,
464
+ "grad_norm": 5.0625,
465
+ "kd_loss": 0.11962890625,
466
+ "learning_rate": 8.99696436000368e-06,
467
+ "loss": 0.1794,
468
+ "step": 230,
469
+ "student_loss": 0.00222015380859375,
470
+ "teacher_loss": 0.003056387882679701
471
+ },
472
+ {
473
+ "epoch": 0.06776239907727798,
474
+ "grad_norm": 4.9375,
475
+ "kd_loss": 0.15234375,
476
+ "learning_rate": 8.947795613501658e-06,
477
+ "loss": 0.2096,
478
+ "step": 235,
479
+ "student_loss": 0.005798778962343931,
480
+ "teacher_loss": 0.00040503445779904723
481
+ },
482
+ {
483
+ "epoch": 0.06920415224913495,
484
+ "grad_norm": 4.34375,
485
+ "kd_loss": 0.1103515625,
486
+ "learning_rate": 8.897591626020284e-06,
487
+ "loss": 0.2009,
488
+ "step": 240,
489
+ "student_loss": 0.0019766136538237333,
490
+ "teacher_loss": 0.0007733534439466894
491
+ },
492
+ {
493
+ "epoch": 0.07064590542099193,
494
+ "grad_norm": 4.875,
495
+ "kd_loss": 0.1376953125,
496
+ "learning_rate": 8.846365562685178e-06,
497
+ "loss": 0.1982,
498
+ "step": 245,
499
+ "student_loss": 0.002786125522106886,
500
+ "teacher_loss": 0.0018817130476236343
501
+ },
502
+ {
503
+ "epoch": 0.07208765859284891,
504
+ "grad_norm": 5.0625,
505
+ "kd_loss": 0.1162109375,
506
+ "learning_rate": 8.794130856643635e-06,
507
+ "loss": 0.1736,
508
+ "step": 250,
509
+ "student_loss": 0.003304118989035487,
510
+ "teacher_loss": 0.0007230375776998699
511
+ },
512
+ {
513
+ "epoch": 0.07352941176470588,
514
+ "grad_norm": 6.875,
515
+ "kd_loss": 0.12890625,
516
+ "learning_rate": 8.74090120554202e-06,
517
+ "loss": 0.2065,
518
+ "step": 255,
519
+ "student_loss": 0.0011023435508832335,
520
+ "teacher_loss": 0.0011509901378303766
521
+ },
522
+ {
523
+ "epoch": 0.07497116493656286,
524
+ "grad_norm": 4.9375,
525
+ "kd_loss": 0.1259765625,
526
+ "learning_rate": 8.686690567933803e-06,
527
+ "loss": 0.2005,
528
+ "step": 260,
529
+ "student_loss": 0.001012590597383678,
530
+ "teacher_loss": 0.035696372389793396
531
+ },
532
+ {
533
+ "epoch": 0.07641291810841984,
534
+ "grad_norm": 4.8125,
535
+ "kd_loss": 0.11669921875,
536
+ "learning_rate": 8.63151315961915e-06,
537
+ "loss": 0.1962,
538
+ "step": 265,
539
+ "student_loss": 0.001487129949964583,
540
+ "teacher_loss": 0.0009277480421587825
541
+ },
542
+ {
543
+ "epoch": 0.07785467128027682,
544
+ "grad_norm": 4.84375,
545
+ "kd_loss": 0.1298828125,
546
+ "learning_rate": 8.575383449917103e-06,
547
+ "loss": 0.196,
548
+ "step": 270,
549
+ "student_loss": 0.016224455088377,
550
+ "teacher_loss": 0.0017208521021530032
551
+ },
552
+ {
553
+ "epoch": 0.07929642445213379,
554
+ "grad_norm": 6.5625,
555
+ "kd_loss": 0.12451171875,
556
+ "learning_rate": 8.518316157871232e-06,
557
+ "loss": 0.2031,
558
+ "step": 275,
559
+ "student_loss": 0.0010892748832702637,
560
+ "teacher_loss": 0.035929761826992035
561
+ },
562
+ {
563
+ "epoch": 0.08073817762399077,
564
+ "grad_norm": 4.90625,
565
+ "kd_loss": 0.1279296875,
566
+ "learning_rate": 8.460326248389825e-06,
567
+ "loss": 0.217,
568
+ "step": 280,
569
+ "student_loss": 0.0011528143659234047,
570
+ "teacher_loss": 0.0003817000542767346
571
+ },
572
+ {
573
+ "epoch": 0.08217993079584775,
574
+ "grad_norm": 6.625,
575
+ "kd_loss": 0.11572265625,
576
+ "learning_rate": 8.401428928321607e-06,
577
+ "loss": 0.206,
578
+ "step": 285,
579
+ "student_loss": 0.0025788608472794294,
580
+ "teacher_loss": 0.001419232808984816
581
+ },
582
+ {
583
+ "epoch": 0.08362168396770472,
584
+ "grad_norm": 4.59375,
585
+ "kd_loss": 0.13671875,
586
+ "learning_rate": 8.341639642468002e-06,
587
+ "loss": 0.2497,
588
+ "step": 290,
589
+ "student_loss": 0.012355645187199116,
590
+ "teacher_loss": 0.004034335725009441
591
+ },
592
+ {
593
+ "epoch": 0.0850634371395617,
594
+ "grad_norm": 6.875,
595
+ "kd_loss": 0.1259765625,
596
+ "learning_rate": 8.280974069532999e-06,
597
+ "loss": 0.1999,
598
+ "step": 295,
599
+ "student_loss": 0.0029606728348881006,
600
+ "teacher_loss": 0.0018796900985762477
601
+ },
602
+ {
603
+ "epoch": 0.08650519031141868,
604
+ "grad_norm": 7.40625,
605
+ "kd_loss": 0.12451171875,
606
+ "learning_rate": 8.219448118011687e-06,
607
+ "loss": 0.1898,
608
+ "step": 300,
609
+ "student_loss": 0.05260760709643364,
610
+ "teacher_loss": 0.0028262475971132517
611
+ },
612
+ {
613
+ "epoch": 0.08794694348327567,
614
+ "grad_norm": 5.75,
615
+ "kd_loss": 0.11474609375,
616
+ "learning_rate": 8.157077922018537e-06,
617
+ "loss": 0.1993,
618
+ "step": 305,
619
+ "student_loss": 0.07766762375831604,
620
+ "teacher_loss": 0.0020029095467180014
621
+ },
622
+ {
623
+ "epoch": 0.08938869665513265,
624
+ "grad_norm": 3.796875,
625
+ "kd_loss": 0.10595703125,
626
+ "learning_rate": 8.093879837056486e-06,
627
+ "loss": 0.1971,
628
+ "step": 310,
629
+ "student_loss": 0.0014958757674321532,
630
+ "teacher_loss": 0.0014671633252874017
631
+ },
632
+ {
633
+ "epoch": 0.09083044982698962,
634
+ "grad_norm": 7.15625,
635
+ "kd_loss": 0.1103515625,
636
+ "learning_rate": 8.029870435728018e-06,
637
+ "loss": 0.204,
638
+ "step": 315,
639
+ "student_loss": 0.2129756659269333,
640
+ "teacher_loss": 0.0017302327323704958
641
+ },
642
+ {
643
+ "epoch": 0.0922722029988466,
644
+ "grad_norm": 5.0,
645
+ "kd_loss": 0.10400390625,
646
+ "learning_rate": 7.965066503389264e-06,
647
+ "loss": 0.2075,
648
+ "step": 320,
649
+ "student_loss": 0.0016070405254140496,
650
+ "teacher_loss": 0.0009597347816452384
651
+ },
652
+ {
653
+ "epoch": 0.09371395617070358,
654
+ "grad_norm": 6.59375,
655
+ "kd_loss": 0.171875,
656
+ "learning_rate": 7.89948503374835e-06,
657
+ "loss": 0.1917,
658
+ "step": 325,
659
+ "student_loss": 0.003808736801147461,
660
+ "teacher_loss": 0.01614256761968136
661
+ },
662
+ {
663
+ "epoch": 0.09515570934256055,
664
+ "grad_norm": 6.03125,
665
+ "kd_loss": 0.115234375,
666
+ "learning_rate": 7.833143224409076e-06,
667
+ "loss": 0.2017,
668
+ "step": 330,
669
+ "student_loss": 0.021043118089437485,
670
+ "teacher_loss": 0.0004570994933601469
671
+ },
672
+ {
673
+ "epoch": 0.09659746251441753,
674
+ "grad_norm": 4.21875,
675
+ "kd_loss": 0.12109375,
676
+ "learning_rate": 7.766058472361154e-06,
677
+ "loss": 0.176,
678
+ "step": 335,
679
+ "student_loss": 0.0018800155958160758,
680
+ "teacher_loss": 0.0022506280802190304
681
+ },
682
+ {
683
+ "epoch": 0.09803921568627451,
684
+ "grad_norm": 5.03125,
685
+ "kd_loss": 0.1083984375,
686
+ "learning_rate": 7.698248369418146e-06,
687
+ "loss": 0.1834,
688
+ "step": 340,
689
+ "student_loss": 0.0805514007806778,
690
+ "teacher_loss": 0.006284959614276886
691
+ },
692
+ {
693
+ "epoch": 0.09948096885813149,
694
+ "grad_norm": 5.6875,
695
+ "kd_loss": 0.10888671875,
696
+ "learning_rate": 7.629730697604314e-06,
697
+ "loss": 0.2074,
698
+ "step": 345,
699
+ "student_loss": 0.13202711939811707,
700
+ "teacher_loss": 0.0005228903028182685
701
+ },
702
+ {
703
+ "epoch": 0.10092272202998846,
704
+ "grad_norm": 5.25,
705
+ "kd_loss": 0.12890625,
706
+ "learning_rate": 7.560523424491595e-06,
707
+ "loss": 0.1831,
708
+ "step": 350,
709
+ "student_loss": 0.081477090716362,
710
+ "teacher_loss": 0.0003300213429611176
711
+ },
712
+ {
713
+ "epoch": 0.10236447520184544,
714
+ "grad_norm": 3.796875,
715
+ "kd_loss": 0.10693359375,
716
+ "learning_rate": 7.490644698487909e-06,
717
+ "loss": 0.1853,
718
+ "step": 355,
719
+ "student_loss": 0.002148553030565381,
720
+ "teacher_loss": 0.0014131986536085606
721
+ },
722
+ {
723
+ "epoch": 0.10380622837370242,
724
+ "grad_norm": 5.65625,
725
+ "kd_loss": 0.111328125,
726
+ "learning_rate": 7.420112844078066e-06,
727
+ "loss": 0.1865,
728
+ "step": 360,
729
+ "student_loss": 0.18958300352096558,
730
+ "teacher_loss": 0.001118413987569511
731
+ },
732
+ {
733
+ "epoch": 0.1052479815455594,
734
+ "grad_norm": 7.625,
735
+ "kd_loss": 0.1123046875,
736
+ "learning_rate": 7.348946357018479e-06,
737
+ "loss": 0.1824,
738
+ "step": 365,
739
+ "student_loss": 0.0013260500272735953,
740
+ "teacher_loss": 0.029322339221835136
741
+ },
742
+ {
743
+ "epoch": 0.10668973471741638,
744
+ "grad_norm": 4.875,
745
+ "kd_loss": 0.1318359375,
746
+ "learning_rate": 7.277163899486975e-06,
747
+ "loss": 0.189,
748
+ "step": 370,
749
+ "student_loss": 0.16300825774669647,
750
+ "teacher_loss": 0.000458209979115054
751
+ },
752
+ {
753
+ "epoch": 0.10813148788927336,
754
+ "grad_norm": 5.875,
755
+ "kd_loss": 0.15625,
756
+ "learning_rate": 7.204784295188959e-06,
757
+ "loss": 0.1865,
758
+ "step": 375,
759
+ "student_loss": 0.3203120529651642,
760
+ "teacher_loss": 0.02350226417183876
761
+ },
762
+ {
763
+ "epoch": 0.10957324106113034,
764
+ "grad_norm": 4.34375,
765
+ "kd_loss": 0.10546875,
766
+ "learning_rate": 7.1318265244212305e-06,
767
+ "loss": 0.1864,
768
+ "step": 380,
769
+ "student_loss": 0.0027754041366279125,
770
+ "teacher_loss": 0.001105214236304164
771
+ },
772
+ {
773
+ "epoch": 0.11101499423298732,
774
+ "grad_norm": 4.8125,
775
+ "kd_loss": 0.1171875,
776
+ "learning_rate": 7.05830971909472e-06,
777
+ "loss": 0.1872,
778
+ "step": 385,
779
+ "student_loss": 0.001283544348552823,
780
+ "teacher_loss": 0.0009954526321962476
781
+ },
782
+ {
783
+ "epoch": 0.11245674740484429,
784
+ "grad_norm": 5.25,
785
+ "kd_loss": 0.125,
786
+ "learning_rate": 6.9842531577174865e-06,
787
+ "loss": 0.1764,
788
+ "step": 390,
789
+ "student_loss": 0.0008525378652848303,
790
+ "teacher_loss": 0.001041764859110117
791
+ },
792
+ {
793
+ "epoch": 0.11389850057670127,
794
+ "grad_norm": 6.59375,
795
+ "kd_loss": 0.1171875,
796
+ "learning_rate": 6.9096762603392595e-06,
797
+ "loss": 0.195,
798
+ "step": 395,
799
+ "student_loss": 0.0015218615299090743,
800
+ "teacher_loss": 0.0010145456762984395
801
+ },
802
+ {
803
+ "epoch": 0.11534025374855825,
804
+ "grad_norm": 4.6875,
805
+ "kd_loss": 0.12451171875,
806
+ "learning_rate": 6.834598583458862e-06,
807
+ "loss": 0.1822,
808
+ "step": 400,
809
+ "student_loss": 0.0013333633542060852,
810
+ "teacher_loss": 0.00030270780553109944
811
+ },
812
+ {
813
+ "epoch": 0.11678200692041522,
814
+ "grad_norm": 2.703125,
815
+ "kd_loss": 0.103515625,
816
+ "learning_rate": 6.7590398148958625e-06,
817
+ "loss": 0.196,
818
+ "step": 405,
819
+ "student_loss": 0.026790756732225418,
820
+ "teacher_loss": 0.0005501789273694158
821
+ },
822
+ {
823
+ "epoch": 0.1182237600922722,
824
+ "grad_norm": 8.125,
825
+ "kd_loss": 0.1787109375,
826
+ "learning_rate": 6.6830197686277945e-06,
827
+ "loss": 0.2152,
828
+ "step": 410,
829
+ "student_loss": 0.5714533925056458,
830
+ "teacher_loss": 0.008862318471074104
831
+ },
832
+ {
833
+ "epoch": 0.11966551326412918,
834
+ "grad_norm": 4.1875,
835
+ "kd_loss": 0.1279296875,
836
+ "learning_rate": 6.6065583795942625e-06,
837
+ "loss": 0.2006,
838
+ "step": 415,
839
+ "student_loss": 0.019465278834104538,
840
+ "teacher_loss": 0.0042519038543105125
841
+ },
842
+ {
843
+ "epoch": 0.12110726643598616,
844
+ "grad_norm": 5.21875,
845
+ "kd_loss": 0.1181640625,
846
+ "learning_rate": 6.52967569846937e-06,
847
+ "loss": 0.1764,
848
+ "step": 420,
849
+ "student_loss": 0.008550797589123249,
850
+ "teacher_loss": 0.023795029148459435
851
+ },
852
+ {
853
+ "epoch": 0.12254901960784313,
854
+ "grad_norm": 8.5625,
855
+ "kd_loss": 0.126953125,
856
+ "learning_rate": 6.452391886403767e-06,
857
+ "loss": 0.1854,
858
+ "step": 425,
859
+ "student_loss": 0.04949700087308884,
860
+ "teacher_loss": 0.004153509624302387
861
+ },
862
+ {
863
+ "epoch": 0.12399077277970011,
864
+ "grad_norm": 7.25,
865
+ "kd_loss": 0.18359375,
866
+ "learning_rate": 6.374727209737743e-06,
867
+ "loss": 0.2107,
868
+ "step": 430,
869
+ "student_loss": 0.002016805112361908,
870
+ "teacher_loss": 0.06358348578214645
871
+ },
872
+ {
873
+ "epoch": 0.1254325259515571,
874
+ "grad_norm": 6.21875,
875
+ "kd_loss": 0.11083984375,
876
+ "learning_rate": 6.296702034686726e-06,
877
+ "loss": 0.1934,
878
+ "step": 435,
879
+ "student_loss": 0.0017790297279134393,
880
+ "teacher_loss": 0.0017155191162601113
881
+ },
882
+ {
883
+ "epoch": 0.12687427912341406,
884
+ "grad_norm": 4.125,
885
+ "kd_loss": 0.18359375,
886
+ "learning_rate": 6.218336822000598e-06,
887
+ "loss": 0.2068,
888
+ "step": 440,
889
+ "student_loss": 0.565355658531189,
890
+ "teacher_loss": 0.008367877453565598
891
+ },
892
+ {
893
+ "epoch": 0.12831603229527105,
894
+ "grad_norm": 5.09375,
895
+ "kd_loss": 0.1357421875,
896
+ "learning_rate": 6.139652121598219e-06,
897
+ "loss": 0.2072,
898
+ "step": 445,
899
+ "student_loss": 0.00040582873043604195,
900
+ "teacher_loss": 0.017554111778736115
901
+ },
902
+ {
903
+ "epoch": 0.12975778546712802,
904
+ "grad_norm": 4.03125,
905
+ "kd_loss": 0.1396484375,
906
+ "learning_rate": 6.060668567178561e-06,
907
+ "loss": 0.194,
908
+ "step": 450,
909
+ "student_loss": 0.0020873546600341797,
910
+ "teacher_loss": 0.0007538718055002391
911
+ },
912
+ {
913
+ "epoch": 0.131199538638985,
914
+ "grad_norm": 4.09375,
915
+ "kd_loss": 0.10595703125,
916
+ "learning_rate": 5.981406870809889e-06,
917
+ "loss": 0.1896,
918
+ "step": 455,
919
+ "student_loss": 0.010908172465860844,
920
+ "teacher_loss": 0.0012792785419151187
921
+ },
922
+ {
923
+ "epoch": 0.13264129181084197,
924
+ "grad_norm": 6.6875,
925
+ "kd_loss": 0.12255859375,
926
+ "learning_rate": 5.9018878174983674e-06,
927
+ "loss": 0.1893,
928
+ "step": 460,
929
+ "student_loss": 0.01578596420586109,
930
+ "teacher_loss": 0.0009114979766309261
931
+ },
932
+ {
933
+ "epoch": 0.13408304498269896,
934
+ "grad_norm": 4.4375,
935
+ "kd_loss": 0.14453125,
936
+ "learning_rate": 5.822132259737565e-06,
937
+ "loss": 0.2189,
938
+ "step": 465,
939
+ "student_loss": 0.0021727595012634993,
940
+ "teacher_loss": 0.0004909814451821148
941
+ },
942
+ {
943
+ "epoch": 0.13552479815455595,
944
+ "grad_norm": 5.09375,
945
+ "kd_loss": 0.1005859375,
946
+ "learning_rate": 5.742161112040237e-06,
947
+ "loss": 0.2169,
948
+ "step": 470,
949
+ "student_loss": 0.0009243786334991455,
950
+ "teacher_loss": 0.000744891760405153
951
+ },
952
+ {
953
+ "epoch": 0.13696655132641292,
954
+ "grad_norm": 4.9375,
955
+ "kd_loss": 0.1396484375,
956
+ "learning_rate": 5.661995345453867e-06,
957
+ "loss": 0.1752,
958
+ "step": 475,
959
+ "student_loss": 0.003345559583976865,
960
+ "teacher_loss": 0.0005117281689308584
961
+ },
962
+ {
963
+ "epoch": 0.1384083044982699,
964
+ "grad_norm": 4.46875,
965
+ "kd_loss": 0.1376953125,
966
+ "learning_rate": 5.581655982061367e-06,
967
+ "loss": 0.211,
968
+ "step": 480,
969
+ "student_loss": 0.0018078088760375977,
970
+ "teacher_loss": 0.029633358120918274
971
+ },
972
+ {
973
+ "epoch": 0.13985005767012687,
974
+ "grad_norm": 6.59375,
975
+ "kd_loss": 0.11767578125,
976
+ "learning_rate": 5.501164089468406e-06,
977
+ "loss": 0.1795,
978
+ "step": 485,
979
+ "student_loss": 0.3106631636619568,
980
+ "teacher_loss": 0.0037072307895869017
981
+ },
982
+ {
983
+ "epoch": 0.14129181084198386,
984
+ "grad_norm": 6.0,
985
+ "kd_loss": 0.1328125,
986
+ "learning_rate": 5.4205407752787884e-06,
987
+ "loss": 0.1896,
988
+ "step": 490,
989
+ "student_loss": 0.0010533903259783983,
990
+ "teacher_loss": 0.0012765543069690466
991
+ },
992
+ {
993
+ "epoch": 0.14273356401384082,
994
+ "grad_norm": 5.25,
995
+ "kd_loss": 0.1376953125,
996
+ "learning_rate": 5.339807181559359e-06,
997
+ "loss": 0.194,
998
+ "step": 495,
999
+ "student_loss": 0.08354002982378006,
1000
+ "teacher_loss": 0.0006759578245691955
1001
+ },
1002
+ {
1003
+ "epoch": 0.14417531718569782,
1004
+ "grad_norm": 6.125,
1005
+ "kd_loss": 0.12060546875,
1006
+ "learning_rate": 5.258984479295853e-06,
1007
+ "loss": 0.1865,
1008
+ "step": 500,
1009
+ "student_loss": 0.003352736122906208,
1010
+ "teacher_loss": 0.0017620434518903494
1011
+ },
1012
+ {
1013
+ "epoch": 0.14561707035755478,
1014
+ "grad_norm": 5.71875,
1015
+ "kd_loss": 0.11572265625,
1016
+ "learning_rate": 5.1780938628411795e-06,
1017
+ "loss": 0.2201,
1018
+ "step": 505,
1019
+ "student_loss": 0.002489902079105377,
1020
+ "teacher_loss": 0.0007855825824663043
1021
+ },
1022
+ {
1023
+ "epoch": 0.14705882352941177,
1024
+ "grad_norm": 5.59375,
1025
+ "kd_loss": 0.10498046875,
1026
+ "learning_rate": 5.097156544357567e-06,
1027
+ "loss": 0.2023,
1028
+ "step": 510,
1029
+ "student_loss": 0.0014551215572282672,
1030
+ "teacher_loss": 0.0008335533202625811
1031
+ },
1032
+ {
1033
+ "epoch": 0.14850057670126873,
1034
+ "grad_norm": 4.75,
1035
+ "kd_loss": 0.12353515625,
1036
+ "learning_rate": 5.016193748254045e-06,
1037
+ "loss": 0.1779,
1038
+ "step": 515,
1039
+ "student_loss": 0.01823529414832592,
1040
+ "teacher_loss": 0.0005908762104809284
1041
+ },
1042
+ {
1043
+ "epoch": 0.14994232987312572,
1044
+ "grad_norm": 3.875,
1045
+ "kd_loss": 0.1181640625,
1046
+ "learning_rate": 4.935226705620699e-06,
1047
+ "loss": 0.1875,
1048
+ "step": 520,
1049
+ "student_loss": 0.48260822892189026,
1050
+ "teacher_loss": 0.011817601509392262
1051
+ },
1052
+ {
1053
+ "epoch": 0.1513840830449827,
1054
+ "grad_norm": 3.828125,
1055
+ "kd_loss": 0.1142578125,
1056
+ "learning_rate": 4.8542766486612035e-06,
1057
+ "loss": 0.179,
1058
+ "step": 525,
1059
+ "student_loss": 0.0011587169719859958,
1060
+ "teacher_loss": 0.0004872040299233049
1061
+ },
1062
+ {
1063
+ "epoch": 0.15282583621683968,
1064
+ "grad_norm": 5.78125,
1065
+ "kd_loss": 0.11767578125,
1066
+ "learning_rate": 4.773364805125025e-06,
1067
+ "loss": 0.1752,
1068
+ "step": 530,
1069
+ "student_loss": 0.0030523419845849276,
1070
+ "teacher_loss": 0.0013172540348023176
1071
+ },
1072
+ {
1073
+ "epoch": 0.15426758938869667,
1074
+ "grad_norm": 3.1875,
1075
+ "kd_loss": 0.10302734375,
1076
+ "learning_rate": 4.6925123927408265e-06,
1077
+ "loss": 0.1654,
1078
+ "step": 535,
1079
+ "student_loss": 0.0017982972785830498,
1080
+ "teacher_loss": 0.00047424182412214577
1081
+ },
1082
+ {
1083
+ "epoch": 0.15570934256055363,
1084
+ "grad_norm": 5.59375,
1085
+ "kd_loss": 0.11962890625,
1086
+ "learning_rate": 4.611740613652485e-06,
1087
+ "loss": 0.1655,
1088
+ "step": 540,
1089
+ "student_loss": 0.013529052957892418,
1090
+ "teacher_loss": 0.0009085286292247474
1091
+ },
1092
+ {
1093
+ "epoch": 0.15715109573241062,
1094
+ "grad_norm": 9.0,
1095
+ "kd_loss": 0.115234375,
1096
+ "learning_rate": 4.531070648859186e-06,
1097
+ "loss": 0.1973,
1098
+ "step": 545,
1099
+ "student_loss": 0.004623454995453358,
1100
+ "teacher_loss": 0.007325619924813509
1101
+ },
1102
+ {
1103
+ "epoch": 0.15859284890426759,
1104
+ "grad_norm": 5.8125,
1105
+ "kd_loss": 0.126953125,
1106
+ "learning_rate": 4.450523652661086e-06,
1107
+ "loss": 0.1622,
1108
+ "step": 550,
1109
+ "student_loss": 0.0009506479254923761,
1110
+ "teacher_loss": 0.008248833939433098
1111
+ },
1112
+ {
1113
+ "epoch": 0.16003460207612458,
1114
+ "grad_norm": 4.34375,
1115
+ "kd_loss": 0.11376953125,
1116
+ "learning_rate": 4.370120747111956e-06,
1117
+ "loss": 0.1848,
1118
+ "step": 555,
1119
+ "student_loss": 0.005332108587026596,
1120
+ "teacher_loss": 0.0016086830291897058
1121
+ },
1122
+ {
1123
+ "epoch": 0.16147635524798154,
1124
+ "grad_norm": 10.375,
1125
+ "kd_loss": 0.1201171875,
1126
+ "learning_rate": 4.289883016480291e-06,
1127
+ "loss": 0.2032,
1128
+ "step": 560,
1129
+ "student_loss": 0.12518270313739777,
1130
+ "teacher_loss": 0.0005838426877744496
1131
+ },
1132
+ {
1133
+ "epoch": 0.16291810841983853,
1134
+ "grad_norm": 6.0625,
1135
+ "kd_loss": 0.12353515625,
1136
+ "learning_rate": 4.209831501720328e-06,
1137
+ "loss": 0.1825,
1138
+ "step": 565,
1139
+ "student_loss": 0.029691526666283607,
1140
+ "teacher_loss": 0.021172240376472473
1141
+ },
1142
+ {
1143
+ "epoch": 0.1643598615916955,
1144
+ "grad_norm": 8.9375,
1145
+ "kd_loss": 0.1708984375,
1146
+ "learning_rate": 4.129987194954421e-06,
1147
+ "loss": 0.189,
1148
+ "step": 570,
1149
+ "student_loss": 0.13193272054195404,
1150
+ "teacher_loss": 0.00722926901653409
1151
+ },
1152
+ {
1153
+ "epoch": 0.16580161476355249,
1154
+ "grad_norm": 7.84375,
1155
+ "kd_loss": 0.10302734375,
1156
+ "learning_rate": 4.050371033968216e-06,
1157
+ "loss": 0.1851,
1158
+ "step": 575,
1159
+ "student_loss": 0.0010820090537890792,
1160
+ "teacher_loss": 0.0006335995858535171
1161
+ },
1162
+ {
1163
+ "epoch": 0.16724336793540945,
1164
+ "grad_norm": 4.96875,
1165
+ "kd_loss": 0.330078125,
1166
+ "learning_rate": 3.9710038967200825e-06,
1167
+ "loss": 0.1666,
1168
+ "step": 580,
1169
+ "student_loss": 0.0031647607684135437,
1170
+ "teacher_loss": 0.0028759294655174017
1171
+ },
1172
+ {
1173
+ "epoch": 0.16868512110726644,
1174
+ "grad_norm": 3.1875,
1175
+ "kd_loss": 0.1142578125,
1176
+ "learning_rate": 3.89190659586623e-06,
1177
+ "loss": 0.1868,
1178
+ "step": 585,
1179
+ "student_loss": 0.040488943457603455,
1180
+ "teacher_loss": 0.0005194384139031172
1181
+ },
1182
+ {
1183
+ "epoch": 0.1701268742791234,
1184
+ "grad_norm": 7.5625,
1185
+ "kd_loss": 0.11181640625,
1186
+ "learning_rate": 3.8130998733029517e-06,
1187
+ "loss": 0.1949,
1188
+ "step": 590,
1189
+ "student_loss": 0.00226792530156672,
1190
+ "teacher_loss": 0.0028023580089211464
1191
+ },
1192
+ {
1193
+ "epoch": 0.1715686274509804,
1194
+ "grad_norm": 4.0625,
1195
+ "kd_loss": 0.11181640625,
1196
+ "learning_rate": 3.734604394727419e-06,
1197
+ "loss": 0.2049,
1198
+ "step": 595,
1199
+ "student_loss": 0.0009982635965570807,
1200
+ "teacher_loss": 0.001097838394343853
1201
+ },
1202
+ {
1203
+ "epoch": 0.17301038062283736,
1204
+ "grad_norm": 6.5625,
1205
+ "kd_loss": 0.12353515625,
1206
+ "learning_rate": 3.656440744218464e-06,
1207
+ "loss": 0.1982,
1208
+ "step": 600,
1209
+ "student_loss": 0.34091848134994507,
1210
+ "teacher_loss": 0.009622580371797085
1211
+ },
1212
+ {
1213
+ "epoch": 0.17445213379469435,
1214
+ "grad_norm": 6.625,
1215
+ "kd_loss": 0.1181640625,
1216
+ "learning_rate": 3.578629418838757e-06,
1217
+ "loss": 0.1972,
1218
+ "step": 605,
1219
+ "student_loss": 0.2827480435371399,
1220
+ "teacher_loss": 0.039488162845373154
1221
+ },
1222
+ {
1223
+ "epoch": 0.17589388696655134,
1224
+ "grad_norm": 4.96875,
1225
+ "kd_loss": 0.1201171875,
1226
+ "learning_rate": 3.5011908232598124e-06,
1227
+ "loss": 0.1603,
1228
+ "step": 610,
1229
+ "student_loss": 0.106364406645298,
1230
+ "teacher_loss": 0.0008834014879539609
1231
+ },
1232
+ {
1233
+ "epoch": 0.1773356401384083,
1234
+ "grad_norm": 4.75,
1235
+ "kd_loss": 0.1240234375,
1236
+ "learning_rate": 3.4241452644112085e-06,
1237
+ "loss": 0.1596,
1238
+ "step": 615,
1239
+ "student_loss": 0.0008959124679677188,
1240
+ "teacher_loss": 0.0007150223245844245
1241
+ },
1242
+ {
1243
+ "epoch": 0.1787773933102653,
1244
+ "grad_norm": 3.34375,
1245
+ "kd_loss": 0.1533203125,
1246
+ "learning_rate": 3.3475129461554567e-06,
1247
+ "loss": 0.1941,
1248
+ "step": 620,
1249
+ "student_loss": 0.00717555359005928,
1250
+ "teacher_loss": 0.008744844235479832
1251
+ },
1252
+ {
1253
+ "epoch": 0.18021914648212226,
1254
+ "grad_norm": 4.46875,
1255
+ "kd_loss": 0.109375,
1256
+ "learning_rate": 3.271313963989886e-06,
1257
+ "loss": 0.1711,
1258
+ "step": 625,
1259
+ "student_loss": 0.007324306294322014,
1260
+ "teacher_loss": 0.005477549973875284
1261
+ },
1262
+ {
1263
+ "epoch": 0.18166089965397925,
1264
+ "grad_norm": 4.875,
1265
+ "kd_loss": 0.11962890625,
1266
+ "learning_rate": 3.195568299776945e-06,
1267
+ "loss": 0.1813,
1268
+ "step": 630,
1269
+ "student_loss": 0.1220104992389679,
1270
+ "teacher_loss": 0.005055113695561886
1271
+ },
1272
+ {
1273
+ "epoch": 0.1831026528258362,
1274
+ "grad_norm": 4.1875,
1275
+ "kd_loss": 0.1005859375,
1276
+ "learning_rate": 3.1202958165043053e-06,
1277
+ "loss": 0.2012,
1278
+ "step": 635,
1279
+ "student_loss": 0.0011498430976644158,
1280
+ "teacher_loss": 0.0006288467557169497
1281
+ },
1282
+ {
1283
+ "epoch": 0.1845444059976932,
1284
+ "grad_norm": 4.84375,
1285
+ "kd_loss": 0.11767578125,
1286
+ "learning_rate": 3.045516253076137e-06,
1287
+ "loss": 0.1779,
1288
+ "step": 640,
1289
+ "student_loss": 0.0011653146939352155,
1290
+ "teacher_loss": 0.0009915747214108706
1291
+ },
1292
+ {
1293
+ "epoch": 0.18598615916955016,
1294
+ "grad_norm": 9.125,
1295
+ "kd_loss": 0.119140625,
1296
+ "learning_rate": 2.9712492191369245e-06,
1297
+ "loss": 0.1795,
1298
+ "step": 645,
1299
+ "student_loss": 0.004314988851547241,
1300
+ "teacher_loss": 0.0008632438839413226
1301
+ },
1302
+ {
1303
+ "epoch": 0.18742791234140715,
1304
+ "grad_norm": 5.71875,
1305
+ "kd_loss": 0.138671875,
1306
+ "learning_rate": 2.8975141899291777e-06,
1307
+ "loss": 0.1767,
1308
+ "step": 650,
1309
+ "student_loss": 0.004328088369220495,
1310
+ "teacher_loss": 0.0019480936462059617
1311
+ },
1312
+ {
1313
+ "epoch": 0.18886966551326412,
1314
+ "grad_norm": 5.3125,
1315
+ "kd_loss": 0.1064453125,
1316
+ "learning_rate": 2.8243305011863843e-06,
1317
+ "loss": 0.1858,
1318
+ "step": 655,
1319
+ "student_loss": 0.07007281482219696,
1320
+ "teacher_loss": 0.002063432242721319
1321
+ },
1322
+ {
1323
+ "epoch": 0.1903114186851211,
1324
+ "grad_norm": 5.34375,
1325
+ "kd_loss": 0.12060546875,
1326
+ "learning_rate": 2.751717344062552e-06,
1327
+ "loss": 0.1979,
1328
+ "step": 660,
1329
+ "student_loss": 0.0020055994391441345,
1330
+ "teacher_loss": 0.0012256632326170802
1331
+ },
1332
+ {
1333
+ "epoch": 0.19175317185697807,
1334
+ "grad_norm": 4.96875,
1335
+ "kd_loss": 0.1259765625,
1336
+ "learning_rate": 2.6796937600996587e-06,
1337
+ "loss": 0.1824,
1338
+ "step": 665,
1339
+ "student_loss": 0.0013414303539320827,
1340
+ "teacher_loss": 0.0005829873844049871
1341
+ },
1342
+ {
1343
+ "epoch": 0.19319492502883506,
1344
+ "grad_norm": 6.21875,
1345
+ "kd_loss": 0.11181640625,
1346
+ "learning_rate": 2.6082786362343377e-06,
1347
+ "loss": 0.2091,
1348
+ "step": 670,
1349
+ "student_loss": 0.01750928722321987,
1350
+ "teacher_loss": 0.01849350705742836
1351
+ },
1352
+ {
1353
+ "epoch": 0.19463667820069205,
1354
+ "grad_norm": 5.9375,
1355
+ "kd_loss": 0.1083984375,
1356
+ "learning_rate": 2.5374906998451094e-06,
1357
+ "loss": 0.1855,
1358
+ "step": 675,
1359
+ "student_loss": 0.0015071257948875427,
1360
+ "teacher_loss": 0.0012101252796128392
1361
+ },
1362
+ {
1363
+ "epoch": 0.19607843137254902,
1364
+ "grad_norm": 3.984375,
1365
+ "kd_loss": 0.14453125,
1366
+ "learning_rate": 2.467348513841447e-06,
1367
+ "loss": 0.1808,
1368
+ "step": 680,
1369
+ "student_loss": 0.09338736534118652,
1370
+ "teacher_loss": 0.038049884140491486
1371
+ },
1372
+ {
1373
+ "epoch": 0.197520184544406,
1374
+ "grad_norm": 3.75,
1375
+ "kd_loss": 0.177734375,
1376
+ "learning_rate": 2.3978704717959777e-06,
1377
+ "loss": 0.1863,
1378
+ "step": 685,
1379
+ "student_loss": 0.0010167881846427917,
1380
+ "teacher_loss": 0.03185700252652168
1381
+ },
1382
+ {
1383
+ "epoch": 0.19896193771626297,
1384
+ "grad_norm": 3.984375,
1385
+ "kd_loss": 0.11572265625,
1386
+ "learning_rate": 2.329074793121085e-06,
1387
+ "loss": 0.1721,
1388
+ "step": 690,
1389
+ "student_loss": 0.017529672011733055,
1390
+ "teacher_loss": 0.008856060914695263
1391
+ },
1392
+ {
1393
+ "epoch": 0.20040369088811996,
1394
+ "grad_norm": 6.65625,
1395
+ "kd_loss": 0.1171875,
1396
+ "learning_rate": 2.260979518291186e-06,
1397
+ "loss": 0.2122,
1398
+ "step": 695,
1399
+ "student_loss": 0.028023820370435715,
1400
+ "teacher_loss": 0.005092882085591555
1401
+ },
1402
+ {
1403
+ "epoch": 0.20184544405997693,
1404
+ "grad_norm": 4.84375,
1405
+ "kd_loss": 0.1015625,
1406
+ "learning_rate": 2.1936025041119268e-06,
1407
+ "loss": 0.1889,
1408
+ "step": 700,
1409
+ "student_loss": 0.004128373693674803,
1410
+ "teacher_loss": 0.0009403788135387003
1411
+ },
1412
+ {
1413
+ "epoch": 0.20328719723183392,
1414
+ "grad_norm": 5.03125,
1415
+ "kd_loss": 0.12451171875,
1416
+ "learning_rate": 2.1269614190375477e-06,
1417
+ "loss": 0.1781,
1418
+ "step": 705,
1419
+ "student_loss": 0.0009666193509474397,
1420
+ "teacher_loss": 0.0012199536431580782
1421
+ },
1422
+ {
1423
+ "epoch": 0.20472895040369088,
1424
+ "grad_norm": 5.46875,
1425
+ "kd_loss": 0.12890625,
1426
+ "learning_rate": 2.061073738537635e-06,
1427
+ "loss": 0.214,
1428
+ "step": 710,
1429
+ "student_loss": 0.20427227020263672,
1430
+ "teacher_loss": 0.003926330246031284
1431
+ },
1432
+ {
1433
+ "epoch": 0.20617070357554787,
1434
+ "grad_norm": 4.875,
1435
+ "kd_loss": 0.10693359375,
1436
+ "learning_rate": 1.9959567405144825e-06,
1437
+ "loss": 0.2167,
1438
+ "step": 715,
1439
+ "student_loss": 0.034982144832611084,
1440
+ "teacher_loss": 0.012018893845379353
1441
+ },
1442
+ {
1443
+ "epoch": 0.20761245674740483,
1444
+ "grad_norm": 4.90625,
1445
+ "kd_loss": 0.140625,
1446
+ "learning_rate": 1.931627500772263e-06,
1447
+ "loss": 0.1911,
1448
+ "step": 720,
1449
+ "student_loss": 0.0010867691598832607,
1450
+ "teacher_loss": 0.005945001263171434
1451
+ },
1452
+ {
1453
+ "epoch": 0.20905420991926182,
1454
+ "grad_norm": 3.375,
1455
+ "kd_loss": 0.1171875,
1456
+ "learning_rate": 1.8681028885391905e-06,
1457
+ "loss": 0.1776,
1458
+ "step": 725,
1459
+ "student_loss": 0.0011550028575584292,
1460
+ "teacher_loss": 0.0008522791904397309
1461
+ },
1462
+ {
1463
+ "epoch": 0.2104959630911188,
1464
+ "grad_norm": 4.625,
1465
+ "kd_loss": 0.111328125,
1466
+ "learning_rate": 1.8053995620438625e-06,
1467
+ "loss": 0.1726,
1468
+ "step": 730,
1469
+ "student_loss": 0.03152952715754509,
1470
+ "teacher_loss": 0.00232282024808228
1471
+ },
1472
+ {
1473
+ "epoch": 0.21193771626297578,
1474
+ "grad_norm": 4.8125,
1475
+ "kd_loss": 0.11328125,
1476
+ "learning_rate": 1.743533964146924e-06,
1477
+ "loss": 0.1841,
1478
+ "step": 735,
1479
+ "student_loss": 0.0014305273070931435,
1480
+ "teacher_loss": 0.0003280507226008922
1481
+ },
1482
+ {
1483
+ "epoch": 0.21337946943483277,
1484
+ "grad_norm": 6.9375,
1485
+ "kd_loss": 0.138671875,
1486
+ "learning_rate": 1.6825223180292138e-06,
1487
+ "loss": 0.1674,
1488
+ "step": 740,
1489
+ "student_loss": 0.00881188828498125,
1490
+ "teacher_loss": 0.00042216398287564516
1491
+ },
1492
+ {
1493
+ "epoch": 0.21482122260668973,
1494
+ "grad_norm": 6.71875,
1495
+ "kd_loss": 0.12158203125,
1496
+ "learning_rate": 1.6223806229375182e-06,
1497
+ "loss": 0.1744,
1498
+ "step": 745,
1499
+ "student_loss": 0.16076479852199554,
1500
+ "teacher_loss": 0.021096019074320793
1501
+ },
1502
+ {
1503
+ "epoch": 0.21626297577854672,
1504
+ "grad_norm": 6.28125,
1505
+ "kd_loss": 0.12890625,
1506
+ "learning_rate": 1.563124649989043e-06,
1507
+ "loss": 0.1968,
1508
+ "step": 750,
1509
+ "student_loss": 0.11668777465820312,
1510
+ "teacher_loss": 0.006358537822961807
1511
+ },
1512
+ {
1513
+ "epoch": 0.2177047289504037,
1514
+ "grad_norm": 5.4375,
1515
+ "kd_loss": 0.0966796875,
1516
+ "learning_rate": 1.5047699380357134e-06,
1517
+ "loss": 0.1956,
1518
+ "step": 755,
1519
+ "student_loss": 0.11215528845787048,
1520
+ "teacher_loss": 0.0056000882759690285
1521
+ },
1522
+ {
1523
+ "epoch": 0.21914648212226068,
1524
+ "grad_norm": 5.125,
1525
+ "kd_loss": 0.10107421875,
1526
+ "learning_rate": 1.4473317895893773e-06,
1527
+ "loss": 0.1792,
1528
+ "step": 760,
1529
+ "student_loss": 0.26354143023490906,
1530
+ "teacher_loss": 0.0006397212855517864
1531
+ },
1532
+ {
1533
+ "epoch": 0.22058823529411764,
1534
+ "grad_norm": 4.53125,
1535
+ "kd_loss": 0.12060546875,
1536
+ "learning_rate": 1.39082526680899e-06,
1537
+ "loss": 0.1971,
1538
+ "step": 765,
1539
+ "student_loss": 0.08930032700300217,
1540
+ "teacher_loss": 0.0006192077999003232
1541
+ },
1542
+ {
1543
+ "epoch": 0.22202998846597463,
1544
+ "grad_norm": 3.6875,
1545
+ "kd_loss": 0.1142578125,
1546
+ "learning_rate": 1.3352651875508204e-06,
1547
+ "loss": 0.1708,
1548
+ "step": 770,
1549
+ "student_loss": 0.004422426223754883,
1550
+ "teacher_loss": 0.0008745273225940764
1551
+ },
1552
+ {
1553
+ "epoch": 0.2234717416378316,
1554
+ "grad_norm": 5.4375,
1555
+ "kd_loss": 0.11328125,
1556
+ "learning_rate": 1.2806661214827286e-06,
1557
+ "loss": 0.1885,
1558
+ "step": 775,
1559
+ "student_loss": 0.0018669217824935913,
1560
+ "teacher_loss": 0.0006652303854934871
1561
+ },
1562
+ {
1563
+ "epoch": 0.22491349480968859,
1564
+ "grad_norm": 2.9375,
1565
+ "kd_loss": 0.1083984375,
1566
+ "learning_rate": 1.2270423862635188e-06,
1567
+ "loss": 0.1836,
1568
+ "step": 780,
1569
+ "student_loss": 0.009803751483559608,
1570
+ "teacher_loss": 0.0004910013522021472
1571
+ },
1572
+ {
1573
+ "epoch": 0.22635524798154555,
1574
+ "grad_norm": 5.6875,
1575
+ "kd_loss": 0.09912109375,
1576
+ "learning_rate": 1.1744080437883859e-06,
1577
+ "loss": 0.1669,
1578
+ "step": 785,
1579
+ "student_loss": 0.0010796686401590705,
1580
+ "teacher_loss": 0.001094150822609663
1581
+ },
1582
+ {
1583
+ "epoch": 0.22779700115340254,
1584
+ "grad_norm": 6.0625,
1585
+ "kd_loss": 0.1279296875,
1586
+ "learning_rate": 1.1227768965014246e-06,
1587
+ "loss": 0.2026,
1588
+ "step": 790,
1589
+ "student_loss": 0.07495569437742233,
1590
+ "teacher_loss": 0.0015408035833388567
1591
+ },
1592
+ {
1593
+ "epoch": 0.2292387543252595,
1594
+ "grad_norm": 3.390625,
1595
+ "kd_loss": 0.1083984375,
1596
+ "learning_rate": 1.0721624837761768e-06,
1597
+ "loss": 0.1999,
1598
+ "step": 795,
1599
+ "student_loss": 0.0036292201839387417,
1600
+ "teacher_loss": 0.0005642004543915391
1601
+ },
1602
+ {
1603
+ "epoch": 0.2306805074971165,
1604
+ "grad_norm": 6.5625,
1605
+ "kd_loss": 0.18359375,
1606
+ "learning_rate": 1.0225780783651689e-06,
1607
+ "loss": 0.2151,
1608
+ "step": 800,
1609
+ "student_loss": 0.062444765120744705,
1610
+ "teacher_loss": 0.04929126426577568
1611
+ },
1612
+ {
1613
+ "epoch": 0.23212226066897348,
1614
+ "grad_norm": 4.875,
1615
+ "kd_loss": 0.10546875,
1616
+ "learning_rate": 9.740366829193587e-07,
1617
+ "loss": 0.2096,
1618
+ "step": 805,
1619
+ "student_loss": 0.0012999593745917082,
1620
+ "teacher_loss": 0.001006675767712295
1621
+ },
1622
+ {
1623
+ "epoch": 0.23356401384083045,
1624
+ "grad_norm": 6.09375,
1625
+ "kd_loss": 0.109375,
1626
+ "learning_rate": 9.265510265784189e-07,
1627
+ "loss": 0.2063,
1628
+ "step": 810,
1629
+ "student_loss": 0.0013730658683925867,
1630
+ "teacher_loss": 0.00053932867012918
1631
+ },
1632
+ {
1633
+ "epoch": 0.23500576701268744,
1634
+ "grad_norm": 10.0,
1635
+ "kd_loss": 0.103515625,
1636
+ "learning_rate": 8.801335616327378e-07,
1637
+ "loss": 0.1942,
1638
+ "step": 815,
1639
+ "student_loss": 0.05159832164645195,
1640
+ "teacher_loss": 0.010285490192472935
1641
+ },
1642
+ {
1643
+ "epoch": 0.2364475201845444,
1644
+ "grad_norm": 4.9375,
1645
+ "kd_loss": 0.125,
1646
+ "learning_rate": 8.347964602580245e-07,
1647
+ "loss": 0.1808,
1648
+ "step": 820,
1649
+ "student_loss": 0.037393856793642044,
1650
+ "teacher_loss": 0.0004633679345715791
1651
+ },
1652
+ {
1653
+ "epoch": 0.2378892733564014,
1654
+ "grad_norm": 6.5625,
1655
+ "kd_loss": 0.142578125,
1656
+ "learning_rate": 7.905516113233652e-07,
1657
+ "loss": 0.1747,
1658
+ "step": 825,
1659
+ "student_loss": 0.0011921566911041737,
1660
+ "teacher_loss": 0.021351948380470276
1661
+ },
1662
+ {
1663
+ "epoch": 0.23933102652825836,
1664
+ "grad_norm": 3.484375,
1665
+ "kd_loss": 0.111328125,
1666
+ "learning_rate": 7.474106172735746e-07,
1667
+ "loss": 0.1797,
1668
+ "step": 830,
1669
+ "student_loss": 0.03779162839055061,
1670
+ "teacher_loss": 0.003403074573725462
1671
+ },
1672
+ {
1673
+ "epoch": 0.24077277970011535,
1674
+ "grad_norm": 6.03125,
1675
+ "kd_loss": 0.10009765625,
1676
+ "learning_rate": 7.053847910866513e-07,
1677
+ "loss": 0.1667,
1678
+ "step": 835,
1679
+ "student_loss": 0.11626744270324707,
1680
+ "teacher_loss": 0.00203131721355021
1681
+ },
1682
+ {
1683
+ "epoch": 0.2422145328719723,
1684
+ "grad_norm": 4.28125,
1685
+ "kd_loss": 0.1376953125,
1686
+ "learning_rate": 6.644851533071556e-07,
1687
+ "loss": 0.1761,
1688
+ "step": 840,
1689
+ "student_loss": 0.0023884603288024664,
1690
+ "teacher_loss": 0.0004405094077810645
1691
+ },
1692
+ {
1693
+ "epoch": 0.2436562860438293,
1694
+ "grad_norm": 8.6875,
1695
+ "kd_loss": 0.140625,
1696
+ "learning_rate": 6.24722429156251e-07,
1697
+ "loss": 0.2435,
1698
+ "step": 845,
1699
+ "student_loss": 0.14598870277404785,
1700
+ "teacher_loss": 0.0010793671244755387
1701
+ },
1702
+ {
1703
+ "epoch": 0.24509803921568626,
1704
+ "grad_norm": 4.6875,
1705
+ "kd_loss": 0.103515625,
1706
+ "learning_rate": 5.861070457192081e-07,
1707
+ "loss": 0.186,
1708
+ "step": 850,
1709
+ "student_loss": 0.06827586144208908,
1710
+ "teacher_loss": 0.00046239409130066633
1711
+ },
1712
+ {
1713
+ "epoch": 0.24653979238754326,
1714
+ "grad_norm": 3.203125,
1715
+ "kd_loss": 0.125,
1716
+ "learning_rate": 5.486491292110796e-07,
1717
+ "loss": 0.1726,
1718
+ "step": 855,
1719
+ "student_loss": 0.0007081849034875631,
1720
+ "teacher_loss": 0.0005193519755266607
1721
+ },
1722
+ {
1723
+ "epoch": 0.24798154555940022,
1724
+ "grad_norm": 4.40625,
1725
+ "kd_loss": 0.11474609375,
1726
+ "learning_rate": 5.123585023212785e-07,
1727
+ "loss": 0.2129,
1728
+ "step": 860,
1729
+ "student_loss": 0.002149001695215702,
1730
+ "teacher_loss": 0.0017558797262609005
1731
+ },
1732
+ {
1733
+ "epoch": 0.2494232987312572,
1734
+ "grad_norm": 3.921875,
1735
+ "kd_loss": 0.111328125,
1736
+ "learning_rate": 4.772446816377408e-07,
1737
+ "loss": 0.1792,
1738
+ "step": 865,
1739
+ "student_loss": 0.0010927807306870818,
1740
+ "teacher_loss": 0.0006015551625750959
1741
+ },
1742
+ {
1743
+ "epoch": 0.2508650519031142,
1744
+ "grad_norm": 4.0,
1745
+ "kd_loss": 0.12890625,
1746
+ "learning_rate": 4.4331687515137614e-07,
1747
+ "loss": 0.1958,
1748
+ "step": 870,
1749
+ "student_loss": 0.04162781313061714,
1750
+ "teacher_loss": 0.0015390625922009349
1751
+ },
1752
+ {
1753
+ "epoch": 0.25230680507497116,
1754
+ "grad_norm": 7.9375,
1755
+ "kd_loss": 0.10888671875,
1756
+ "learning_rate": 4.1058397984142405e-07,
1757
+ "loss": 0.1771,
1758
+ "step": 875,
1759
+ "student_loss": 0.0009604351944290102,
1760
+ "teacher_loss": 0.0005376276094466448
1761
+ },
1762
+ {
1763
+ "epoch": 0.2537485582468281,
1764
+ "grad_norm": 3.953125,
1765
+ "kd_loss": 0.1162109375,
1766
+ "learning_rate": 3.790545793423761e-07,
1767
+ "loss": 0.1917,
1768
+ "step": 880,
1769
+ "student_loss": 0.0019418075680732727,
1770
+ "teacher_loss": 0.0007360613089986145
1771
+ },
1772
+ {
1773
+ "epoch": 0.25519031141868515,
1774
+ "grad_norm": 3.25,
1775
+ "kd_loss": 0.1435546875,
1776
+ "learning_rate": 3.4873694169306915e-07,
1777
+ "loss": 0.1832,
1778
+ "step": 885,
1779
+ "student_loss": 0.017352323979139328,
1780
+ "teacher_loss": 0.05972852557897568
1781
+ },
1782
+ {
1783
+ "epoch": 0.2566320645905421,
1784
+ "grad_norm": 4.75,
1785
+ "kd_loss": 0.111328125,
1786
+ "learning_rate": 3.196390171685343e-07,
1787
+ "loss": 0.1981,
1788
+ "step": 890,
1789
+ "student_loss": 0.0014260741882026196,
1790
+ "teacher_loss": 0.0012508125510066748
1791
+ },
1792
+ {
1793
+ "epoch": 0.25807381776239907,
1794
+ "grad_norm": 4.25,
1795
+ "kd_loss": 0.1435546875,
1796
+ "learning_rate": 2.917684361951728e-07,
1797
+ "loss": 0.1799,
1798
+ "step": 895,
1799
+ "student_loss": 0.014447882771492004,
1800
+ "teacher_loss": 0.0008786572143435478
1801
+ },
1802
+ {
1803
+ "epoch": 0.25951557093425603,
1804
+ "grad_norm": 4.3125,
1805
+ "kd_loss": 0.1240234375,
1806
+ "learning_rate": 2.65132507349814e-07,
1807
+ "loss": 0.2243,
1808
+ "step": 900,
1809
+ "student_loss": 0.001752070034854114,
1810
+ "teacher_loss": 0.028893902897834778
1811
+ },
1812
+ {
1813
+ "epoch": 0.26095732410611305,
1814
+ "grad_norm": 3.890625,
1815
+ "kd_loss": 0.10888671875,
1816
+ "learning_rate": 2.397382154431621e-07,
1817
+ "loss": 0.1707,
1818
+ "step": 905,
1819
+ "student_loss": 0.07611552625894547,
1820
+ "teacher_loss": 0.0018923009047284722
1821
+ },
1822
+ {
1823
+ "epoch": 0.26239907727797,
1824
+ "grad_norm": 3.84375,
1825
+ "kd_loss": 0.130859375,
1826
+ "learning_rate": 2.1559221968815547e-07,
1827
+ "loss": 0.1867,
1828
+ "step": 910,
1829
+ "student_loss": 0.0012913525570183992,
1830
+ "teacher_loss": 0.0014879581285640597
1831
+ },
1832
+ {
1833
+ "epoch": 0.263840830449827,
1834
+ "grad_norm": 5.1875,
1835
+ "kd_loss": 0.11669921875,
1836
+ "learning_rate": 1.9270085195370048e-07,
1837
+ "loss": 0.1647,
1838
+ "step": 915,
1839
+ "student_loss": 0.03545321896672249,
1840
+ "teacher_loss": 0.0013070907443761826
1841
+ },
1842
+ {
1843
+ "epoch": 0.26528258362168394,
1844
+ "grad_norm": 3.390625,
1845
+ "kd_loss": 0.1630859375,
1846
+ "learning_rate": 1.7107011510424766e-07,
1847
+ "loss": 0.1914,
1848
+ "step": 920,
1849
+ "student_loss": 0.013124704360961914,
1850
+ "teacher_loss": 0.015305249951779842
1851
+ },
1852
+ {
1853
+ "epoch": 0.26672433679354096,
1854
+ "grad_norm": 3.84375,
1855
+ "kd_loss": 0.1064453125,
1856
+ "learning_rate": 1.5070568142564912e-07,
1857
+ "loss": 0.1662,
1858
+ "step": 925,
1859
+ "student_loss": 0.0009416788816452026,
1860
+ "teacher_loss": 0.0009266760898754001
1861
+ },
1862
+ {
1863
+ "epoch": 0.2681660899653979,
1864
+ "grad_norm": 5.90625,
1865
+ "kd_loss": 0.11376953125,
1866
+ "learning_rate": 1.3161289113769405e-07,
1867
+ "loss": 0.1771,
1868
+ "step": 930,
1869
+ "student_loss": 0.04463067650794983,
1870
+ "teacher_loss": 0.0009097782894968987
1871
+ },
1872
+ {
1873
+ "epoch": 0.2696078431372549,
1874
+ "grad_norm": 5.59375,
1875
+ "kd_loss": 0.126953125,
1876
+ "learning_rate": 1.1379675099373489e-07,
1877
+ "loss": 0.1749,
1878
+ "step": 935,
1879
+ "student_loss": 0.02378019131720066,
1880
+ "teacher_loss": 0.0022270630579441786
1881
+ },
1882
+ {
1883
+ "epoch": 0.2710495963091119,
1884
+ "grad_norm": 5.78125,
1885
+ "kd_loss": 0.2138671875,
1886
+ "learning_rate": 9.726193296774767e-08,
1887
+ "loss": 0.1876,
1888
+ "step": 940,
1889
+ "student_loss": 0.002607885980978608,
1890
+ "teacher_loss": 0.0031158654019236565
1891
+ },
1892
+ {
1893
+ "epoch": 0.27249134948096887,
1894
+ "grad_norm": 7.5625,
1895
+ "kd_loss": 0.10595703125,
1896
+ "learning_rate": 8.201277302919086e-08,
1897
+ "loss": 0.1904,
1898
+ "step": 945,
1899
+ "student_loss": 0.07421658933162689,
1900
+ "teacher_loss": 0.002074115676805377
1901
+ },
1902
+ {
1903
+ "epoch": 0.27393310265282583,
1904
+ "grad_norm": 3.203125,
1905
+ "kd_loss": 0.13671875,
1906
+ "learning_rate": 6.805327000596995e-08,
1907
+ "loss": 0.17,
1908
+ "step": 950,
1909
+ "student_loss": 0.005115017760545015,
1910
+ "teacher_loss": 0.0004907959373667836
1911
+ },
1912
+ {
1913
+ "epoch": 0.2753748558246828,
1914
+ "grad_norm": 4.5625,
1915
+ "kd_loss": 0.095703125,
1916
+ "learning_rate": 5.538708453581787e-08,
1917
+ "loss": 0.1903,
1918
+ "step": 955,
1919
+ "student_loss": 0.011558901518583298,
1920
+ "teacher_loss": 0.0004379775491543114
1921
+ },
1922
+ {
1923
+ "epoch": 0.2768166089965398,
1924
+ "grad_norm": 7.9375,
1925
+ "kd_loss": 0.12451171875,
1926
+ "learning_rate": 4.40175381063529e-08,
1927
+ "loss": 0.1861,
1928
+ "step": 960,
1929
+ "student_loss": 0.0017829686403274536,
1930
+ "teacher_loss": 0.0036916364915668964
1931
+ },
1932
+ {
1933
+ "epoch": 0.2782583621683968,
1934
+ "grad_norm": 4.96875,
1935
+ "kd_loss": 0.1728515625,
1936
+ "learning_rate": 3.394761218407705e-08,
1937
+ "loss": 0.2026,
1938
+ "step": 965,
1939
+ "student_loss": 0.2359560877084732,
1940
+ "teacher_loss": 0.008938993327319622
1941
+ },
1942
+ {
1943
+ "epoch": 0.27970011534025374,
1944
+ "grad_norm": 4.375,
1945
+ "kd_loss": 0.1123046875,
1946
+ "learning_rate": 2.5179947432540376e-08,
1947
+ "loss": 0.1889,
1948
+ "step": 970,
1949
+ "student_loss": 0.0006114219431765378,
1950
+ "teacher_loss": 0.00041617779061198235
1951
+ },
1952
+ {
1953
+ "epoch": 0.2811418685121107,
1954
+ "grad_norm": 4.5,
1955
+ "kd_loss": 0.10888671875,
1956
+ "learning_rate": 1.7716843019867646e-08,
1957
+ "loss": 0.1982,
1958
+ "step": 975,
1959
+ "student_loss": 0.07514014840126038,
1960
+ "teacher_loss": 0.001045848592184484
1961
+ },
1962
+ {
1963
+ "epoch": 0.2825836216839677,
1964
+ "grad_norm": 5.5,
1965
+ "kd_loss": 0.107421875,
1966
+ "learning_rate": 1.156025601584676e-08,
1967
+ "loss": 0.1779,
1968
+ "step": 980,
1969
+ "student_loss": 0.0017143742879852653,
1970
+ "teacher_loss": 0.0004985960549674928
1971
+ },
1972
+ {
1973
+ "epoch": 0.2840253748558247,
1974
+ "grad_norm": 6.0625,
1975
+ "kd_loss": 0.107421875,
1976
+ "learning_rate": 6.711800878718144e-09,
1977
+ "loss": 0.1914,
1978
+ "step": 985,
1979
+ "student_loss": 0.0008509993785992265,
1980
+ "teacher_loss": 0.0007709055789746344
1981
+ },
1982
+ {
1983
+ "epoch": 0.28546712802768165,
1984
+ "grad_norm": 11.8125,
1985
+ "kd_loss": 0.0986328125,
1986
+ "learning_rate": 3.1727490318111953e-09,
1987
+ "loss": 0.1871,
1988
+ "step": 990,
1989
+ "student_loss": 0.007576147560030222,
1990
+ "teacher_loss": 0.015172326937317848
1991
+ },
1992
+ {
1993
+ "epoch": 0.2869088811995386,
1994
+ "grad_norm": 4.625,
1995
+ "kd_loss": 0.1318359375,
1996
+ "learning_rate": 9.440285301370865e-10,
1997
+ "loss": 0.2025,
1998
+ "step": 995,
1999
+ "student_loss": 0.0030154017731547356,
2000
+ "teacher_loss": 0.0012625143863260746
2001
+ },
2002
+ {
2003
+ "epoch": 0.28835063437139563,
2004
+ "grad_norm": 5.40625,
2005
+ "kd_loss": 0.10546875,
2006
+ "learning_rate": 2.622381702066523e-11,
2007
+ "loss": 0.1678,
2008
+ "step": 1000,
2009
+ "student_loss": 0.08579359203577042,
2010
+ "teacher_loss": 0.0004960879450663924
2011
+ },
2012
+ {
2013
+ "epoch": 0.28835063437139563,
2014
+ "kd_loss": 0.10546875,
2015
+ "step": 1000,
2016
+ "student_loss": 0.08579359203577042,
2017
+ "teacher_loss": 0.0004960879450663924,
2018
+ "total_flos": 0.0,
2019
+ "train_loss": 0.2803848307132721,
2020
+ "train_runtime": 4898.5131,
2021
+ "train_samples_per_second": 3.266,
2022
+ "train_steps_per_second": 0.204
2023
+ }
2024
+ ],
2025
+ "logging_steps": 5,
2026
+ "max_steps": 1000,
2027
+ "num_input_tokens_seen": 0,
2028
+ "num_train_epochs": 1,
2029
+ "save_steps": 500,
2030
+ "stateful_callbacks": {
2031
+ "TrainerControl": {
2032
+ "args": {
2033
+ "should_epoch_stop": false,
2034
+ "should_evaluate": false,
2035
+ "should_log": false,
2036
+ "should_save": false,
2037
+ "should_training_stop": true
2038
+ },
2039
+ "attributes": {}
2040
+ }
2041
+ },
2042
+ "total_flos": 0.0,
2043
+ "train_batch_size": 1,
2044
+ "trial_name": null,
2045
+ "trial_params": null
2046
+ }
checkpoints/codi-single-1.5b/checkpoint-1000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-1500/added_tokens.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|action_sep|>": 151670,
5
+ "<|arg_sep|>": 151671,
6
+ "<|box_end|>": 151649,
7
+ "<|box_start|>": 151648,
8
+ "<|call_sep|>": 151666,
9
+ "<|end_of_text|>": 151673,
10
+ "<|endoftext|>": 151643,
11
+ "<|exception_sep|>": 151669,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|frame_sep|>": 151672,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|latent_end|>": 151675,
22
+ "<|latent_start|>": 151674,
23
+ "<|line_sep|>": 151667,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|return_sep|>": 151668,
30
+ "<|trace_context_start|>": 151665,
31
+ "<|video_pad|>": 151656,
32
+ "<|vision_end|>": 151653,
33
+ "<|vision_pad|>": 151654,
34
+ "<|vision_start|>": 151652
35
+ }
checkpoints/codi-single-1.5b/checkpoint-1500/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/codi-single-1.5b/checkpoint-1500/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.6",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151676
58
+ }
checkpoints/codi-single-1.5b/checkpoint-1500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-1500/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be59aa6ffff8591705641c0c9b739d7e2740ee03b67745e2f4965e6b413a33aa
3
+ size 3096212347
checkpoints/codi-single-1.5b/checkpoint-1500/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/codi-single-1.5b/checkpoint-1500/thought_projector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a534eb9136ea0396a22fe21fafc63f12b93a0455d1bf6fe461b9739ede5ca50e
3
+ size 9445953
checkpoints/codi-single-1.5b/checkpoint-1500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
3
+ size 11424004
checkpoints/codi-single-1.5b/checkpoint-1500/tokenizer_config.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|trace_context_start|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|call_sep|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|line_sep|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|return_sep|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|exception_sep|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|action_sep|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|arg_sep|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|frame_sep|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|end_of_text|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|latent_start|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|latent_end|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ }
269
+ },
270
+ "additional_special_tokens": [
271
+ "<|im_start|>",
272
+ "<|im_end|>",
273
+ "<|object_ref_start|>",
274
+ "<|object_ref_end|>",
275
+ "<|box_start|>",
276
+ "<|box_end|>",
277
+ "<|quad_start|>",
278
+ "<|quad_end|>",
279
+ "<|vision_start|>",
280
+ "<|vision_end|>",
281
+ "<|vision_pad|>",
282
+ "<|image_pad|>",
283
+ "<|video_pad|>"
284
+ ],
285
+ "bos_token": null,
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|endoftext|>",
288
+ "errors": "replace",
289
+ "extra_special_tokens": {},
290
+ "model_max_length": 32768,
291
+ "pad_token": "<|endoftext|>",
292
+ "split_special_tokens": false,
293
+ "tokenizer_class": "Qwen2Tokenizer",
294
+ "unk_token": null
295
+ }
checkpoints/codi-single-1.5b/checkpoint-1500/trainer_state.json ADDED
@@ -0,0 +1,3046 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.43252595155709345,
6
+ "eval_steps": 500,
7
+ "global_step": 1500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0014417531718569781,
14
+ "grad_norm": 91136.0,
15
+ "kd_loss": 0.53515625,
16
+ "learning_rate": 1.3333333333333334e-06,
17
+ "loss": 1.4874,
18
+ "step": 5,
19
+ "student_loss": 0.6224480271339417,
20
+ "teacher_loss": 0.00038480176590383053
21
+ },
22
+ {
23
+ "epoch": 0.0028835063437139563,
24
+ "grad_norm": 29312.0,
25
+ "kd_loss": 0.54296875,
26
+ "learning_rate": 3e-06,
27
+ "loss": 1.3555,
28
+ "step": 10,
29
+ "student_loss": 0.7942929863929749,
30
+ "teacher_loss": 0.0007905907114036381
31
+ },
32
+ {
33
+ "epoch": 0.004325259515570935,
34
+ "grad_norm": 25088.0,
35
+ "kd_loss": 0.515625,
36
+ "learning_rate": 4.666666666666667e-06,
37
+ "loss": 1.3876,
38
+ "step": 15,
39
+ "student_loss": 0.2712247967720032,
40
+ "teacher_loss": 0.0006367590394802392
41
+ },
42
+ {
43
+ "epoch": 0.0057670126874279125,
44
+ "grad_norm": 11328.0,
45
+ "kd_loss": 0.5234375,
46
+ "learning_rate": 6.333333333333333e-06,
47
+ "loss": 1.3195,
48
+ "step": 20,
49
+ "student_loss": 0.991253137588501,
50
+ "teacher_loss": 0.00029781984630972147
51
+ },
52
+ {
53
+ "epoch": 0.00720876585928489,
54
+ "grad_norm": 4896.0,
55
+ "kd_loss": 0.46875,
56
+ "learning_rate": 8.000000000000001e-06,
57
+ "loss": 1.2722,
58
+ "step": 25,
59
+ "student_loss": 0.24368858337402344,
60
+ "teacher_loss": 0.0009928465588018298
61
+ },
62
+ {
63
+ "epoch": 0.00865051903114187,
64
+ "grad_norm": 35584.0,
65
+ "kd_loss": 0.48828125,
66
+ "learning_rate": 9.666666666666667e-06,
67
+ "loss": 1.2823,
68
+ "step": 30,
69
+ "student_loss": 0.7547083497047424,
70
+ "teacher_loss": 0.0003548521490301937
71
+ },
72
+ {
73
+ "epoch": 0.010092272202998846,
74
+ "grad_norm": 11584.0,
75
+ "kd_loss": 0.458984375,
76
+ "learning_rate": 9.99958042442916e-06,
77
+ "loss": 1.1068,
78
+ "step": 35,
79
+ "student_loss": 0.48039907217025757,
80
+ "teacher_loss": 0.00026277394499629736
81
+ },
82
+ {
83
+ "epoch": 0.011534025374855825,
84
+ "grad_norm": 33536.0,
85
+ "kd_loss": 0.5,
86
+ "learning_rate": 9.997876019358083e-06,
87
+ "loss": 1.174,
88
+ "step": 40,
89
+ "student_loss": 0.7645292282104492,
90
+ "teacher_loss": 0.07346436381340027
91
+ },
92
+ {
93
+ "epoch": 0.012975778546712802,
94
+ "grad_norm": 3664.0,
95
+ "kd_loss": 0.478515625,
96
+ "learning_rate": 9.99486100792044e-06,
97
+ "loss": 0.9907,
98
+ "step": 45,
99
+ "student_loss": 1.340754508972168,
100
+ "teacher_loss": 0.0009389458573423326
101
+ },
102
+ {
103
+ "epoch": 0.01441753171856978,
104
+ "grad_norm": 1584.0,
105
+ "kd_loss": 0.486328125,
106
+ "learning_rate": 9.990536180750724e-06,
107
+ "loss": 0.9709,
108
+ "step": 50,
109
+ "student_loss": 0.31134045124053955,
110
+ "teacher_loss": 0.00372113729827106
111
+ },
112
+ {
113
+ "epoch": 0.015859284890426758,
114
+ "grad_norm": 672.0,
115
+ "kd_loss": 0.44921875,
116
+ "learning_rate": 9.984902671959911e-06,
117
+ "loss": 0.7719,
118
+ "step": 55,
119
+ "student_loss": 0.11045902222394943,
120
+ "teacher_loss": 0.0004030822601635009
121
+ },
122
+ {
123
+ "epoch": 0.01730103806228374,
124
+ "grad_norm": 648.0,
125
+ "kd_loss": 0.447265625,
126
+ "learning_rate": 9.97796195883804e-06,
127
+ "loss": 0.725,
128
+ "step": 60,
129
+ "student_loss": 0.15876038372516632,
130
+ "teacher_loss": 0.0018200079211965203
131
+ },
132
+ {
133
+ "epoch": 0.018742791234140715,
134
+ "grad_norm": 490.0,
135
+ "kd_loss": 0.447265625,
136
+ "learning_rate": 9.969715861466839e-06,
137
+ "loss": 0.6844,
138
+ "step": 65,
139
+ "student_loss": 0.2451532483100891,
140
+ "teacher_loss": 0.056328896433115005
141
+ },
142
+ {
143
+ "epoch": 0.020184544405997693,
144
+ "grad_norm": 210.0,
145
+ "kd_loss": 0.4453125,
146
+ "learning_rate": 9.96016654224243e-06,
147
+ "loss": 0.6719,
148
+ "step": 70,
149
+ "student_loss": 0.036203403025865555,
150
+ "teacher_loss": 0.0021477588452398777
151
+ },
152
+ {
153
+ "epoch": 0.02162629757785467,
154
+ "grad_norm": 366.0,
155
+ "kd_loss": 0.44921875,
156
+ "learning_rate": 9.94931650530827e-06,
157
+ "loss": 0.6026,
158
+ "step": 75,
159
+ "student_loss": 0.009898507036268711,
160
+ "teacher_loss": 0.0007659209077246487
161
+ },
162
+ {
163
+ "epoch": 0.02306805074971165,
164
+ "grad_norm": 1816.0,
165
+ "kd_loss": 0.462890625,
166
+ "learning_rate": 9.93716859589851e-06,
167
+ "loss": 0.6433,
168
+ "step": 80,
169
+ "student_loss": 0.030777160078287125,
170
+ "teacher_loss": 0.0008218760485760868
171
+ },
172
+ {
173
+ "epoch": 0.024509803921568627,
174
+ "grad_norm": 360.0,
175
+ "kd_loss": 0.470703125,
176
+ "learning_rate": 9.923725999591846e-06,
177
+ "loss": 0.5588,
178
+ "step": 85,
179
+ "student_loss": 0.07332021743059158,
180
+ "teacher_loss": 0.00044353402336128056
181
+ },
182
+ {
183
+ "epoch": 0.025951557093425604,
184
+ "grad_norm": 732.0,
185
+ "kd_loss": 0.39453125,
186
+ "learning_rate": 9.908992241476189e-06,
187
+ "loss": 0.5723,
188
+ "step": 90,
189
+ "student_loss": 0.07177069783210754,
190
+ "teacher_loss": 0.00044491401058621705
191
+ },
192
+ {
193
+ "epoch": 0.027393310265282585,
194
+ "grad_norm": 250.0,
195
+ "kd_loss": 0.4296875,
196
+ "learning_rate": 9.892971185224244e-06,
197
+ "loss": 0.5615,
198
+ "step": 95,
199
+ "student_loss": 0.25651854276657104,
200
+ "teacher_loss": 0.0204803254455328
201
+ },
202
+ {
203
+ "epoch": 0.02883506343713956,
204
+ "grad_norm": 195.0,
205
+ "kd_loss": 0.42578125,
206
+ "learning_rate": 9.875667032080354e-06,
207
+ "loss": 0.5376,
208
+ "step": 100,
209
+ "student_loss": 0.0018209030386060476,
210
+ "teacher_loss": 0.0005951338680461049
211
+ },
212
+ {
213
+ "epoch": 0.03027681660899654,
214
+ "grad_norm": 74.5,
215
+ "kd_loss": 0.416015625,
216
+ "learning_rate": 9.857084319758772e-06,
217
+ "loss": 0.5554,
218
+ "step": 105,
219
+ "student_loss": 0.009238015860319138,
220
+ "teacher_loss": 0.000378329394152388
221
+ },
222
+ {
223
+ "epoch": 0.031718569780853516,
224
+ "grad_norm": 134.0,
225
+ "kd_loss": 0.3984375,
226
+ "learning_rate": 9.837227921253747e-06,
227
+ "loss": 0.5376,
228
+ "step": 110,
229
+ "student_loss": 0.008933513425290585,
230
+ "teacher_loss": 0.054748453199863434
231
+ },
232
+ {
233
+ "epoch": 0.03316032295271049,
234
+ "grad_norm": 53.5,
235
+ "kd_loss": 0.380859375,
236
+ "learning_rate": 9.816103043561648e-06,
237
+ "loss": 0.5101,
238
+ "step": 115,
239
+ "student_loss": 0.0015179987531155348,
240
+ "teacher_loss": 0.0003994991711806506
241
+ },
242
+ {
243
+ "epoch": 0.03460207612456748,
244
+ "grad_norm": 90.5,
245
+ "kd_loss": 0.34765625,
246
+ "learning_rate": 9.79371522631553e-06,
247
+ "loss": 0.4575,
248
+ "step": 120,
249
+ "student_loss": 0.005917699541896582,
250
+ "teacher_loss": 0.0011113588698208332
251
+ },
252
+ {
253
+ "epoch": 0.036043829296424454,
254
+ "grad_norm": 105.0,
255
+ "kd_loss": 0.333984375,
256
+ "learning_rate": 9.770070340332457e-06,
257
+ "loss": 0.4262,
258
+ "step": 125,
259
+ "student_loss": 0.15691499412059784,
260
+ "teacher_loss": 0.0019031435949727893
261
+ },
262
+ {
263
+ "epoch": 0.03748558246828143,
264
+ "grad_norm": 16.375,
265
+ "kd_loss": 0.3125,
266
+ "learning_rate": 9.745174586073982e-06,
267
+ "loss": 0.4434,
268
+ "step": 130,
269
+ "student_loss": 0.0024075880646705627,
270
+ "teacher_loss": 0.007021783851087093
271
+ },
272
+ {
273
+ "epoch": 0.03892733564013841,
274
+ "grad_norm": 21.625,
275
+ "kd_loss": 0.294921875,
276
+ "learning_rate": 9.719034492020183e-06,
277
+ "loss": 0.3819,
278
+ "step": 135,
279
+ "student_loss": 0.002887926297262311,
280
+ "teacher_loss": 0.00043111268314532936
281
+ },
282
+ {
283
+ "epoch": 0.040369088811995385,
284
+ "grad_norm": 11.875,
285
+ "kd_loss": 0.26953125,
286
+ "learning_rate": 9.691656912957686e-06,
287
+ "loss": 0.3717,
288
+ "step": 140,
289
+ "student_loss": 0.0013271740172058344,
290
+ "teacher_loss": 0.0008426732383668423
291
+ },
292
+ {
293
+ "epoch": 0.04181084198385236,
294
+ "grad_norm": 11.6875,
295
+ "kd_loss": 0.259765625,
296
+ "learning_rate": 9.663049028182112e-06,
297
+ "loss": 0.3423,
298
+ "step": 145,
299
+ "student_loss": 0.11146840453147888,
300
+ "teacher_loss": 0.0059180548414587975
301
+ },
302
+ {
303
+ "epoch": 0.04325259515570934,
304
+ "grad_norm": 7.09375,
305
+ "kd_loss": 0.2333984375,
306
+ "learning_rate": 9.633218339615433e-06,
307
+ "loss": 0.3012,
308
+ "step": 150,
309
+ "student_loss": 0.001806147862225771,
310
+ "teacher_loss": 0.001238134689629078
311
+ },
312
+ {
313
+ "epoch": 0.04469434832756632,
314
+ "grad_norm": 6.84375,
315
+ "kd_loss": 0.201171875,
316
+ "learning_rate": 9.602172669838721e-06,
317
+ "loss": 0.3157,
318
+ "step": 155,
319
+ "student_loss": 0.0033397674560546875,
320
+ "teacher_loss": 0.0009877807460725307
321
+ },
322
+ {
323
+ "epoch": 0.0461361014994233,
324
+ "grad_norm": 7.0625,
325
+ "kd_loss": 0.22265625,
326
+ "learning_rate": 9.569920160040815e-06,
327
+ "loss": 0.2821,
328
+ "step": 160,
329
+ "student_loss": 0.22769513726234436,
330
+ "teacher_loss": 0.041321102529764175
331
+ },
332
+ {
333
+ "epoch": 0.04757785467128028,
334
+ "grad_norm": 5.96875,
335
+ "kd_loss": 0.1943359375,
336
+ "learning_rate": 9.536469267883432e-06,
337
+ "loss": 0.2727,
338
+ "step": 165,
339
+ "student_loss": 0.00636801915243268,
340
+ "teacher_loss": 0.00474645895883441
341
+ },
342
+ {
343
+ "epoch": 0.049019607843137254,
344
+ "grad_norm": 6.375,
345
+ "kd_loss": 0.166015625,
346
+ "learning_rate": 9.501828765283295e-06,
347
+ "loss": 0.2549,
348
+ "step": 170,
349
+ "student_loss": 0.00595078757032752,
350
+ "teacher_loss": 0.002222016453742981
351
+ },
352
+ {
353
+ "epoch": 0.05046136101499423,
354
+ "grad_norm": 5.875,
355
+ "kd_loss": 0.154296875,
356
+ "learning_rate": 9.466007736111846e-06,
357
+ "loss": 0.249,
358
+ "step": 175,
359
+ "student_loss": 0.0170090701431036,
360
+ "teacher_loss": 0.001197761739604175
361
+ },
362
+ {
363
+ "epoch": 0.05190311418685121,
364
+ "grad_norm": 6.5,
365
+ "kd_loss": 0.15234375,
366
+ "learning_rate": 9.429015573813163e-06,
367
+ "loss": 0.2458,
368
+ "step": 180,
369
+ "student_loss": 0.0050661382265388966,
370
+ "teacher_loss": 0.002651061164215207
371
+ },
372
+ {
373
+ "epoch": 0.05334486735870819,
374
+ "grad_norm": 4.25,
375
+ "kd_loss": 0.169921875,
376
+ "learning_rate": 9.390861978940687e-06,
377
+ "loss": 0.2313,
378
+ "step": 185,
379
+ "student_loss": 0.3254599869251251,
380
+ "teacher_loss": 0.0007031414425000548
381
+ },
382
+ {
383
+ "epoch": 0.05478662053056517,
384
+ "grad_norm": 7.71875,
385
+ "kd_loss": 0.177734375,
386
+ "learning_rate": 9.351556956613423e-06,
387
+ "loss": 0.2414,
388
+ "step": 190,
389
+ "student_loss": 0.003914886154234409,
390
+ "teacher_loss": 0.001010580570437014
391
+ },
392
+ {
393
+ "epoch": 0.056228373702422146,
394
+ "grad_norm": 6.84375,
395
+ "kd_loss": 0.1435546875,
396
+ "learning_rate": 9.31111081389227e-06,
397
+ "loss": 0.2156,
398
+ "step": 195,
399
+ "student_loss": 0.06280706077814102,
400
+ "teacher_loss": 0.0005038110539317131
401
+ },
402
+ {
403
+ "epoch": 0.05767012687427912,
404
+ "grad_norm": 3.5625,
405
+ "kd_loss": 0.140625,
406
+ "learning_rate": 9.269534157077177e-06,
407
+ "loss": 0.1981,
408
+ "step": 200,
409
+ "student_loss": 0.002929725218564272,
410
+ "teacher_loss": 0.0005093662184663117
411
+ },
412
+ {
413
+ "epoch": 0.0591118800461361,
414
+ "grad_norm": 4.96875,
415
+ "kd_loss": 0.150390625,
416
+ "learning_rate": 9.226837888925813e-06,
417
+ "loss": 0.2157,
418
+ "step": 205,
419
+ "student_loss": 0.00641452893614769,
420
+ "teacher_loss": 0.03549756482243538
421
+ },
422
+ {
423
+ "epoch": 0.06055363321799308,
424
+ "grad_norm": 6.375,
425
+ "kd_loss": 0.1318359375,
426
+ "learning_rate": 9.183033205794525e-06,
427
+ "loss": 0.2094,
428
+ "step": 210,
429
+ "student_loss": 0.13332298398017883,
430
+ "teacher_loss": 0.0007853205897845328
431
+ },
432
+ {
433
+ "epoch": 0.061995386389850055,
434
+ "grad_norm": 6.46875,
435
+ "kd_loss": 0.1259765625,
436
+ "learning_rate": 9.13813159470227e-06,
437
+ "loss": 0.2075,
438
+ "step": 215,
439
+ "student_loss": 0.020972760394215584,
440
+ "teacher_loss": 0.0003814305819105357
441
+ },
442
+ {
443
+ "epoch": 0.06343713956170703,
444
+ "grad_norm": 7.8125,
445
+ "kd_loss": 0.1357421875,
446
+ "learning_rate": 9.092144830318357e-06,
447
+ "loss": 0.2394,
448
+ "step": 220,
449
+ "student_loss": 0.3116561472415924,
450
+ "teacher_loss": 0.008577825501561165
451
+ },
452
+ {
453
+ "epoch": 0.06487889273356401,
454
+ "grad_norm": 6.125,
455
+ "kd_loss": 0.11572265625,
456
+ "learning_rate": 9.045084971874738e-06,
457
+ "loss": 0.2193,
458
+ "step": 225,
459
+ "student_loss": 0.17015616595745087,
460
+ "teacher_loss": 0.0012981987092643976
461
+ },
462
+ {
463
+ "epoch": 0.06632064590542099,
464
+ "grad_norm": 5.0625,
465
+ "kd_loss": 0.11962890625,
466
+ "learning_rate": 8.99696436000368e-06,
467
+ "loss": 0.1794,
468
+ "step": 230,
469
+ "student_loss": 0.00222015380859375,
470
+ "teacher_loss": 0.003056387882679701
471
+ },
472
+ {
473
+ "epoch": 0.06776239907727798,
474
+ "grad_norm": 4.9375,
475
+ "kd_loss": 0.15234375,
476
+ "learning_rate": 8.947795613501658e-06,
477
+ "loss": 0.2096,
478
+ "step": 235,
479
+ "student_loss": 0.005798778962343931,
480
+ "teacher_loss": 0.00040503445779904723
481
+ },
482
+ {
483
+ "epoch": 0.06920415224913495,
484
+ "grad_norm": 4.34375,
485
+ "kd_loss": 0.1103515625,
486
+ "learning_rate": 8.897591626020284e-06,
487
+ "loss": 0.2009,
488
+ "step": 240,
489
+ "student_loss": 0.0019766136538237333,
490
+ "teacher_loss": 0.0007733534439466894
491
+ },
492
+ {
493
+ "epoch": 0.07064590542099193,
494
+ "grad_norm": 4.875,
495
+ "kd_loss": 0.1376953125,
496
+ "learning_rate": 8.846365562685178e-06,
497
+ "loss": 0.1982,
498
+ "step": 245,
499
+ "student_loss": 0.002786125522106886,
500
+ "teacher_loss": 0.0018817130476236343
501
+ },
502
+ {
503
+ "epoch": 0.07208765859284891,
504
+ "grad_norm": 5.0625,
505
+ "kd_loss": 0.1162109375,
506
+ "learning_rate": 8.794130856643635e-06,
507
+ "loss": 0.1736,
508
+ "step": 250,
509
+ "student_loss": 0.003304118989035487,
510
+ "teacher_loss": 0.0007230375776998699
511
+ },
512
+ {
513
+ "epoch": 0.07352941176470588,
514
+ "grad_norm": 6.875,
515
+ "kd_loss": 0.12890625,
516
+ "learning_rate": 8.74090120554202e-06,
517
+ "loss": 0.2065,
518
+ "step": 255,
519
+ "student_loss": 0.0011023435508832335,
520
+ "teacher_loss": 0.0011509901378303766
521
+ },
522
+ {
523
+ "epoch": 0.07497116493656286,
524
+ "grad_norm": 4.9375,
525
+ "kd_loss": 0.1259765625,
526
+ "learning_rate": 8.686690567933803e-06,
527
+ "loss": 0.2005,
528
+ "step": 260,
529
+ "student_loss": 0.001012590597383678,
530
+ "teacher_loss": 0.035696372389793396
531
+ },
532
+ {
533
+ "epoch": 0.07641291810841984,
534
+ "grad_norm": 4.8125,
535
+ "kd_loss": 0.11669921875,
536
+ "learning_rate": 8.63151315961915e-06,
537
+ "loss": 0.1962,
538
+ "step": 265,
539
+ "student_loss": 0.001487129949964583,
540
+ "teacher_loss": 0.0009277480421587825
541
+ },
542
+ {
543
+ "epoch": 0.07785467128027682,
544
+ "grad_norm": 4.84375,
545
+ "kd_loss": 0.1298828125,
546
+ "learning_rate": 8.575383449917103e-06,
547
+ "loss": 0.196,
548
+ "step": 270,
549
+ "student_loss": 0.016224455088377,
550
+ "teacher_loss": 0.0017208521021530032
551
+ },
552
+ {
553
+ "epoch": 0.07929642445213379,
554
+ "grad_norm": 6.5625,
555
+ "kd_loss": 0.12451171875,
556
+ "learning_rate": 8.518316157871232e-06,
557
+ "loss": 0.2031,
558
+ "step": 275,
559
+ "student_loss": 0.0010892748832702637,
560
+ "teacher_loss": 0.035929761826992035
561
+ },
562
+ {
563
+ "epoch": 0.08073817762399077,
564
+ "grad_norm": 4.90625,
565
+ "kd_loss": 0.1279296875,
566
+ "learning_rate": 8.460326248389825e-06,
567
+ "loss": 0.217,
568
+ "step": 280,
569
+ "student_loss": 0.0011528143659234047,
570
+ "teacher_loss": 0.0003817000542767346
571
+ },
572
+ {
573
+ "epoch": 0.08217993079584775,
574
+ "grad_norm": 6.625,
575
+ "kd_loss": 0.11572265625,
576
+ "learning_rate": 8.401428928321607e-06,
577
+ "loss": 0.206,
578
+ "step": 285,
579
+ "student_loss": 0.0025788608472794294,
580
+ "teacher_loss": 0.001419232808984816
581
+ },
582
+ {
583
+ "epoch": 0.08362168396770472,
584
+ "grad_norm": 4.59375,
585
+ "kd_loss": 0.13671875,
586
+ "learning_rate": 8.341639642468002e-06,
587
+ "loss": 0.2497,
588
+ "step": 290,
589
+ "student_loss": 0.012355645187199116,
590
+ "teacher_loss": 0.004034335725009441
591
+ },
592
+ {
593
+ "epoch": 0.0850634371395617,
594
+ "grad_norm": 6.875,
595
+ "kd_loss": 0.1259765625,
596
+ "learning_rate": 8.280974069532999e-06,
597
+ "loss": 0.1999,
598
+ "step": 295,
599
+ "student_loss": 0.0029606728348881006,
600
+ "teacher_loss": 0.0018796900985762477
601
+ },
602
+ {
603
+ "epoch": 0.08650519031141868,
604
+ "grad_norm": 7.40625,
605
+ "kd_loss": 0.12451171875,
606
+ "learning_rate": 8.219448118011687e-06,
607
+ "loss": 0.1898,
608
+ "step": 300,
609
+ "student_loss": 0.05260760709643364,
610
+ "teacher_loss": 0.0028262475971132517
611
+ },
612
+ {
613
+ "epoch": 0.08794694348327567,
614
+ "grad_norm": 5.75,
615
+ "kd_loss": 0.11474609375,
616
+ "learning_rate": 8.157077922018537e-06,
617
+ "loss": 0.1993,
618
+ "step": 305,
619
+ "student_loss": 0.07766762375831604,
620
+ "teacher_loss": 0.0020029095467180014
621
+ },
622
+ {
623
+ "epoch": 0.08938869665513265,
624
+ "grad_norm": 3.796875,
625
+ "kd_loss": 0.10595703125,
626
+ "learning_rate": 8.093879837056486e-06,
627
+ "loss": 0.1971,
628
+ "step": 310,
629
+ "student_loss": 0.0014958757674321532,
630
+ "teacher_loss": 0.0014671633252874017
631
+ },
632
+ {
633
+ "epoch": 0.09083044982698962,
634
+ "grad_norm": 7.15625,
635
+ "kd_loss": 0.1103515625,
636
+ "learning_rate": 8.029870435728018e-06,
637
+ "loss": 0.204,
638
+ "step": 315,
639
+ "student_loss": 0.2129756659269333,
640
+ "teacher_loss": 0.0017302327323704958
641
+ },
642
+ {
643
+ "epoch": 0.0922722029988466,
644
+ "grad_norm": 5.0,
645
+ "kd_loss": 0.10400390625,
646
+ "learning_rate": 7.965066503389264e-06,
647
+ "loss": 0.2075,
648
+ "step": 320,
649
+ "student_loss": 0.0016070405254140496,
650
+ "teacher_loss": 0.0009597347816452384
651
+ },
652
+ {
653
+ "epoch": 0.09371395617070358,
654
+ "grad_norm": 6.59375,
655
+ "kd_loss": 0.171875,
656
+ "learning_rate": 7.89948503374835e-06,
657
+ "loss": 0.1917,
658
+ "step": 325,
659
+ "student_loss": 0.003808736801147461,
660
+ "teacher_loss": 0.01614256761968136
661
+ },
662
+ {
663
+ "epoch": 0.09515570934256055,
664
+ "grad_norm": 6.03125,
665
+ "kd_loss": 0.115234375,
666
+ "learning_rate": 7.833143224409076e-06,
667
+ "loss": 0.2017,
668
+ "step": 330,
669
+ "student_loss": 0.021043118089437485,
670
+ "teacher_loss": 0.0004570994933601469
671
+ },
672
+ {
673
+ "epoch": 0.09659746251441753,
674
+ "grad_norm": 4.21875,
675
+ "kd_loss": 0.12109375,
676
+ "learning_rate": 7.766058472361154e-06,
677
+ "loss": 0.176,
678
+ "step": 335,
679
+ "student_loss": 0.0018800155958160758,
680
+ "teacher_loss": 0.0022506280802190304
681
+ },
682
+ {
683
+ "epoch": 0.09803921568627451,
684
+ "grad_norm": 5.03125,
685
+ "kd_loss": 0.1083984375,
686
+ "learning_rate": 7.698248369418146e-06,
687
+ "loss": 0.1834,
688
+ "step": 340,
689
+ "student_loss": 0.0805514007806778,
690
+ "teacher_loss": 0.006284959614276886
691
+ },
692
+ {
693
+ "epoch": 0.09948096885813149,
694
+ "grad_norm": 5.6875,
695
+ "kd_loss": 0.10888671875,
696
+ "learning_rate": 7.629730697604314e-06,
697
+ "loss": 0.2074,
698
+ "step": 345,
699
+ "student_loss": 0.13202711939811707,
700
+ "teacher_loss": 0.0005228903028182685
701
+ },
702
+ {
703
+ "epoch": 0.10092272202998846,
704
+ "grad_norm": 5.25,
705
+ "kd_loss": 0.12890625,
706
+ "learning_rate": 7.560523424491595e-06,
707
+ "loss": 0.1831,
708
+ "step": 350,
709
+ "student_loss": 0.081477090716362,
710
+ "teacher_loss": 0.0003300213429611176
711
+ },
712
+ {
713
+ "epoch": 0.10236447520184544,
714
+ "grad_norm": 3.796875,
715
+ "kd_loss": 0.10693359375,
716
+ "learning_rate": 7.490644698487909e-06,
717
+ "loss": 0.1853,
718
+ "step": 355,
719
+ "student_loss": 0.002148553030565381,
720
+ "teacher_loss": 0.0014131986536085606
721
+ },
722
+ {
723
+ "epoch": 0.10380622837370242,
724
+ "grad_norm": 5.65625,
725
+ "kd_loss": 0.111328125,
726
+ "learning_rate": 7.420112844078066e-06,
727
+ "loss": 0.1865,
728
+ "step": 360,
729
+ "student_loss": 0.18958300352096558,
730
+ "teacher_loss": 0.001118413987569511
731
+ },
732
+ {
733
+ "epoch": 0.1052479815455594,
734
+ "grad_norm": 7.625,
735
+ "kd_loss": 0.1123046875,
736
+ "learning_rate": 7.348946357018479e-06,
737
+ "loss": 0.1824,
738
+ "step": 365,
739
+ "student_loss": 0.0013260500272735953,
740
+ "teacher_loss": 0.029322339221835136
741
+ },
742
+ {
743
+ "epoch": 0.10668973471741638,
744
+ "grad_norm": 4.875,
745
+ "kd_loss": 0.1318359375,
746
+ "learning_rate": 7.277163899486975e-06,
747
+ "loss": 0.189,
748
+ "step": 370,
749
+ "student_loss": 0.16300825774669647,
750
+ "teacher_loss": 0.000458209979115054
751
+ },
752
+ {
753
+ "epoch": 0.10813148788927336,
754
+ "grad_norm": 5.875,
755
+ "kd_loss": 0.15625,
756
+ "learning_rate": 7.204784295188959e-06,
757
+ "loss": 0.1865,
758
+ "step": 375,
759
+ "student_loss": 0.3203120529651642,
760
+ "teacher_loss": 0.02350226417183876
761
+ },
762
+ {
763
+ "epoch": 0.10957324106113034,
764
+ "grad_norm": 4.34375,
765
+ "kd_loss": 0.10546875,
766
+ "learning_rate": 7.1318265244212305e-06,
767
+ "loss": 0.1864,
768
+ "step": 380,
769
+ "student_loss": 0.0027754041366279125,
770
+ "teacher_loss": 0.001105214236304164
771
+ },
772
+ {
773
+ "epoch": 0.11101499423298732,
774
+ "grad_norm": 4.8125,
775
+ "kd_loss": 0.1171875,
776
+ "learning_rate": 7.05830971909472e-06,
777
+ "loss": 0.1872,
778
+ "step": 385,
779
+ "student_loss": 0.001283544348552823,
780
+ "teacher_loss": 0.0009954526321962476
781
+ },
782
+ {
783
+ "epoch": 0.11245674740484429,
784
+ "grad_norm": 5.25,
785
+ "kd_loss": 0.125,
786
+ "learning_rate": 6.9842531577174865e-06,
787
+ "loss": 0.1764,
788
+ "step": 390,
789
+ "student_loss": 0.0008525378652848303,
790
+ "teacher_loss": 0.001041764859110117
791
+ },
792
+ {
793
+ "epoch": 0.11389850057670127,
794
+ "grad_norm": 6.59375,
795
+ "kd_loss": 0.1171875,
796
+ "learning_rate": 6.9096762603392595e-06,
797
+ "loss": 0.195,
798
+ "step": 395,
799
+ "student_loss": 0.0015218615299090743,
800
+ "teacher_loss": 0.0010145456762984395
801
+ },
802
+ {
803
+ "epoch": 0.11534025374855825,
804
+ "grad_norm": 4.6875,
805
+ "kd_loss": 0.12451171875,
806
+ "learning_rate": 6.834598583458862e-06,
807
+ "loss": 0.1822,
808
+ "step": 400,
809
+ "student_loss": 0.0013333633542060852,
810
+ "teacher_loss": 0.00030270780553109944
811
+ },
812
+ {
813
+ "epoch": 0.11678200692041522,
814
+ "grad_norm": 2.703125,
815
+ "kd_loss": 0.103515625,
816
+ "learning_rate": 6.7590398148958625e-06,
817
+ "loss": 0.196,
818
+ "step": 405,
819
+ "student_loss": 0.026790756732225418,
820
+ "teacher_loss": 0.0005501789273694158
821
+ },
822
+ {
823
+ "epoch": 0.1182237600922722,
824
+ "grad_norm": 8.125,
825
+ "kd_loss": 0.1787109375,
826
+ "learning_rate": 6.6830197686277945e-06,
827
+ "loss": 0.2152,
828
+ "step": 410,
829
+ "student_loss": 0.5714533925056458,
830
+ "teacher_loss": 0.008862318471074104
831
+ },
832
+ {
833
+ "epoch": 0.11966551326412918,
834
+ "grad_norm": 4.1875,
835
+ "kd_loss": 0.1279296875,
836
+ "learning_rate": 6.6065583795942625e-06,
837
+ "loss": 0.2006,
838
+ "step": 415,
839
+ "student_loss": 0.019465278834104538,
840
+ "teacher_loss": 0.0042519038543105125
841
+ },
842
+ {
843
+ "epoch": 0.12110726643598616,
844
+ "grad_norm": 5.21875,
845
+ "kd_loss": 0.1181640625,
846
+ "learning_rate": 6.52967569846937e-06,
847
+ "loss": 0.1764,
848
+ "step": 420,
849
+ "student_loss": 0.008550797589123249,
850
+ "teacher_loss": 0.023795029148459435
851
+ },
852
+ {
853
+ "epoch": 0.12254901960784313,
854
+ "grad_norm": 8.5625,
855
+ "kd_loss": 0.126953125,
856
+ "learning_rate": 6.452391886403767e-06,
857
+ "loss": 0.1854,
858
+ "step": 425,
859
+ "student_loss": 0.04949700087308884,
860
+ "teacher_loss": 0.004153509624302387
861
+ },
862
+ {
863
+ "epoch": 0.12399077277970011,
864
+ "grad_norm": 7.25,
865
+ "kd_loss": 0.18359375,
866
+ "learning_rate": 6.374727209737743e-06,
867
+ "loss": 0.2107,
868
+ "step": 430,
869
+ "student_loss": 0.002016805112361908,
870
+ "teacher_loss": 0.06358348578214645
871
+ },
872
+ {
873
+ "epoch": 0.1254325259515571,
874
+ "grad_norm": 6.21875,
875
+ "kd_loss": 0.11083984375,
876
+ "learning_rate": 6.296702034686726e-06,
877
+ "loss": 0.1934,
878
+ "step": 435,
879
+ "student_loss": 0.0017790297279134393,
880
+ "teacher_loss": 0.0017155191162601113
881
+ },
882
+ {
883
+ "epoch": 0.12687427912341406,
884
+ "grad_norm": 4.125,
885
+ "kd_loss": 0.18359375,
886
+ "learning_rate": 6.218336822000598e-06,
887
+ "loss": 0.2068,
888
+ "step": 440,
889
+ "student_loss": 0.565355658531189,
890
+ "teacher_loss": 0.008367877453565598
891
+ },
892
+ {
893
+ "epoch": 0.12831603229527105,
894
+ "grad_norm": 5.09375,
895
+ "kd_loss": 0.1357421875,
896
+ "learning_rate": 6.139652121598219e-06,
897
+ "loss": 0.2072,
898
+ "step": 445,
899
+ "student_loss": 0.00040582873043604195,
900
+ "teacher_loss": 0.017554111778736115
901
+ },
902
+ {
903
+ "epoch": 0.12975778546712802,
904
+ "grad_norm": 4.03125,
905
+ "kd_loss": 0.1396484375,
906
+ "learning_rate": 6.060668567178561e-06,
907
+ "loss": 0.194,
908
+ "step": 450,
909
+ "student_loss": 0.0020873546600341797,
910
+ "teacher_loss": 0.0007538718055002391
911
+ },
912
+ {
913
+ "epoch": 0.131199538638985,
914
+ "grad_norm": 4.09375,
915
+ "kd_loss": 0.10595703125,
916
+ "learning_rate": 5.981406870809889e-06,
917
+ "loss": 0.1896,
918
+ "step": 455,
919
+ "student_loss": 0.010908172465860844,
920
+ "teacher_loss": 0.0012792785419151187
921
+ },
922
+ {
923
+ "epoch": 0.13264129181084197,
924
+ "grad_norm": 6.6875,
925
+ "kd_loss": 0.12255859375,
926
+ "learning_rate": 5.9018878174983674e-06,
927
+ "loss": 0.1893,
928
+ "step": 460,
929
+ "student_loss": 0.01578596420586109,
930
+ "teacher_loss": 0.0009114979766309261
931
+ },
932
+ {
933
+ "epoch": 0.13408304498269896,
934
+ "grad_norm": 4.4375,
935
+ "kd_loss": 0.14453125,
936
+ "learning_rate": 5.822132259737565e-06,
937
+ "loss": 0.2189,
938
+ "step": 465,
939
+ "student_loss": 0.0021727595012634993,
940
+ "teacher_loss": 0.0004909814451821148
941
+ },
942
+ {
943
+ "epoch": 0.13552479815455595,
944
+ "grad_norm": 5.09375,
945
+ "kd_loss": 0.1005859375,
946
+ "learning_rate": 5.742161112040237e-06,
947
+ "loss": 0.2169,
948
+ "step": 470,
949
+ "student_loss": 0.0009243786334991455,
950
+ "teacher_loss": 0.000744891760405153
951
+ },
952
+ {
953
+ "epoch": 0.13696655132641292,
954
+ "grad_norm": 4.9375,
955
+ "kd_loss": 0.1396484375,
956
+ "learning_rate": 5.661995345453867e-06,
957
+ "loss": 0.1752,
958
+ "step": 475,
959
+ "student_loss": 0.003345559583976865,
960
+ "teacher_loss": 0.0005117281689308584
961
+ },
962
+ {
963
+ "epoch": 0.1384083044982699,
964
+ "grad_norm": 4.46875,
965
+ "kd_loss": 0.1376953125,
966
+ "learning_rate": 5.581655982061367e-06,
967
+ "loss": 0.211,
968
+ "step": 480,
969
+ "student_loss": 0.0018078088760375977,
970
+ "teacher_loss": 0.029633358120918274
971
+ },
972
+ {
973
+ "epoch": 0.13985005767012687,
974
+ "grad_norm": 6.59375,
975
+ "kd_loss": 0.11767578125,
976
+ "learning_rate": 5.501164089468406e-06,
977
+ "loss": 0.1795,
978
+ "step": 485,
979
+ "student_loss": 0.3106631636619568,
980
+ "teacher_loss": 0.0037072307895869017
981
+ },
982
+ {
983
+ "epoch": 0.14129181084198386,
984
+ "grad_norm": 6.0,
985
+ "kd_loss": 0.1328125,
986
+ "learning_rate": 5.4205407752787884e-06,
987
+ "loss": 0.1896,
988
+ "step": 490,
989
+ "student_loss": 0.0010533903259783983,
990
+ "teacher_loss": 0.0012765543069690466
991
+ },
992
+ {
993
+ "epoch": 0.14273356401384082,
994
+ "grad_norm": 5.25,
995
+ "kd_loss": 0.1376953125,
996
+ "learning_rate": 5.339807181559359e-06,
997
+ "loss": 0.194,
998
+ "step": 495,
999
+ "student_loss": 0.08354002982378006,
1000
+ "teacher_loss": 0.0006759578245691955
1001
+ },
1002
+ {
1003
+ "epoch": 0.14417531718569782,
1004
+ "grad_norm": 6.125,
1005
+ "kd_loss": 0.12060546875,
1006
+ "learning_rate": 5.258984479295853e-06,
1007
+ "loss": 0.1865,
1008
+ "step": 500,
1009
+ "student_loss": 0.003352736122906208,
1010
+ "teacher_loss": 0.0017620434518903494
1011
+ },
1012
+ {
1013
+ "epoch": 0.14561707035755478,
1014
+ "grad_norm": 5.71875,
1015
+ "kd_loss": 0.11572265625,
1016
+ "learning_rate": 5.1780938628411795e-06,
1017
+ "loss": 0.2201,
1018
+ "step": 505,
1019
+ "student_loss": 0.002489902079105377,
1020
+ "teacher_loss": 0.0007855825824663043
1021
+ },
1022
+ {
1023
+ "epoch": 0.14705882352941177,
1024
+ "grad_norm": 5.59375,
1025
+ "kd_loss": 0.10498046875,
1026
+ "learning_rate": 5.097156544357567e-06,
1027
+ "loss": 0.2023,
1028
+ "step": 510,
1029
+ "student_loss": 0.0014551215572282672,
1030
+ "teacher_loss": 0.0008335533202625811
1031
+ },
1032
+ {
1033
+ "epoch": 0.14850057670126873,
1034
+ "grad_norm": 4.75,
1035
+ "kd_loss": 0.12353515625,
1036
+ "learning_rate": 5.016193748254045e-06,
1037
+ "loss": 0.1779,
1038
+ "step": 515,
1039
+ "student_loss": 0.01823529414832592,
1040
+ "teacher_loss": 0.0005908762104809284
1041
+ },
1042
+ {
1043
+ "epoch": 0.14994232987312572,
1044
+ "grad_norm": 3.875,
1045
+ "kd_loss": 0.1181640625,
1046
+ "learning_rate": 4.935226705620699e-06,
1047
+ "loss": 0.1875,
1048
+ "step": 520,
1049
+ "student_loss": 0.48260822892189026,
1050
+ "teacher_loss": 0.011817601509392262
1051
+ },
1052
+ {
1053
+ "epoch": 0.1513840830449827,
1054
+ "grad_norm": 3.828125,
1055
+ "kd_loss": 0.1142578125,
1056
+ "learning_rate": 4.8542766486612035e-06,
1057
+ "loss": 0.179,
1058
+ "step": 525,
1059
+ "student_loss": 0.0011587169719859958,
1060
+ "teacher_loss": 0.0004872040299233049
1061
+ },
1062
+ {
1063
+ "epoch": 0.15282583621683968,
1064
+ "grad_norm": 5.78125,
1065
+ "kd_loss": 0.11767578125,
1066
+ "learning_rate": 4.773364805125025e-06,
1067
+ "loss": 0.1752,
1068
+ "step": 530,
1069
+ "student_loss": 0.0030523419845849276,
1070
+ "teacher_loss": 0.0013172540348023176
1071
+ },
1072
+ {
1073
+ "epoch": 0.15426758938869667,
1074
+ "grad_norm": 3.1875,
1075
+ "kd_loss": 0.10302734375,
1076
+ "learning_rate": 4.6925123927408265e-06,
1077
+ "loss": 0.1654,
1078
+ "step": 535,
1079
+ "student_loss": 0.0017982972785830498,
1080
+ "teacher_loss": 0.00047424182412214577
1081
+ },
1082
+ {
1083
+ "epoch": 0.15570934256055363,
1084
+ "grad_norm": 5.59375,
1085
+ "kd_loss": 0.11962890625,
1086
+ "learning_rate": 4.611740613652485e-06,
1087
+ "loss": 0.1655,
1088
+ "step": 540,
1089
+ "student_loss": 0.013529052957892418,
1090
+ "teacher_loss": 0.0009085286292247474
1091
+ },
1092
+ {
1093
+ "epoch": 0.15715109573241062,
1094
+ "grad_norm": 9.0,
1095
+ "kd_loss": 0.115234375,
1096
+ "learning_rate": 4.531070648859186e-06,
1097
+ "loss": 0.1973,
1098
+ "step": 545,
1099
+ "student_loss": 0.004623454995453358,
1100
+ "teacher_loss": 0.007325619924813509
1101
+ },
1102
+ {
1103
+ "epoch": 0.15859284890426759,
1104
+ "grad_norm": 5.8125,
1105
+ "kd_loss": 0.126953125,
1106
+ "learning_rate": 4.450523652661086e-06,
1107
+ "loss": 0.1622,
1108
+ "step": 550,
1109
+ "student_loss": 0.0009506479254923761,
1110
+ "teacher_loss": 0.008248833939433098
1111
+ },
1112
+ {
1113
+ "epoch": 0.16003460207612458,
1114
+ "grad_norm": 4.34375,
1115
+ "kd_loss": 0.11376953125,
1116
+ "learning_rate": 4.370120747111956e-06,
1117
+ "loss": 0.1848,
1118
+ "step": 555,
1119
+ "student_loss": 0.005332108587026596,
1120
+ "teacher_loss": 0.0016086830291897058
1121
+ },
1122
+ {
1123
+ "epoch": 0.16147635524798154,
1124
+ "grad_norm": 10.375,
1125
+ "kd_loss": 0.1201171875,
1126
+ "learning_rate": 4.289883016480291e-06,
1127
+ "loss": 0.2032,
1128
+ "step": 560,
1129
+ "student_loss": 0.12518270313739777,
1130
+ "teacher_loss": 0.0005838426877744496
1131
+ },
1132
+ {
1133
+ "epoch": 0.16291810841983853,
1134
+ "grad_norm": 6.0625,
1135
+ "kd_loss": 0.12353515625,
1136
+ "learning_rate": 4.209831501720328e-06,
1137
+ "loss": 0.1825,
1138
+ "step": 565,
1139
+ "student_loss": 0.029691526666283607,
1140
+ "teacher_loss": 0.021172240376472473
1141
+ },
1142
+ {
1143
+ "epoch": 0.1643598615916955,
1144
+ "grad_norm": 8.9375,
1145
+ "kd_loss": 0.1708984375,
1146
+ "learning_rate": 4.129987194954421e-06,
1147
+ "loss": 0.189,
1148
+ "step": 570,
1149
+ "student_loss": 0.13193272054195404,
1150
+ "teacher_loss": 0.00722926901653409
1151
+ },
1152
+ {
1153
+ "epoch": 0.16580161476355249,
1154
+ "grad_norm": 7.84375,
1155
+ "kd_loss": 0.10302734375,
1156
+ "learning_rate": 4.050371033968216e-06,
1157
+ "loss": 0.1851,
1158
+ "step": 575,
1159
+ "student_loss": 0.0010820090537890792,
1160
+ "teacher_loss": 0.0006335995858535171
1161
+ },
1162
+ {
1163
+ "epoch": 0.16724336793540945,
1164
+ "grad_norm": 4.96875,
1165
+ "kd_loss": 0.330078125,
1166
+ "learning_rate": 3.9710038967200825e-06,
1167
+ "loss": 0.1666,
1168
+ "step": 580,
1169
+ "student_loss": 0.0031647607684135437,
1170
+ "teacher_loss": 0.0028759294655174017
1171
+ },
1172
+ {
1173
+ "epoch": 0.16868512110726644,
1174
+ "grad_norm": 3.1875,
1175
+ "kd_loss": 0.1142578125,
1176
+ "learning_rate": 3.89190659586623e-06,
1177
+ "loss": 0.1868,
1178
+ "step": 585,
1179
+ "student_loss": 0.040488943457603455,
1180
+ "teacher_loss": 0.0005194384139031172
1181
+ },
1182
+ {
1183
+ "epoch": 0.1701268742791234,
1184
+ "grad_norm": 7.5625,
1185
+ "kd_loss": 0.11181640625,
1186
+ "learning_rate": 3.8130998733029517e-06,
1187
+ "loss": 0.1949,
1188
+ "step": 590,
1189
+ "student_loss": 0.00226792530156672,
1190
+ "teacher_loss": 0.0028023580089211464
1191
+ },
1192
+ {
1193
+ "epoch": 0.1715686274509804,
1194
+ "grad_norm": 4.0625,
1195
+ "kd_loss": 0.11181640625,
1196
+ "learning_rate": 3.734604394727419e-06,
1197
+ "loss": 0.2049,
1198
+ "step": 595,
1199
+ "student_loss": 0.0009982635965570807,
1200
+ "teacher_loss": 0.001097838394343853
1201
+ },
1202
+ {
1203
+ "epoch": 0.17301038062283736,
1204
+ "grad_norm": 6.5625,
1205
+ "kd_loss": 0.12353515625,
1206
+ "learning_rate": 3.656440744218464e-06,
1207
+ "loss": 0.1982,
1208
+ "step": 600,
1209
+ "student_loss": 0.34091848134994507,
1210
+ "teacher_loss": 0.009622580371797085
1211
+ },
1212
+ {
1213
+ "epoch": 0.17445213379469435,
1214
+ "grad_norm": 6.625,
1215
+ "kd_loss": 0.1181640625,
1216
+ "learning_rate": 3.578629418838757e-06,
1217
+ "loss": 0.1972,
1218
+ "step": 605,
1219
+ "student_loss": 0.2827480435371399,
1220
+ "teacher_loss": 0.039488162845373154
1221
+ },
1222
+ {
1223
+ "epoch": 0.17589388696655134,
1224
+ "grad_norm": 4.96875,
1225
+ "kd_loss": 0.1201171875,
1226
+ "learning_rate": 3.5011908232598124e-06,
1227
+ "loss": 0.1603,
1228
+ "step": 610,
1229
+ "student_loss": 0.106364406645298,
1230
+ "teacher_loss": 0.0008834014879539609
1231
+ },
1232
+ {
1233
+ "epoch": 0.1773356401384083,
1234
+ "grad_norm": 4.75,
1235
+ "kd_loss": 0.1240234375,
1236
+ "learning_rate": 3.4241452644112085e-06,
1237
+ "loss": 0.1596,
1238
+ "step": 615,
1239
+ "student_loss": 0.0008959124679677188,
1240
+ "teacher_loss": 0.0007150223245844245
1241
+ },
1242
+ {
1243
+ "epoch": 0.1787773933102653,
1244
+ "grad_norm": 3.34375,
1245
+ "kd_loss": 0.1533203125,
1246
+ "learning_rate": 3.3475129461554567e-06,
1247
+ "loss": 0.1941,
1248
+ "step": 620,
1249
+ "student_loss": 0.00717555359005928,
1250
+ "teacher_loss": 0.008744844235479832
1251
+ },
1252
+ {
1253
+ "epoch": 0.18021914648212226,
1254
+ "grad_norm": 4.46875,
1255
+ "kd_loss": 0.109375,
1256
+ "learning_rate": 3.271313963989886e-06,
1257
+ "loss": 0.1711,
1258
+ "step": 625,
1259
+ "student_loss": 0.007324306294322014,
1260
+ "teacher_loss": 0.005477549973875284
1261
+ },
1262
+ {
1263
+ "epoch": 0.18166089965397925,
1264
+ "grad_norm": 4.875,
1265
+ "kd_loss": 0.11962890625,
1266
+ "learning_rate": 3.195568299776945e-06,
1267
+ "loss": 0.1813,
1268
+ "step": 630,
1269
+ "student_loss": 0.1220104992389679,
1270
+ "teacher_loss": 0.005055113695561886
1271
+ },
1272
+ {
1273
+ "epoch": 0.1831026528258362,
1274
+ "grad_norm": 4.1875,
1275
+ "kd_loss": 0.1005859375,
1276
+ "learning_rate": 3.1202958165043053e-06,
1277
+ "loss": 0.2012,
1278
+ "step": 635,
1279
+ "student_loss": 0.0011498430976644158,
1280
+ "teacher_loss": 0.0006288467557169497
1281
+ },
1282
+ {
1283
+ "epoch": 0.1845444059976932,
1284
+ "grad_norm": 4.84375,
1285
+ "kd_loss": 0.11767578125,
1286
+ "learning_rate": 3.045516253076137e-06,
1287
+ "loss": 0.1779,
1288
+ "step": 640,
1289
+ "student_loss": 0.0011653146939352155,
1290
+ "teacher_loss": 0.0009915747214108706
1291
+ },
1292
+ {
1293
+ "epoch": 0.18598615916955016,
1294
+ "grad_norm": 9.125,
1295
+ "kd_loss": 0.119140625,
1296
+ "learning_rate": 2.9712492191369245e-06,
1297
+ "loss": 0.1795,
1298
+ "step": 645,
1299
+ "student_loss": 0.004314988851547241,
1300
+ "teacher_loss": 0.0008632438839413226
1301
+ },
1302
+ {
1303
+ "epoch": 0.18742791234140715,
1304
+ "grad_norm": 5.71875,
1305
+ "kd_loss": 0.138671875,
1306
+ "learning_rate": 2.8975141899291777e-06,
1307
+ "loss": 0.1767,
1308
+ "step": 650,
1309
+ "student_loss": 0.004328088369220495,
1310
+ "teacher_loss": 0.0019480936462059617
1311
+ },
1312
+ {
1313
+ "epoch": 0.18886966551326412,
1314
+ "grad_norm": 5.3125,
1315
+ "kd_loss": 0.1064453125,
1316
+ "learning_rate": 2.8243305011863843e-06,
1317
+ "loss": 0.1858,
1318
+ "step": 655,
1319
+ "student_loss": 0.07007281482219696,
1320
+ "teacher_loss": 0.002063432242721319
1321
+ },
1322
+ {
1323
+ "epoch": 0.1903114186851211,
1324
+ "grad_norm": 5.34375,
1325
+ "kd_loss": 0.12060546875,
1326
+ "learning_rate": 2.751717344062552e-06,
1327
+ "loss": 0.1979,
1328
+ "step": 660,
1329
+ "student_loss": 0.0020055994391441345,
1330
+ "teacher_loss": 0.0012256632326170802
1331
+ },
1332
+ {
1333
+ "epoch": 0.19175317185697807,
1334
+ "grad_norm": 4.96875,
1335
+ "kd_loss": 0.1259765625,
1336
+ "learning_rate": 2.6796937600996587e-06,
1337
+ "loss": 0.1824,
1338
+ "step": 665,
1339
+ "student_loss": 0.0013414303539320827,
1340
+ "teacher_loss": 0.0005829873844049871
1341
+ },
1342
+ {
1343
+ "epoch": 0.19319492502883506,
1344
+ "grad_norm": 6.21875,
1345
+ "kd_loss": 0.11181640625,
1346
+ "learning_rate": 2.6082786362343377e-06,
1347
+ "loss": 0.2091,
1348
+ "step": 670,
1349
+ "student_loss": 0.01750928722321987,
1350
+ "teacher_loss": 0.01849350705742836
1351
+ },
1352
+ {
1353
+ "epoch": 0.19463667820069205,
1354
+ "grad_norm": 5.9375,
1355
+ "kd_loss": 0.1083984375,
1356
+ "learning_rate": 2.5374906998451094e-06,
1357
+ "loss": 0.1855,
1358
+ "step": 675,
1359
+ "student_loss": 0.0015071257948875427,
1360
+ "teacher_loss": 0.0012101252796128392
1361
+ },
1362
+ {
1363
+ "epoch": 0.19607843137254902,
1364
+ "grad_norm": 3.984375,
1365
+ "kd_loss": 0.14453125,
1366
+ "learning_rate": 2.467348513841447e-06,
1367
+ "loss": 0.1808,
1368
+ "step": 680,
1369
+ "student_loss": 0.09338736534118652,
1370
+ "teacher_loss": 0.038049884140491486
1371
+ },
1372
+ {
1373
+ "epoch": 0.197520184544406,
1374
+ "grad_norm": 3.75,
1375
+ "kd_loss": 0.177734375,
1376
+ "learning_rate": 2.3978704717959777e-06,
1377
+ "loss": 0.1863,
1378
+ "step": 685,
1379
+ "student_loss": 0.0010167881846427917,
1380
+ "teacher_loss": 0.03185700252652168
1381
+ },
1382
+ {
1383
+ "epoch": 0.19896193771626297,
1384
+ "grad_norm": 3.984375,
1385
+ "kd_loss": 0.11572265625,
1386
+ "learning_rate": 2.329074793121085e-06,
1387
+ "loss": 0.1721,
1388
+ "step": 690,
1389
+ "student_loss": 0.017529672011733055,
1390
+ "teacher_loss": 0.008856060914695263
1391
+ },
1392
+ {
1393
+ "epoch": 0.20040369088811996,
1394
+ "grad_norm": 6.65625,
1395
+ "kd_loss": 0.1171875,
1396
+ "learning_rate": 2.260979518291186e-06,
1397
+ "loss": 0.2122,
1398
+ "step": 695,
1399
+ "student_loss": 0.028023820370435715,
1400
+ "teacher_loss": 0.005092882085591555
1401
+ },
1402
+ {
1403
+ "epoch": 0.20184544405997693,
1404
+ "grad_norm": 4.84375,
1405
+ "kd_loss": 0.1015625,
1406
+ "learning_rate": 2.1936025041119268e-06,
1407
+ "loss": 0.1889,
1408
+ "step": 700,
1409
+ "student_loss": 0.004128373693674803,
1410
+ "teacher_loss": 0.0009403788135387003
1411
+ },
1412
+ {
1413
+ "epoch": 0.20328719723183392,
1414
+ "grad_norm": 5.03125,
1415
+ "kd_loss": 0.12451171875,
1416
+ "learning_rate": 2.1269614190375477e-06,
1417
+ "loss": 0.1781,
1418
+ "step": 705,
1419
+ "student_loss": 0.0009666193509474397,
1420
+ "teacher_loss": 0.0012199536431580782
1421
+ },
1422
+ {
1423
+ "epoch": 0.20472895040369088,
1424
+ "grad_norm": 5.46875,
1425
+ "kd_loss": 0.12890625,
1426
+ "learning_rate": 2.061073738537635e-06,
1427
+ "loss": 0.214,
1428
+ "step": 710,
1429
+ "student_loss": 0.20427227020263672,
1430
+ "teacher_loss": 0.003926330246031284
1431
+ },
1432
+ {
1433
+ "epoch": 0.20617070357554787,
1434
+ "grad_norm": 4.875,
1435
+ "kd_loss": 0.10693359375,
1436
+ "learning_rate": 1.9959567405144825e-06,
1437
+ "loss": 0.2167,
1438
+ "step": 715,
1439
+ "student_loss": 0.034982144832611084,
1440
+ "teacher_loss": 0.012018893845379353
1441
+ },
1442
+ {
1443
+ "epoch": 0.20761245674740483,
1444
+ "grad_norm": 4.90625,
1445
+ "kd_loss": 0.140625,
1446
+ "learning_rate": 1.931627500772263e-06,
1447
+ "loss": 0.1911,
1448
+ "step": 720,
1449
+ "student_loss": 0.0010867691598832607,
1450
+ "teacher_loss": 0.005945001263171434
1451
+ },
1452
+ {
1453
+ "epoch": 0.20905420991926182,
1454
+ "grad_norm": 3.375,
1455
+ "kd_loss": 0.1171875,
1456
+ "learning_rate": 1.8681028885391905e-06,
1457
+ "loss": 0.1776,
1458
+ "step": 725,
1459
+ "student_loss": 0.0011550028575584292,
1460
+ "teacher_loss": 0.0008522791904397309
1461
+ },
1462
+ {
1463
+ "epoch": 0.2104959630911188,
1464
+ "grad_norm": 4.625,
1465
+ "kd_loss": 0.111328125,
1466
+ "learning_rate": 1.8053995620438625e-06,
1467
+ "loss": 0.1726,
1468
+ "step": 730,
1469
+ "student_loss": 0.03152952715754509,
1470
+ "teacher_loss": 0.00232282024808228
1471
+ },
1472
+ {
1473
+ "epoch": 0.21193771626297578,
1474
+ "grad_norm": 4.8125,
1475
+ "kd_loss": 0.11328125,
1476
+ "learning_rate": 1.743533964146924e-06,
1477
+ "loss": 0.1841,
1478
+ "step": 735,
1479
+ "student_loss": 0.0014305273070931435,
1480
+ "teacher_loss": 0.0003280507226008922
1481
+ },
1482
+ {
1483
+ "epoch": 0.21337946943483277,
1484
+ "grad_norm": 6.9375,
1485
+ "kd_loss": 0.138671875,
1486
+ "learning_rate": 1.6825223180292138e-06,
1487
+ "loss": 0.1674,
1488
+ "step": 740,
1489
+ "student_loss": 0.00881188828498125,
1490
+ "teacher_loss": 0.00042216398287564516
1491
+ },
1492
+ {
1493
+ "epoch": 0.21482122260668973,
1494
+ "grad_norm": 6.71875,
1495
+ "kd_loss": 0.12158203125,
1496
+ "learning_rate": 1.6223806229375182e-06,
1497
+ "loss": 0.1744,
1498
+ "step": 745,
1499
+ "student_loss": 0.16076479852199554,
1500
+ "teacher_loss": 0.021096019074320793
1501
+ },
1502
+ {
1503
+ "epoch": 0.21626297577854672,
1504
+ "grad_norm": 6.28125,
1505
+ "kd_loss": 0.12890625,
1506
+ "learning_rate": 1.563124649989043e-06,
1507
+ "loss": 0.1968,
1508
+ "step": 750,
1509
+ "student_loss": 0.11668777465820312,
1510
+ "teacher_loss": 0.006358537822961807
1511
+ },
1512
+ {
1513
+ "epoch": 0.2177047289504037,
1514
+ "grad_norm": 5.4375,
1515
+ "kd_loss": 0.0966796875,
1516
+ "learning_rate": 1.5047699380357134e-06,
1517
+ "loss": 0.1956,
1518
+ "step": 755,
1519
+ "student_loss": 0.11215528845787048,
1520
+ "teacher_loss": 0.0056000882759690285
1521
+ },
1522
+ {
1523
+ "epoch": 0.21914648212226068,
1524
+ "grad_norm": 5.125,
1525
+ "kd_loss": 0.10107421875,
1526
+ "learning_rate": 1.4473317895893773e-06,
1527
+ "loss": 0.1792,
1528
+ "step": 760,
1529
+ "student_loss": 0.26354143023490906,
1530
+ "teacher_loss": 0.0006397212855517864
1531
+ },
1532
+ {
1533
+ "epoch": 0.22058823529411764,
1534
+ "grad_norm": 4.53125,
1535
+ "kd_loss": 0.12060546875,
1536
+ "learning_rate": 1.39082526680899e-06,
1537
+ "loss": 0.1971,
1538
+ "step": 765,
1539
+ "student_loss": 0.08930032700300217,
1540
+ "teacher_loss": 0.0006192077999003232
1541
+ },
1542
+ {
1543
+ "epoch": 0.22202998846597463,
1544
+ "grad_norm": 3.6875,
1545
+ "kd_loss": 0.1142578125,
1546
+ "learning_rate": 1.3352651875508204e-06,
1547
+ "loss": 0.1708,
1548
+ "step": 770,
1549
+ "student_loss": 0.004422426223754883,
1550
+ "teacher_loss": 0.0008745273225940764
1551
+ },
1552
+ {
1553
+ "epoch": 0.2234717416378316,
1554
+ "grad_norm": 5.4375,
1555
+ "kd_loss": 0.11328125,
1556
+ "learning_rate": 1.2806661214827286e-06,
1557
+ "loss": 0.1885,
1558
+ "step": 775,
1559
+ "student_loss": 0.0018669217824935913,
1560
+ "teacher_loss": 0.0006652303854934871
1561
+ },
1562
+ {
1563
+ "epoch": 0.22491349480968859,
1564
+ "grad_norm": 2.9375,
1565
+ "kd_loss": 0.1083984375,
1566
+ "learning_rate": 1.2270423862635188e-06,
1567
+ "loss": 0.1836,
1568
+ "step": 780,
1569
+ "student_loss": 0.009803751483559608,
1570
+ "teacher_loss": 0.0004910013522021472
1571
+ },
1572
+ {
1573
+ "epoch": 0.22635524798154555,
1574
+ "grad_norm": 5.6875,
1575
+ "kd_loss": 0.09912109375,
1576
+ "learning_rate": 1.1744080437883859e-06,
1577
+ "loss": 0.1669,
1578
+ "step": 785,
1579
+ "student_loss": 0.0010796686401590705,
1580
+ "teacher_loss": 0.001094150822609663
1581
+ },
1582
+ {
1583
+ "epoch": 0.22779700115340254,
1584
+ "grad_norm": 6.0625,
1585
+ "kd_loss": 0.1279296875,
1586
+ "learning_rate": 1.1227768965014246e-06,
1587
+ "loss": 0.2026,
1588
+ "step": 790,
1589
+ "student_loss": 0.07495569437742233,
1590
+ "teacher_loss": 0.0015408035833388567
1591
+ },
1592
+ {
1593
+ "epoch": 0.2292387543252595,
1594
+ "grad_norm": 3.390625,
1595
+ "kd_loss": 0.1083984375,
1596
+ "learning_rate": 1.0721624837761768e-06,
1597
+ "loss": 0.1999,
1598
+ "step": 795,
1599
+ "student_loss": 0.0036292201839387417,
1600
+ "teacher_loss": 0.0005642004543915391
1601
+ },
1602
+ {
1603
+ "epoch": 0.2306805074971165,
1604
+ "grad_norm": 6.5625,
1605
+ "kd_loss": 0.18359375,
1606
+ "learning_rate": 1.0225780783651689e-06,
1607
+ "loss": 0.2151,
1608
+ "step": 800,
1609
+ "student_loss": 0.062444765120744705,
1610
+ "teacher_loss": 0.04929126426577568
1611
+ },
1612
+ {
1613
+ "epoch": 0.23212226066897348,
1614
+ "grad_norm": 4.875,
1615
+ "kd_loss": 0.10546875,
1616
+ "learning_rate": 9.740366829193587e-07,
1617
+ "loss": 0.2096,
1618
+ "step": 805,
1619
+ "student_loss": 0.0012999593745917082,
1620
+ "teacher_loss": 0.001006675767712295
1621
+ },
1622
+ {
1623
+ "epoch": 0.23356401384083045,
1624
+ "grad_norm": 6.09375,
1625
+ "kd_loss": 0.109375,
1626
+ "learning_rate": 9.265510265784189e-07,
1627
+ "loss": 0.2063,
1628
+ "step": 810,
1629
+ "student_loss": 0.0013730658683925867,
1630
+ "teacher_loss": 0.00053932867012918
1631
+ },
1632
+ {
1633
+ "epoch": 0.23500576701268744,
1634
+ "grad_norm": 10.0,
1635
+ "kd_loss": 0.103515625,
1636
+ "learning_rate": 8.801335616327378e-07,
1637
+ "loss": 0.1942,
1638
+ "step": 815,
1639
+ "student_loss": 0.05159832164645195,
1640
+ "teacher_loss": 0.010285490192472935
1641
+ },
1642
+ {
1643
+ "epoch": 0.2364475201845444,
1644
+ "grad_norm": 4.9375,
1645
+ "kd_loss": 0.125,
1646
+ "learning_rate": 8.347964602580245e-07,
1647
+ "loss": 0.1808,
1648
+ "step": 820,
1649
+ "student_loss": 0.037393856793642044,
1650
+ "teacher_loss": 0.0004633679345715791
1651
+ },
1652
+ {
1653
+ "epoch": 0.2378892733564014,
1654
+ "grad_norm": 6.5625,
1655
+ "kd_loss": 0.142578125,
1656
+ "learning_rate": 7.905516113233652e-07,
1657
+ "loss": 0.1747,
1658
+ "step": 825,
1659
+ "student_loss": 0.0011921566911041737,
1660
+ "teacher_loss": 0.021351948380470276
1661
+ },
1662
+ {
1663
+ "epoch": 0.23933102652825836,
1664
+ "grad_norm": 3.484375,
1665
+ "kd_loss": 0.111328125,
1666
+ "learning_rate": 7.474106172735746e-07,
1667
+ "loss": 0.1797,
1668
+ "step": 830,
1669
+ "student_loss": 0.03779162839055061,
1670
+ "teacher_loss": 0.003403074573725462
1671
+ },
1672
+ {
1673
+ "epoch": 0.24077277970011535,
1674
+ "grad_norm": 6.03125,
1675
+ "kd_loss": 0.10009765625,
1676
+ "learning_rate": 7.053847910866513e-07,
1677
+ "loss": 0.1667,
1678
+ "step": 835,
1679
+ "student_loss": 0.11626744270324707,
1680
+ "teacher_loss": 0.00203131721355021
1681
+ },
1682
+ {
1683
+ "epoch": 0.2422145328719723,
1684
+ "grad_norm": 4.28125,
1685
+ "kd_loss": 0.1376953125,
1686
+ "learning_rate": 6.644851533071556e-07,
1687
+ "loss": 0.1761,
1688
+ "step": 840,
1689
+ "student_loss": 0.0023884603288024664,
1690
+ "teacher_loss": 0.0004405094077810645
1691
+ },
1692
+ {
1693
+ "epoch": 0.2436562860438293,
1694
+ "grad_norm": 8.6875,
1695
+ "kd_loss": 0.140625,
1696
+ "learning_rate": 6.24722429156251e-07,
1697
+ "loss": 0.2435,
1698
+ "step": 845,
1699
+ "student_loss": 0.14598870277404785,
1700
+ "teacher_loss": 0.0010793671244755387
1701
+ },
1702
+ {
1703
+ "epoch": 0.24509803921568626,
1704
+ "grad_norm": 4.6875,
1705
+ "kd_loss": 0.103515625,
1706
+ "learning_rate": 5.861070457192081e-07,
1707
+ "loss": 0.186,
1708
+ "step": 850,
1709
+ "student_loss": 0.06827586144208908,
1710
+ "teacher_loss": 0.00046239409130066633
1711
+ },
1712
+ {
1713
+ "epoch": 0.24653979238754326,
1714
+ "grad_norm": 3.203125,
1715
+ "kd_loss": 0.125,
1716
+ "learning_rate": 5.486491292110796e-07,
1717
+ "loss": 0.1726,
1718
+ "step": 855,
1719
+ "student_loss": 0.0007081849034875631,
1720
+ "teacher_loss": 0.0005193519755266607
1721
+ },
1722
+ {
1723
+ "epoch": 0.24798154555940022,
1724
+ "grad_norm": 4.40625,
1725
+ "kd_loss": 0.11474609375,
1726
+ "learning_rate": 5.123585023212785e-07,
1727
+ "loss": 0.2129,
1728
+ "step": 860,
1729
+ "student_loss": 0.002149001695215702,
1730
+ "teacher_loss": 0.0017558797262609005
1731
+ },
1732
+ {
1733
+ "epoch": 0.2494232987312572,
1734
+ "grad_norm": 3.921875,
1735
+ "kd_loss": 0.111328125,
1736
+ "learning_rate": 4.772446816377408e-07,
1737
+ "loss": 0.1792,
1738
+ "step": 865,
1739
+ "student_loss": 0.0010927807306870818,
1740
+ "teacher_loss": 0.0006015551625750959
1741
+ },
1742
+ {
1743
+ "epoch": 0.2508650519031142,
1744
+ "grad_norm": 4.0,
1745
+ "kd_loss": 0.12890625,
1746
+ "learning_rate": 4.4331687515137614e-07,
1747
+ "loss": 0.1958,
1748
+ "step": 870,
1749
+ "student_loss": 0.04162781313061714,
1750
+ "teacher_loss": 0.0015390625922009349
1751
+ },
1752
+ {
1753
+ "epoch": 0.25230680507497116,
1754
+ "grad_norm": 7.9375,
1755
+ "kd_loss": 0.10888671875,
1756
+ "learning_rate": 4.1058397984142405e-07,
1757
+ "loss": 0.1771,
1758
+ "step": 875,
1759
+ "student_loss": 0.0009604351944290102,
1760
+ "teacher_loss": 0.0005376276094466448
1761
+ },
1762
+ {
1763
+ "epoch": 0.2537485582468281,
1764
+ "grad_norm": 3.953125,
1765
+ "kd_loss": 0.1162109375,
1766
+ "learning_rate": 3.790545793423761e-07,
1767
+ "loss": 0.1917,
1768
+ "step": 880,
1769
+ "student_loss": 0.0019418075680732727,
1770
+ "teacher_loss": 0.0007360613089986145
1771
+ },
1772
+ {
1773
+ "epoch": 0.25519031141868515,
1774
+ "grad_norm": 3.25,
1775
+ "kd_loss": 0.1435546875,
1776
+ "learning_rate": 3.4873694169306915e-07,
1777
+ "loss": 0.1832,
1778
+ "step": 885,
1779
+ "student_loss": 0.017352323979139328,
1780
+ "teacher_loss": 0.05972852557897568
1781
+ },
1782
+ {
1783
+ "epoch": 0.2566320645905421,
1784
+ "grad_norm": 4.75,
1785
+ "kd_loss": 0.111328125,
1786
+ "learning_rate": 3.196390171685343e-07,
1787
+ "loss": 0.1981,
1788
+ "step": 890,
1789
+ "student_loss": 0.0014260741882026196,
1790
+ "teacher_loss": 0.0012508125510066748
1791
+ },
1792
+ {
1793
+ "epoch": 0.25807381776239907,
1794
+ "grad_norm": 4.25,
1795
+ "kd_loss": 0.1435546875,
1796
+ "learning_rate": 2.917684361951728e-07,
1797
+ "loss": 0.1799,
1798
+ "step": 895,
1799
+ "student_loss": 0.014447882771492004,
1800
+ "teacher_loss": 0.0008786572143435478
1801
+ },
1802
+ {
1803
+ "epoch": 0.25951557093425603,
1804
+ "grad_norm": 4.3125,
1805
+ "kd_loss": 0.1240234375,
1806
+ "learning_rate": 2.65132507349814e-07,
1807
+ "loss": 0.2243,
1808
+ "step": 900,
1809
+ "student_loss": 0.001752070034854114,
1810
+ "teacher_loss": 0.028893902897834778
1811
+ },
1812
+ {
1813
+ "epoch": 0.26095732410611305,
1814
+ "grad_norm": 3.890625,
1815
+ "kd_loss": 0.10888671875,
1816
+ "learning_rate": 2.397382154431621e-07,
1817
+ "loss": 0.1707,
1818
+ "step": 905,
1819
+ "student_loss": 0.07611552625894547,
1820
+ "teacher_loss": 0.0018923009047284722
1821
+ },
1822
+ {
1823
+ "epoch": 0.26239907727797,
1824
+ "grad_norm": 3.84375,
1825
+ "kd_loss": 0.130859375,
1826
+ "learning_rate": 2.1559221968815547e-07,
1827
+ "loss": 0.1867,
1828
+ "step": 910,
1829
+ "student_loss": 0.0012913525570183992,
1830
+ "teacher_loss": 0.0014879581285640597
1831
+ },
1832
+ {
1833
+ "epoch": 0.263840830449827,
1834
+ "grad_norm": 5.1875,
1835
+ "kd_loss": 0.11669921875,
1836
+ "learning_rate": 1.9270085195370048e-07,
1837
+ "loss": 0.1647,
1838
+ "step": 915,
1839
+ "student_loss": 0.03545321896672249,
1840
+ "teacher_loss": 0.0013070907443761826
1841
+ },
1842
+ {
1843
+ "epoch": 0.26528258362168394,
1844
+ "grad_norm": 3.390625,
1845
+ "kd_loss": 0.1630859375,
1846
+ "learning_rate": 1.7107011510424766e-07,
1847
+ "loss": 0.1914,
1848
+ "step": 920,
1849
+ "student_loss": 0.013124704360961914,
1850
+ "teacher_loss": 0.015305249951779842
1851
+ },
1852
+ {
1853
+ "epoch": 0.26672433679354096,
1854
+ "grad_norm": 3.84375,
1855
+ "kd_loss": 0.1064453125,
1856
+ "learning_rate": 1.5070568142564912e-07,
1857
+ "loss": 0.1662,
1858
+ "step": 925,
1859
+ "student_loss": 0.0009416788816452026,
1860
+ "teacher_loss": 0.0009266760898754001
1861
+ },
1862
+ {
1863
+ "epoch": 0.2681660899653979,
1864
+ "grad_norm": 5.90625,
1865
+ "kd_loss": 0.11376953125,
1866
+ "learning_rate": 1.3161289113769405e-07,
1867
+ "loss": 0.1771,
1868
+ "step": 930,
1869
+ "student_loss": 0.04463067650794983,
1870
+ "teacher_loss": 0.0009097782894968987
1871
+ },
1872
+ {
1873
+ "epoch": 0.2696078431372549,
1874
+ "grad_norm": 5.59375,
1875
+ "kd_loss": 0.126953125,
1876
+ "learning_rate": 1.1379675099373489e-07,
1877
+ "loss": 0.1749,
1878
+ "step": 935,
1879
+ "student_loss": 0.02378019131720066,
1880
+ "teacher_loss": 0.0022270630579441786
1881
+ },
1882
+ {
1883
+ "epoch": 0.2710495963091119,
1884
+ "grad_norm": 5.78125,
1885
+ "kd_loss": 0.2138671875,
1886
+ "learning_rate": 9.726193296774767e-08,
1887
+ "loss": 0.1876,
1888
+ "step": 940,
1889
+ "student_loss": 0.002607885980978608,
1890
+ "teacher_loss": 0.0031158654019236565
1891
+ },
1892
+ {
1893
+ "epoch": 0.27249134948096887,
1894
+ "grad_norm": 7.5625,
1895
+ "kd_loss": 0.10595703125,
1896
+ "learning_rate": 8.201277302919086e-08,
1897
+ "loss": 0.1904,
1898
+ "step": 945,
1899
+ "student_loss": 0.07421658933162689,
1900
+ "teacher_loss": 0.002074115676805377
1901
+ },
1902
+ {
1903
+ "epoch": 0.27393310265282583,
1904
+ "grad_norm": 3.203125,
1905
+ "kd_loss": 0.13671875,
1906
+ "learning_rate": 6.805327000596995e-08,
1907
+ "loss": 0.17,
1908
+ "step": 950,
1909
+ "student_loss": 0.005115017760545015,
1910
+ "teacher_loss": 0.0004907959373667836
1911
+ },
1912
+ {
1913
+ "epoch": 0.2753748558246828,
1914
+ "grad_norm": 4.5625,
1915
+ "kd_loss": 0.095703125,
1916
+ "learning_rate": 5.538708453581787e-08,
1917
+ "loss": 0.1903,
1918
+ "step": 955,
1919
+ "student_loss": 0.011558901518583298,
1920
+ "teacher_loss": 0.0004379775491543114
1921
+ },
1922
+ {
1923
+ "epoch": 0.2768166089965398,
1924
+ "grad_norm": 7.9375,
1925
+ "kd_loss": 0.12451171875,
1926
+ "learning_rate": 4.40175381063529e-08,
1927
+ "loss": 0.1861,
1928
+ "step": 960,
1929
+ "student_loss": 0.0017829686403274536,
1930
+ "teacher_loss": 0.0036916364915668964
1931
+ },
1932
+ {
1933
+ "epoch": 0.2782583621683968,
1934
+ "grad_norm": 4.96875,
1935
+ "kd_loss": 0.1728515625,
1936
+ "learning_rate": 3.394761218407705e-08,
1937
+ "loss": 0.2026,
1938
+ "step": 965,
1939
+ "student_loss": 0.2359560877084732,
1940
+ "teacher_loss": 0.008938993327319622
1941
+ },
1942
+ {
1943
+ "epoch": 0.27970011534025374,
1944
+ "grad_norm": 4.375,
1945
+ "kd_loss": 0.1123046875,
1946
+ "learning_rate": 2.5179947432540376e-08,
1947
+ "loss": 0.1889,
1948
+ "step": 970,
1949
+ "student_loss": 0.0006114219431765378,
1950
+ "teacher_loss": 0.00041617779061198235
1951
+ },
1952
+ {
1953
+ "epoch": 0.2811418685121107,
1954
+ "grad_norm": 4.5,
1955
+ "kd_loss": 0.10888671875,
1956
+ "learning_rate": 1.7716843019867646e-08,
1957
+ "loss": 0.1982,
1958
+ "step": 975,
1959
+ "student_loss": 0.07514014840126038,
1960
+ "teacher_loss": 0.001045848592184484
1961
+ },
1962
+ {
1963
+ "epoch": 0.2825836216839677,
1964
+ "grad_norm": 5.5,
1965
+ "kd_loss": 0.107421875,
1966
+ "learning_rate": 1.156025601584676e-08,
1967
+ "loss": 0.1779,
1968
+ "step": 980,
1969
+ "student_loss": 0.0017143742879852653,
1970
+ "teacher_loss": 0.0004985960549674928
1971
+ },
1972
+ {
1973
+ "epoch": 0.2840253748558247,
1974
+ "grad_norm": 6.0625,
1975
+ "kd_loss": 0.107421875,
1976
+ "learning_rate": 6.711800878718144e-09,
1977
+ "loss": 0.1914,
1978
+ "step": 985,
1979
+ "student_loss": 0.0008509993785992265,
1980
+ "teacher_loss": 0.0007709055789746344
1981
+ },
1982
+ {
1983
+ "epoch": 0.28546712802768165,
1984
+ "grad_norm": 11.8125,
1985
+ "kd_loss": 0.0986328125,
1986
+ "learning_rate": 3.1727490318111953e-09,
1987
+ "loss": 0.1871,
1988
+ "step": 990,
1989
+ "student_loss": 0.007576147560030222,
1990
+ "teacher_loss": 0.015172326937317848
1991
+ },
1992
+ {
1993
+ "epoch": 0.2869088811995386,
1994
+ "grad_norm": 4.625,
1995
+ "kd_loss": 0.1318359375,
1996
+ "learning_rate": 9.440285301370865e-10,
1997
+ "loss": 0.2025,
1998
+ "step": 995,
1999
+ "student_loss": 0.0030154017731547356,
2000
+ "teacher_loss": 0.0012625143863260746
2001
+ },
2002
+ {
2003
+ "epoch": 0.28835063437139563,
2004
+ "grad_norm": 5.40625,
2005
+ "kd_loss": 0.10546875,
2006
+ "learning_rate": 2.622381702066523e-11,
2007
+ "loss": 0.1678,
2008
+ "step": 1000,
2009
+ "student_loss": 0.08579359203577042,
2010
+ "teacher_loss": 0.0004960879450663924
2011
+ },
2012
+ {
2013
+ "epoch": 0.28835063437139563,
2014
+ "kd_loss": 0.10546875,
2015
+ "step": 1000,
2016
+ "student_loss": 0.08579359203577042,
2017
+ "teacher_loss": 0.0004960879450663924,
2018
+ "total_flos": 0.0,
2019
+ "train_loss": 0.2803848307132721,
2020
+ "train_runtime": 4898.5131,
2021
+ "train_samples_per_second": 3.266,
2022
+ "train_steps_per_second": 0.204
2023
+ },
2024
+ {
2025
+ "epoch": 0.2897923875432526,
2026
+ "grad_norm": 4.21875,
2027
+ "kd_loss": 0.107421875,
2028
+ "learning_rate": 7.75705864825114e-06,
2029
+ "loss": 0.1907,
2030
+ "step": 1005,
2031
+ "student_loss": 0.04834694042801857,
2032
+ "teacher_loss": 0.07516621053218842
2033
+ },
2034
+ {
2035
+ "epoch": 0.29123414071510956,
2036
+ "grad_norm": 10.375,
2037
+ "kd_loss": 0.11767578125,
2038
+ "learning_rate": 7.734502946076656e-06,
2039
+ "loss": 0.1897,
2040
+ "step": 1010,
2041
+ "student_loss": 0.001301786513067782,
2042
+ "teacher_loss": 0.0013126098783686757
2043
+ },
2044
+ {
2045
+ "epoch": 0.2926758938869666,
2046
+ "grad_norm": 4.3125,
2047
+ "kd_loss": 0.1748046875,
2048
+ "learning_rate": 7.711867567242769e-06,
2049
+ "loss": 0.1779,
2050
+ "step": 1015,
2051
+ "student_loss": 0.026407891884446144,
2052
+ "teacher_loss": 0.008747180923819542
2053
+ },
2054
+ {
2055
+ "epoch": 0.29411764705882354,
2056
+ "grad_norm": 5.5,
2057
+ "kd_loss": 0.2119140625,
2058
+ "learning_rate": 7.689153171288487e-06,
2059
+ "loss": 0.1807,
2060
+ "step": 1020,
2061
+ "student_loss": 0.029385080561041832,
2062
+ "teacher_loss": 0.03383686766028404
2063
+ },
2064
+ {
2065
+ "epoch": 0.2955594002306805,
2066
+ "grad_norm": 9.375,
2067
+ "kd_loss": 0.11474609375,
2068
+ "learning_rate": 7.666360420055188e-06,
2069
+ "loss": 0.1903,
2070
+ "step": 1025,
2071
+ "student_loss": 0.0012202368816360831,
2072
+ "teacher_loss": 0.0004073530144523829
2073
+ },
2074
+ {
2075
+ "epoch": 0.29700115340253747,
2076
+ "grad_norm": 5.28125,
2077
+ "kd_loss": 0.10400390625,
2078
+ "learning_rate": 7.643489977667327e-06,
2079
+ "loss": 0.1909,
2080
+ "step": 1030,
2081
+ "student_loss": 0.03574004024267197,
2082
+ "teacher_loss": 0.0005011596367694438
2083
+ },
2084
+ {
2085
+ "epoch": 0.2984429065743945,
2086
+ "grad_norm": 5.71875,
2087
+ "kd_loss": 0.11376953125,
2088
+ "learning_rate": 7.6205425105130855e-06,
2089
+ "loss": 0.1876,
2090
+ "step": 1035,
2091
+ "student_loss": 0.009626907296478748,
2092
+ "teacher_loss": 0.0027181527111679316
2093
+ },
2094
+ {
2095
+ "epoch": 0.29988465974625145,
2096
+ "grad_norm": 5.5,
2097
+ "kd_loss": 0.1044921875,
2098
+ "learning_rate": 7.597518687224959e-06,
2099
+ "loss": 0.2002,
2100
+ "step": 1040,
2101
+ "student_loss": 0.04977378249168396,
2102
+ "teacher_loss": 0.0008017396903596818
2103
+ },
2104
+ {
2105
+ "epoch": 0.3013264129181084,
2106
+ "grad_norm": 5.84375,
2107
+ "kd_loss": 0.0927734375,
2108
+ "learning_rate": 7.574419178660269e-06,
2109
+ "loss": 0.1864,
2110
+ "step": 1045,
2111
+ "student_loss": 0.00125790829770267,
2112
+ "teacher_loss": 0.0010273143416270614
2113
+ },
2114
+ {
2115
+ "epoch": 0.3027681660899654,
2116
+ "grad_norm": 7.5,
2117
+ "kd_loss": 0.10302734375,
2118
+ "learning_rate": 7.551244657881618e-06,
2119
+ "loss": 0.2226,
2120
+ "step": 1050,
2121
+ "student_loss": 0.396968275308609,
2122
+ "teacher_loss": 0.0008409225265495479
2123
+ },
2124
+ {
2125
+ "epoch": 0.3042099192618224,
2126
+ "grad_norm": 5.46875,
2127
+ "kd_loss": 0.11279296875,
2128
+ "learning_rate": 7.527995800137287e-06,
2129
+ "loss": 0.1736,
2130
+ "step": 1055,
2131
+ "student_loss": 0.001924316049553454,
2132
+ "teacher_loss": 0.0013026405358687043
2133
+ },
2134
+ {
2135
+ "epoch": 0.30565167243367936,
2136
+ "grad_norm": 7.28125,
2137
+ "kd_loss": 0.1083984375,
2138
+ "learning_rate": 7.504673282841544e-06,
2139
+ "loss": 0.179,
2140
+ "step": 1060,
2141
+ "student_loss": 0.07162957638502121,
2142
+ "teacher_loss": 0.0005437894142232835
2143
+ },
2144
+ {
2145
+ "epoch": 0.3070934256055363,
2146
+ "grad_norm": 5.5,
2147
+ "kd_loss": 0.1376953125,
2148
+ "learning_rate": 7.481277785554918e-06,
2149
+ "loss": 0.1812,
2150
+ "step": 1065,
2151
+ "student_loss": 0.051106277853250504,
2152
+ "teacher_loss": 0.0007818956510163844
2153
+ },
2154
+ {
2155
+ "epoch": 0.30853517877739334,
2156
+ "grad_norm": 7.65625,
2157
+ "kd_loss": 0.11865234375,
2158
+ "learning_rate": 7.457809989964393e-06,
2159
+ "loss": 0.2085,
2160
+ "step": 1070,
2161
+ "student_loss": 0.07073640078306198,
2162
+ "teacher_loss": 0.0005237783188931644
2163
+ },
2164
+ {
2165
+ "epoch": 0.3099769319492503,
2166
+ "grad_norm": 4.5625,
2167
+ "kd_loss": 0.10546875,
2168
+ "learning_rate": 7.434270579863549e-06,
2169
+ "loss": 0.1846,
2170
+ "step": 1075,
2171
+ "student_loss": 0.007236347068101168,
2172
+ "teacher_loss": 0.0034689689055085182
2173
+ },
2174
+ {
2175
+ "epoch": 0.31141868512110726,
2176
+ "grad_norm": 7.4375,
2177
+ "kd_loss": 0.11376953125,
2178
+ "learning_rate": 7.4106602411326345e-06,
2179
+ "loss": 0.1724,
2180
+ "step": 1080,
2181
+ "student_loss": 0.002105340361595154,
2182
+ "teacher_loss": 0.01056791003793478
2183
+ },
2184
+ {
2185
+ "epoch": 0.3128604382929642,
2186
+ "grad_norm": 4.46875,
2187
+ "kd_loss": 0.10302734375,
2188
+ "learning_rate": 7.386979661718585e-06,
2189
+ "loss": 0.1935,
2190
+ "step": 1085,
2191
+ "student_loss": 0.00163954496383667,
2192
+ "teacher_loss": 0.0008159097633324564
2193
+ },
2194
+ {
2195
+ "epoch": 0.31430219146482125,
2196
+ "grad_norm": 5.59375,
2197
+ "kd_loss": 0.1044921875,
2198
+ "learning_rate": 7.363229531614973e-06,
2199
+ "loss": 0.175,
2200
+ "step": 1090,
2201
+ "student_loss": 0.035642288625240326,
2202
+ "teacher_loss": 0.0027311204466968775
2203
+ },
2204
+ {
2205
+ "epoch": 0.3157439446366782,
2206
+ "grad_norm": 3.6875,
2207
+ "kd_loss": 0.12353515625,
2208
+ "learning_rate": 7.339410542841906e-06,
2209
+ "loss": 0.1985,
2210
+ "step": 1095,
2211
+ "student_loss": 0.14903123676776886,
2212
+ "teacher_loss": 0.007329413667321205
2213
+ },
2214
+ {
2215
+ "epoch": 0.31718569780853517,
2216
+ "grad_norm": 7.9375,
2217
+ "kd_loss": 0.11279296875,
2218
+ "learning_rate": 7.315523389425867e-06,
2219
+ "loss": 0.1747,
2220
+ "step": 1100,
2221
+ "student_loss": 0.0011135012609884143,
2222
+ "teacher_loss": 0.0004325899062678218
2223
+ },
2224
+ {
2225
+ "epoch": 0.31862745098039214,
2226
+ "grad_norm": 3.328125,
2227
+ "kd_loss": 0.099609375,
2228
+ "learning_rate": 7.291568767379484e-06,
2229
+ "loss": 0.1604,
2230
+ "step": 1105,
2231
+ "student_loss": 0.012246989645063877,
2232
+ "teacher_loss": 0.0004818146117031574
2233
+ },
2234
+ {
2235
+ "epoch": 0.32006920415224915,
2236
+ "grad_norm": 4.15625,
2237
+ "kd_loss": 0.107421875,
2238
+ "learning_rate": 7.267547374681259e-06,
2239
+ "loss": 0.1857,
2240
+ "step": 1110,
2241
+ "student_loss": 0.0022737043909728527,
2242
+ "teacher_loss": 0.0010464838705956936
2243
+ },
2244
+ {
2245
+ "epoch": 0.3215109573241061,
2246
+ "grad_norm": 3.625,
2247
+ "kd_loss": 0.1083984375,
2248
+ "learning_rate": 7.24345991125522e-06,
2249
+ "loss": 0.1756,
2250
+ "step": 1115,
2251
+ "student_loss": 0.013854903168976307,
2252
+ "teacher_loss": 0.0004721402656286955
2253
+ },
2254
+ {
2255
+ "epoch": 0.3229527104959631,
2256
+ "grad_norm": 4.78125,
2257
+ "kd_loss": 0.11474609375,
2258
+ "learning_rate": 7.219307078950536e-06,
2259
+ "loss": 0.175,
2260
+ "step": 1120,
2261
+ "student_loss": 0.08585968613624573,
2262
+ "teacher_loss": 0.0014684133930131793
2263
+ },
2264
+ {
2265
+ "epoch": 0.32439446366782004,
2266
+ "grad_norm": 4.4375,
2267
+ "kd_loss": 0.11572265625,
2268
+ "learning_rate": 7.195089581521064e-06,
2269
+ "loss": 0.1717,
2270
+ "step": 1125,
2271
+ "student_loss": 0.04334001988172531,
2272
+ "teacher_loss": 0.021173296496272087
2273
+ },
2274
+ {
2275
+ "epoch": 0.32583621683967706,
2276
+ "grad_norm": 6.65625,
2277
+ "kd_loss": 0.1220703125,
2278
+ "learning_rate": 7.170808124604842e-06,
2279
+ "loss": 0.1814,
2280
+ "step": 1130,
2281
+ "student_loss": 0.0865454375743866,
2282
+ "teacher_loss": 0.000596641271840781
2283
+ },
2284
+ {
2285
+ "epoch": 0.327277970011534,
2286
+ "grad_norm": 5.90625,
2287
+ "kd_loss": 0.099609375,
2288
+ "learning_rate": 7.14646341570353e-06,
2289
+ "loss": 0.1876,
2290
+ "step": 1135,
2291
+ "student_loss": 0.06569283455610275,
2292
+ "teacher_loss": 0.005785806570202112
2293
+ },
2294
+ {
2295
+ "epoch": 0.328719723183391,
2296
+ "grad_norm": 4.03125,
2297
+ "kd_loss": 0.1279296875,
2298
+ "learning_rate": 7.122056164161795e-06,
2299
+ "loss": 0.1885,
2300
+ "step": 1140,
2301
+ "student_loss": 0.03503354638814926,
2302
+ "teacher_loss": 0.003907355945557356
2303
+ },
2304
+ {
2305
+ "epoch": 0.330161476355248,
2306
+ "grad_norm": 9.0625,
2307
+ "kd_loss": 0.0908203125,
2308
+ "learning_rate": 7.097587081146636e-06,
2309
+ "loss": 0.1845,
2310
+ "step": 1145,
2311
+ "student_loss": 0.0037424068432301283,
2312
+ "teacher_loss": 0.0011356781469658017
2313
+ },
2314
+ {
2315
+ "epoch": 0.33160322952710497,
2316
+ "grad_norm": 5.1875,
2317
+ "kd_loss": 0.08984375,
2318
+ "learning_rate": 7.073056879626681e-06,
2319
+ "loss": 0.2248,
2320
+ "step": 1150,
2321
+ "student_loss": 0.002525654621422291,
2322
+ "teacher_loss": 0.0010445830412209034
2323
+ },
2324
+ {
2325
+ "epoch": 0.33304498269896193,
2326
+ "grad_norm": 5.1875,
2327
+ "kd_loss": 0.1044921875,
2328
+ "learning_rate": 7.048466274351389e-06,
2329
+ "loss": 0.1782,
2330
+ "step": 1155,
2331
+ "student_loss": 0.05463318154215813,
2332
+ "teacher_loss": 0.00036173113039694726
2333
+ },
2334
+ {
2335
+ "epoch": 0.3344867358708189,
2336
+ "grad_norm": 5.84375,
2337
+ "kd_loss": 0.11083984375,
2338
+ "learning_rate": 7.023815981830236e-06,
2339
+ "loss": 0.2102,
2340
+ "step": 1160,
2341
+ "student_loss": 0.002071135677397251,
2342
+ "teacher_loss": 0.030623162165284157
2343
+ },
2344
+ {
2345
+ "epoch": 0.3359284890426759,
2346
+ "grad_norm": 4.5,
2347
+ "kd_loss": 0.119140625,
2348
+ "learning_rate": 6.999106720311846e-06,
2349
+ "loss": 0.1978,
2350
+ "step": 1165,
2351
+ "student_loss": 0.06917373836040497,
2352
+ "teacher_loss": 0.0006929833325557411
2353
+ },
2354
+ {
2355
+ "epoch": 0.3373702422145329,
2356
+ "grad_norm": 4.9375,
2357
+ "kd_loss": 0.126953125,
2358
+ "learning_rate": 6.974339209763043e-06,
2359
+ "loss": 0.1661,
2360
+ "step": 1170,
2361
+ "student_loss": 0.047193318605422974,
2362
+ "teacher_loss": 0.01937233842909336
2363
+ },
2364
+ {
2365
+ "epoch": 0.33881199538638984,
2366
+ "grad_norm": 5.0625,
2367
+ "kd_loss": 0.10302734375,
2368
+ "learning_rate": 6.949514171847891e-06,
2369
+ "loss": 0.1856,
2370
+ "step": 1175,
2371
+ "student_loss": 0.0045834011398255825,
2372
+ "teacher_loss": 0.0021127781365066767
2373
+ },
2374
+ {
2375
+ "epoch": 0.3402537485582468,
2376
+ "grad_norm": 4.4375,
2377
+ "kd_loss": 0.10888671875,
2378
+ "learning_rate": 6.924632329906657e-06,
2379
+ "loss": 0.1824,
2380
+ "step": 1180,
2381
+ "student_loss": 0.03225143998861313,
2382
+ "teacher_loss": 0.0007702379371039569
2383
+ },
2384
+ {
2385
+ "epoch": 0.3416955017301038,
2386
+ "grad_norm": 4.53125,
2387
+ "kd_loss": 0.109375,
2388
+ "learning_rate": 6.899694408934734e-06,
2389
+ "loss": 0.1567,
2390
+ "step": 1185,
2391
+ "student_loss": 0.011755186133086681,
2392
+ "teacher_loss": 0.0007617790251970291
2393
+ },
2394
+ {
2395
+ "epoch": 0.3431372549019608,
2396
+ "grad_norm": 4.46875,
2397
+ "kd_loss": 0.1123046875,
2398
+ "learning_rate": 6.874701135561524e-06,
2399
+ "loss": 0.1666,
2400
+ "step": 1190,
2401
+ "student_loss": 0.0012172460556030273,
2402
+ "teacher_loss": 0.00039286207174882293
2403
+ },
2404
+ {
2405
+ "epoch": 0.34457900807381775,
2406
+ "grad_norm": 5.34375,
2407
+ "kd_loss": 0.09814453125,
2408
+ "learning_rate": 6.849653238029261e-06,
2409
+ "loss": 0.1611,
2410
+ "step": 1195,
2411
+ "student_loss": 0.005217494908720255,
2412
+ "teacher_loss": 0.0005248200614005327
2413
+ },
2414
+ {
2415
+ "epoch": 0.3460207612456747,
2416
+ "grad_norm": 4.9375,
2417
+ "kd_loss": 0.10546875,
2418
+ "learning_rate": 6.824551446171788e-06,
2419
+ "loss": 0.2491,
2420
+ "step": 1200,
2421
+ "student_loss": 0.00212606368586421,
2422
+ "teacher_loss": 0.0009664412937127054
2423
+ },
2424
+ {
2425
+ "epoch": 0.34746251441753173,
2426
+ "grad_norm": 5.25,
2427
+ "kd_loss": 0.10302734375,
2428
+ "learning_rate": 6.7993964913932975e-06,
2429
+ "loss": 0.206,
2430
+ "step": 1205,
2431
+ "student_loss": 0.0657915249466896,
2432
+ "teacher_loss": 0.0004779589653480798
2433
+ },
2434
+ {
2435
+ "epoch": 0.3489042675893887,
2436
+ "grad_norm": 6.71875,
2437
+ "kd_loss": 0.1025390625,
2438
+ "learning_rate": 6.774189106647021e-06,
2439
+ "loss": 0.1767,
2440
+ "step": 1210,
2441
+ "student_loss": 0.054130807518959045,
2442
+ "teacher_loss": 0.0005891909822821617
2443
+ },
2444
+ {
2445
+ "epoch": 0.35034602076124566,
2446
+ "grad_norm": 4.8125,
2447
+ "kd_loss": 0.10400390625,
2448
+ "learning_rate": 6.748930026413865e-06,
2449
+ "loss": 0.1911,
2450
+ "step": 1215,
2451
+ "student_loss": 0.10985054075717926,
2452
+ "teacher_loss": 0.0005408765282481909
2453
+ },
2454
+ {
2455
+ "epoch": 0.3517877739331027,
2456
+ "grad_norm": 4.9375,
2457
+ "kd_loss": 0.115234375,
2458
+ "learning_rate": 6.7236199866810185e-06,
2459
+ "loss": 0.1839,
2460
+ "step": 1220,
2461
+ "student_loss": 0.04293162375688553,
2462
+ "teacher_loss": 0.010158946737647057
2463
+ },
2464
+ {
2465
+ "epoch": 0.35322952710495964,
2466
+ "grad_norm": 5.46875,
2467
+ "kd_loss": 0.11328125,
2468
+ "learning_rate": 6.698259724920503e-06,
2469
+ "loss": 0.1899,
2470
+ "step": 1225,
2471
+ "student_loss": 0.025801874697208405,
2472
+ "teacher_loss": 0.0039003598503768444
2473
+ },
2474
+ {
2475
+ "epoch": 0.3546712802768166,
2476
+ "grad_norm": 7.03125,
2477
+ "kd_loss": 0.103515625,
2478
+ "learning_rate": 6.672849980067685e-06,
2479
+ "loss": 0.1782,
2480
+ "step": 1230,
2481
+ "student_loss": 0.0011705292854458094,
2482
+ "teacher_loss": 0.0013004811480641365
2483
+ },
2484
+ {
2485
+ "epoch": 0.35611303344867357,
2486
+ "grad_norm": 4.96875,
2487
+ "kd_loss": 0.134765625,
2488
+ "learning_rate": 6.647391492499746e-06,
2489
+ "loss": 0.1692,
2490
+ "step": 1235,
2491
+ "student_loss": 0.0030034382361918688,
2492
+ "teacher_loss": 0.0015800351975485682
2493
+ },
2494
+ {
2495
+ "epoch": 0.3575547866205306,
2496
+ "grad_norm": 5.6875,
2497
+ "kd_loss": 0.1015625,
2498
+ "learning_rate": 6.621885004014113e-06,
2499
+ "loss": 0.1994,
2500
+ "step": 1240,
2501
+ "student_loss": 0.0015772647457197309,
2502
+ "teacher_loss": 0.0011423062533140182
2503
+ },
2504
+ {
2505
+ "epoch": 0.35899653979238755,
2506
+ "grad_norm": 4.25,
2507
+ "kd_loss": 0.11669921875,
2508
+ "learning_rate": 6.596331257806837e-06,
2509
+ "loss": 0.1877,
2510
+ "step": 1245,
2511
+ "student_loss": 0.0013259351253509521,
2512
+ "teacher_loss": 0.0011870627058669925
2513
+ },
2514
+ {
2515
+ "epoch": 0.3604382929642445,
2516
+ "grad_norm": 3.96875,
2517
+ "kd_loss": 0.1162109375,
2518
+ "learning_rate": 6.570730998450945e-06,
2519
+ "loss": 0.1859,
2520
+ "step": 1250,
2521
+ "student_loss": 0.26618441939353943,
2522
+ "teacher_loss": 0.002830672077834606
2523
+ },
2524
+ {
2525
+ "epoch": 0.3618800461361015,
2526
+ "grad_norm": 4.75,
2527
+ "kd_loss": 0.11962890625,
2528
+ "learning_rate": 6.545084971874738e-06,
2529
+ "loss": 0.1923,
2530
+ "step": 1255,
2531
+ "student_loss": 0.04056844487786293,
2532
+ "teacher_loss": 0.006723723839968443
2533
+ },
2534
+ {
2535
+ "epoch": 0.3633217993079585,
2536
+ "grad_norm": 5.0,
2537
+ "kd_loss": 0.1142578125,
2538
+ "learning_rate": 6.519393925340067e-06,
2539
+ "loss": 0.1923,
2540
+ "step": 1260,
2541
+ "student_loss": 0.0006198959308676422,
2542
+ "teacher_loss": 0.0006204553064890206
2543
+ },
2544
+ {
2545
+ "epoch": 0.36476355247981546,
2546
+ "grad_norm": 6.21875,
2547
+ "kd_loss": 0.10888671875,
2548
+ "learning_rate": 6.49365860742055e-06,
2549
+ "loss": 0.1947,
2550
+ "step": 1265,
2551
+ "student_loss": 0.41350483894348145,
2552
+ "teacher_loss": 0.020098216831684113
2553
+ },
2554
+ {
2555
+ "epoch": 0.3662053056516724,
2556
+ "grad_norm": 4.65625,
2557
+ "kd_loss": 0.10107421875,
2558
+ "learning_rate": 6.467879767979764e-06,
2559
+ "loss": 0.1638,
2560
+ "step": 1270,
2561
+ "student_loss": 0.0016804474871605635,
2562
+ "teacher_loss": 0.0008275846485048532
2563
+ },
2564
+ {
2565
+ "epoch": 0.36764705882352944,
2566
+ "grad_norm": 4.8125,
2567
+ "kd_loss": 0.107421875,
2568
+ "learning_rate": 6.442058158149396e-06,
2569
+ "loss": 0.1518,
2570
+ "step": 1275,
2571
+ "student_loss": 0.07305289059877396,
2572
+ "teacher_loss": 0.0009938895236700773
2573
+ },
2574
+ {
2575
+ "epoch": 0.3690888119953864,
2576
+ "grad_norm": 4.53125,
2577
+ "kd_loss": 0.10693359375,
2578
+ "learning_rate": 6.4161945303073535e-06,
2579
+ "loss": 0.1549,
2580
+ "step": 1280,
2581
+ "student_loss": 0.003565654158592224,
2582
+ "teacher_loss": 0.0004424946673680097
2583
+ },
2584
+ {
2585
+ "epoch": 0.37053056516724336,
2586
+ "grad_norm": 4.15625,
2587
+ "kd_loss": 0.11865234375,
2588
+ "learning_rate": 6.390289638055851e-06,
2589
+ "loss": 0.1723,
2590
+ "step": 1285,
2591
+ "student_loss": 0.0022811174858361483,
2592
+ "teacher_loss": 0.000994974747300148
2593
+ },
2594
+ {
2595
+ "epoch": 0.3719723183391003,
2596
+ "grad_norm": 6.40625,
2597
+ "kd_loss": 0.08837890625,
2598
+ "learning_rate": 6.364344236199441e-06,
2599
+ "loss": 0.1828,
2600
+ "step": 1290,
2601
+ "student_loss": 0.4067663848400116,
2602
+ "teacher_loss": 0.015121642500162125
2603
+ },
2604
+ {
2605
+ "epoch": 0.37341407151095735,
2606
+ "grad_norm": 6.5,
2607
+ "kd_loss": 0.091796875,
2608
+ "learning_rate": 6.3383590807230264e-06,
2609
+ "loss": 0.2023,
2610
+ "step": 1295,
2611
+ "student_loss": 0.0015801489353179932,
2612
+ "teacher_loss": 0.007307669147849083
2613
+ },
2614
+ {
2615
+ "epoch": 0.3748558246828143,
2616
+ "grad_norm": 4.75,
2617
+ "kd_loss": 0.1259765625,
2618
+ "learning_rate": 6.3123349287698345e-06,
2619
+ "loss": 0.1514,
2620
+ "step": 1300,
2621
+ "student_loss": 0.002292349934577942,
2622
+ "teacher_loss": 0.0012089353986084461
2623
+ },
2624
+ {
2625
+ "epoch": 0.3762975778546713,
2626
+ "grad_norm": 4.875,
2627
+ "kd_loss": 0.12451171875,
2628
+ "learning_rate": 6.286272538619351e-06,
2629
+ "loss": 0.1734,
2630
+ "step": 1305,
2631
+ "student_loss": 0.004027123097330332,
2632
+ "teacher_loss": 0.30039486289024353
2633
+ },
2634
+ {
2635
+ "epoch": 0.37773933102652824,
2636
+ "grad_norm": 5.6875,
2637
+ "kd_loss": 0.111328125,
2638
+ "learning_rate": 6.260172669665233e-06,
2639
+ "loss": 0.1584,
2640
+ "step": 1310,
2641
+ "student_loss": 0.00176389014814049,
2642
+ "teacher_loss": 0.001472116680815816
2643
+ },
2644
+ {
2645
+ "epoch": 0.37918108419838525,
2646
+ "grad_norm": 6.40625,
2647
+ "kd_loss": 0.11669921875,
2648
+ "learning_rate": 6.234036082393171e-06,
2649
+ "loss": 0.1926,
2650
+ "step": 1315,
2651
+ "student_loss": 0.11009548604488373,
2652
+ "teacher_loss": 0.0014415290206670761
2653
+ },
2654
+ {
2655
+ "epoch": 0.3806228373702422,
2656
+ "grad_norm": 4.78125,
2657
+ "kd_loss": 0.11865234375,
2658
+ "learning_rate": 6.207863538358741e-06,
2659
+ "loss": 0.1882,
2660
+ "step": 1320,
2661
+ "student_loss": 0.012124598026275635,
2662
+ "teacher_loss": 0.0018767216242849827
2663
+ },
2664
+ {
2665
+ "epoch": 0.3820645905420992,
2666
+ "grad_norm": 5.1875,
2667
+ "kd_loss": 0.11328125,
2668
+ "learning_rate": 6.181655800165207e-06,
2669
+ "loss": 0.1981,
2670
+ "step": 1325,
2671
+ "student_loss": 0.009575133211910725,
2672
+ "teacher_loss": 0.0010605778079479933
2673
+ },
2674
+ {
2675
+ "epoch": 0.38350634371395614,
2676
+ "grad_norm": 6.09375,
2677
+ "kd_loss": 0.111328125,
2678
+ "learning_rate": 6.155413631441307e-06,
2679
+ "loss": 0.1758,
2680
+ "step": 1330,
2681
+ "student_loss": 0.06053456291556358,
2682
+ "teacher_loss": 0.0006676834309473634
2683
+ },
2684
+ {
2685
+ "epoch": 0.38494809688581316,
2686
+ "grad_norm": 5.53125,
2687
+ "kd_loss": 0.099609375,
2688
+ "learning_rate": 6.129137796818997e-06,
2689
+ "loss": 0.1761,
2690
+ "step": 1335,
2691
+ "student_loss": 0.0009542008046992123,
2692
+ "teacher_loss": 0.0006002294248901308
2693
+ },
2694
+ {
2695
+ "epoch": 0.3863898500576701,
2696
+ "grad_norm": 5.75,
2697
+ "kd_loss": 0.1650390625,
2698
+ "learning_rate": 6.102829061911176e-06,
2699
+ "loss": 0.1853,
2700
+ "step": 1340,
2701
+ "student_loss": 0.005108754616230726,
2702
+ "teacher_loss": 0.005764763802289963
2703
+ },
2704
+ {
2705
+ "epoch": 0.3878316032295271,
2706
+ "grad_norm": 6.65625,
2707
+ "kd_loss": 0.12353515625,
2708
+ "learning_rate": 6.076488193289375e-06,
2709
+ "loss": 0.1821,
2710
+ "step": 1345,
2711
+ "student_loss": 0.0015393177745863795,
2712
+ "teacher_loss": 0.0016605017008259892
2713
+ },
2714
+ {
2715
+ "epoch": 0.3892733564013841,
2716
+ "grad_norm": 8.4375,
2717
+ "kd_loss": 0.11865234375,
2718
+ "learning_rate": 6.050115958461423e-06,
2719
+ "loss": 0.1673,
2720
+ "step": 1350,
2721
+ "student_loss": 0.048770517110824585,
2722
+ "teacher_loss": 0.0007486655958928168
2723
+ },
2724
+ {
2725
+ "epoch": 0.39071510957324107,
2726
+ "grad_norm": 6.1875,
2727
+ "kd_loss": 0.103515625,
2728
+ "learning_rate": 6.02371312584908e-06,
2729
+ "loss": 0.1894,
2730
+ "step": 1355,
2731
+ "student_loss": 0.09447399526834488,
2732
+ "teacher_loss": 0.0003722972178366035
2733
+ },
2734
+ {
2735
+ "epoch": 0.39215686274509803,
2736
+ "grad_norm": 8.5,
2737
+ "kd_loss": 0.11328125,
2738
+ "learning_rate": 5.997280464765655e-06,
2739
+ "loss": 0.1885,
2740
+ "step": 1360,
2741
+ "student_loss": 0.14768695831298828,
2742
+ "teacher_loss": 0.0006404736777767539
2743
+ },
2744
+ {
2745
+ "epoch": 0.393598615916955,
2746
+ "grad_norm": 7.8125,
2747
+ "kd_loss": 0.109375,
2748
+ "learning_rate": 5.970818745393579e-06,
2749
+ "loss": 0.1734,
2750
+ "step": 1365,
2751
+ "student_loss": 0.01714850589632988,
2752
+ "teacher_loss": 0.029827572405338287
2753
+ },
2754
+ {
2755
+ "epoch": 0.395040369088812,
2756
+ "grad_norm": 7.6875,
2757
+ "kd_loss": 0.09619140625,
2758
+ "learning_rate": 5.9443287387619754e-06,
2759
+ "loss": 0.1706,
2760
+ "step": 1370,
2761
+ "student_loss": 0.32004305720329285,
2762
+ "teacher_loss": 0.018626194447278976
2763
+ },
2764
+ {
2765
+ "epoch": 0.396482122260669,
2766
+ "grad_norm": 4.5,
2767
+ "kd_loss": 0.10791015625,
2768
+ "learning_rate": 5.9178112167241805e-06,
2769
+ "loss": 0.1781,
2770
+ "step": 1375,
2771
+ "student_loss": 0.09515227377414703,
2772
+ "teacher_loss": 0.00036773309693671763
2773
+ },
2774
+ {
2775
+ "epoch": 0.39792387543252594,
2776
+ "grad_norm": 4.6875,
2777
+ "kd_loss": 0.10546875,
2778
+ "learning_rate": 5.8912669519352725e-06,
2779
+ "loss": 0.1764,
2780
+ "step": 1380,
2781
+ "student_loss": 0.011620831675827503,
2782
+ "teacher_loss": 0.0029276730492711067
2783
+ },
2784
+ {
2785
+ "epoch": 0.3993656286043829,
2786
+ "grad_norm": 4.46875,
2787
+ "kd_loss": 0.10791015625,
2788
+ "learning_rate": 5.864696717829539e-06,
2789
+ "loss": 0.1843,
2790
+ "step": 1385,
2791
+ "student_loss": 0.10878758132457733,
2792
+ "teacher_loss": 0.013216445222496986
2793
+ },
2794
+ {
2795
+ "epoch": 0.4008073817762399,
2796
+ "grad_norm": 6.53125,
2797
+ "kd_loss": 0.126953125,
2798
+ "learning_rate": 5.838101288597951e-06,
2799
+ "loss": 0.1739,
2800
+ "step": 1390,
2801
+ "student_loss": 0.0017183037707582116,
2802
+ "teacher_loss": 0.0011319200275465846
2803
+ },
2804
+ {
2805
+ "epoch": 0.4022491349480969,
2806
+ "grad_norm": 4.5,
2807
+ "kd_loss": 0.1171875,
2808
+ "learning_rate": 5.8114814391656046e-06,
2809
+ "loss": 0.1812,
2810
+ "step": 1395,
2811
+ "student_loss": 0.0009771361947059631,
2812
+ "teacher_loss": 0.0006259285728447139
2813
+ },
2814
+ {
2815
+ "epoch": 0.40369088811995385,
2816
+ "grad_norm": 4.59375,
2817
+ "kd_loss": 0.10888671875,
2818
+ "learning_rate": 5.78483794516914e-06,
2819
+ "loss": 0.1759,
2820
+ "step": 1400,
2821
+ "student_loss": 0.0028622837271541357,
2822
+ "teacher_loss": 0.00048269989201799035
2823
+ },
2824
+ {
2825
+ "epoch": 0.40513264129181087,
2826
+ "grad_norm": 5.84375,
2827
+ "kd_loss": 0.1064453125,
2828
+ "learning_rate": 5.75817158293414e-06,
2829
+ "loss": 0.1843,
2830
+ "step": 1405,
2831
+ "student_loss": 0.0025167667772620916,
2832
+ "teacher_loss": 0.0007314765243791044
2833
+ },
2834
+ {
2835
+ "epoch": 0.40657439446366783,
2836
+ "grad_norm": 4.625,
2837
+ "kd_loss": 0.10693359375,
2838
+ "learning_rate": 5.731483129452514e-06,
2839
+ "loss": 0.1836,
2840
+ "step": 1410,
2841
+ "student_loss": 0.0036077857948839664,
2842
+ "teacher_loss": 0.0006345498841255903
2843
+ },
2844
+ {
2845
+ "epoch": 0.4080161476355248,
2846
+ "grad_norm": 9.625,
2847
+ "kd_loss": 0.1083984375,
2848
+ "learning_rate": 5.704773362359854e-06,
2849
+ "loss": 0.1652,
2850
+ "step": 1415,
2851
+ "student_loss": 0.011436261236667633,
2852
+ "teacher_loss": 0.01508229598402977
2853
+ },
2854
+ {
2855
+ "epoch": 0.40945790080738176,
2856
+ "grad_norm": 5.90625,
2857
+ "kd_loss": 0.1201171875,
2858
+ "learning_rate": 5.678043059912776e-06,
2859
+ "loss": 0.202,
2860
+ "step": 1420,
2861
+ "student_loss": 0.001543233753181994,
2862
+ "teacher_loss": 0.0010519566712900996
2863
+ },
2864
+ {
2865
+ "epoch": 0.4108996539792388,
2866
+ "grad_norm": 4.1875,
2867
+ "kd_loss": 0.09765625,
2868
+ "learning_rate": 5.6512930009662524e-06,
2869
+ "loss": 0.1847,
2870
+ "step": 1425,
2871
+ "student_loss": 0.1836177557706833,
2872
+ "teacher_loss": 0.000503146555274725
2873
+ },
2874
+ {
2875
+ "epoch": 0.41234140715109574,
2876
+ "grad_norm": 5.09375,
2877
+ "kd_loss": 0.11279296875,
2878
+ "learning_rate": 5.624523964950903e-06,
2879
+ "loss": 0.1717,
2880
+ "step": 1430,
2881
+ "student_loss": 0.005288061685860157,
2882
+ "teacher_loss": 0.0005534209776669741
2883
+ },
2884
+ {
2885
+ "epoch": 0.4137831603229527,
2886
+ "grad_norm": 5.125,
2887
+ "kd_loss": 0.1044921875,
2888
+ "learning_rate": 5.597736731850295e-06,
2889
+ "loss": 0.1881,
2890
+ "step": 1435,
2891
+ "student_loss": 0.019315902143716812,
2892
+ "teacher_loss": 0.00202935584820807
2893
+ },
2894
+ {
2895
+ "epoch": 0.41522491349480967,
2896
+ "grad_norm": 8.5,
2897
+ "kd_loss": 0.10107421875,
2898
+ "learning_rate": 5.570932082178219e-06,
2899
+ "loss": 0.1903,
2900
+ "step": 1440,
2901
+ "student_loss": 0.08893950283527374,
2902
+ "teacher_loss": 0.000398140778997913
2903
+ },
2904
+ {
2905
+ "epoch": 0.4166666666666667,
2906
+ "grad_norm": 9.5625,
2907
+ "kd_loss": 0.12890625,
2908
+ "learning_rate": 5.5441107969559315e-06,
2909
+ "loss": 0.1919,
2910
+ "step": 1445,
2911
+ "student_loss": 0.2631194293498993,
2912
+ "teacher_loss": 0.0044821687042713165
2913
+ },
2914
+ {
2915
+ "epoch": 0.41810841983852365,
2916
+ "grad_norm": 5.1875,
2917
+ "kd_loss": 0.1826171875,
2918
+ "learning_rate": 5.517273657689419e-06,
2919
+ "loss": 0.1631,
2920
+ "step": 1450,
2921
+ "student_loss": 0.006265243049710989,
2922
+ "teacher_loss": 0.0023199233692139387
2923
+ },
2924
+ {
2925
+ "epoch": 0.4195501730103806,
2926
+ "grad_norm": 5.84375,
2927
+ "kd_loss": 0.12890625,
2928
+ "learning_rate": 5.490421446346608e-06,
2929
+ "loss": 0.1744,
2930
+ "step": 1455,
2931
+ "student_loss": 0.0026093143969774246,
2932
+ "teacher_loss": 0.0007649324252270162
2933
+ },
2934
+ {
2935
+ "epoch": 0.4209919261822376,
2936
+ "grad_norm": 5.8125,
2937
+ "kd_loss": 0.134765625,
2938
+ "learning_rate": 5.463554945334589e-06,
2939
+ "loss": 0.1769,
2940
+ "step": 1460,
2941
+ "student_loss": 0.0007114948239177465,
2942
+ "teacher_loss": 0.0003705586714204401
2943
+ },
2944
+ {
2945
+ "epoch": 0.4224336793540946,
2946
+ "grad_norm": 13.4375,
2947
+ "kd_loss": 0.1435546875,
2948
+ "learning_rate": 5.43667493747682e-06,
2949
+ "loss": 0.2009,
2950
+ "step": 1465,
2951
+ "student_loss": 0.0005334956222213805,
2952
+ "teacher_loss": 0.013963425531983376
2953
+ },
2954
+ {
2955
+ "epoch": 0.42387543252595156,
2956
+ "grad_norm": 8.375,
2957
+ "kd_loss": 0.1015625,
2958
+ "learning_rate": 5.409782205990317e-06,
2959
+ "loss": 0.1964,
2960
+ "step": 1470,
2961
+ "student_loss": 0.0011849907459691167,
2962
+ "teacher_loss": 0.004421910271048546
2963
+ },
2964
+ {
2965
+ "epoch": 0.4253171856978085,
2966
+ "grad_norm": 4.5625,
2967
+ "kd_loss": 0.11376953125,
2968
+ "learning_rate": 5.3828775344628245e-06,
2969
+ "loss": 0.1738,
2970
+ "step": 1475,
2971
+ "student_loss": 0.0008997453842312098,
2972
+ "teacher_loss": 0.0010891201673075557
2973
+ },
2974
+ {
2975
+ "epoch": 0.42675893886966554,
2976
+ "grad_norm": 4.78125,
2977
+ "kd_loss": 0.1123046875,
2978
+ "learning_rate": 5.355961706829997e-06,
2979
+ "loss": 0.1813,
2980
+ "step": 1480,
2981
+ "student_loss": 0.000982648809440434,
2982
+ "teacher_loss": 0.0016894500004127622
2983
+ },
2984
+ {
2985
+ "epoch": 0.4282006920415225,
2986
+ "grad_norm": 13.5625,
2987
+ "kd_loss": 0.11279296875,
2988
+ "learning_rate": 5.329035507352548e-06,
2989
+ "loss": 0.185,
2990
+ "step": 1485,
2991
+ "student_loss": 0.0005971972714178264,
2992
+ "teacher_loss": 0.0009129931568168104
2993
+ },
2994
+ {
2995
+ "epoch": 0.42964244521337946,
2996
+ "grad_norm": 7.09375,
2997
+ "kd_loss": 0.119140625,
2998
+ "learning_rate": 5.3020997205933985e-06,
2999
+ "loss": 0.1721,
3000
+ "step": 1490,
3001
+ "student_loss": 0.004597559571266174,
3002
+ "teacher_loss": 0.001885790377855301
3003
+ },
3004
+ {
3005
+ "epoch": 0.43108419838523643,
3006
+ "grad_norm": 4.6875,
3007
+ "kd_loss": 0.1015625,
3008
+ "learning_rate": 5.275155131394825e-06,
3009
+ "loss": 0.18,
3010
+ "step": 1495,
3011
+ "student_loss": 0.0022636561188846827,
3012
+ "teacher_loss": 0.0010887808166444302
3013
+ },
3014
+ {
3015
+ "epoch": 0.43252595155709345,
3016
+ "grad_norm": 6.21875,
3017
+ "kd_loss": 0.1103515625,
3018
+ "learning_rate": 5.248202524855578e-06,
3019
+ "loss": 0.1757,
3020
+ "step": 1500,
3021
+ "student_loss": 0.04894772171974182,
3022
+ "teacher_loss": 0.004637535195797682
3023
+ }
3024
+ ],
3025
+ "logging_steps": 5,
3026
+ "max_steps": 3000,
3027
+ "num_input_tokens_seen": 0,
3028
+ "num_train_epochs": 1,
3029
+ "save_steps": 500,
3030
+ "stateful_callbacks": {
3031
+ "TrainerControl": {
3032
+ "args": {
3033
+ "should_epoch_stop": false,
3034
+ "should_evaluate": false,
3035
+ "should_log": false,
3036
+ "should_save": true,
3037
+ "should_training_stop": false
3038
+ },
3039
+ "attributes": {}
3040
+ }
3041
+ },
3042
+ "total_flos": 0.0,
3043
+ "train_batch_size": 1,
3044
+ "trial_name": null,
3045
+ "trial_params": null
3046
+ }
checkpoints/codi-single-1.5b/checkpoint-1500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2000/added_tokens.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|action_sep|>": 151670,
5
+ "<|arg_sep|>": 151671,
6
+ "<|box_end|>": 151649,
7
+ "<|box_start|>": 151648,
8
+ "<|call_sep|>": 151666,
9
+ "<|end_of_text|>": 151673,
10
+ "<|endoftext|>": 151643,
11
+ "<|exception_sep|>": 151669,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|frame_sep|>": 151672,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|latent_end|>": 151675,
22
+ "<|latent_start|>": 151674,
23
+ "<|line_sep|>": 151667,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|return_sep|>": 151668,
30
+ "<|trace_context_start|>": 151665,
31
+ "<|video_pad|>": 151656,
32
+ "<|vision_end|>": 151653,
33
+ "<|vision_pad|>": 151654,
34
+ "<|vision_start|>": 151652
35
+ }
checkpoints/codi-single-1.5b/checkpoint-2000/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/codi-single-1.5b/checkpoint-2000/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.6",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151676
58
+ }
checkpoints/codi-single-1.5b/checkpoint-2000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d14f7a986351cfea6fded3a7c0f17dc151227c0c793905389b9a498eaa870408
3
+ size 3096212347
checkpoints/codi-single-1.5b/checkpoint-2000/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/codi-single-1.5b/checkpoint-2000/thought_projector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:523afd8eab7d0a92752c8aa50257d29101b9d3523d28f97b529f3f279756aa10
3
+ size 9445953
checkpoints/codi-single-1.5b/checkpoint-2000/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
3
+ size 11424004
checkpoints/codi-single-1.5b/checkpoint-2000/tokenizer_config.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|trace_context_start|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|call_sep|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|line_sep|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|return_sep|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|exception_sep|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|action_sep|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|arg_sep|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|frame_sep|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|end_of_text|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|latent_start|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|latent_end|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ }
269
+ },
270
+ "additional_special_tokens": [
271
+ "<|im_start|>",
272
+ "<|im_end|>",
273
+ "<|object_ref_start|>",
274
+ "<|object_ref_end|>",
275
+ "<|box_start|>",
276
+ "<|box_end|>",
277
+ "<|quad_start|>",
278
+ "<|quad_end|>",
279
+ "<|vision_start|>",
280
+ "<|vision_end|>",
281
+ "<|vision_pad|>",
282
+ "<|image_pad|>",
283
+ "<|video_pad|>"
284
+ ],
285
+ "bos_token": null,
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|endoftext|>",
288
+ "errors": "replace",
289
+ "extra_special_tokens": {},
290
+ "model_max_length": 32768,
291
+ "pad_token": "<|endoftext|>",
292
+ "split_special_tokens": false,
293
+ "tokenizer_class": "Qwen2Tokenizer",
294
+ "unk_token": null
295
+ }
checkpoints/codi-single-1.5b/checkpoint-2000/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2000/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2500/added_tokens.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|action_sep|>": 151670,
5
+ "<|arg_sep|>": 151671,
6
+ "<|box_end|>": 151649,
7
+ "<|box_start|>": 151648,
8
+ "<|call_sep|>": 151666,
9
+ "<|end_of_text|>": 151673,
10
+ "<|endoftext|>": 151643,
11
+ "<|exception_sep|>": 151669,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|frame_sep|>": 151672,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|latent_end|>": 151675,
22
+ "<|latent_start|>": 151674,
23
+ "<|line_sep|>": 151667,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|return_sep|>": 151668,
30
+ "<|trace_context_start|>": 151665,
31
+ "<|video_pad|>": 151656,
32
+ "<|vision_end|>": 151653,
33
+ "<|vision_pad|>": 151654,
34
+ "<|vision_start|>": 151652
35
+ }
checkpoints/codi-single-1.5b/checkpoint-2500/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/codi-single-1.5b/checkpoint-2500/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.6",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151676
58
+ }
checkpoints/codi-single-1.5b/checkpoint-2500/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2500/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b04b58be8be7f629acef51a976fcf4293f37704f8a02a393edc02aca6c5bb032
3
+ size 3096212347
checkpoints/codi-single-1.5b/checkpoint-2500/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|endoftext|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
checkpoints/codi-single-1.5b/checkpoint-2500/thought_projector.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d878a33cf68c1ed7a51c914d09dfa09be143fdd3f53339198f4973c26f3280c0
3
+ size 9445953
checkpoints/codi-single-1.5b/checkpoint-2500/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a790d654474f5dfe225f889afd0210313eb1083f942671f2c4b8e95a1c922b
3
+ size 11424004
checkpoints/codi-single-1.5b/checkpoint-2500/tokenizer_config.json ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ },
181
+ "151665": {
182
+ "content": "<|trace_context_start|>",
183
+ "lstrip": false,
184
+ "normalized": false,
185
+ "rstrip": false,
186
+ "single_word": false,
187
+ "special": true
188
+ },
189
+ "151666": {
190
+ "content": "<|call_sep|>",
191
+ "lstrip": false,
192
+ "normalized": false,
193
+ "rstrip": false,
194
+ "single_word": false,
195
+ "special": true
196
+ },
197
+ "151667": {
198
+ "content": "<|line_sep|>",
199
+ "lstrip": false,
200
+ "normalized": false,
201
+ "rstrip": false,
202
+ "single_word": false,
203
+ "special": true
204
+ },
205
+ "151668": {
206
+ "content": "<|return_sep|>",
207
+ "lstrip": false,
208
+ "normalized": false,
209
+ "rstrip": false,
210
+ "single_word": false,
211
+ "special": true
212
+ },
213
+ "151669": {
214
+ "content": "<|exception_sep|>",
215
+ "lstrip": false,
216
+ "normalized": false,
217
+ "rstrip": false,
218
+ "single_word": false,
219
+ "special": true
220
+ },
221
+ "151670": {
222
+ "content": "<|action_sep|>",
223
+ "lstrip": false,
224
+ "normalized": false,
225
+ "rstrip": false,
226
+ "single_word": false,
227
+ "special": true
228
+ },
229
+ "151671": {
230
+ "content": "<|arg_sep|>",
231
+ "lstrip": false,
232
+ "normalized": false,
233
+ "rstrip": false,
234
+ "single_word": false,
235
+ "special": true
236
+ },
237
+ "151672": {
238
+ "content": "<|frame_sep|>",
239
+ "lstrip": false,
240
+ "normalized": false,
241
+ "rstrip": false,
242
+ "single_word": false,
243
+ "special": true
244
+ },
245
+ "151673": {
246
+ "content": "<|end_of_text|>",
247
+ "lstrip": false,
248
+ "normalized": false,
249
+ "rstrip": false,
250
+ "single_word": false,
251
+ "special": true
252
+ },
253
+ "151674": {
254
+ "content": "<|latent_start|>",
255
+ "lstrip": false,
256
+ "normalized": false,
257
+ "rstrip": false,
258
+ "single_word": false,
259
+ "special": true
260
+ },
261
+ "151675": {
262
+ "content": "<|latent_end|>",
263
+ "lstrip": false,
264
+ "normalized": false,
265
+ "rstrip": false,
266
+ "single_word": false,
267
+ "special": true
268
+ }
269
+ },
270
+ "additional_special_tokens": [
271
+ "<|im_start|>",
272
+ "<|im_end|>",
273
+ "<|object_ref_start|>",
274
+ "<|object_ref_end|>",
275
+ "<|box_start|>",
276
+ "<|box_end|>",
277
+ "<|quad_start|>",
278
+ "<|quad_end|>",
279
+ "<|vision_start|>",
280
+ "<|vision_end|>",
281
+ "<|vision_pad|>",
282
+ "<|image_pad|>",
283
+ "<|video_pad|>"
284
+ ],
285
+ "bos_token": null,
286
+ "clean_up_tokenization_spaces": false,
287
+ "eos_token": "<|endoftext|>",
288
+ "errors": "replace",
289
+ "extra_special_tokens": {},
290
+ "model_max_length": 32768,
291
+ "pad_token": "<|endoftext|>",
292
+ "split_special_tokens": false,
293
+ "tokenizer_class": "Qwen2Tokenizer",
294
+ "unk_token": null
295
+ }
checkpoints/codi-single-1.5b/checkpoint-2500/trainer_state.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-2500/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-3000/added_tokens.json ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|action_sep|>": 151670,
5
+ "<|arg_sep|>": 151671,
6
+ "<|box_end|>": 151649,
7
+ "<|box_start|>": 151648,
8
+ "<|call_sep|>": 151666,
9
+ "<|end_of_text|>": 151673,
10
+ "<|endoftext|>": 151643,
11
+ "<|exception_sep|>": 151669,
12
+ "<|file_sep|>": 151664,
13
+ "<|fim_middle|>": 151660,
14
+ "<|fim_pad|>": 151662,
15
+ "<|fim_prefix|>": 151659,
16
+ "<|fim_suffix|>": 151661,
17
+ "<|frame_sep|>": 151672,
18
+ "<|im_end|>": 151645,
19
+ "<|im_start|>": 151644,
20
+ "<|image_pad|>": 151655,
21
+ "<|latent_end|>": 151675,
22
+ "<|latent_start|>": 151674,
23
+ "<|line_sep|>": 151667,
24
+ "<|object_ref_end|>": 151647,
25
+ "<|object_ref_start|>": 151646,
26
+ "<|quad_end|>": 151651,
27
+ "<|quad_start|>": 151650,
28
+ "<|repo_name|>": 151663,
29
+ "<|return_sep|>": 151668,
30
+ "<|trace_context_start|>": 151665,
31
+ "<|video_pad|>": 151656,
32
+ "<|vision_end|>": 151653,
33
+ "<|vision_pad|>": 151654,
34
+ "<|vision_start|>": 151652
35
+ }
checkpoints/codi-single-1.5b/checkpoint-3000/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
checkpoints/codi-single-1.5b/checkpoint-3000/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151643,
8
+ "hidden_act": "silu",
9
+ "hidden_size": 1536,
10
+ "initializer_range": 0.02,
11
+ "intermediate_size": 8960,
12
+ "layer_types": [
13
+ "full_attention",
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention"
41
+ ],
42
+ "max_position_embeddings": 32768,
43
+ "max_window_layers": 28,
44
+ "model_type": "qwen2",
45
+ "num_attention_heads": 12,
46
+ "num_hidden_layers": 28,
47
+ "num_key_value_heads": 2,
48
+ "pad_token_id": 151643,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.6",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 151676
58
+ }
checkpoints/codi-single-1.5b/checkpoint-3000/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
checkpoints/codi-single-1.5b/checkpoint-3000/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:956685564d84f545a68360c7ac9513deaf972690b53a7d604246ccdb77dd06c8
3
+ size 3096212347