Ys404 commited on
Commit
4bb6424
·
verified ·
1 Parent(s): f28773d

Add scripts and checkpoints (CosFly-Track release)

Browse files

Initial upload of evaluation scripts and SFT model checkpoints.
Uploaded via PR mode (uploader has no direct push to main).

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +8 -0
  2. .watchdog.log +317 -0
  3. checkpoints/GLM-4.6V-Flash-SFT/all_results.json +8 -0
  4. checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja +140 -0
  5. checkpoints/GLM-4.6V-Flash-SFT/config.json +72 -0
  6. checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json +56 -0
  7. checkpoints/GLM-4.6V-Flash-SFT/generation_config.json +16 -0
  8. checkpoints/GLM-4.6V-Flash-SFT/model.safetensors +3 -0
  9. checkpoints/GLM-4.6V-Flash-SFT/processor_config.json +63 -0
  10. checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json +3 -0
  11. checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json +19 -0
  12. checkpoints/GLM-4.6V-Flash-SFT/train_results.json +8 -0
  13. checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json +2227 -0
  14. checkpoints/GLM-4.6V-Flash-SFT/training_loss.png +0 -0
  15. checkpoints/Gemma-4-E4B-it-SFT/all_results.json +8 -0
  16. checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja +263 -0
  17. checkpoints/Gemma-4-E4B-it-SFT/config.json +199 -0
  18. checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json +56 -0
  19. checkpoints/Gemma-4-E4B-it-SFT/generation_config.json +15 -0
  20. checkpoints/Gemma-4-E4B-it-SFT/model.safetensors +3 -0
  21. checkpoints/Gemma-4-E4B-it-SFT/processor_config.json +75 -0
  22. checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json +3 -0
  23. checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json +96 -0
  24. checkpoints/Gemma-4-E4B-it-SFT/train_results.json +8 -0
  25. checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json +2227 -0
  26. checkpoints/Gemma-4-E4B-it-SFT/training_loss.png +0 -0
  27. checkpoints/InternVL3.5-8B-SFT/all_results.json +8 -0
  28. checkpoints/InternVL3.5-8B-SFT/chat_template.jinja +6 -0
  29. checkpoints/InternVL3.5-8B-SFT/config.json +121 -0
  30. checkpoints/InternVL3.5-8B-SFT/eval_results_job_internvl35_8b_internvl35_8b_20260430_002347.json +55 -0
  31. checkpoints/InternVL3.5-8B-SFT/generation_config.json +8 -0
  32. checkpoints/InternVL3.5-8B-SFT/model.safetensors +3 -0
  33. checkpoints/InternVL3.5-8B-SFT/processor_config.json +79 -0
  34. checkpoints/InternVL3.5-8B-SFT/tokenizer.json +3 -0
  35. checkpoints/InternVL3.5-8B-SFT/tokenizer_config.json +29 -0
  36. checkpoints/InternVL3.5-8B-SFT/train_results.json +8 -0
  37. checkpoints/InternVL3.5-8B-SFT/trainer_state.json +2227 -0
  38. checkpoints/InternVL3.5-8B-SFT/training_loss.png +0 -0
  39. checkpoints/Qwen3-VL-2B-SFT/all_results.json +8 -0
  40. checkpoints/Qwen3-VL-2B-SFT/chat_template.jinja +120 -0
  41. checkpoints/Qwen3-VL-2B-SFT/config.json +71 -0
  42. checkpoints/Qwen3-VL-2B-SFT/eval_results_job_qwen3vl_2b_qwen3_vl_2b_20260430_002232.json +56 -0
  43. checkpoints/Qwen3-VL-2B-SFT/generation_config.json +14 -0
  44. checkpoints/Qwen3-VL-2B-SFT/model.safetensors +3 -0
  45. checkpoints/Qwen3-VL-2B-SFT/processor_config.json +60 -0
  46. checkpoints/Qwen3-VL-2B-SFT/tokenizer.json +3 -0
  47. checkpoints/Qwen3-VL-2B-SFT/tokenizer_config.json +31 -0
  48. checkpoints/Qwen3-VL-2B-SFT/train_results.json +8 -0
  49. checkpoints/Qwen3-VL-2B-SFT/trainer_state.json +2227 -0
  50. checkpoints/Qwen3-VL-2B-SFT/training_loss.png +0 -0
.gitattributes CHANGED
@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
+ checkpoints/InternVL3.5-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
39
+ checkpoints/Qwen3-VL-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
40
+ checkpoints/Qwen3-VL-8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
41
+ checkpoints/Qwen3.5-0.8B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
42
+ checkpoints/Qwen3.5-2B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
+ checkpoints/Qwen3.5-9B-SFT/tokenizer.json filter=lfs diff=lfs merge=lfs -text
.watchdog.log ADDED
@@ -0,0 +1,317 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [2026-05-07 01:23:57] Watchdog started, stall threshold = 300 s
2
+ [2026-05-07 01:23:58] No upload python found. Restarting...
3
+ [2026-05-07 01:23:58] Restarting upload (LFS dedup will skip already uploaded chunks)...
4
+ [2026-05-07 01:23:58] Restart issued, WMI ReturnValue=0, launcher PID=20160
5
+ [2026-05-07 01:24:18] Tracking PID 22168, init Read=12.72 GB
6
+ [2026-05-07 03:23:40] No upload python found. Restarting...
7
+ [2026-05-07 03:23:40] Restarting upload (LFS dedup will skip already uploaded chunks)...
8
+ [2026-05-07 03:23:40] Restart issued, WMI ReturnValue=0, launcher PID=27392
9
+ [2026-05-07 03:24:01] Tracking PID 4136, init Read=9.47 GB
10
+ [2026-05-07 03:27:03] No upload python found. Restarting...
11
+ [2026-05-07 03:27:03] Restarting upload (LFS dedup will skip already uploaded chunks)...
12
+ [2026-05-07 03:27:03] Restart issued, WMI ReturnValue=0, launcher PID=20248
13
+ [2026-05-07 03:27:24] Tracking PID 13668, init Read=10.66 GB
14
+ [2026-05-07 03:30:26] No upload python found. Restarting...
15
+ [2026-05-07 03:30:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
16
+ [2026-05-07 03:30:26] Restart issued, WMI ReturnValue=0, launcher PID=14656
17
+ [2026-05-07 03:30:46] Tracking PID 11616, init Read=9.98 GB
18
+ [2026-05-07 03:33:48] No upload python found. Restarting...
19
+ [2026-05-07 03:33:48] Restarting upload (LFS dedup will skip already uploaded chunks)...
20
+ [2026-05-07 03:33:49] Restart issued, WMI ReturnValue=0, launcher PID=26872
21
+ [2026-05-07 03:34:09] Tracking PID 1688, init Read=8.44 GB
22
+ [2026-05-07 03:37:11] No upload python found. Restarting...
23
+ [2026-05-07 03:37:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
24
+ [2026-05-07 03:37:11] Restart issued, WMI ReturnValue=0, launcher PID=25172
25
+ [2026-05-07 03:37:31] Tracking PID 20240, init Read=10.62 GB
26
+ [2026-05-07 03:40:34] No upload python found. Restarting...
27
+ [2026-05-07 03:40:34] Restarting upload (LFS dedup will skip already uploaded chunks)...
28
+ [2026-05-07 03:40:34] Restart issued, WMI ReturnValue=0, launcher PID=15440
29
+ [2026-05-07 03:40:54] Tracking PID 7668, init Read=9.31 GB
30
+ [2026-05-07 03:43:56] No upload python found. Restarting...
31
+ [2026-05-07 03:43:56] Restarting upload (LFS dedup will skip already uploaded chunks)...
32
+ [2026-05-07 03:43:56] Restart issued, WMI ReturnValue=0, launcher PID=12332
33
+ [2026-05-07 03:44:17] Tracking PID 16364, init Read=9.01 GB
34
+ [2026-05-07 03:47:19] No upload python found. Restarting...
35
+ [2026-05-07 03:47:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
36
+ [2026-05-07 03:47:19] Restart issued, WMI ReturnValue=0, launcher PID=25116
37
+ [2026-05-07 03:47:39] Tracking PID 21724, init Read=8.21 GB
38
+ [2026-05-07 03:50:41] No upload python found. Restarting...
39
+ [2026-05-07 03:50:41] Restarting upload (LFS dedup will skip already uploaded chunks)...
40
+ [2026-05-07 03:50:42] Restart issued, WMI ReturnValue=0, launcher PID=29036
41
+ [2026-05-07 03:51:02] Tracking PID 28372, init Read=8.72 GB
42
+ [2026-05-07 03:54:04] No upload python found. Restarting...
43
+ [2026-05-07 03:54:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
44
+ [2026-05-07 03:54:04] Restart issued, WMI ReturnValue=0, launcher PID=29684
45
+ [2026-05-07 03:54:24] Tracking PID 20664, init Read=8.09 GB
46
+ [2026-05-07 03:57:26] No upload python found. Restarting...
47
+ [2026-05-07 03:57:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
48
+ [2026-05-07 03:57:26] Restart issued, WMI ReturnValue=0, launcher PID=25116
49
+ [2026-05-07 03:57:47] Tracking PID 15052, init Read=8.79 GB
50
+ [2026-05-07 04:00:49] No upload python found. Restarting...
51
+ [2026-05-07 04:00:49] Restarting upload (LFS dedup will skip already uploaded chunks)...
52
+ [2026-05-07 04:00:49] Restart issued, WMI ReturnValue=0, launcher PID=2668
53
+ [2026-05-07 04:01:09] Tracking PID 8028, init Read=9.38 GB
54
+ [2026-05-07 04:04:11] No upload python found. Restarting...
55
+ [2026-05-07 04:04:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
56
+ [2026-05-07 04:04:11] Restart issued, WMI ReturnValue=0, launcher PID=4128
57
+ [2026-05-07 04:04:31] Tracking PID 27280, init Read=9.46 GB
58
+ [2026-05-07 04:08:04] No upload python found. Restarting...
59
+ [2026-05-07 04:08:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
60
+ [2026-05-07 04:08:04] Restart issued, WMI ReturnValue=0, launcher PID=27408
61
+ [2026-05-07 04:08:24] Tracking PID 29060, init Read=10.32 GB
62
+ [2026-05-07 04:11:27] No upload python found. Restarting...
63
+ [2026-05-07 04:11:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
64
+ [2026-05-07 04:11:27] Restart issued, WMI ReturnValue=0, launcher PID=18400
65
+ [2026-05-07 04:11:47] Tracking PID 28568, init Read=9.09 GB
66
+ [2026-05-07 04:14:49] No upload python found. Restarting...
67
+ [2026-05-07 04:14:49] Restarting upload (LFS dedup will skip already uploaded chunks)...
68
+ [2026-05-07 04:14:49] Restart issued, WMI ReturnValue=0, launcher PID=25660
69
+ [2026-05-07 04:15:10] Tracking PID 7216, init Read=9.88 GB
70
+ [2026-05-07 04:18:12] No upload python found. Restarting...
71
+ [2026-05-07 04:18:12] Restarting upload (LFS dedup will skip already uploaded chunks)...
72
+ [2026-05-07 04:18:12] Restart issued, WMI ReturnValue=0, launcher PID=27632
73
+ [2026-05-07 04:18:32] Tracking PID 26584, init Read=8.21 GB
74
+ [2026-05-07 04:21:34] No upload python found. Restarting...
75
+ [2026-05-07 04:21:34] Restarting upload (LFS dedup will skip already uploaded chunks)...
76
+ [2026-05-07 04:21:34] Restart issued, WMI ReturnValue=0, launcher PID=29684
77
+ [2026-05-07 04:21:54] Tracking PID 1452, init Read=8.94 GB
78
+ [2026-05-07 04:24:57] No upload python found. Restarting...
79
+ [2026-05-07 04:24:57] Restarting upload (LFS dedup will skip already uploaded chunks)...
80
+ [2026-05-07 04:24:57] Restart issued, WMI ReturnValue=0, launcher PID=23396
81
+ [2026-05-07 04:25:17] Tracking PID 2080, init Read=9.61 GB
82
+ [2026-05-07 04:28:19] No upload python found. Restarting...
83
+ [2026-05-07 04:28:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
84
+ [2026-05-07 04:28:19] Restart issued, WMI ReturnValue=0, launcher PID=29288
85
+ [2026-05-07 04:28:40] Tracking PID 12628, init Read=9.36 GB
86
+ [2026-05-07 04:31:42] No upload python found. Restarting...
87
+ [2026-05-07 04:31:42] Restarting upload (LFS dedup will skip already uploaded chunks)...
88
+ [2026-05-07 04:31:42] Restart issued, WMI ReturnValue=0, launcher PID=29080
89
+ [2026-05-07 04:32:02] Tracking PID 20776, init Read=8.87 GB
90
+ [2026-05-07 04:35:04] No upload python found. Restarting...
91
+ [2026-05-07 04:35:04] Restarting upload (LFS dedup will skip already uploaded chunks)...
92
+ [2026-05-07 04:35:04] Restart issued, WMI ReturnValue=0, launcher PID=25012
93
+ [2026-05-07 04:35:24] Tracking PID 23744, init Read=9.38 GB
94
+ [2026-05-07 04:38:27] No upload python found. Restarting...
95
+ [2026-05-07 04:38:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
96
+ [2026-05-07 04:38:27] Restart issued, WMI ReturnValue=0, launcher PID=8960
97
+ [2026-05-07 04:38:47] Tracking PID 28516, init Read=8.45 GB
98
+ [2026-05-07 04:42:19] No upload python found. Restarting...
99
+ [2026-05-07 04:42:19] Restarting upload (LFS dedup will skip already uploaded chunks)...
100
+ [2026-05-07 04:42:20] Restart issued, WMI ReturnValue=0, launcher PID=24896
101
+ [2026-05-07 04:42:40] Tracking PID 20416, init Read=7.3 GB
102
+ [2026-05-07 04:45:42] No upload python found. Restarting...
103
+ [2026-05-07 04:45:42] Restarting upload (LFS dedup will skip already uploaded chunks)...
104
+ [2026-05-07 04:45:42] Restart issued, WMI ReturnValue=0, launcher PID=16408
105
+ [2026-05-07 04:46:02] Tracking PID 28992, init Read=9.8 GB
106
+ [2026-05-07 04:49:05] No upload python found. Restarting...
107
+ [2026-05-07 04:49:05] Restarting upload (LFS dedup will skip already uploaded chunks)...
108
+ [2026-05-07 04:49:05] Restart issued, WMI ReturnValue=0, launcher PID=27912
109
+ [2026-05-07 04:49:25] Tracking PID 960, init Read=9.27 GB
110
+ [2026-05-07 04:52:27] No upload python found. Restarting...
111
+ [2026-05-07 04:52:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
112
+ [2026-05-07 04:52:27] Restart issued, WMI ReturnValue=0, launcher PID=15432
113
+ [2026-05-07 04:52:47] Tracking PID 24880, init Read=9.64 GB
114
+ [2026-05-07 04:56:20] No upload python found. Restarting...
115
+ [2026-05-07 04:56:20] Restarting upload (LFS dedup will skip already uploaded chunks)...
116
+ [2026-05-07 04:56:20] Restart issued, WMI ReturnValue=0, launcher PID=3744
117
+ [2026-05-07 04:56:41] Tracking PID 25356, init Read=8.27 GB
118
+ [2026-05-07 04:59:43] No upload python found. Restarting...
119
+ [2026-05-07 04:59:43] Restarting upload (LFS dedup will skip already uploaded chunks)...
120
+ [2026-05-07 04:59:43] Restart issued, WMI ReturnValue=0, launcher PID=27888
121
+ [2026-05-07 05:00:04] Tracking PID 27952, init Read=10.45 GB
122
+ [2026-05-07 05:03:06] No upload python found. Restarting...
123
+ [2026-05-07 05:03:06] Restarting upload (LFS dedup will skip already uploaded chunks)...
124
+ [2026-05-07 05:03:06] Restart issued, WMI ReturnValue=0, launcher PID=20772
125
+ [2026-05-07 05:03:26] Tracking PID 1456, init Read=10.27 GB
126
+ [2026-05-07 05:06:29] No upload python found. Restarting...
127
+ [2026-05-07 05:06:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
128
+ [2026-05-07 05:06:29] Restart issued, WMI ReturnValue=0, launcher PID=13848
129
+ [2026-05-07 05:06:49] Tracking PID 28648, init Read=10.36 GB
130
+ [2026-05-07 05:09:51] No upload python found. Restarting...
131
+ [2026-05-07 05:09:51] Restarting upload (LFS dedup will skip already uploaded chunks)...
132
+ [2026-05-07 05:09:52] Restart issued, WMI ReturnValue=0, launcher PID=6508
133
+ [2026-05-07 05:10:12] Tracking PID 29120, init Read=9.57 GB
134
+ [2026-05-07 05:13:14] No upload python found. Restarting...
135
+ [2026-05-07 05:13:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
136
+ [2026-05-07 05:13:14] Restart issued, WMI ReturnValue=0, launcher PID=29080
137
+ [2026-05-07 05:13:34] Tracking PID 29408, init Read=7.42 GB
138
+ [2026-05-07 05:17:07] No upload python found. Restarting...
139
+ [2026-05-07 05:17:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
140
+ [2026-05-07 05:17:07] Restart issued, WMI ReturnValue=0, launcher PID=5536
141
+ [2026-05-07 05:17:27] Tracking PID 24176, init Read=9.01 GB
142
+ [2026-05-07 05:20:29] No upload python found. Restarting...
143
+ [2026-05-07 05:20:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
144
+ [2026-05-07 05:20:29] Restart issued, WMI ReturnValue=0, launcher PID=27784
145
+ [2026-05-07 05:20:50] Tracking PID 27904, init Read=10.14 GB
146
+ [2026-05-07 05:23:52] No upload python found. Restarting...
147
+ [2026-05-07 05:23:52] Restarting upload (LFS dedup will skip already uploaded chunks)...
148
+ [2026-05-07 05:23:52] Restart issued, WMI ReturnValue=0, launcher PID=3892
149
+ [2026-05-07 05:24:12] Tracking PID 23124, init Read=8.03 GB
150
+ [2026-05-07 05:27:14] No upload python found. Restarting...
151
+ [2026-05-07 05:27:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
152
+ [2026-05-07 05:27:14] Restart issued, WMI ReturnValue=0, launcher PID=924
153
+ [2026-05-07 05:27:35] Tracking PID 6124, init Read=8.05 GB
154
+ [2026-05-07 05:30:37] No upload python found. Restarting...
155
+ [2026-05-07 05:30:37] Restarting upload (LFS dedup will skip already uploaded chunks)...
156
+ [2026-05-07 05:30:37] Restart issued, WMI ReturnValue=0, launcher PID=28232
157
+ [2026-05-07 05:30:57] Tracking PID 1836, init Read=9.41 GB
158
+ [2026-05-07 05:33:59] No upload python found. Restarting...
159
+ [2026-05-07 05:33:59] Restarting upload (LFS dedup will skip already uploaded chunks)...
160
+ [2026-05-07 05:33:59] Restart issued, WMI ReturnValue=0, launcher PID=29568
161
+ [2026-05-07 05:34:20] Tracking PID 14728, init Read=8.76 GB
162
+ [2026-05-07 05:37:22] No upload python found. Restarting...
163
+ [2026-05-07 05:37:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
164
+ [2026-05-07 05:37:22] Restart issued, WMI ReturnValue=0, launcher PID=29036
165
+ [2026-05-07 05:37:42] Tracking PID 21932, init Read=9.63 GB
166
+ [2026-05-07 05:40:44] No upload python found. Restarting...
167
+ [2026-05-07 05:40:44] Restarting upload (LFS dedup will skip already uploaded chunks)...
168
+ [2026-05-07 05:40:44] Restart issued, WMI ReturnValue=0, launcher PID=5956
169
+ [2026-05-07 05:41:05] Tracking PID 16784, init Read=10.13 GB
170
+ [2026-05-07 05:44:07] No upload python found. Restarting...
171
+ [2026-05-07 05:44:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
172
+ [2026-05-07 05:44:07] Restart issued, WMI ReturnValue=0, launcher PID=29208
173
+ [2026-05-07 05:44:27] Tracking PID 26468, init Read=9.85 GB
174
+ [2026-05-07 05:47:29] No upload python found. Restarting...
175
+ [2026-05-07 05:47:29] Restarting upload (LFS dedup will skip already uploaded chunks)...
176
+ [2026-05-07 05:47:30] Restart issued, WMI ReturnValue=0, launcher PID=17204
177
+ [2026-05-07 05:47:50] Tracking PID 27924, init Read=9.21 GB
178
+ [2026-05-07 05:50:52] No upload python found. Restarting...
179
+ [2026-05-07 05:50:52] Restarting upload (LFS dedup will skip already uploaded chunks)...
180
+ [2026-05-07 05:50:52] Restart issued, WMI ReturnValue=0, launcher PID=7704
181
+ [2026-05-07 05:51:12] Tracking PID 25912, init Read=10.19 GB
182
+ [2026-05-07 05:54:14] No upload python found. Restarting...
183
+ [2026-05-07 05:54:14] Restarting upload (LFS dedup will skip already uploaded chunks)...
184
+ [2026-05-07 05:54:14] Restart issued, WMI ReturnValue=0, launcher PID=28952
185
+ [2026-05-07 05:54:35] Tracking PID 29272, init Read=8.83 GB
186
+ [2026-05-07 05:57:37] No upload python found. Restarting...
187
+ [2026-05-07 05:57:37] Restarting upload (LFS dedup will skip already uploaded chunks)...
188
+ [2026-05-07 05:57:37] Restart issued, WMI ReturnValue=0, launcher PID=25524
189
+ [2026-05-07 05:57:58] Tracking PID 8760, init Read=7.8 GB
190
+ [2026-05-07 06:01:00] No upload python found. Restarting...
191
+ [2026-05-07 06:01:00] Restarting upload (LFS dedup will skip already uploaded chunks)...
192
+ [2026-05-07 06:01:00] Restart issued, WMI ReturnValue=0, launcher PID=29016
193
+ [2026-05-07 06:01:20] Tracking PID 8040, init Read=9.98 GB
194
+ [2026-05-07 06:04:22] No upload python found. Restarting...
195
+ [2026-05-07 06:04:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
196
+ [2026-05-07 06:04:22] Restart issued, WMI ReturnValue=0, launcher PID=28840
197
+ [2026-05-07 06:04:42] Tracking PID 25172, init Read=8.78 GB
198
+ [2026-05-07 06:07:44] No upload python found. Restarting...
199
+ [2026-05-07 06:07:44] Restarting upload (LFS dedup will skip already uploaded chunks)...
200
+ [2026-05-07 06:07:44] Restart issued, WMI ReturnValue=0, launcher PID=14524
201
+ [2026-05-07 06:08:05] Tracking PID 11872, init Read=8.56 GB
202
+ [2026-05-07 06:11:07] No upload python found. Restarting...
203
+ [2026-05-07 06:11:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
204
+ [2026-05-07 06:11:07] Restart issued, WMI ReturnValue=0, launcher PID=13756
205
+ [2026-05-07 06:11:27] Tracking PID 15716, init Read=10.04 GB
206
+ [2026-05-07 06:15:00] No upload python found. Restarting...
207
+ [2026-05-07 06:15:00] Restarting upload (LFS dedup will skip already uploaded chunks)...
208
+ [2026-05-07 06:15:00] Restart issued, WMI ReturnValue=0, launcher PID=28472
209
+ [2026-05-07 06:15:20] Tracking PID 20180, init Read=7.92 GB
210
+ [2026-05-07 06:18:22] No upload python found. Restarting...
211
+ [2026-05-07 06:18:22] Restarting upload (LFS dedup will skip already uploaded chunks)...
212
+ [2026-05-07 06:18:22] Restart issued, WMI ReturnValue=0, launcher PID=15712
213
+ [2026-05-07 06:18:43] Tracking PID 12508, init Read=9.04 GB
214
+ [2026-05-07 06:21:45] No upload python found. Restarting...
215
+ [2026-05-07 06:21:45] Restarting upload (LFS dedup will skip already uploaded chunks)...
216
+ [2026-05-07 06:21:45] Restart issued, WMI ReturnValue=0, launcher PID=22588
217
+ [2026-05-07 06:22:06] Tracking PID 20564, init Read=8.24 GB
218
+ [2026-05-07 06:25:07] No upload python found. Restarting...
219
+ [2026-05-07 06:25:07] Restarting upload (LFS dedup will skip already uploaded chunks)...
220
+ [2026-05-07 06:25:08] Restart issued, WMI ReturnValue=0, launcher PID=21216
221
+ [2026-05-07 06:25:28] Tracking PID 27056, init Read=8.88 GB
222
+ [2026-05-07 06:28:30] No upload python found. Restarting...
223
+ [2026-05-07 06:28:30] Restarting upload (LFS dedup will skip already uploaded chunks)...
224
+ [2026-05-07 06:28:30] Restart issued, WMI ReturnValue=0, launcher PID=15504
225
+ [2026-05-07 06:28:50] Tracking PID 23240, init Read=8.62 GB
226
+ [2026-05-07 06:31:53] No upload python found. Restarting...
227
+ [2026-05-07 06:31:53] Restarting upload (LFS dedup will skip already uploaded chunks)...
228
+ [2026-05-07 06:31:53] Restart issued, WMI ReturnValue=0, launcher PID=12632
229
+ [2026-05-07 06:32:13] Tracking PID 29112, init Read=7.91 GB
230
+ [2026-05-07 06:35:46] No upload python found. Restarting...
231
+ [2026-05-07 06:35:46] Restarting upload (LFS dedup will skip already uploaded chunks)...
232
+ [2026-05-07 06:35:46] Restart issued, WMI ReturnValue=0, launcher PID=6688
233
+ [2026-05-07 06:36:06] Tracking PID 1880, init Read=9.38 GB
234
+ [2026-05-07 06:39:08] No upload python found. Restarting...
235
+ [2026-05-07 06:39:08] Restarting upload (LFS dedup will skip already uploaded chunks)...
236
+ [2026-05-07 06:39:08] Restart issued, WMI ReturnValue=0, launcher PID=28860
237
+ [2026-05-07 06:39:28] Tracking PID 20996, init Read=10.1 GB
238
+ [2026-05-07 06:42:31] No upload python found. Restarting...
239
+ [2026-05-07 06:42:31] Restarting upload (LFS dedup will skip already uploaded chunks)...
240
+ [2026-05-07 06:42:31] Restart issued, WMI ReturnValue=0, launcher PID=12436
241
+ [2026-05-07 06:42:51] Tracking PID 23428, init Read=8.12 GB
242
+ [2026-05-07 06:45:53] No upload python found. Restarting...
243
+ [2026-05-07 06:45:53] Restarting upload (LFS dedup will skip already uploaded chunks)...
244
+ [2026-05-07 06:45:53] Restart issued, WMI ReturnValue=0, launcher PID=15440
245
+ [2026-05-07 06:46:14] Tracking PID 26756, init Read=9.91 GB
246
+ [2026-05-07 06:49:16] No upload python found. Restarting...
247
+ [2026-05-07 06:49:16] Restarting upload (LFS dedup will skip already uploaded chunks)...
248
+ [2026-05-07 06:49:16] Restart issued, WMI ReturnValue=0, launcher PID=28312
249
+ [2026-05-07 06:49:36] Tracking PID 13260, init Read=8.84 GB
250
+ [2026-05-07 06:53:09] No upload python found. Restarting...
251
+ [2026-05-07 06:53:09] Restarting upload (LFS dedup will skip already uploaded chunks)...
252
+ [2026-05-07 06:53:09] Restart issued, WMI ReturnValue=0, launcher PID=13476
253
+ [2026-05-07 06:53:29] Tracking PID 18072, init Read=8.38 GB
254
+ [2026-05-07 06:56:32] No upload python found. Restarting...
255
+ [2026-05-07 06:56:32] Restarting upload (LFS dedup will skip already uploaded chunks)...
256
+ [2026-05-07 06:56:32] Restart issued, WMI ReturnValue=0, launcher PID=17460
257
+ [2026-05-07 06:56:52] Tracking PID 29056, init Read=8.07 GB
258
+ [2026-05-07 07:00:25] No upload python found. Restarting...
259
+ [2026-05-07 07:00:25] Restarting upload (LFS dedup will skip already uploaded chunks)...
260
+ [2026-05-07 07:00:25] Restart issued, WMI ReturnValue=0, launcher PID=29528
261
+ [2026-05-07 07:00:45] Tracking PID 21456, init Read=9.85 GB
262
+ [2026-05-07 07:03:47] No upload python found. Restarting...
263
+ [2026-05-07 07:03:47] Restarting upload (LFS dedup will skip already uploaded chunks)...
264
+ [2026-05-07 07:03:47] Restart issued, WMI ReturnValue=0, launcher PID=24264
265
+ [2026-05-07 07:04:08] Tracking PID 3440, init Read=9.81 GB
266
+ [2026-05-07 07:07:10] No upload python found. Restarting...
267
+ [2026-05-07 07:07:10] Restarting upload (LFS dedup will skip already uploaded chunks)...
268
+ [2026-05-07 07:07:10] Restart issued, WMI ReturnValue=0, launcher PID=19964
269
+ [2026-05-07 07:07:30] Tracking PID 29076, init Read=10.53 GB
270
+ [2026-05-07 07:11:03] No upload python found. Restarting...
271
+ [2026-05-07 07:11:03] Restarting upload (LFS dedup will skip already uploaded chunks)...
272
+ [2026-05-07 07:11:03] Restart issued, WMI ReturnValue=0, launcher PID=5848
273
+ [2026-05-07 07:11:23] Tracking PID 20508, init Read=9.59 GB
274
+ [2026-05-07 07:14:26] No upload python found. Restarting...
275
+ [2026-05-07 07:14:26] Restarting upload (LFS dedup will skip already uploaded chunks)...
276
+ [2026-05-07 07:14:26] Restart issued, WMI ReturnValue=0, launcher PID=23236
277
+ [2026-05-07 07:14:46] Tracking PID 2916, init Read=8.73 GB
278
+ [2026-05-07 07:17:48] No upload python found. Restarting...
279
+ [2026-05-07 07:17:48] Restarting upload (LFS dedup will skip already uploaded chunks)...
280
+ [2026-05-07 07:17:48] Restart issued, WMI ReturnValue=0, launcher PID=29588
281
+ [2026-05-07 07:18:09] Tracking PID 26428, init Read=9.4 GB
282
+ [2026-05-07 07:21:11] No upload python found. Restarting...
283
+ [2026-05-07 07:21:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
284
+ [2026-05-07 07:21:11] Restart issued, WMI ReturnValue=0, launcher PID=8984
285
+ [2026-05-07 07:21:31] Tracking PID 6716, init Read=10.62 GB
286
+ [2026-05-07 07:24:33] No upload python found. Restarting...
287
+ [2026-05-07 07:24:33] Restarting upload (LFS dedup will skip already uploaded chunks)...
288
+ [2026-05-07 07:24:34] Restart issued, WMI ReturnValue=0, launcher PID=23504
289
+ [2026-05-07 07:24:54] Tracking PID 6104, init Read=9.53 GB
290
+ [2026-05-07 07:27:56] No upload python found. Restarting...
291
+ [2026-05-07 07:27:56] Restarting upload (LFS dedup will skip already uploaded chunks)...
292
+ [2026-05-07 07:27:56] Restart issued, WMI ReturnValue=0, launcher PID=3568
293
+ [2026-05-07 07:28:16] Tracking PID 5140, init Read=7.9 GB
294
+ [2026-05-07 07:31:49] No upload python found. Restarting...
295
+ [2026-05-07 07:31:49] Restarting upload (LFS dedup will skip already uploaded chunks)...
296
+ [2026-05-07 07:31:49] Restart issued, WMI ReturnValue=0, launcher PID=18812
297
+ [2026-05-07 07:32:09] Tracking PID 13604, init Read=8.94 GB
298
+ [2026-05-07 07:35:11] No upload python found. Restarting...
299
+ [2026-05-07 07:35:11] Restarting upload (LFS dedup will skip already uploaded chunks)...
300
+ [2026-05-07 07:35:12] Restart issued, WMI ReturnValue=0, launcher PID=27192
301
+ [2026-05-07 07:35:32] Tracking PID 17928, init Read=8.93 GB
302
+ [2026-05-07 07:38:34] No upload python found. Restarting...
303
+ [2026-05-07 07:38:34] Restarting upload (LFS dedup will skip already uploaded chunks)...
304
+ [2026-05-07 07:38:34] Restart issued, WMI ReturnValue=0, launcher PID=17280
305
+ [2026-05-07 07:38:54] Tracking PID 25484, init Read=9.93 GB
306
+ [2026-05-07 07:42:27] No upload python found. Restarting...
307
+ [2026-05-07 07:42:27] Restarting upload (LFS dedup will skip already uploaded chunks)...
308
+ [2026-05-07 07:42:27] Restart issued, WMI ReturnValue=0, launcher PID=25116
309
+ [2026-05-07 07:42:48] Tracking PID 29200, init Read=9.92 GB
310
+ [2026-05-07 07:45:50] No upload python found. Restarting...
311
+ [2026-05-07 07:45:50] Restarting upload (LFS dedup will skip already uploaded chunks)...
312
+ [2026-05-07 07:45:50] Restart issued, WMI ReturnValue=0, launcher PID=22852
313
+ [2026-05-07 07:46:10] Tracking PID 24524, init Read=9.5 GB
314
+ [2026-05-07 07:49:12] No upload python found. Restarting...
315
+ [2026-05-07 07:49:12] Restarting upload (LFS dedup will skip already uploaded chunks)...
316
+ [2026-05-07 07:49:13] Restart issued, WMI ReturnValue=0, launcher PID=21544
317
+ [2026-05-07 07:49:33] Tracking PID 22200, init Read=8.94 GB
checkpoints/GLM-4.6V-Flash-SFT/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 2477163648385024.0,
4
+ "train_loss": 0.20598802658081056,
5
+ "train_runtime": 35266.4791,
6
+ "train_samples_per_second": 5.671,
7
+ "train_steps_per_second": 0.089
8
+ }
checkpoints/GLM-4.6V-Flash-SFT/chat_template.jinja ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [gMASK]<sop>
2
+ {%- if tools -%}
3
+ <|system|>
4
+ # Tools
5
+
6
+ You may call one or more functions to assist with the user query.
7
+
8
+ You are provided with function signatures within <tools></tools> XML tags:
9
+ <tools>
10
+ {% for tool in tools %}
11
+ {{ tool | tojson(ensure_ascii=False) }}
12
+ {% endfor %}
13
+ </tools>
14
+
15
+ For each function call, output the function name and arguments within the following XML format:
16
+ <tool_call>{function-name}
17
+ <arg_key>{arg-key-1}</arg_key>
18
+ <arg_value>{arg-value-1}</arg_value>
19
+ <arg_key>{arg-key-2}</arg_key>
20
+ <arg_value>{arg-value-2}</arg_value>
21
+ ...
22
+ </tool_call>{%- endif -%}
23
+ {%- macro visible_text(content) -%}
24
+ {%- if content is string -%}
25
+ {{- content }}
26
+ {%- elif content is iterable and content is not mapping -%}
27
+ {%- for item in content -%}
28
+ {%- if item is mapping and item.type == 'text' -%}
29
+ {{- item.text }}
30
+ {%- elif item is mapping and (item.type == 'image' or 'image' in item) -%}
31
+ <|begin_of_image|><|image|><|end_of_image|>
32
+ {%- elif item is mapping and (item.type == 'video' or 'video' in item) -%}
33
+ <|begin_of_video|><|video|><|end_of_video|>
34
+ {%- elif item is string -%}
35
+ {{- item }}
36
+ {%- endif -%}
37
+ {%- endfor -%}
38
+ {%- else -%}
39
+ {{- content }}
40
+ {%- endif -%}
41
+ {%- endmacro -%}
42
+ {%- set ns = namespace(last_user_index=-1) %}
43
+ {%- for m in messages %}
44
+ {%- if m.role == 'user' %}
45
+ {% set ns.last_user_index = loop.index0 -%}
46
+ {%- endif %}
47
+ {%- endfor %}
48
+ {% for m in messages %}
49
+ {%- if m.role == 'user' -%}<|user|>
50
+ {% if m.content is string %}
51
+ {{ m.content }}
52
+ {%- else %}
53
+ {%- for item in m.content %}
54
+ {% if item.type == 'video' or 'video' in item %}
55
+ <|begin_of_video|><|video|><|end_of_video|>{% elif item.type == 'image' or 'image' in item %}
56
+ <|begin_of_image|><|image|><|end_of_image|>{% elif item.type == 'text' %}
57
+ {{ item.text }}
58
+ {%- endif %}
59
+ {%- endfor %}
60
+ {%- endif %}
61
+ {{- '/nothink' if (enable_thinking is defined and not enable_thinking and not visible_text(m.content).endswith("/nothink")) else '' -}}
62
+ {%- elif m.role == 'assistant' -%}
63
+ <|assistant|>
64
+ {%- set reasoning_content = '' %}
65
+ {%- set content = visible_text(m.content) %}
66
+ {%- if m.reasoning_content is string %}
67
+ {%- set reasoning_content = m.reasoning_content %}
68
+ {%- else %}
69
+ {%- if '</think>' in content %}
70
+ {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
71
+ {%- set content = content.split('</think>')[-1].lstrip('\n') %}
72
+ {%- endif %}
73
+ {%- endif %}
74
+ {%- if loop.index0 > ns.last_user_index and reasoning_content -%}
75
+ {{ '\n<think>' + reasoning_content.strip() + '</think>'}}
76
+ {%- else -%}
77
+ {{ '\n<think></think>' }}
78
+ {%- endif -%}
79
+ {%- if content.strip() -%}
80
+ {{ '\n' + content.strip() }}
81
+ {%- endif -%}
82
+ {% if m.tool_calls %}
83
+ {% for tc in m.tool_calls %}
84
+ {%- if tc.function %}
85
+ {%- set tc = tc.function %}
86
+ {%- endif %}
87
+ {{ '\n<tool_call>' + tc.name }}
88
+ {% set _args = tc.arguments %}
89
+ {% for k, v in _args.items() %}
90
+ <arg_key>{{ k }}</arg_key>
91
+ <arg_value>{{ v | tojson(ensure_ascii=False) if v is not string else v }}</arg_value>
92
+ {% endfor %}
93
+ </tool_call>{% endfor %}
94
+ {% endif %}
95
+ {%- elif m.role == 'tool' -%}
96
+ {%- if m.content is string -%}
97
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
98
+ {{- '<|observation|>' }}
99
+ {%- endif %}
100
+ {{- '\n<tool_response>\n' }}
101
+ {{- m.content }}
102
+ {{- '\n</tool_response>' }}
103
+ {% elif m.content is iterable and m.content is not mapping %}
104
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
105
+ {{- '<|observation|>' }}
106
+ {%- endif %}
107
+ {{- '\n<tool_response>\n' }}
108
+ {%- for tr in m.content -%}
109
+ {%- if tr is mapping and tr.type is defined -%}
110
+ {%- set t = tr.type | lower -%}
111
+ {%- if t == 'text' and tr.text is defined -%}
112
+ {{ tr.text }}
113
+ {%- elif t in ['image', 'image_url'] -%}
114
+ <|begin_of_image|><|image|><|end_of_image|>
115
+ {%- elif t in ['video', 'video_url'] -%}
116
+ <|begin_of_video|><|video|><|end_of_video|>
117
+ {%- else -%}
118
+ {{ tr | tojson(ensure_ascii=False) }}
119
+ {%- endif -%}
120
+ {%- else -%}
121
+ {{ tr.output if tr.output is defined else tr }}
122
+ {%- endif -%}
123
+ {%- endfor -%}
124
+ {{- '\n</tool_response>' }}
125
+ {%- else -%}
126
+ <|observation|>{% for tr in m.content %}
127
+
128
+ <tool_response>
129
+ {{ tr.output if tr.output is defined else tr }}
130
+ </tool_response>{% endfor -%}
131
+ {% endif -%}
132
+ {%- elif m.role == 'system' -%}
133
+ <|system|>
134
+ {{ visible_text(m.content) }}
135
+ {%- endif -%}
136
+ {%- endfor -%}
137
+ {%- if add_generation_prompt -%}
138
+ <|assistant|>
139
+ {{'<think></think>\n' if (enable_thinking is defined and not enable_thinking) else ''}}
140
+ {%- endif -%}
checkpoints/GLM-4.6V-Flash-SFT/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Glm4vForConditionalGeneration"
4
+ ],
5
+ "dtype": "bfloat16",
6
+ "eos_token_id": 151329,
7
+ "hidden_size": 4096,
8
+ "image_end_token_id": 151340,
9
+ "image_start_token_id": 151339,
10
+ "image_token_id": 151363,
11
+ "model_type": "glm4v",
12
+ "pad_token_id": 151329,
13
+ "text_config": {
14
+ "attention_bias": true,
15
+ "attention_dropout": 0.0,
16
+ "dtype": "bfloat16",
17
+ "eos_token_id": [
18
+ 151329,
19
+ 151336,
20
+ 151338
21
+ ],
22
+ "hidden_act": "silu",
23
+ "hidden_size": 4096,
24
+ "initializer_range": 0.02,
25
+ "intermediate_size": 13696,
26
+ "max_position_embeddings": 131072,
27
+ "model_type": "glm4v_text",
28
+ "num_attention_heads": 32,
29
+ "num_hidden_layers": 40,
30
+ "num_key_value_heads": 2,
31
+ "pad_token_id": 151329,
32
+ "rms_norm_eps": 1e-05,
33
+ "rope_parameters": {
34
+ "mrope_section": [
35
+ 8,
36
+ 12,
37
+ 12
38
+ ],
39
+ "partial_rotary_factor": 0.5,
40
+ "rope_theta": 500000,
41
+ "rope_type": "default"
42
+ },
43
+ "use_cache": false,
44
+ "vocab_size": 151552
45
+ },
46
+ "tie_word_embeddings": false,
47
+ "transformers_version": "5.5.3",
48
+ "use_cache": false,
49
+ "video_end_token_id": 151342,
50
+ "video_start_token_id": 151341,
51
+ "video_token_id": 151364,
52
+ "vision_config": {
53
+ "attention_bias": false,
54
+ "attention_dropout": 0.0,
55
+ "depth": 24,
56
+ "dtype": "bfloat16",
57
+ "hidden_act": "silu",
58
+ "hidden_dropout_prob": 0.0,
59
+ "hidden_size": 1536,
60
+ "image_size": 336,
61
+ "in_channels": 3,
62
+ "initializer_range": 0.02,
63
+ "intermediate_size": 13696,
64
+ "model_type": "glm4v_vision",
65
+ "num_heads": 12,
66
+ "out_hidden_size": 4096,
67
+ "patch_size": 14,
68
+ "rms_norm_eps": 1e-05,
69
+ "spatial_merge_size": 2,
70
+ "temporal_patch_size": 2
71
+ }
72
+ }
checkpoints/GLM-4.6V-Flash-SFT/eval_results_job_glm_glm_46v_flash_20260430_010119.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mae_dx": 0.1517896551724138,
3
+ "rmse_dx": 0.5050280292665226,
4
+ "mae_dy": 0.13570689655172413,
5
+ "rmse_dy": 0.40379185488190017,
6
+ "mae_dz": 0.017967241379310345,
7
+ "rmse_dz": 0.15680698656144998,
8
+ "mae_dpitch": 0.24627758620689652,
9
+ "rmse_dpitch": 0.5965444891927231,
10
+ "mae_dyaw": 1.0261448275862068,
11
+ "rmse_dyaw": 2.459724339755617,
12
+ "mae_droll": 0.0,
13
+ "rmse_droll": 0.0,
14
+ "mae_overall": 0.26298103448275856,
15
+ "mae_position": 0.10182126436781609,
16
+ "mae_rotation": 0.42414080459770115,
17
+ "rmse_overall": 1.068394337204253,
18
+ "wp1_euc_mae": 0.0698010264307822,
19
+ "wp1_euc_median": 0.01999999999999999,
20
+ "wp2_euc_mae": 0.1401695004658457,
21
+ "wp2_euc_median": 0.04123105625617661,
22
+ "wp3_euc_mae": 0.22301934350856006,
23
+ "wp3_euc_median": 0.07211102550927984,
24
+ "wp4_euc_mae": 0.32865394783587415,
25
+ "wp4_euc_median": 0.1104536101718727,
26
+ "wp5_euc_mae": 0.44338792793915116,
27
+ "wp5_euc_median": 0.15905694150420963,
28
+ "euclidean_mae": 0.24100634923604267,
29
+ "ADE": 0.24100634923604267,
30
+ "FDE": 0.44338792793915116,
31
+ "ADE_median": 0.08327688731593763,
32
+ "FDE_median": 0.15905694150420963,
33
+ "SR@0.5m": 0.8951724137931034,
34
+ "SR@1.0m": 0.9513793103448276,
35
+ "SR@2.0m": 0.9808620689655172,
36
+ "SR@5.0m": 0.9968965517241379,
37
+ "TrajSR@1.0m": 0.8974137931034483,
38
+ "TrajSR@2.0m": 0.9577586206896552,
39
+ "TrajSR@5.0m": 0.9922413793103448,
40
+ "RotAcc@1.0deg": 0.7027586206896552,
41
+ "RotAcc@5.0deg": 0.9586206896551724,
42
+ "RotAcc@10.0deg": 0.9889655172413793,
43
+ "wp1_rot_mae": 0.5029051706109685,
44
+ "wp2_rot_mae": 0.7513635215329055,
45
+ "wp3_rot_mae": 1.0546360645612183,
46
+ "wp4_rot_mae": 1.4243170022546052,
47
+ "wp5_rot_mae": 1.784744600833039,
48
+ "rotation_euc_mae": 1.1035932719585473,
49
+ "parse_failure_rate": 0.0,
50
+ "parse_success_rate": 1.0,
51
+ "valid_samples": 1160,
52
+ "total_samples": 1160,
53
+ "parse_failures": 0,
54
+ "inference_engine": "vllm",
55
+ "vllm_version": "0.19.0"
56
+ }
checkpoints/GLM-4.6V-Flash-SFT/generation_config.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151329,
6
+ 151329,
7
+ 151336,
8
+ 151338,
9
+ 151348
10
+ ],
11
+ "pad_token_id": 151329,
12
+ "temperature": 0.8,
13
+ "top_k": 2,
14
+ "top_p": 0.6,
15
+ "transformers_version": "5.5.3"
16
+ }
checkpoints/GLM-4.6V-Flash-SFT/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c8a32229e6fe30d156e4259207d341d5b0022d08d8df59cd08760bf85cd5d215
3
+ size 20585645128
checkpoints/GLM-4.6V-Flash-SFT/processor_config.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.48145466,
9
+ 0.4578275,
10
+ 0.40821073
11
+ ],
12
+ "image_processor_type": "Glm46VImageProcessor",
13
+ "image_std": [
14
+ 0.26862954,
15
+ 0.26130258,
16
+ 0.27577711
17
+ ],
18
+ "merge_size": 2,
19
+ "patch_size": 14,
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 9633792,
24
+ "shortest_edge": 12544
25
+ },
26
+ "temporal_patch_size": 2
27
+ },
28
+ "processor_class": "Glm46VProcessor",
29
+ "video_processor": {
30
+ "do_convert_rgb": true,
31
+ "do_normalize": true,
32
+ "do_rescale": true,
33
+ "do_resize": true,
34
+ "do_sample_frames": true,
35
+ "fps": 2,
36
+ "image_mean": [
37
+ 0.48145466,
38
+ 0.4578275,
39
+ 0.40821073
40
+ ],
41
+ "image_std": [
42
+ 0.26862954,
43
+ 0.26130258,
44
+ 0.27577711
45
+ ],
46
+ "max_duration": 300,
47
+ "max_image_size": {
48
+ "longest_edge": 47040000
49
+ },
50
+ "merge_size": 2,
51
+ "num_frames": 16,
52
+ "patch_size": 14,
53
+ "resample": 3,
54
+ "rescale_factor": 0.00392156862745098,
55
+ "return_metadata": false,
56
+ "size": {
57
+ "longest_edge": 100352000,
58
+ "shortest_edge": 12544
59
+ },
60
+ "temporal_patch_size": 2,
61
+ "video_processor_type": "Glm46VVideoProcessor"
62
+ }
63
+ }
checkpoints/GLM-4.6V-Flash-SFT/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eecde1f225a86abef606164ceeb446737e592c4e7a40afe5cbf3ce8328e3df99
3
+ size 19970886
checkpoints/GLM-4.6V-Flash-SFT/tokenizer_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "backend": "tokenizers",
3
+ "clean_up_tokenization_spaces": false,
4
+ "do_lower_case": false,
5
+ "eos_token": "<|endoftext|>",
6
+ "extra_special_tokens": [
7
+ "<|user|>",
8
+ "<|observation|>",
9
+ "</answer>"
10
+ ],
11
+ "is_local": true,
12
+ "model_max_length": 128000,
13
+ "pad_token": "<|endoftext|>",
14
+ "padding_side": "right",
15
+ "processor_class": "Glm46VProcessor",
16
+ "remove_space": false,
17
+ "split_special_tokens": false,
18
+ "tokenizer_class": "TokenizersBackend"
19
+ }
checkpoints/GLM-4.6V-Flash-SFT/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 2477163648385024.0,
4
+ "train_loss": 0.20598802658081056,
5
+ "train_runtime": 35266.4791,
6
+ "train_samples_per_second": 5.671,
7
+ "train_steps_per_second": 0.089
8
+ }
checkpoints/GLM-4.6V-Flash-SFT/trainer_state.json ADDED
@@ -0,0 +1,2227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0032,
14
+ "grad_norm": 20.093808181688754,
15
+ "learning_rate": 1.437699680511182e-07,
16
+ "loss": 0.7523126602172852,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0064,
21
+ "grad_norm": 16.520568445399164,
22
+ "learning_rate": 3.0351437699680514e-07,
23
+ "loss": 0.684361743927002,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0096,
28
+ "grad_norm": 7.062991511064744,
29
+ "learning_rate": 4.6325878594249205e-07,
30
+ "loss": 0.46736898422241213,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0128,
35
+ "grad_norm": 1.0572338350229438,
36
+ "learning_rate": 6.230031948881789e-07,
37
+ "loss": 0.3222517013549805,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.016,
42
+ "grad_norm": 0.768970780796944,
43
+ "learning_rate": 7.82747603833866e-07,
44
+ "loss": 0.29146518707275393,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.0192,
49
+ "grad_norm": 0.8158618748659492,
50
+ "learning_rate": 9.424920127795528e-07,
51
+ "loss": 0.28341834545135497,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0224,
56
+ "grad_norm": 0.7218086220464439,
57
+ "learning_rate": 1.1022364217252397e-06,
58
+ "loss": 0.2903137683868408,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0256,
63
+ "grad_norm": 0.7459109221323802,
64
+ "learning_rate": 1.2619808306709266e-06,
65
+ "loss": 0.2718811988830566,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0288,
70
+ "grad_norm": 0.7186860317140319,
71
+ "learning_rate": 1.4217252396166134e-06,
72
+ "loss": 0.2660067558288574,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.032,
77
+ "grad_norm": 0.765918500231858,
78
+ "learning_rate": 1.5814696485623005e-06,
79
+ "loss": 0.26980152130126955,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.0352,
84
+ "grad_norm": 0.7344200083929374,
85
+ "learning_rate": 1.7412140575079875e-06,
86
+ "loss": 0.2695180416107178,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.0384,
91
+ "grad_norm": 0.7057487416602337,
92
+ "learning_rate": 1.9009584664536742e-06,
93
+ "loss": 0.2582674264907837,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.0416,
98
+ "grad_norm": 0.6996888798419932,
99
+ "learning_rate": 2.060702875399361e-06,
100
+ "loss": 0.2612154960632324,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.0448,
105
+ "grad_norm": 0.7150606291134206,
106
+ "learning_rate": 2.220447284345048e-06,
107
+ "loss": 0.2520437717437744,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.048,
112
+ "grad_norm": 0.7697242977250355,
113
+ "learning_rate": 2.380191693290735e-06,
114
+ "loss": 0.2501786470413208,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.0512,
119
+ "grad_norm": 0.6327215717833664,
120
+ "learning_rate": 2.539936102236422e-06,
121
+ "loss": 0.24434318542480468,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.0544,
126
+ "grad_norm": 0.7947096523807732,
127
+ "learning_rate": 2.699680511182109e-06,
128
+ "loss": 0.25281600952148436,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.0576,
133
+ "grad_norm": 0.6717890611061146,
134
+ "learning_rate": 2.8594249201277955e-06,
135
+ "loss": 0.2454531669616699,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.0608,
140
+ "grad_norm": 0.7151585341922304,
141
+ "learning_rate": 3.0191693290734825e-06,
142
+ "loss": 0.2505363464355469,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.064,
147
+ "grad_norm": 0.8601334705182279,
148
+ "learning_rate": 3.17891373801917e-06,
149
+ "loss": 0.2505714178085327,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.0672,
154
+ "grad_norm": 0.6106680426063227,
155
+ "learning_rate": 3.3386581469648564e-06,
156
+ "loss": 0.24775364398956298,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.0704,
161
+ "grad_norm": 0.6262984320818072,
162
+ "learning_rate": 3.4984025559105434e-06,
163
+ "loss": 0.24066565036773682,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.0736,
168
+ "grad_norm": 0.6078537303186395,
169
+ "learning_rate": 3.6581469648562303e-06,
170
+ "loss": 0.24378209114074706,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0768,
175
+ "grad_norm": 0.5889510426869463,
176
+ "learning_rate": 3.817891373801918e-06,
177
+ "loss": 0.23820171356201172,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.08,
182
+ "grad_norm": 0.5658292689427505,
183
+ "learning_rate": 3.977635782747604e-06,
184
+ "loss": 0.23654117584228515,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.0832,
189
+ "grad_norm": 0.5757166706348428,
190
+ "learning_rate": 4.137380191693291e-06,
191
+ "loss": 0.23743386268615724,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.0864,
196
+ "grad_norm": 0.5807034355359694,
197
+ "learning_rate": 4.297124600638978e-06,
198
+ "loss": 0.23970918655395507,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.0896,
203
+ "grad_norm": 0.5634022487351626,
204
+ "learning_rate": 4.456869009584665e-06,
205
+ "loss": 0.23490209579467775,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0928,
210
+ "grad_norm": 0.5520223075835592,
211
+ "learning_rate": 4.616613418530352e-06,
212
+ "loss": 0.2404552936553955,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.096,
217
+ "grad_norm": 0.5587222430473198,
218
+ "learning_rate": 4.776357827476039e-06,
219
+ "loss": 0.24298410415649413,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.0992,
224
+ "grad_norm": 0.542281258937415,
225
+ "learning_rate": 4.936102236421725e-06,
226
+ "loss": 0.22964231967926024,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.1024,
231
+ "grad_norm": 0.6339707011249724,
232
+ "learning_rate": 4.999943833158769e-06,
233
+ "loss": 0.22938170433044433,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.1056,
238
+ "grad_norm": 0.5290859105179109,
239
+ "learning_rate": 4.999600600490783e-06,
240
+ "loss": 0.23717782497406006,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.1088,
245
+ "grad_norm": 0.574404257271199,
246
+ "learning_rate": 4.9989453817439345e-06,
247
+ "loss": 0.23035426139831544,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.112,
252
+ "grad_norm": 0.5887719210155044,
253
+ "learning_rate": 4.997978258698942e-06,
254
+ "loss": 0.230421781539917,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.1152,
259
+ "grad_norm": 0.5618660264892863,
260
+ "learning_rate": 4.996699352066659e-06,
261
+ "loss": 0.23192777633666992,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.1184,
266
+ "grad_norm": 0.589113954603133,
267
+ "learning_rate": 4.995108821473014e-06,
268
+ "loss": 0.23194873332977295,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.1216,
273
+ "grad_norm": 0.552581223712263,
274
+ "learning_rate": 4.993206865439084e-06,
275
+ "loss": 0.22629022598266602,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.1248,
280
+ "grad_norm": 0.5506631212695152,
281
+ "learning_rate": 4.990993721356317e-06,
282
+ "loss": 0.22567858695983886,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.128,
287
+ "grad_norm": 0.5210832665844604,
288
+ "learning_rate": 4.988469665456901e-06,
289
+ "loss": 0.22596418857574463,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1312,
294
+ "grad_norm": 0.5132503738005023,
295
+ "learning_rate": 4.985635012779288e-06,
296
+ "loss": 0.23435051441192628,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1344,
301
+ "grad_norm": 0.5264119522984109,
302
+ "learning_rate": 4.98249011712887e-06,
303
+ "loss": 0.2258882999420166,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.1376,
308
+ "grad_norm": 0.5122311697688684,
309
+ "learning_rate": 4.979035371033824e-06,
310
+ "loss": 0.22527906894683838,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.1408,
315
+ "grad_norm": 0.5105227090020142,
316
+ "learning_rate": 4.975271205696115e-06,
317
+ "loss": 0.2246992588043213,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.144,
322
+ "grad_norm": 0.5307268054645026,
323
+ "learning_rate": 4.971198090937671e-06,
324
+ "loss": 0.2193459987640381,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1472,
329
+ "grad_norm": 0.46923570087876276,
330
+ "learning_rate": 4.966816535141756e-06,
331
+ "loss": 0.21553544998168944,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.1504,
336
+ "grad_norm": 0.4881836025298746,
337
+ "learning_rate": 4.9621270851895035e-06,
338
+ "loss": 0.22505784034729004,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1536,
343
+ "grad_norm": 0.50506411723612,
344
+ "learning_rate": 4.957130326391662e-06,
345
+ "loss": 0.22673957347869872,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1568,
350
+ "grad_norm": 0.5086993434891525,
351
+ "learning_rate": 4.951826882415544e-06,
352
+ "loss": 0.22294471263885499,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.16,
357
+ "grad_norm": 0.5280465251135189,
358
+ "learning_rate": 4.946217415207177e-06,
359
+ "loss": 0.21789300441741943,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1632,
364
+ "grad_norm": 0.5337843871964275,
365
+ "learning_rate": 4.940302624908689e-06,
366
+ "loss": 0.22192811965942383,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1664,
371
+ "grad_norm": 0.4884343559217744,
372
+ "learning_rate": 4.934083249770912e-06,
373
+ "loss": 0.21614904403686525,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1696,
378
+ "grad_norm": 0.5316592538281818,
379
+ "learning_rate": 4.927560066061251e-06,
380
+ "loss": 0.21973915100097657,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.1728,
385
+ "grad_norm": 0.518761429695226,
386
+ "learning_rate": 4.920733887966783e-06,
387
+ "loss": 0.23207192420959472,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.176,
392
+ "grad_norm": 0.511452747175852,
393
+ "learning_rate": 4.913605567492636e-06,
394
+ "loss": 0.21878607273101808,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1792,
399
+ "grad_norm": 0.49924599926539726,
400
+ "learning_rate": 4.906175994355656e-06,
401
+ "loss": 0.22075920104980468,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.1824,
406
+ "grad_norm": 0.5259698850641532,
407
+ "learning_rate": 4.898446095873345e-06,
408
+ "loss": 0.22276382446289061,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.1856,
413
+ "grad_norm": 0.501751014152873,
414
+ "learning_rate": 4.890416836848128e-06,
415
+ "loss": 0.21954989433288574,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.1888,
420
+ "grad_norm": 0.5167201593356286,
421
+ "learning_rate": 4.882089219446925e-06,
422
+ "loss": 0.2145029067993164,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.192,
427
+ "grad_norm": 0.5006060240232905,
428
+ "learning_rate": 4.873464283076074e-06,
429
+ "loss": 0.22003324031829835,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.1952,
434
+ "grad_norm": 0.4477538874438277,
435
+ "learning_rate": 4.864543104251587e-06,
436
+ "loss": 0.21916275024414061,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.1984,
441
+ "grad_norm": 0.4832933241270485,
442
+ "learning_rate": 4.855326796464798e-06,
443
+ "loss": 0.2203526973724365,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.2016,
448
+ "grad_norm": 0.5359361967005408,
449
+ "learning_rate": 4.8458165100433725e-06,
450
+ "loss": 0.21596732139587402,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.2048,
455
+ "grad_norm": 0.5708003689943741,
456
+ "learning_rate": 4.836013432007738e-06,
457
+ "loss": 0.2171140193939209,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.208,
462
+ "grad_norm": 0.4831169531465719,
463
+ "learning_rate": 4.825918785922921e-06,
464
+ "loss": 0.22040581703186035,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.2112,
469
+ "grad_norm": 0.4982382400104379,
470
+ "learning_rate": 4.8155338317458315e-06,
471
+ "loss": 0.21841506958007811,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.2144,
476
+ "grad_norm": 0.4741071764041748,
477
+ "learning_rate": 4.804859865668002e-06,
478
+ "loss": 0.2143453598022461,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.2176,
483
+ "grad_norm": 0.47853550451884025,
484
+ "learning_rate": 4.793898219953804e-06,
485
+ "loss": 0.21545085906982422,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.2208,
490
+ "grad_norm": 0.4902247743421047,
491
+ "learning_rate": 4.782650262774164e-06,
492
+ "loss": 0.2166231393814087,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.224,
497
+ "grad_norm": 0.4611717059287351,
498
+ "learning_rate": 4.7711173980357886e-06,
499
+ "loss": 0.21284222602844238,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.2272,
504
+ "grad_norm": 0.4815654128340087,
505
+ "learning_rate": 4.759301065205947e-06,
506
+ "loss": 0.21358721256256102,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.2304,
511
+ "grad_norm": 0.5049245613626656,
512
+ "learning_rate": 4.7472027391328e-06,
513
+ "loss": 0.21447527408599854,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.2336,
518
+ "grad_norm": 0.4758997167389971,
519
+ "learning_rate": 4.734823929861317e-06,
520
+ "loss": 0.21809780597686768,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.2368,
525
+ "grad_norm": 0.5423173365143716,
526
+ "learning_rate": 4.722166182444801e-06,
527
+ "loss": 0.21390962600708008,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.24,
532
+ "grad_norm": 0.44572231492476455,
533
+ "learning_rate": 4.709231076752045e-06,
534
+ "loss": 0.21404554843902587,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.2432,
539
+ "grad_norm": 0.4848421373802031,
540
+ "learning_rate": 4.696020227270142e-06,
541
+ "loss": 0.21710457801818847,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.2464,
546
+ "grad_norm": 0.518532765750562,
547
+ "learning_rate": 4.6825352829029705e-06,
548
+ "loss": 0.21285481452941896,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.2496,
553
+ "grad_norm": 0.5008678397970389,
554
+ "learning_rate": 4.668777926765392e-06,
555
+ "loss": 0.21155524253845215,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.2528,
560
+ "grad_norm": 0.48720974823345864,
561
+ "learning_rate": 4.6547498759731725e-06,
562
+ "loss": 0.20655455589294433,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.256,
567
+ "grad_norm": 0.49528977499161353,
568
+ "learning_rate": 4.6404528814286575e-06,
569
+ "loss": 0.2101435422897339,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.2592,
574
+ "grad_norm": 0.4532686250809506,
575
+ "learning_rate": 4.6258887276022425e-06,
576
+ "loss": 0.21684365272521972,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2624,
581
+ "grad_norm": 0.49803115837380546,
582
+ "learning_rate": 4.611059232309639e-06,
583
+ "loss": 0.21193151473999022,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2656,
588
+ "grad_norm": 0.5153783225404047,
589
+ "learning_rate": 4.595966246484986e-06,
590
+ "loss": 0.21344296932220458,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.2688,
595
+ "grad_norm": 0.4765272009238815,
596
+ "learning_rate": 4.580611653949829e-06,
597
+ "loss": 0.21319386959075928,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.272,
602
+ "grad_norm": 0.5228745905777464,
603
+ "learning_rate": 4.564997371177992e-06,
604
+ "loss": 0.21112470626831054,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.2752,
609
+ "grad_norm": 0.4583805155148445,
610
+ "learning_rate": 4.54912534705637e-06,
611
+ "loss": 0.2108391284942627,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.2784,
616
+ "grad_norm": 0.4920259584441244,
617
+ "learning_rate": 4.532997562641683e-06,
618
+ "loss": 0.20768051147460936,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.2816,
623
+ "grad_norm": 0.5200095181799963,
624
+ "learning_rate": 4.516616030913214e-06,
625
+ "loss": 0.21211957931518555,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.2848,
630
+ "grad_norm": 0.4788503683270311,
631
+ "learning_rate": 4.499982796521556e-06,
632
+ "loss": 0.20693025588989258,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.288,
637
+ "grad_norm": 0.4666456137071941,
638
+ "learning_rate": 4.48309993553341e-06,
639
+ "loss": 0.20890872478485106,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2912,
644
+ "grad_norm": 0.4794527139448749,
645
+ "learning_rate": 4.465969555172468e-06,
646
+ "loss": 0.20777955055236816,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2944,
651
+ "grad_norm": 0.4616610840587355,
652
+ "learning_rate": 4.448593793556391e-06,
653
+ "loss": 0.21416122913360597,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2976,
658
+ "grad_norm": 0.47725407011391663,
659
+ "learning_rate": 4.430974819429954e-06,
660
+ "loss": 0.20783448219299316,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.3008,
665
+ "grad_norm": 0.4596350013424985,
666
+ "learning_rate": 4.413114831894344e-06,
667
+ "loss": 0.20199823379516602,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.304,
672
+ "grad_norm": 0.4940149958405755,
673
+ "learning_rate": 4.3950160601326865e-06,
674
+ "loss": 0.20049993991851806,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.3072,
679
+ "grad_norm": 0.4891958940488766,
680
+ "learning_rate": 4.376680763131811e-06,
681
+ "loss": 0.20765538215637208,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.3104,
686
+ "grad_norm": 0.5373640149223949,
687
+ "learning_rate": 4.358111229400296e-06,
688
+ "loss": 0.2103745460510254,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3136,
693
+ "grad_norm": 0.5035919946088194,
694
+ "learning_rate": 4.33930977668283e-06,
695
+ "loss": 0.2148181438446045,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3168,
700
+ "grad_norm": 0.498832420199319,
701
+ "learning_rate": 4.320278751670922e-06,
702
+ "loss": 0.20667800903320313,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.32,
707
+ "grad_norm": 0.5016480811009209,
708
+ "learning_rate": 4.301020529710009e-06,
709
+ "loss": 0.20847175121307374,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3232,
714
+ "grad_norm": 0.5355131410598809,
715
+ "learning_rate": 4.281537514502962e-06,
716
+ "loss": 0.21192097663879395,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3264,
721
+ "grad_norm": 0.49710771531514497,
722
+ "learning_rate": 4.261832137810093e-06,
723
+ "loss": 0.20849306583404542,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3296,
728
+ "grad_norm": 0.4702938633516668,
729
+ "learning_rate": 4.241906859145611e-06,
730
+ "loss": 0.20947628021240233,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.3328,
735
+ "grad_norm": 0.47328762785100176,
736
+ "learning_rate": 4.221764165470661e-06,
737
+ "loss": 0.20568199157714845,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.336,
742
+ "grad_norm": 0.48090607151875236,
743
+ "learning_rate": 4.201406570882898e-06,
744
+ "loss": 0.20522446632385255,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3392,
749
+ "grad_norm": 0.46870182419574746,
750
+ "learning_rate": 4.180836616302704e-06,
751
+ "loss": 0.2044762134552002,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.3424,
756
+ "grad_norm": 0.49284234006242156,
757
+ "learning_rate": 4.160056869156041e-06,
758
+ "loss": 0.20835609436035157,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.3456,
763
+ "grad_norm": 0.425482663225026,
764
+ "learning_rate": 4.139069923053995e-06,
765
+ "loss": 0.20575876235961915,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.3488,
770
+ "grad_norm": 0.46647669293000804,
771
+ "learning_rate": 4.117878397469062e-06,
772
+ "loss": 0.20992250442504884,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.352,
777
+ "grad_norm": 0.4464343988416538,
778
+ "learning_rate": 4.096484937408195e-06,
779
+ "loss": 0.20092244148254396,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.3552,
784
+ "grad_norm": 0.5116088744854695,
785
+ "learning_rate": 4.074892213082676e-06,
786
+ "loss": 0.20036702156066893,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3584,
791
+ "grad_norm": 4.940314739525779,
792
+ "learning_rate": 4.0531029195748265e-06,
793
+ "loss": 0.21338913440704346,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3616,
798
+ "grad_norm": 0.4721397920115156,
799
+ "learning_rate": 4.03111977650163e-06,
800
+ "loss": 0.20792775154113768,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.3648,
805
+ "grad_norm": 0.5105519348301445,
806
+ "learning_rate": 4.008945527675281e-06,
807
+ "loss": 0.2061443328857422,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.368,
812
+ "grad_norm": 0.523180958068929,
813
+ "learning_rate": 3.986582940760717e-06,
814
+ "loss": 0.1962942123413086,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.3712,
819
+ "grad_norm": 0.5027335828799008,
820
+ "learning_rate": 3.9640348069301785e-06,
821
+ "loss": 0.2031947612762451,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.3744,
826
+ "grad_norm": 0.48735270934050073,
827
+ "learning_rate": 3.941303940514826e-06,
828
+ "loss": 0.20448057651519774,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.3776,
833
+ "grad_norm": 0.5075332440871839,
834
+ "learning_rate": 3.918393178653472e-06,
835
+ "loss": 0.20594587326049804,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.3808,
840
+ "grad_norm": 0.4485083644552742,
841
+ "learning_rate": 3.895305380938468e-06,
842
+ "loss": 0.20264167785644532,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.384,
847
+ "grad_norm": 0.4568492727427137,
848
+ "learning_rate": 3.872043429058783e-06,
849
+ "loss": 0.20010733604431152,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.3872,
854
+ "grad_norm": 0.46103501808297814,
855
+ "learning_rate": 3.84861022644033e-06,
856
+ "loss": 0.2026883602142334,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.3904,
861
+ "grad_norm": 0.46609834517793386,
862
+ "learning_rate": 3.825008697883574e-06,
863
+ "loss": 0.21079249382019044,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3936,
868
+ "grad_norm": 0.49992288047242467,
869
+ "learning_rate": 3.8012417891984776e-06,
870
+ "loss": 0.2031094551086426,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3968,
875
+ "grad_norm": 0.4746264528155682,
876
+ "learning_rate": 3.777312466836819e-06,
877
+ "loss": 0.20238199234008789,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.4,
882
+ "grad_norm": 0.45243385346205817,
883
+ "learning_rate": 3.7532237175219378e-06,
884
+ "loss": 0.20085253715515136,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.4032,
889
+ "grad_norm": 0.48931316379420287,
890
+ "learning_rate": 3.728978547875948e-06,
891
+ "loss": 0.20520598888397218,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.4064,
896
+ "grad_norm": 0.5229456414008956,
897
+ "learning_rate": 3.7045799840444712e-06,
898
+ "loss": 0.19984333515167235,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.4096,
903
+ "grad_norm": 0.4773055647919508,
904
+ "learning_rate": 3.6800310713189258e-06,
905
+ "loss": 0.20064287185668944,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.4128,
910
+ "grad_norm": 0.4824962267097886,
911
+ "learning_rate": 3.6553348737564328e-06,
912
+ "loss": 0.20138092041015626,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.416,
917
+ "grad_norm": 0.47245858486532044,
918
+ "learning_rate": 3.6304944737973794e-06,
919
+ "loss": 0.20704314708709717,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.4192,
924
+ "grad_norm": 0.47670774891547607,
925
+ "learning_rate": 3.6055129718806836e-06,
926
+ "loss": 0.20015296936035157,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4224,
931
+ "grad_norm": 0.4553061754046557,
932
+ "learning_rate": 3.5803934860568134e-06,
933
+ "loss": 0.19692450761795044,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4256,
938
+ "grad_norm": 0.5124220374815842,
939
+ "learning_rate": 3.5551391515986163e-06,
940
+ "loss": 0.2016448497772217,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4288,
945
+ "grad_norm": 0.4809826187082155,
946
+ "learning_rate": 3.529753120609982e-06,
947
+ "loss": 0.19793987274169922,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.432,
952
+ "grad_norm": 0.48798480914379067,
953
+ "learning_rate": 3.5042385616324243e-06,
954
+ "loss": 0.20041651725769044,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4352,
959
+ "grad_norm": 0.4589600174491072,
960
+ "learning_rate": 3.4785986592495934e-06,
961
+ "loss": 0.19874777793884277,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4384,
966
+ "grad_norm": 0.44810416886840765,
967
+ "learning_rate": 3.452836613689803e-06,
968
+ "loss": 0.19696075916290284,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.4416,
973
+ "grad_norm": 0.4584133576368786,
974
+ "learning_rate": 3.426955640426584e-06,
975
+ "loss": 0.20014967918395996,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.4448,
980
+ "grad_norm": 0.46474214573205574,
981
+ "learning_rate": 3.4009589697773605e-06,
982
+ "loss": 0.19937365055084227,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.448,
987
+ "grad_norm": 0.4671452045462699,
988
+ "learning_rate": 3.3748498465002475e-06,
989
+ "loss": 0.19703936576843262,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.4512,
994
+ "grad_norm": 0.48450994567172556,
995
+ "learning_rate": 3.3486315293890693e-06,
996
+ "loss": 0.20506525039672852,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4544,
1001
+ "grad_norm": 0.48940983460177095,
1002
+ "learning_rate": 3.3223072908666053e-06,
1003
+ "loss": 0.19508613348007203,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4576,
1008
+ "grad_norm": 0.5510507698314822,
1009
+ "learning_rate": 3.295880416576153e-06,
1010
+ "loss": 0.20555310249328612,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4608,
1015
+ "grad_norm": 0.45473195837081576,
1016
+ "learning_rate": 3.269354204971427e-06,
1017
+ "loss": 0.19813575744628906,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.464,
1022
+ "grad_norm": 0.4854091562037593,
1023
+ "learning_rate": 3.242731966904865e-06,
1024
+ "loss": 0.19694712162017822,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4672,
1029
+ "grad_norm": 0.4637441174996577,
1030
+ "learning_rate": 3.2160170252143913e-06,
1031
+ "loss": 0.1959088087081909,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4704,
1036
+ "grad_norm": 0.4460606032902631,
1037
+ "learning_rate": 3.1892127143086716e-06,
1038
+ "loss": 0.20340628623962403,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4736,
1043
+ "grad_norm": 0.4768689558424143,
1044
+ "learning_rate": 3.1623223797509347e-06,
1045
+ "loss": 0.19146734476089478,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4768,
1050
+ "grad_norm": 0.46631038217283505,
1051
+ "learning_rate": 3.135349377841396e-06,
1052
+ "loss": 0.19588179588317872,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.48,
1057
+ "grad_norm": 0.48197350793708515,
1058
+ "learning_rate": 3.1082970751983497e-06,
1059
+ "loss": 0.20245718955993652,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.4832,
1064
+ "grad_norm": 0.44408940491911375,
1065
+ "learning_rate": 3.0811688483379546e-06,
1066
+ "loss": 0.19959219694137573,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4864,
1071
+ "grad_norm": 0.47255519902507054,
1072
+ "learning_rate": 3.0539680832528074e-06,
1073
+ "loss": 0.1994904398918152,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4896,
1078
+ "grad_norm": 0.48800627171777977,
1079
+ "learning_rate": 3.026698174989316e-06,
1080
+ "loss": 0.19807126522064208,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4928,
1085
+ "grad_norm": 0.4748737132528679,
1086
+ "learning_rate": 2.999362527223952e-06,
1087
+ "loss": 0.19806113243103027,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.496,
1092
+ "grad_norm": 0.47637730688550123,
1093
+ "learning_rate": 2.9719645518384194e-06,
1094
+ "loss": 0.19955278635025026,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4992,
1099
+ "grad_norm": 0.5411554495039922,
1100
+ "learning_rate": 2.944507668493807e-06,
1101
+ "loss": 0.202299165725708,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.5024,
1106
+ "grad_norm": 0.48642193804707995,
1107
+ "learning_rate": 2.9169953042037623e-06,
1108
+ "loss": 0.19863581657409668,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.5056,
1113
+ "grad_norm": 0.5363553346933208,
1114
+ "learning_rate": 2.889430892906754e-06,
1115
+ "loss": 0.19409118890762328,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.5088,
1120
+ "grad_norm": 0.47187050499878397,
1121
+ "learning_rate": 2.861817875037462e-06,
1122
+ "loss": 0.1912764310836792,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.512,
1127
+ "grad_norm": 0.5163595948637988,
1128
+ "learning_rate": 2.8341596970973683e-06,
1129
+ "loss": 0.20115599632263184,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.5152,
1134
+ "grad_norm": 0.5033907485073755,
1135
+ "learning_rate": 2.80645981122458e-06,
1136
+ "loss": 0.19687057733535768,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.5184,
1141
+ "grad_norm": 0.4753722793172304,
1142
+ "learning_rate": 2.7787216747629508e-06,
1143
+ "loss": 0.20292258262634277,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5216,
1148
+ "grad_norm": 0.46781165760957,
1149
+ "learning_rate": 2.7509487498305615e-06,
1150
+ "loss": 0.18959319591522217,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5248,
1155
+ "grad_norm": 0.4803554793777817,
1156
+ "learning_rate": 2.7231445028875924e-06,
1157
+ "loss": 0.19619333744049072,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.528,
1162
+ "grad_norm": 0.43719126287209875,
1163
+ "learning_rate": 2.6953124043036604e-06,
1164
+ "loss": 0.19511375427246094,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5312,
1169
+ "grad_norm": 0.4689037514921924,
1170
+ "learning_rate": 2.667455927924667e-06,
1171
+ "loss": 0.19399585723876953,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5344,
1176
+ "grad_norm": 0.48479905355532704,
1177
+ "learning_rate": 2.6395785506392164e-06,
1178
+ "loss": 0.1896076202392578,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.5376,
1183
+ "grad_norm": 0.516453973005613,
1184
+ "learning_rate": 2.6116837519446407e-06,
1185
+ "loss": 0.1939442992210388,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5408,
1190
+ "grad_norm": 0.47710575683228795,
1191
+ "learning_rate": 2.5837750135127192e-06,
1192
+ "loss": 0.19078316688537597,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.544,
1197
+ "grad_norm": 0.47654319681013313,
1198
+ "learning_rate": 2.555855818755108e-06,
1199
+ "loss": 0.19690483808517456,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.5472,
1204
+ "grad_norm": 0.5030326386548561,
1205
+ "learning_rate": 2.5279296523885636e-06,
1206
+ "loss": 0.19325432777404786,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5504,
1211
+ "grad_norm": 0.49452423153374125,
1212
+ "learning_rate": 2.5e-06,
1213
+ "loss": 0.19436432123184205,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5536,
1218
+ "grad_norm": 0.5135088244704792,
1219
+ "learning_rate": 2.472070347611437e-06,
1220
+ "loss": 0.1878933072090149,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5568,
1225
+ "grad_norm": 0.5160118206798595,
1226
+ "learning_rate": 2.444144181244893e-06,
1227
+ "loss": 0.19355961084365844,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.56,
1232
+ "grad_norm": 0.5069308846787346,
1233
+ "learning_rate": 2.416224986487282e-06,
1234
+ "loss": 0.19122695922851562,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5632,
1239
+ "grad_norm": 0.5385800538703149,
1240
+ "learning_rate": 2.3883162480553605e-06,
1241
+ "loss": 0.18820159435272216,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5664,
1246
+ "grad_norm": 0.49129457413116234,
1247
+ "learning_rate": 2.3604214493607844e-06,
1248
+ "loss": 0.19197521209716797,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5696,
1253
+ "grad_norm": 0.4908165776123557,
1254
+ "learning_rate": 2.332544072075333e-06,
1255
+ "loss": 0.19534649848937988,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5728,
1260
+ "grad_norm": 0.49497656453552125,
1261
+ "learning_rate": 2.30468759569634e-06,
1262
+ "loss": 0.19484236240386962,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.576,
1267
+ "grad_norm": 0.466973816624908,
1268
+ "learning_rate": 2.276855497112408e-06,
1269
+ "loss": 0.191474986076355,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5792,
1274
+ "grad_norm": 0.498294237386886,
1275
+ "learning_rate": 2.2490512501694394e-06,
1276
+ "loss": 0.18636202812194824,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5824,
1281
+ "grad_norm": 0.5110432771457695,
1282
+ "learning_rate": 2.2212783252370496e-06,
1283
+ "loss": 0.19112749099731446,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5856,
1288
+ "grad_norm": 0.4923044532988948,
1289
+ "learning_rate": 2.1935401887754213e-06,
1290
+ "loss": 0.19590845108032226,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5888,
1295
+ "grad_norm": 0.49881036242858373,
1296
+ "learning_rate": 2.165840302902632e-06,
1297
+ "loss": 0.18917866945266723,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.592,
1302
+ "grad_norm": 0.5070848566140863,
1303
+ "learning_rate": 2.1381821249625383e-06,
1304
+ "loss": 0.1955878973007202,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5952,
1309
+ "grad_norm": 0.5245919327161893,
1310
+ "learning_rate": 2.1105691070932465e-06,
1311
+ "loss": 0.18681724071502687,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5984,
1316
+ "grad_norm": 0.5043139368489675,
1317
+ "learning_rate": 2.083004695796238e-06,
1318
+ "loss": 0.185194993019104,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.6016,
1323
+ "grad_norm": 0.5180452275250914,
1324
+ "learning_rate": 2.055492331506194e-06,
1325
+ "loss": 0.1928567886352539,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.6048,
1330
+ "grad_norm": 0.5320215436686966,
1331
+ "learning_rate": 2.0280354481615814e-06,
1332
+ "loss": 0.19074957370758056,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.608,
1337
+ "grad_norm": 0.4725862343819939,
1338
+ "learning_rate": 2.000637472776049e-06,
1339
+ "loss": 0.19257795810699463,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.6112,
1344
+ "grad_norm": 0.46908638481055026,
1345
+ "learning_rate": 1.973301825010685e-06,
1346
+ "loss": 0.18594731092453004,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.6144,
1351
+ "grad_norm": 0.5595713557618127,
1352
+ "learning_rate": 1.9460319167471934e-06,
1353
+ "loss": 0.19121139049530028,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.6176,
1358
+ "grad_norm": 0.507704360185881,
1359
+ "learning_rate": 1.9188311516620466e-06,
1360
+ "loss": 0.18624544143676758,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.6208,
1365
+ "grad_norm": 0.4860192603301521,
1366
+ "learning_rate": 1.891702924801651e-06,
1367
+ "loss": 0.19231630563735963,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.624,
1372
+ "grad_norm": 0.5275367662218493,
1373
+ "learning_rate": 1.864650622158604e-06,
1374
+ "loss": 0.19608126878738402,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6272,
1379
+ "grad_norm": 0.49282562967431837,
1380
+ "learning_rate": 1.8376776202490666e-06,
1381
+ "loss": 0.19235665798187257,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6304,
1386
+ "grad_norm": 0.5182260002744055,
1387
+ "learning_rate": 1.8107872856913293e-06,
1388
+ "loss": 0.18613014221191407,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.6336,
1393
+ "grad_norm": 0.5103313601861706,
1394
+ "learning_rate": 1.7839829747856096e-06,
1395
+ "loss": 0.1881113052368164,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6368,
1400
+ "grad_norm": 0.5451499180289584,
1401
+ "learning_rate": 1.7572680330951359e-06,
1402
+ "loss": 0.18735458850860595,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.64,
1407
+ "grad_norm": 0.5090636315844644,
1408
+ "learning_rate": 1.7306457950285747e-06,
1409
+ "loss": 0.1885282278060913,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6432,
1414
+ "grad_norm": 0.4758742975901025,
1415
+ "learning_rate": 1.704119583423848e-06,
1416
+ "loss": 0.18241598606109619,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6464,
1421
+ "grad_norm": 0.49602490022248863,
1422
+ "learning_rate": 1.677692709133396e-06,
1423
+ "loss": 0.19074147939682007,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6496,
1428
+ "grad_norm": 0.520455285125112,
1429
+ "learning_rate": 1.6513684706109311e-06,
1430
+ "loss": 0.19024887084960937,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6528,
1435
+ "grad_norm": 0.5234524283247538,
1436
+ "learning_rate": 1.6251501534997529e-06,
1437
+ "loss": 0.18900917768478392,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.656,
1442
+ "grad_norm": 0.4762667999370438,
1443
+ "learning_rate": 1.5990410302226405e-06,
1444
+ "loss": 0.18147594928741456,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6592,
1449
+ "grad_norm": 0.4931916769975977,
1450
+ "learning_rate": 1.5730443595734162e-06,
1451
+ "loss": 0.18815698623657226,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6624,
1456
+ "grad_norm": 0.5595459804684163,
1457
+ "learning_rate": 1.5471633863101982e-06,
1458
+ "loss": 0.18958520889282227,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.6656,
1463
+ "grad_norm": 0.551381176131532,
1464
+ "learning_rate": 1.521401340750407e-06,
1465
+ "loss": 0.1908926248550415,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6688,
1470
+ "grad_norm": 0.5155022860725758,
1471
+ "learning_rate": 1.495761438367577e-06,
1472
+ "loss": 0.18872777223587037,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.672,
1477
+ "grad_norm": 0.6037433446756716,
1478
+ "learning_rate": 1.4702468793900187e-06,
1479
+ "loss": 0.18800405263900757,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6752,
1484
+ "grad_norm": 0.5613773833705744,
1485
+ "learning_rate": 1.444860848401384e-06,
1486
+ "loss": 0.18743778467178346,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6784,
1491
+ "grad_norm": 0.5277286435676816,
1492
+ "learning_rate": 1.4196065139431866e-06,
1493
+ "loss": 0.18769149780273436,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6816,
1498
+ "grad_norm": 0.5487755330646784,
1499
+ "learning_rate": 1.3944870281193178e-06,
1500
+ "loss": 0.1866753101348877,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6848,
1505
+ "grad_norm": 0.5319334450957595,
1506
+ "learning_rate": 1.3695055262026208e-06,
1507
+ "loss": 0.19193503856658936,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.688,
1512
+ "grad_norm": 0.5061777243502238,
1513
+ "learning_rate": 1.3446651262435679e-06,
1514
+ "loss": 0.18499069213867186,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6912,
1519
+ "grad_norm": 0.5063080834031065,
1520
+ "learning_rate": 1.3199689286810746e-06,
1521
+ "loss": 0.18700281381607056,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6944,
1526
+ "grad_norm": 0.5014045449596041,
1527
+ "learning_rate": 1.2954200159555294e-06,
1528
+ "loss": 0.18185386657714844,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6976,
1533
+ "grad_norm": 0.5417896517828541,
1534
+ "learning_rate": 1.2710214521240527e-06,
1535
+ "loss": 0.18632771968841552,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.7008,
1540
+ "grad_norm": 0.5710908799443121,
1541
+ "learning_rate": 1.246776282478063e-06,
1542
+ "loss": 0.18732945919036864,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.704,
1547
+ "grad_norm": 0.5180508096448415,
1548
+ "learning_rate": 1.222687533163181e-06,
1549
+ "loss": 0.18602204322814941,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.7072,
1554
+ "grad_norm": 0.5480758918229119,
1555
+ "learning_rate": 1.1987582108015228e-06,
1556
+ "loss": 0.18710973262786865,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.7104,
1561
+ "grad_norm": 0.5631818126474104,
1562
+ "learning_rate": 1.1749913021164255e-06,
1563
+ "loss": 0.18828771114349366,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.7136,
1568
+ "grad_norm": 0.4833634541431531,
1569
+ "learning_rate": 1.1513897735596702e-06,
1570
+ "loss": 0.18257718086242675,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.7168,
1575
+ "grad_norm": 0.5051522117897481,
1576
+ "learning_rate": 1.127956570941218e-06,
1577
+ "loss": 0.17966469526290893,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.72,
1582
+ "grad_norm": 0.5404271805851407,
1583
+ "learning_rate": 1.104694619061533e-06,
1584
+ "loss": 0.18814800977706908,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.7232,
1589
+ "grad_norm": 0.5147342090287059,
1590
+ "learning_rate": 1.0816068213465295e-06,
1591
+ "loss": 0.1908186197280884,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.7264,
1596
+ "grad_norm": 0.5558495401174878,
1597
+ "learning_rate": 1.0586960594851762e-06,
1598
+ "loss": 0.1859324097633362,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.7296,
1603
+ "grad_norm": 0.6185737554957568,
1604
+ "learning_rate": 1.0359651930698217e-06,
1605
+ "loss": 0.18477405309677125,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7328,
1610
+ "grad_norm": 0.5398647348951853,
1611
+ "learning_rate": 1.0134170592392837e-06,
1612
+ "loss": 0.1857767939567566,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.736,
1617
+ "grad_norm": 0.5450678028060058,
1618
+ "learning_rate": 9.910544723247204e-07,
1619
+ "loss": 0.184822678565979,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7392,
1624
+ "grad_norm": 0.5999082382312588,
1625
+ "learning_rate": 9.688802234983706e-07,
1626
+ "loss": 0.18381783962249756,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7424,
1631
+ "grad_norm": 0.5175099712487172,
1632
+ "learning_rate": 9.468970804251742e-07,
1633
+ "loss": 0.18641353845596315,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7456,
1638
+ "grad_norm": 0.5367638040398911,
1639
+ "learning_rate": 9.251077869173244e-07,
1640
+ "loss": 0.18090612888336183,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7488,
1645
+ "grad_norm": 0.563594153188617,
1646
+ "learning_rate": 9.035150625918054e-07,
1647
+ "loss": 0.18149322271347046,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.752,
1652
+ "grad_norm": 0.5304713442318342,
1653
+ "learning_rate": 8.821216025309395e-07,
1654
+ "loss": 0.18464915752410888,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7552,
1659
+ "grad_norm": 0.535119183480021,
1660
+ "learning_rate": 8.609300769460055e-07,
1661
+ "loss": 0.1792607307434082,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7584,
1666
+ "grad_norm": 0.5724539486438234,
1667
+ "learning_rate": 8.399431308439592e-07,
1668
+ "loss": 0.183684778213501,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7616,
1673
+ "grad_norm": 0.5589161632397335,
1674
+ "learning_rate": 8.191633836972962e-07,
1675
+ "loss": 0.18650429248809813,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7648,
1680
+ "grad_norm": 0.5386156132762686,
1681
+ "learning_rate": 7.985934291171024e-07,
1682
+ "loss": 0.1821720838546753,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.768,
1687
+ "grad_norm": 0.5321288466713382,
1688
+ "learning_rate": 7.7823583452934e-07,
1689
+ "loss": 0.18489625453948974,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7712,
1694
+ "grad_norm": 0.5670301824645666,
1695
+ "learning_rate": 7.58093140854389e-07,
1696
+ "loss": 0.18495336771011353,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7744,
1701
+ "grad_norm": 0.6058756306995335,
1702
+ "learning_rate": 7.381678621899077e-07,
1703
+ "loss": 0.1848145008087158,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7776,
1708
+ "grad_norm": 0.5477002870283818,
1709
+ "learning_rate": 7.184624854970379e-07,
1710
+ "loss": 0.1817490816116333,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7808,
1715
+ "grad_norm": 0.5458027173632266,
1716
+ "learning_rate": 6.989794702899932e-07,
1717
+ "loss": 0.18078404664993286,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.784,
1722
+ "grad_norm": 0.5772130708628379,
1723
+ "learning_rate": 6.797212483290777e-07,
1724
+ "loss": 0.18299766778945922,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7872,
1729
+ "grad_norm": 0.5674146932938366,
1730
+ "learning_rate": 6.60690223317171e-07,
1731
+ "loss": 0.1799448013305664,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7904,
1736
+ "grad_norm": 0.5238538237059384,
1737
+ "learning_rate": 6.418887705997046e-07,
1738
+ "loss": 0.1826066255569458,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7936,
1743
+ "grad_norm": 0.5857270779434125,
1744
+ "learning_rate": 6.23319236868189e-07,
1745
+ "loss": 0.18549437522888185,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7968,
1750
+ "grad_norm": 0.5274424793724192,
1751
+ "learning_rate": 6.049839398673141e-07,
1752
+ "loss": 0.1865037798881531,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.8,
1757
+ "grad_norm": 0.5820741885019232,
1758
+ "learning_rate": 5.868851681056567e-07,
1759
+ "loss": 0.18739759922027588,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.8032,
1764
+ "grad_norm": 0.559971376703767,
1765
+ "learning_rate": 5.690251805700467e-07,
1766
+ "loss": 0.1853170394897461,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.8064,
1771
+ "grad_norm": 0.5456407872897143,
1772
+ "learning_rate": 5.514062064436096e-07,
1773
+ "loss": 0.18589026927948,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.8096,
1778
+ "grad_norm": 0.5866178273652722,
1779
+ "learning_rate": 5.34030444827533e-07,
1780
+ "loss": 0.1827709197998047,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.8128,
1785
+ "grad_norm": 0.588749656654477,
1786
+ "learning_rate": 5.169000644665895e-07,
1787
+ "loss": 0.1794450044631958,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.816,
1792
+ "grad_norm": 0.5778176841150756,
1793
+ "learning_rate": 5.000172034784442e-07,
1794
+ "loss": 0.18060548305511476,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.8192,
1799
+ "grad_norm": 0.566426267196354,
1800
+ "learning_rate": 4.833839690867853e-07,
1801
+ "loss": 0.18326361179351808,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.8224,
1806
+ "grad_norm": 0.5763812670051818,
1807
+ "learning_rate": 4.6700243735831705e-07,
1808
+ "loss": 0.17798151969909667,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.8256,
1813
+ "grad_norm": 0.5465254160649792,
1814
+ "learning_rate": 4.508746529436311e-07,
1815
+ "loss": 0.1761394739151001,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.8288,
1820
+ "grad_norm": 0.5717164779412172,
1821
+ "learning_rate": 4.350026288220083e-07,
1822
+ "loss": 0.18241602182388306,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.832,
1827
+ "grad_norm": 0.5532919690194787,
1828
+ "learning_rate": 4.1938834605017133e-07,
1829
+ "loss": 0.1799800157546997,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8352,
1834
+ "grad_norm": 0.5485503614596886,
1835
+ "learning_rate": 4.0403375351501515e-07,
1836
+ "loss": 0.18037915229797363,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8384,
1841
+ "grad_norm": 0.5921392059955939,
1842
+ "learning_rate": 3.88940767690362e-07,
1843
+ "loss": 0.17850807905197144,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8416,
1848
+ "grad_norm": 0.6173777417506611,
1849
+ "learning_rate": 3.7411127239775774e-07,
1850
+ "loss": 0.17773046493530273,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8448,
1855
+ "grad_norm": 0.5704461135916385,
1856
+ "learning_rate": 3.595471185713431e-07,
1857
+ "loss": 0.17534157037734985,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.848,
1862
+ "grad_norm": 0.6016600022490033,
1863
+ "learning_rate": 3.4525012402682826e-07,
1864
+ "loss": 0.17784465551376344,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8512,
1869
+ "grad_norm": 0.5793357844007763,
1870
+ "learning_rate": 3.3122207323460804e-07,
1871
+ "loss": 0.17941689491271973,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8544,
1876
+ "grad_norm": 0.5402101980665998,
1877
+ "learning_rate": 3.1746471709702963e-07,
1878
+ "loss": 0.17694177627563476,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8576,
1883
+ "grad_norm": 0.5764717205309013,
1884
+ "learning_rate": 3.039797727298585e-07,
1885
+ "loss": 0.18307201862335204,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8608,
1890
+ "grad_norm": 0.6021889152147203,
1891
+ "learning_rate": 2.9076892324795546e-07,
1892
+ "loss": 0.18175405263900757,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.864,
1897
+ "grad_norm": 0.5783244972157141,
1898
+ "learning_rate": 2.778338175551995e-07,
1899
+ "loss": 0.17646790742874147,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8672,
1904
+ "grad_norm": 0.573282650162234,
1905
+ "learning_rate": 2.6517607013868326e-07,
1906
+ "loss": 0.18459818363189698,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.8704,
1911
+ "grad_norm": 0.6039696058732922,
1912
+ "learning_rate": 2.527972608672002e-07,
1913
+ "loss": 0.18084490299224854,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8736,
1918
+ "grad_norm": 0.5916439702722857,
1919
+ "learning_rate": 2.40698934794053e-07,
1920
+ "loss": 0.18053301572799682,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8768,
1925
+ "grad_norm": 0.5703451942226244,
1926
+ "learning_rate": 2.2888260196421237e-07,
1927
+ "loss": 0.1792958378791809,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.88,
1932
+ "grad_norm": 0.5672304805383847,
1933
+ "learning_rate": 2.1734973722583735e-07,
1934
+ "loss": 0.1819172501564026,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8832,
1939
+ "grad_norm": 0.5784570642525821,
1940
+ "learning_rate": 2.0610178004619564e-07,
1941
+ "loss": 0.17332799434661866,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8864,
1946
+ "grad_norm": 0.575451427907292,
1947
+ "learning_rate": 1.9514013433199834e-07,
1948
+ "loss": 0.18558990955352783,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8896,
1953
+ "grad_norm": 0.5133461724908028,
1954
+ "learning_rate": 1.8446616825416958e-07,
1955
+ "loss": 0.18399085998535156,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8928,
1960
+ "grad_norm": 0.6123280023323261,
1961
+ "learning_rate": 1.7408121407708007e-07,
1962
+ "loss": 0.1844745397567749,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.896,
1967
+ "grad_norm": 0.5761361465385083,
1968
+ "learning_rate": 1.6398656799226253e-07,
1969
+ "loss": 0.17304511070251466,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8992,
1974
+ "grad_norm": 0.6034414454227958,
1975
+ "learning_rate": 1.5418348995662773e-07,
1976
+ "loss": 0.17871806621551514,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.9024,
1981
+ "grad_norm": 0.5923974971972374,
1982
+ "learning_rate": 1.4467320353520275e-07,
1983
+ "loss": 0.17667040824890137,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.9056,
1988
+ "grad_norm": 0.603734748014922,
1989
+ "learning_rate": 1.3545689574841341e-07,
1990
+ "loss": 0.1787508487701416,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.9088,
1995
+ "grad_norm": 0.5750783540393263,
1996
+ "learning_rate": 1.26535716923927e-07,
1997
+ "loss": 0.18438329696655273,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.912,
2002
+ "grad_norm": 0.5716942434142535,
2003
+ "learning_rate": 1.1791078055307493e-07,
2004
+ "loss": 0.1802410364151001,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.9152,
2009
+ "grad_norm": 0.6031535401501658,
2010
+ "learning_rate": 1.0958316315187289e-07,
2011
+ "loss": 0.17950894832611083,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.9184,
2016
+ "grad_norm": 0.5724651470732645,
2017
+ "learning_rate": 1.0155390412665528e-07,
2018
+ "loss": 0.17800890207290648,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.9216,
2023
+ "grad_norm": 0.5920847136083833,
2024
+ "learning_rate": 9.38240056443443e-08,
2025
+ "loss": 0.17559461593627929,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.9248,
2030
+ "grad_norm": 0.5600845233888927,
2031
+ "learning_rate": 8.639443250736402e-08,
2032
+ "loss": 0.17780338525772094,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.928,
2037
+ "grad_norm": 0.5760602589693042,
2038
+ "learning_rate": 7.926611203321777e-08,
2039
+ "loss": 0.1794909954071045,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.9312,
2044
+ "grad_norm": 0.59057677772977,
2045
+ "learning_rate": 7.243993393874882e-08,
2046
+ "loss": 0.1795297384262085,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.9344,
2051
+ "grad_norm": 0.5693422129621047,
2052
+ "learning_rate": 6.591675022908805e-08,
2053
+ "loss": 0.17676992416381837,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9376,
2058
+ "grad_norm": 0.5656532345210596,
2059
+ "learning_rate": 5.969737509131241e-08,
2060
+ "loss": 0.17433459758758546,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9408,
2065
+ "grad_norm": 0.5865348817236666,
2066
+ "learning_rate": 5.3782584792823334e-08,
2067
+ "loss": 0.1795581579208374,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.944,
2072
+ "grad_norm": 0.6034375830769324,
2073
+ "learning_rate": 4.817311758445686e-08,
2074
+ "loss": 0.18066773414611817,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9472,
2079
+ "grad_norm": 0.598761782830776,
2080
+ "learning_rate": 4.286967360833866e-08,
2081
+ "loss": 0.1803189516067505,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9504,
2086
+ "grad_norm": 0.5410244646488507,
2087
+ "learning_rate": 3.787291481049754e-08,
2088
+ "loss": 0.18075671195983886,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9536,
2093
+ "grad_norm": 0.6102805369465131,
2094
+ "learning_rate": 3.3183464858244364e-08,
2095
+ "loss": 0.18705531358718872,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.9568,
2100
+ "grad_norm": 0.5798299084498433,
2101
+ "learning_rate": 2.8801909062328992e-08,
2102
+ "loss": 0.17331962585449218,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.96,
2107
+ "grad_norm": 0.5999449762716584,
2108
+ "learning_rate": 2.4728794303886248e-08,
2109
+ "loss": 0.17158935070037842,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9632,
2114
+ "grad_norm": 0.6212882795186798,
2115
+ "learning_rate": 2.0964628966175794e-08,
2116
+ "loss": 0.17738908529281616,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.9664,
2121
+ "grad_norm": 0.564746561855876,
2122
+ "learning_rate": 1.750988287113009e-08,
2123
+ "loss": 0.17667733430862426,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9696,
2128
+ "grad_norm": 0.5852806549215316,
2129
+ "learning_rate": 1.4364987220713278e-08,
2130
+ "loss": 0.18457986116409303,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9728,
2135
+ "grad_norm": 0.5991233203919278,
2136
+ "learning_rate": 1.1530334543099763e-08,
2137
+ "loss": 0.18215363025665282,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.976,
2142
+ "grad_norm": 0.6041102228390866,
2143
+ "learning_rate": 9.006278643683697e-09,
2144
+ "loss": 0.18243587017059326,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9792,
2149
+ "grad_norm": 0.5869697890802611,
2150
+ "learning_rate": 6.793134560916514e-09,
2151
+ "loss": 0.18486570119857787,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9824,
2156
+ "grad_norm": 0.5595978682216465,
2157
+ "learning_rate": 4.891178526986451e-09,
2158
+ "loss": 0.18047856092453002,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9856,
2163
+ "grad_norm": 0.5638404572903396,
2164
+ "learning_rate": 3.3006479333413943e-09,
2165
+ "loss": 0.18349089622497558,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9888,
2170
+ "grad_norm": 0.5582534730189623,
2171
+ "learning_rate": 2.021741301058422e-09,
2172
+ "loss": 0.18032891750335694,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.992,
2177
+ "grad_norm": 0.5757824692806152,
2178
+ "learning_rate": 1.0546182560652872e-09,
2179
+ "loss": 0.1812995433807373,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9952,
2184
+ "grad_norm": 0.5718406851297113,
2185
+ "learning_rate": 3.9939950921774607e-10,
2186
+ "loss": 0.17747504711151124,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.9984,
2191
+ "grad_norm": 0.549457935685087,
2192
+ "learning_rate": 5.616684123160854e-11,
2193
+ "loss": 0.17633507251739503,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 1.0,
2198
+ "step": 3125,
2199
+ "total_flos": 2477163648385024.0,
2200
+ "train_loss": 0.20598802658081056,
2201
+ "train_runtime": 35266.4791,
2202
+ "train_samples_per_second": 5.671,
2203
+ "train_steps_per_second": 0.089
2204
+ }
2205
+ ],
2206
+ "logging_steps": 10,
2207
+ "max_steps": 3125,
2208
+ "num_input_tokens_seen": 0,
2209
+ "num_train_epochs": 1,
2210
+ "save_steps": 500,
2211
+ "stateful_callbacks": {
2212
+ "TrainerControl": {
2213
+ "args": {
2214
+ "should_epoch_stop": false,
2215
+ "should_evaluate": false,
2216
+ "should_log": false,
2217
+ "should_save": true,
2218
+ "should_training_stop": true
2219
+ },
2220
+ "attributes": {}
2221
+ }
2222
+ },
2223
+ "total_flos": 2477163648385024.0,
2224
+ "train_batch_size": 4,
2225
+ "trial_name": null,
2226
+ "trial_params": null
2227
+ }
checkpoints/GLM-4.6V-Flash-SFT/training_loss.png ADDED
checkpoints/Gemma-4-E4B-it-SFT/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.0913057758773248e+16,
4
+ "train_loss": 0.7292402684783935,
5
+ "train_runtime": 30167.0559,
6
+ "train_samples_per_second": 6.63,
7
+ "train_steps_per_second": 0.104
8
+ }
checkpoints/Gemma-4-E4B-it-SFT/chat_template.jinja ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- macro format_parameters(properties, required) -%}
2
+ {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
3
+ {%- set ns = namespace(found_first=false) -%}
4
+ {%- for key, value in properties | dictsort -%}
5
+ {%- set add_comma = false -%}
6
+ {%- if key not in standard_keys -%}
7
+ {%- if ns.found_first %},{% endif -%}
8
+ {%- set ns.found_first = true -%}
9
+ {{ key }}:{
10
+ {%- if value['description'] -%}
11
+ description:<|"|>{{ value['description'] }}<|"|>
12
+ {%- set add_comma = true -%}
13
+ {%- endif -%}
14
+ {%- if value['nullable'] %}
15
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
16
+ nullable:true
17
+ {%- endif -%}
18
+ {%- if value['type'] | upper == 'STRING' -%}
19
+ {%- if value['enum'] -%}
20
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
21
+ enum:{{ format_argument(value['enum']) }}
22
+ {%- endif -%}
23
+ {%- elif value['type'] | upper == 'OBJECT' -%}
24
+ ,properties:{
25
+ {%- if value['properties'] is defined and value['properties'] is mapping -%}
26
+ {{- format_parameters(value['properties'], value['required'] | default([])) -}}
27
+ {%- elif value is mapping -%}
28
+ {{- format_parameters(value, value['required'] | default([])) -}}
29
+ {%- endif -%}
30
+ }
31
+ {%- if value['required'] -%}
32
+ ,required:[
33
+ {%- for item in value['required'] | default([]) -%}
34
+ <|"|>{{- item -}}<|"|>
35
+ {%- if not loop.last %},{% endif -%}
36
+ {%- endfor -%}
37
+ ]
38
+ {%- endif -%}
39
+ {%- elif value['type'] | upper == 'ARRAY' -%}
40
+ {%- if value['items'] is mapping and value['items'] -%}
41
+ ,items:{
42
+ {%- set ns_items = namespace(found_first=false) -%}
43
+ {%- for item_key, item_value in value['items'] | dictsort -%}
44
+ {%- if item_value is not none -%}
45
+ {%- if ns_items.found_first %},{% endif -%}
46
+ {%- set ns_items.found_first = true -%}
47
+ {%- if item_key == 'properties' -%}
48
+ properties:{
49
+ {%- if item_value is mapping -%}
50
+ {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
51
+ {%- endif -%}
52
+ }
53
+ {%- elif item_key == 'required' -%}
54
+ required:[
55
+ {%- for req_item in item_value -%}
56
+ <|"|>{{- req_item -}}<|"|>
57
+ {%- if not loop.last %},{% endif -%}
58
+ {%- endfor -%}
59
+ ]
60
+ {%- elif item_key == 'type' -%}
61
+ {%- if item_value is string -%}
62
+ type:{{ format_argument(item_value | upper) }}
63
+ {%- else -%}
64
+ type:{{ format_argument(item_value | map('upper') | list) }}
65
+ {%- endif -%}
66
+ {%- else -%}
67
+ {{ item_key }}:{{ format_argument(item_value) }}
68
+ {%- endif -%}
69
+ {%- endif -%}
70
+ {%- endfor -%}
71
+ }
72
+ {%- endif -%}
73
+ {%- endif -%}
74
+ {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
75
+ type:<|"|>{{ value['type'] | upper }}<|"|>}
76
+ {%- endif -%}
77
+ {%- endfor -%}
78
+ {%- endmacro -%}
79
+ {%- macro format_function_declaration(tool_data) -%}
80
+ declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
81
+ {%- set params = tool_data['function']['parameters'] -%}
82
+ {%- if params -%}
83
+ ,parameters:{
84
+ {%- if params['properties'] -%}
85
+ properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
86
+ {%- endif -%}
87
+ {%- if params['required'] -%}
88
+ required:[
89
+ {%- for item in params['required'] -%}
90
+ <|"|>{{- item -}}<|"|>
91
+ {{- ',' if not loop.last -}}
92
+ {%- endfor -%}
93
+ ],
94
+ {%- endif -%}
95
+ {%- if params['type'] -%}
96
+ type:<|"|>{{- params['type'] | upper -}}<|"|>}
97
+ {%- endif -%}
98
+ {%- endif -%}
99
+ {%- if 'response' in tool_data['function'] -%}
100
+ {%- set response_declaration = tool_data['function']['response'] -%}
101
+ ,response:{
102
+ {%- if response_declaration['description'] -%}
103
+ description:<|"|>{{- response_declaration['description'] -}}<|"|>,
104
+ {%- endif -%}
105
+ {%- if response_declaration['type'] | upper == 'OBJECT' -%}
106
+ type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
107
+ {%- endif -%}
108
+ {%- endif -%}
109
+ }
110
+ {%- endmacro -%}
111
+ {%- macro format_argument(argument, escape_keys=True) -%}
112
+ {%- if argument is string -%}
113
+ {{- '<|"|>' + argument + '<|"|>' -}}
114
+ {%- elif argument is boolean -%}
115
+ {{- 'true' if argument else 'false' -}}
116
+ {%- elif argument is mapping -%}
117
+ {{- '{' -}}
118
+ {%- set ns = namespace(found_first=false) -%}
119
+ {%- for key, value in argument | dictsort -%}
120
+ {%- if ns.found_first %},{% endif -%}
121
+ {%- set ns.found_first = true -%}
122
+ {%- if escape_keys -%}
123
+ {{- '<|"|>' + key + '<|"|>' -}}
124
+ {%- else -%}
125
+ {{- key -}}
126
+ {%- endif -%}
127
+ :{{- format_argument(value, escape_keys=escape_keys) -}}
128
+ {%- endfor -%}
129
+ {{- '}' -}}
130
+ {%- elif argument is sequence -%}
131
+ {{- '[' -}}
132
+ {%- for item in argument -%}
133
+ {{- format_argument(item, escape_keys=escape_keys) -}}
134
+ {%- if not loop.last %},{% endif -%}
135
+ {%- endfor -%}
136
+ {{- ']' -}}
137
+ {%- else -%}
138
+ {{- argument -}}
139
+ {%- endif -%}
140
+ {%- endmacro -%}
141
+ {%- macro strip_thinking(text) -%}
142
+ {%- set ns = namespace(result='') -%}
143
+ {%- for part in text.split('<channel|>') -%}
144
+ {%- if '<|channel>' in part -%}
145
+ {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
146
+ {%- else -%}
147
+ {%- set ns.result = ns.result + part -%}
148
+ {%- endif -%}
149
+ {%- endfor -%}
150
+ {{- ns.result | trim -}}
151
+ {%- endmacro -%}
152
+
153
+ {%- set ns = namespace(prev_message_type=None) -%}
154
+ {%- set loop_messages = messages -%}
155
+ {{ bos_token }}
156
+ {#- Handle System/Tool Definitions Block -#}
157
+ {%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
158
+ {{- '<|turn>system\n' -}}
159
+
160
+ {#- Inject Thinking token at the very top of the FIRST system turn -#}
161
+ {%- if enable_thinking is defined and enable_thinking -%}
162
+ {{- '<|think|>' -}}
163
+ {%- set ns.prev_message_type = 'think' -%}
164
+ {%- endif -%}
165
+
166
+ {%- if messages[0]['role'] in ['system', 'developer'] -%}
167
+ {{- messages[0]['content'] | trim -}}
168
+ {%- set loop_messages = messages[1:] -%}
169
+ {%- endif -%}
170
+
171
+ {%- if tools -%}
172
+ {%- for tool in tools %}
173
+ {{- '<|tool>' -}}
174
+ {{- format_function_declaration(tool) | trim -}}
175
+ {{- '<tool|>' -}}
176
+ {%- endfor %}
177
+ {%- set ns.prev_message_type = 'tool' -%}
178
+ {%- endif -%}
179
+
180
+ {{- '<turn|>\n' -}}
181
+ {%- endif %}
182
+
183
+ {#- Loop through messages -#}
184
+ {%- for message in loop_messages -%}
185
+ {%- set ns.prev_message_type = None -%}
186
+ {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
187
+ {{- '<|turn>' + role + '\n' }}
188
+
189
+ {%- if message['tool_calls'] -%}
190
+ {%- for tool_call in message['tool_calls'] -%}
191
+ {%- set function = tool_call['function'] -%}
192
+ {{- '<|tool_call>call:' + function['name'] + '{' -}}
193
+ {%- if function['arguments'] is mapping -%}
194
+ {%- set ns_args = namespace(found_first=false) -%}
195
+ {%- for key, value in function['arguments'] | dictsort -%}
196
+ {%- if ns_args.found_first %},{% endif -%}
197
+ {%- set ns_args.found_first = true -%}
198
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
199
+ {%- endfor -%}
200
+ {%- elif function['arguments'] is string -%}
201
+ {{- function['arguments'] -}}
202
+ {%- endif -%}
203
+ {{- '}<tool_call|>' -}}
204
+ {%- endfor -%}
205
+ {%- set ns.prev_message_type = 'tool_call' -%}
206
+ {%- endif -%}
207
+
208
+ {%- if message['tool_responses'] -%}
209
+ {#- Tool Response handling -#}
210
+ {%- for tool_response in message['tool_responses'] -%}
211
+ {{- '<|tool_response>' -}}
212
+ {%- if tool_response['response'] is mapping -%}
213
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{' -}}
214
+ {%- for key, value in tool_response['response'] | dictsort -%}
215
+ {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
216
+ {%- if not loop.last %},{% endif -%}
217
+ {%- endfor -%}
218
+ {{- '}' -}}
219
+ {%- else -%}
220
+ {{- 'response:' + tool_response['name'] | default('unknown') + '{value:' + format_argument(tool_response['response'], escape_keys=False) + '}' -}}
221
+ {%- endif -%}
222
+ {{- '<tool_response|>' -}}
223
+ {%- endfor -%}
224
+ {%- set ns.prev_message_type = 'tool_response' -%}
225
+ {%- endif -%}
226
+
227
+ {%- if message['content'] is string -%}
228
+ {%- if role == 'model' -%}
229
+ {{- strip_thinking(message['content']) -}}
230
+ {%- else -%}
231
+ {{- message['content'] | trim -}}
232
+ {%- endif -%}
233
+ {%- elif message['content'] is sequence -%}
234
+ {%- for item in message['content'] -%}
235
+ {%- if item['type'] == 'text' -%}
236
+ {%- if role == 'model' -%}
237
+ {{- strip_thinking(item['text']) -}}
238
+ {%- else -%}
239
+ {{- item['text'] | trim -}}
240
+ {%- endif -%}
241
+ {%- elif item['type'] == 'image' -%}
242
+ {{- '\n\n<|image|>\n\n' -}}
243
+ {%- set ns.prev_message_type = 'image' -%}
244
+ {%- elif item['type'] == 'audio' -%}
245
+ {{- '<|audio|>' -}}
246
+ {%- set ns.prev_message_type = 'audio' -%}
247
+ {%- elif item['type'] == 'video' -%}
248
+ {{- '\n\n<|video|>\n\n' -}}
249
+ {%- set ns.prev_message_type = 'video' -%}
250
+ {%- endif -%}
251
+ {%- endfor -%}
252
+ {%- endif -%}
253
+
254
+ {%- if not (message['tool_responses'] and not message['content']) -%}
255
+ {{- '<turn|>\n' -}}
256
+ {%- endif -%}
257
+ {%- endfor -%}
258
+
259
+ {%- if add_generation_prompt -%}
260
+ {%- if ns.prev_message_type != 'tool_response' -%}
261
+ {{- '<|turn>model\n' -}}
262
+ {%- endif -%}
263
+ {%- endif -%}
checkpoints/Gemma-4-E4B-it-SFT/config.json ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Gemma4ForConditionalGeneration"
4
+ ],
5
+ "audio_config": {
6
+ "_name_or_path": "",
7
+ "architectures": null,
8
+ "attention_chunk_size": 12,
9
+ "attention_context_left": 13,
10
+ "attention_context_right": 0,
11
+ "attention_invalid_logits_value": -1000000000.0,
12
+ "attention_logit_cap": 50.0,
13
+ "chunk_size_feed_forward": 0,
14
+ "conv_kernel_size": 5,
15
+ "dtype": "bfloat16",
16
+ "gradient_clipping": 10000000000.0,
17
+ "hidden_act": "silu",
18
+ "hidden_size": 1024,
19
+ "id2label": {
20
+ "0": "LABEL_0",
21
+ "1": "LABEL_1"
22
+ },
23
+ "initializer_range": 0.02,
24
+ "is_encoder_decoder": false,
25
+ "label2id": {
26
+ "LABEL_0": 0,
27
+ "LABEL_1": 1
28
+ },
29
+ "model_type": "gemma4_audio",
30
+ "num_attention_heads": 8,
31
+ "num_hidden_layers": 12,
32
+ "output_attentions": false,
33
+ "output_hidden_states": false,
34
+ "output_proj_dims": 1536,
35
+ "problem_type": null,
36
+ "residual_weight": 0.5,
37
+ "return_dict": true,
38
+ "rms_norm_eps": 1e-06,
39
+ "subsampling_conv_channels": [
40
+ 128,
41
+ 32
42
+ ],
43
+ "use_clipped_linears": true
44
+ },
45
+ "audio_token_id": 258881,
46
+ "boa_token_id": 256000,
47
+ "boi_token_id": 255999,
48
+ "bos_token_id": 2,
49
+ "dtype": "bfloat16",
50
+ "eoa_token_id": 258883,
51
+ "eoa_token_index": 258883,
52
+ "eoi_token_id": 258882,
53
+ "eos_token_id": 106,
54
+ "hidden_size": 2560,
55
+ "image_token_id": 258880,
56
+ "initializer_range": 0.02,
57
+ "model_type": "gemma4",
58
+ "pad_token_id": 0,
59
+ "text_config": {
60
+ "attention_bias": false,
61
+ "attention_dropout": 0.0,
62
+ "attention_k_eq_v": false,
63
+ "bos_token_id": 2,
64
+ "dtype": "bfloat16",
65
+ "enable_moe_block": false,
66
+ "eos_token_id": 1,
67
+ "expert_intermediate_size": null,
68
+ "final_logit_softcapping": 30.0,
69
+ "global_head_dim": 512,
70
+ "head_dim": 256,
71
+ "hidden_activation": "gelu_pytorch_tanh",
72
+ "hidden_size": 2560,
73
+ "hidden_size_per_layer_input": 256,
74
+ "initializer_range": 0.02,
75
+ "intermediate_size": 10240,
76
+ "layer_types": [
77
+ "sliding_attention",
78
+ "sliding_attention",
79
+ "sliding_attention",
80
+ "sliding_attention",
81
+ "sliding_attention",
82
+ "full_attention",
83
+ "sliding_attention",
84
+ "sliding_attention",
85
+ "sliding_attention",
86
+ "sliding_attention",
87
+ "sliding_attention",
88
+ "full_attention",
89
+ "sliding_attention",
90
+ "sliding_attention",
91
+ "sliding_attention",
92
+ "sliding_attention",
93
+ "sliding_attention",
94
+ "full_attention",
95
+ "sliding_attention",
96
+ "sliding_attention",
97
+ "sliding_attention",
98
+ "sliding_attention",
99
+ "sliding_attention",
100
+ "full_attention",
101
+ "sliding_attention",
102
+ "sliding_attention",
103
+ "sliding_attention",
104
+ "sliding_attention",
105
+ "sliding_attention",
106
+ "full_attention",
107
+ "sliding_attention",
108
+ "sliding_attention",
109
+ "sliding_attention",
110
+ "sliding_attention",
111
+ "sliding_attention",
112
+ "full_attention",
113
+ "sliding_attention",
114
+ "sliding_attention",
115
+ "sliding_attention",
116
+ "sliding_attention",
117
+ "sliding_attention",
118
+ "full_attention"
119
+ ],
120
+ "max_position_embeddings": 131072,
121
+ "model_type": "gemma4_text",
122
+ "moe_intermediate_size": null,
123
+ "num_attention_heads": 8,
124
+ "num_experts": null,
125
+ "num_global_key_value_heads": null,
126
+ "num_hidden_layers": 42,
127
+ "num_key_value_heads": 2,
128
+ "num_kv_shared_layers": 18,
129
+ "pad_token_id": 0,
130
+ "rms_norm_eps": 1e-06,
131
+ "rope_parameters": {
132
+ "full_attention": {
133
+ "partial_rotary_factor": 0.25,
134
+ "rope_theta": 1000000.0,
135
+ "rope_type": "proportional"
136
+ },
137
+ "sliding_attention": {
138
+ "rope_theta": 10000.0,
139
+ "rope_type": "default"
140
+ }
141
+ },
142
+ "sliding_window": 512,
143
+ "tie_word_embeddings": true,
144
+ "top_k_experts": null,
145
+ "use_bidirectional_attention": null,
146
+ "use_cache": false,
147
+ "use_double_wide_mlp": false,
148
+ "vocab_size": 262144,
149
+ "vocab_size_per_layer_input": 262144
150
+ },
151
+ "tie_word_embeddings": true,
152
+ "transformers_version": "5.5.3",
153
+ "use_cache": false,
154
+ "video_token_id": 258884,
155
+ "vision_config": {
156
+ "_name_or_path": "",
157
+ "architectures": null,
158
+ "attention_bias": false,
159
+ "attention_dropout": 0.0,
160
+ "chunk_size_feed_forward": 0,
161
+ "default_output_length": 280,
162
+ "dtype": "bfloat16",
163
+ "global_head_dim": 64,
164
+ "head_dim": 64,
165
+ "hidden_activation": "gelu_pytorch_tanh",
166
+ "hidden_size": 768,
167
+ "id2label": {
168
+ "0": "LABEL_0",
169
+ "1": "LABEL_1"
170
+ },
171
+ "initializer_range": 0.02,
172
+ "intermediate_size": 3072,
173
+ "is_encoder_decoder": false,
174
+ "label2id": {
175
+ "LABEL_0": 0,
176
+ "LABEL_1": 1
177
+ },
178
+ "max_position_embeddings": 131072,
179
+ "model_type": "gemma4_vision",
180
+ "num_attention_heads": 12,
181
+ "num_hidden_layers": 16,
182
+ "num_key_value_heads": 12,
183
+ "output_attentions": false,
184
+ "output_hidden_states": false,
185
+ "patch_size": 16,
186
+ "pooling_kernel_size": 3,
187
+ "position_embedding_size": 10240,
188
+ "problem_type": null,
189
+ "return_dict": true,
190
+ "rms_norm_eps": 1e-06,
191
+ "rope_parameters": {
192
+ "rope_theta": 100.0,
193
+ "rope_type": "default"
194
+ },
195
+ "standardize": false,
196
+ "use_clipped_linears": true
197
+ },
198
+ "vision_soft_tokens_per_image": 280
199
+ }
checkpoints/Gemma-4-E4B-it-SFT/eval_results_job_gemma_gemma_4_e4b_20260430_011024.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mae_dx": 0.48666724137931033,
3
+ "rmse_dx": 1.0707492462177417,
4
+ "mae_dy": 0.3855034482758621,
5
+ "rmse_dy": 0.7843001492655289,
6
+ "mae_dz": 0.04997413793103449,
7
+ "rmse_dz": 0.156602120477122,
8
+ "mae_dpitch": 0.9934068965517242,
9
+ "rmse_dpitch": 1.7330746049166195,
10
+ "mae_dyaw": 2.2219862068965517,
11
+ "rmse_dyaw": 3.906024586323736,
12
+ "mae_droll": 0.0,
13
+ "rmse_droll": 0.0,
14
+ "mae_overall": 0.6895896551724138,
15
+ "mae_position": 0.30738160919540225,
16
+ "mae_rotation": 1.0717977011494253,
17
+ "rmse_overall": 1.8278735619896387,
18
+ "wp1_euc_mae": 0.2665493636964831,
19
+ "wp1_euc_median": 0.18,
20
+ "wp2_euc_mae": 0.5012943438070621,
21
+ "wp2_euc_median": 0.31144823004794875,
22
+ "wp3_euc_mae": 0.7271333853911885,
23
+ "wp3_euc_median": 0.48,
24
+ "wp4_euc_mae": 0.958032444080531,
25
+ "wp4_euc_median": 0.6351377754492935,
26
+ "wp5_euc_mae": 1.1876023185914943,
27
+ "wp5_euc_median": 0.7778817364281356,
28
+ "euclidean_mae": 0.7281223711133517,
29
+ "ADE": 0.7281223711133519,
30
+ "FDE": 1.1876023185914943,
31
+ "ADE_median": 0.49122803576716423,
32
+ "FDE_median": 0.7778817364281356,
33
+ "SR@0.5m": 0.5736206896551724,
34
+ "SR@1.0m": 0.783448275862069,
35
+ "SR@2.0m": 0.9222413793103448,
36
+ "SR@5.0m": 0.9898275862068966,
37
+ "TrajSR@1.0m": 0.5887931034482758,
38
+ "TrajSR@2.0m": 0.8353448275862069,
39
+ "TrajSR@5.0m": 0.9724137931034482,
40
+ "RotAcc@1.0deg": 0.39948275862068966,
41
+ "RotAcc@5.0deg": 0.83,
42
+ "RotAcc@10.0deg": 0.9762068965517241,
43
+ "wp1_rot_mae": 1.8561397413473146,
44
+ "wp2_rot_mae": 2.249132034716281,
45
+ "wp3_rot_mae": 2.6355453352548355,
46
+ "wp4_rot_mae": 3.048629056478642,
47
+ "wp5_rot_mae": 3.45811827126434,
48
+ "rotation_euc_mae": 2.6495128878122824,
49
+ "parse_failure_rate": 0.0,
50
+ "parse_success_rate": 1.0,
51
+ "valid_samples": 1160,
52
+ "total_samples": 1160,
53
+ "parse_failures": 0,
54
+ "inference_engine": "vllm",
55
+ "vllm_version": "0.19.0"
56
+ }
checkpoints/Gemma-4-E4B-it-SFT/generation_config.json ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 2,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 106,
6
+ 1,
7
+ 106,
8
+ 50
9
+ ],
10
+ "pad_token_id": 0,
11
+ "temperature": 1.0,
12
+ "top_k": 64,
13
+ "top_p": 0.95,
14
+ "transformers_version": "5.5.3"
15
+ }
checkpoints/Gemma-4-E4B-it-SFT/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa115532595f57272ed0b16337a23de6762ffa60ab858147f5f51f1cff34105b
3
+ size 15992595884
checkpoints/Gemma-4-E4B-it-SFT/processor_config.json ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_ms_per_token": 40,
3
+ "audio_seq_length": 750,
4
+ "feature_extractor": {
5
+ "dither": 0.0,
6
+ "feature_extractor_type": "Gemma4AudioFeatureExtractor",
7
+ "feature_size": 128,
8
+ "fft_length": 512,
9
+ "fft_overdrive": false,
10
+ "frame_length": 320,
11
+ "hop_length": 160,
12
+ "input_scale_factor": 1.0,
13
+ "max_frequency": 8000.0,
14
+ "mel_floor": 0.001,
15
+ "min_frequency": 0.0,
16
+ "padding_side": "right",
17
+ "padding_value": 0.0,
18
+ "per_bin_mean": null,
19
+ "per_bin_stddev": null,
20
+ "preemphasis": 0.0,
21
+ "preemphasis_htk_flavor": true,
22
+ "return_attention_mask": true,
23
+ "sampling_rate": 16000
24
+ },
25
+ "image_processor": {
26
+ "do_convert_rgb": true,
27
+ "do_normalize": false,
28
+ "do_rescale": true,
29
+ "do_resize": true,
30
+ "image_mean": [
31
+ 0.0,
32
+ 0.0,
33
+ 0.0
34
+ ],
35
+ "image_processor_type": "Gemma4ImageProcessor",
36
+ "image_seq_length": 280,
37
+ "image_std": [
38
+ 1.0,
39
+ 1.0,
40
+ 1.0
41
+ ],
42
+ "max_soft_tokens": 280,
43
+ "patch_size": 16,
44
+ "pooling_kernel_size": 3,
45
+ "resample": 3,
46
+ "rescale_factor": 0.00392156862745098
47
+ },
48
+ "image_seq_length": 280,
49
+ "processor_class": "Gemma4Processor",
50
+ "video_processor": {
51
+ "do_convert_rgb": true,
52
+ "do_normalize": true,
53
+ "do_rescale": true,
54
+ "do_resize": true,
55
+ "do_sample_frames": true,
56
+ "image_mean": [
57
+ 0.0,
58
+ 0.0,
59
+ 0.0
60
+ ],
61
+ "image_std": [
62
+ 1.0,
63
+ 1.0,
64
+ 1.0
65
+ ],
66
+ "max_soft_tokens": 70,
67
+ "num_frames": 32,
68
+ "patch_size": 16,
69
+ "pooling_kernel_size": 3,
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "video_processor_type": "Gemma4VideoProcessor"
74
+ }
75
+ }
checkpoints/Gemma-4-E4B-it-SFT/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc8d3a0ce36466ccc1278bf987df5f71db1719b9ca6b4118264f45cb627bfe0f
3
+ size 32169626
checkpoints/Gemma-4-E4B-it-SFT/tokenizer_config.json ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_token": "<|audio|>",
3
+ "backend": "tokenizers",
4
+ "boa_token": "<|audio>",
5
+ "boi_token": "<|image>",
6
+ "bos_token": "<bos>",
7
+ "eoa_token": "<audio|>",
8
+ "eoc_token": "<channel|>",
9
+ "eoi_token": "<image|>",
10
+ "eos_token": "<turn|>",
11
+ "eot_token": "<turn|>",
12
+ "escape_token": "<|\"|>",
13
+ "etc_token": "<tool_call|>",
14
+ "etd_token": "<tool|>",
15
+ "etr_token": "<tool_response|>",
16
+ "extra_special_tokens": [
17
+ "<|video|>"
18
+ ],
19
+ "image_token": "<|image|>",
20
+ "is_local": true,
21
+ "mask_token": "<mask>",
22
+ "model_max_length": 1000000000000000019884624838656,
23
+ "model_specific_special_tokens": {
24
+ "audio_token": "<|audio|>",
25
+ "boa_token": "<|audio>",
26
+ "boi_token": "<|image>",
27
+ "eoa_token": "<audio|>",
28
+ "eoc_token": "<channel|>",
29
+ "eoi_token": "<image|>",
30
+ "eot_token": "<turn|>",
31
+ "escape_token": "<|\"|>",
32
+ "etc_token": "<tool_call|>",
33
+ "etd_token": "<tool|>",
34
+ "etr_token": "<tool_response|>",
35
+ "image_token": "<|image|>",
36
+ "soc_token": "<|channel>",
37
+ "sot_token": "<|turn>",
38
+ "stc_token": "<|tool_call>",
39
+ "std_token": "<|tool>",
40
+ "str_token": "<|tool_response>",
41
+ "think_token": "<|think|>"
42
+ },
43
+ "pad_token": "<pad>",
44
+ "padding_side": "right",
45
+ "processor_class": "Gemma4Processor",
46
+ "response_schema": {
47
+ "properties": {
48
+ "content": {
49
+ "type": "string"
50
+ },
51
+ "role": {
52
+ "const": "assistant"
53
+ },
54
+ "thinking": {
55
+ "type": "string"
56
+ },
57
+ "tool_calls": {
58
+ "items": {
59
+ "properties": {
60
+ "function": {
61
+ "properties": {
62
+ "arguments": {
63
+ "additionalProperties": {},
64
+ "type": "object",
65
+ "x-parser": "gemma4-tool-call"
66
+ },
67
+ "name": {
68
+ "type": "string"
69
+ }
70
+ },
71
+ "type": "object",
72
+ "x-regex": "call\\:(?P<name>\\w+)(?P<arguments>\\{.*\\})"
73
+ },
74
+ "type": {
75
+ "const": "function"
76
+ }
77
+ },
78
+ "type": "object"
79
+ },
80
+ "type": "array",
81
+ "x-regex-iterator": "<\\|tool_call>(.*?)<tool_call\\|>"
82
+ }
83
+ },
84
+ "type": "object",
85
+ "x-regex": "(\\<\\|channel\\>thought\\n(?P<thinking>.*?)\\<channel\\|\\>)?(?P<content>(?:(?!\\<\\|tool_call\\>)(?!\\<turn\\|\\>).)+)?(?P<tool_calls>\\<\\|tool_call\\>.*\\<tool_call\\|\\>)?(?:\\<turn\\|\\>)?"
86
+ },
87
+ "soc_token": "<|channel>",
88
+ "sot_token": "<|turn>",
89
+ "split_special_tokens": false,
90
+ "stc_token": "<|tool_call>",
91
+ "std_token": "<|tool>",
92
+ "str_token": "<|tool_response>",
93
+ "think_token": "<|think|>",
94
+ "tokenizer_class": "GemmaTokenizer",
95
+ "unk_token": "<unk>"
96
+ }
checkpoints/Gemma-4-E4B-it-SFT/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1.0913057758773248e+16,
4
+ "train_loss": 0.7292402684783935,
5
+ "train_runtime": 30167.0559,
6
+ "train_samples_per_second": 6.63,
7
+ "train_steps_per_second": 0.104
8
+ }
checkpoints/Gemma-4-E4B-it-SFT/trainer_state.json ADDED
@@ -0,0 +1,2227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0032,
14
+ "grad_norm": 366.0841096744857,
15
+ "learning_rate": 1.437699680511182e-07,
16
+ "loss": 23.85431823730469,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0064,
21
+ "grad_norm": 367.47333882445946,
22
+ "learning_rate": 3.0351437699680514e-07,
23
+ "loss": 23.65589599609375,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0096,
28
+ "grad_norm": 367.96579270464326,
29
+ "learning_rate": 4.6325878594249205e-07,
30
+ "loss": 22.780029296875,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0128,
35
+ "grad_norm": 332.5732884154056,
36
+ "learning_rate": 6.230031948881789e-07,
37
+ "loss": 20.279689025878906,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.016,
42
+ "grad_norm": 219.53674756423746,
43
+ "learning_rate": 7.82747603833866e-07,
44
+ "loss": 15.498806762695313,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.0192,
49
+ "grad_norm": 156.1487544830451,
50
+ "learning_rate": 9.424920127795528e-07,
51
+ "loss": 10.388201904296874,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0224,
56
+ "grad_norm": 37.96869040917498,
57
+ "learning_rate": 1.1022364217252397e-06,
58
+ "loss": 3.7560958862304688,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0256,
63
+ "grad_norm": 16.783464772614202,
64
+ "learning_rate": 1.2619808306709266e-06,
65
+ "loss": 2.033830261230469,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0288,
70
+ "grad_norm": 5.438256169634593,
71
+ "learning_rate": 1.4217252396166134e-06,
72
+ "loss": 1.0431390762329102,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.032,
77
+ "grad_norm": 3.6935246150045775,
78
+ "learning_rate": 1.5814696485623005e-06,
79
+ "loss": 0.8069572448730469,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.0352,
84
+ "grad_norm": 9.218312544625562,
85
+ "learning_rate": 1.7412140575079875e-06,
86
+ "loss": 0.7057615280151367,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.0384,
91
+ "grad_norm": 5.394484238866305,
92
+ "learning_rate": 1.9009584664536742e-06,
93
+ "loss": 0.6301750183105469,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.0416,
98
+ "grad_norm": 6.577481237217732,
99
+ "learning_rate": 2.060702875399361e-06,
100
+ "loss": 0.5898516654968262,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.0448,
105
+ "grad_norm": 3.4158074483641068,
106
+ "learning_rate": 2.220447284345048e-06,
107
+ "loss": 0.5524418830871582,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.048,
112
+ "grad_norm": 4.032046521040006,
113
+ "learning_rate": 2.380191693290735e-06,
114
+ "loss": 0.5317594051361084,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.0512,
119
+ "grad_norm": 5.468634675306576,
120
+ "learning_rate": 2.539936102236422e-06,
121
+ "loss": 0.5184277534484864,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.0544,
126
+ "grad_norm": 3.4313124951156424,
127
+ "learning_rate": 2.699680511182109e-06,
128
+ "loss": 0.5204483985900878,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.0576,
133
+ "grad_norm": 5.13400179254009,
134
+ "learning_rate": 2.8594249201277955e-06,
135
+ "loss": 0.5058025360107422,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.0608,
140
+ "grad_norm": 5.9183424216837786,
141
+ "learning_rate": 3.0191693290734825e-06,
142
+ "loss": 0.5073411941528321,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.064,
147
+ "grad_norm": 5.625073986187664,
148
+ "learning_rate": 3.17891373801917e-06,
149
+ "loss": 0.5000103950500489,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.0672,
154
+ "grad_norm": 5.050603467051007,
155
+ "learning_rate": 3.3386581469648564e-06,
156
+ "loss": 0.488192081451416,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.0704,
161
+ "grad_norm": 11.776866822937645,
162
+ "learning_rate": 3.4984025559105434e-06,
163
+ "loss": 0.48699202537536623,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.0736,
168
+ "grad_norm": 7.438900018795585,
169
+ "learning_rate": 3.6581469648562303e-06,
170
+ "loss": 0.4820102691650391,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0768,
175
+ "grad_norm": 4.3491840646532065,
176
+ "learning_rate": 3.817891373801918e-06,
177
+ "loss": 0.47640199661254884,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.08,
182
+ "grad_norm": 3.472565426233091,
183
+ "learning_rate": 3.977635782747604e-06,
184
+ "loss": 0.4729574203491211,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.0832,
189
+ "grad_norm": 3.1912744148161942,
190
+ "learning_rate": 4.137380191693291e-06,
191
+ "loss": 0.4786433219909668,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.0864,
196
+ "grad_norm": 3.9698013424470777,
197
+ "learning_rate": 4.297124600638978e-06,
198
+ "loss": 0.4748369216918945,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.0896,
203
+ "grad_norm": 8.11949393489321,
204
+ "learning_rate": 4.456869009584665e-06,
205
+ "loss": 0.4681865692138672,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0928,
210
+ "grad_norm": 4.7349566199381234,
211
+ "learning_rate": 4.616613418530352e-06,
212
+ "loss": 0.46743001937866213,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.096,
217
+ "grad_norm": 4.756427284033883,
218
+ "learning_rate": 4.776357827476039e-06,
219
+ "loss": 0.46964178085327146,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.0992,
224
+ "grad_norm": 4.86570605379029,
225
+ "learning_rate": 4.936102236421725e-06,
226
+ "loss": 0.45612516403198244,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.1024,
231
+ "grad_norm": 5.762654788054032,
232
+ "learning_rate": 4.999943833158769e-06,
233
+ "loss": 0.45009474754333495,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.1056,
238
+ "grad_norm": 3.501477346053355,
239
+ "learning_rate": 4.999600600490783e-06,
240
+ "loss": 0.4523477554321289,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.1088,
245
+ "grad_norm": 7.957279740713588,
246
+ "learning_rate": 4.9989453817439345e-06,
247
+ "loss": 0.44190473556518556,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.112,
252
+ "grad_norm": 7.660308885793361,
253
+ "learning_rate": 4.997978258698942e-06,
254
+ "loss": 0.43758931159973147,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.1152,
259
+ "grad_norm": 5.8839479464224205,
260
+ "learning_rate": 4.996699352066659e-06,
261
+ "loss": 0.4371060371398926,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.1184,
266
+ "grad_norm": 3.452842882877267,
267
+ "learning_rate": 4.995108821473014e-06,
268
+ "loss": 0.42999753952026365,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.1216,
273
+ "grad_norm": 4.825810317520427,
274
+ "learning_rate": 4.993206865439084e-06,
275
+ "loss": 0.4285894393920898,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.1248,
280
+ "grad_norm": 5.379766821254966,
281
+ "learning_rate": 4.990993721356317e-06,
282
+ "loss": 0.42139811515808107,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.128,
287
+ "grad_norm": 4.854730410799869,
288
+ "learning_rate": 4.988469665456901e-06,
289
+ "loss": 0.42040281295776366,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1312,
294
+ "grad_norm": 4.6616615938661745,
295
+ "learning_rate": 4.985635012779288e-06,
296
+ "loss": 0.4207456588745117,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1344,
301
+ "grad_norm": 4.5341296475975605,
302
+ "learning_rate": 4.98249011712887e-06,
303
+ "loss": 0.414472770690918,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.1376,
308
+ "grad_norm": 5.217437981869656,
309
+ "learning_rate": 4.979035371033824e-06,
310
+ "loss": 0.41441006660461427,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.1408,
315
+ "grad_norm": 3.561516924716779,
316
+ "learning_rate": 4.975271205696115e-06,
317
+ "loss": 0.40767755508422854,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.144,
322
+ "grad_norm": 3.815692337476438,
323
+ "learning_rate": 4.971198090937671e-06,
324
+ "loss": 0.3997596263885498,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1472,
329
+ "grad_norm": 4.559242371997167,
330
+ "learning_rate": 4.966816535141756e-06,
331
+ "loss": 0.39360842704772947,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.1504,
336
+ "grad_norm": 3.432229350472061,
337
+ "learning_rate": 4.9621270851895035e-06,
338
+ "loss": 0.40289998054504395,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1536,
343
+ "grad_norm": 5.375227134041046,
344
+ "learning_rate": 4.957130326391662e-06,
345
+ "loss": 0.3982266664505005,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1568,
350
+ "grad_norm": 5.539585521677851,
351
+ "learning_rate": 4.951826882415544e-06,
352
+ "loss": 0.39270691871643065,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.16,
357
+ "grad_norm": 3.4147092253345743,
358
+ "learning_rate": 4.946217415207177e-06,
359
+ "loss": 0.3853750705718994,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1632,
364
+ "grad_norm": 4.444175842440995,
365
+ "learning_rate": 4.940302624908689e-06,
366
+ "loss": 0.38694162368774415,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1664,
371
+ "grad_norm": 3.3493207902303475,
372
+ "learning_rate": 4.934083249770912e-06,
373
+ "loss": 0.3797153949737549,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1696,
378
+ "grad_norm": 3.0499194254019097,
379
+ "learning_rate": 4.927560066061251e-06,
380
+ "loss": 0.38063654899597166,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.1728,
385
+ "grad_norm": 3.141871281336489,
386
+ "learning_rate": 4.920733887966783e-06,
387
+ "loss": 0.39005699157714846,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.176,
392
+ "grad_norm": 3.979297184951908,
393
+ "learning_rate": 4.913605567492636e-06,
394
+ "loss": 0.38013472557067873,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1792,
399
+ "grad_norm": 3.7669251986704113,
400
+ "learning_rate": 4.906175994355656e-06,
401
+ "loss": 0.37832577228546144,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.1824,
406
+ "grad_norm": 2.983798431857085,
407
+ "learning_rate": 4.898446095873345e-06,
408
+ "loss": 0.38150479793548586,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.1856,
413
+ "grad_norm": 3.657787030439589,
414
+ "learning_rate": 4.890416836848128e-06,
415
+ "loss": 0.3775670528411865,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.1888,
420
+ "grad_norm": 3.551048022748126,
421
+ "learning_rate": 4.882089219446925e-06,
422
+ "loss": 0.37199065685272215,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.192,
427
+ "grad_norm": 4.750977601329729,
428
+ "learning_rate": 4.873464283076074e-06,
429
+ "loss": 0.3790221452713013,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.1952,
434
+ "grad_norm": 7.684545118387627,
435
+ "learning_rate": 4.864543104251587e-06,
436
+ "loss": 0.37508673667907716,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.1984,
441
+ "grad_norm": 5.872575231845199,
442
+ "learning_rate": 4.855326796464798e-06,
443
+ "loss": 0.3811868906021118,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.2016,
448
+ "grad_norm": 3.9960144706794316,
449
+ "learning_rate": 4.8458165100433725e-06,
450
+ "loss": 0.37326750755310056,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.2048,
455
+ "grad_norm": 3.9998452581157657,
456
+ "learning_rate": 4.836013432007738e-06,
457
+ "loss": 0.3709099769592285,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.208,
462
+ "grad_norm": 2.6973135018594343,
463
+ "learning_rate": 4.825918785922921e-06,
464
+ "loss": 0.3728507995605469,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.2112,
469
+ "grad_norm": 4.478756132604264,
470
+ "learning_rate": 4.8155338317458315e-06,
471
+ "loss": 0.36782591342926024,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.2144,
476
+ "grad_norm": 2.5620662799375378,
477
+ "learning_rate": 4.804859865668002e-06,
478
+ "loss": 0.36416780948638916,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.2176,
483
+ "grad_norm": 2.9398359151969884,
484
+ "learning_rate": 4.793898219953804e-06,
485
+ "loss": 0.36772732734680175,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.2208,
490
+ "grad_norm": 3.404020172068192,
491
+ "learning_rate": 4.782650262774164e-06,
492
+ "loss": 0.3651688575744629,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.224,
497
+ "grad_norm": 2.588678061474319,
498
+ "learning_rate": 4.7711173980357886e-06,
499
+ "loss": 0.3649880409240723,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.2272,
504
+ "grad_norm": 3.5390276900279773,
505
+ "learning_rate": 4.759301065205947e-06,
506
+ "loss": 0.3612825870513916,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.2304,
511
+ "grad_norm": 3.8670986814196473,
512
+ "learning_rate": 4.7472027391328e-06,
513
+ "loss": 0.3657612085342407,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.2336,
518
+ "grad_norm": 3.0276354554801217,
519
+ "learning_rate": 4.734823929861317e-06,
520
+ "loss": 0.36682844161987305,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.2368,
525
+ "grad_norm": 5.205227283770371,
526
+ "learning_rate": 4.722166182444801e-06,
527
+ "loss": 0.3605961322784424,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.24,
532
+ "grad_norm": 3.1037248816470737,
533
+ "learning_rate": 4.709231076752045e-06,
534
+ "loss": 0.3625338554382324,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.2432,
539
+ "grad_norm": 3.827009314178272,
540
+ "learning_rate": 4.696020227270142e-06,
541
+ "loss": 0.36273531913757323,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.2464,
546
+ "grad_norm": 2.553717481812464,
547
+ "learning_rate": 4.6825352829029705e-06,
548
+ "loss": 0.35740270614624026,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.2496,
553
+ "grad_norm": 2.8273485176739563,
554
+ "learning_rate": 4.668777926765392e-06,
555
+ "loss": 0.3613132953643799,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.2528,
560
+ "grad_norm": 3.242165291552063,
561
+ "learning_rate": 4.6547498759731725e-06,
562
+ "loss": 0.3525214672088623,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.256,
567
+ "grad_norm": 2.607635187753211,
568
+ "learning_rate": 4.6404528814286575e-06,
569
+ "loss": 0.3569283723831177,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.2592,
574
+ "grad_norm": 3.2439792578606204,
575
+ "learning_rate": 4.6258887276022425e-06,
576
+ "loss": 0.357681941986084,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2624,
581
+ "grad_norm": 2.9728036180938284,
582
+ "learning_rate": 4.611059232309639e-06,
583
+ "loss": 0.3537192106246948,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2656,
588
+ "grad_norm": 2.556165398739607,
589
+ "learning_rate": 4.595966246484986e-06,
590
+ "loss": 0.3528641700744629,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.2688,
595
+ "grad_norm": 2.593548528246384,
596
+ "learning_rate": 4.580611653949829e-06,
597
+ "loss": 0.3564203500747681,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.272,
602
+ "grad_norm": 3.428440109671292,
603
+ "learning_rate": 4.564997371177992e-06,
604
+ "loss": 0.3518026828765869,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.2752,
609
+ "grad_norm": 4.993564850548027,
610
+ "learning_rate": 4.54912534705637e-06,
611
+ "loss": 0.35079920291900635,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.2784,
616
+ "grad_norm": 3.340510283095063,
617
+ "learning_rate": 4.532997562641683e-06,
618
+ "loss": 0.3466078042984009,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.2816,
623
+ "grad_norm": 2.6894615056191644,
624
+ "learning_rate": 4.516616030913214e-06,
625
+ "loss": 0.3472653865814209,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.2848,
630
+ "grad_norm": 1.891440124594712,
631
+ "learning_rate": 4.499982796521556e-06,
632
+ "loss": 0.34483723640441893,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.288,
637
+ "grad_norm": 3.223309297530686,
638
+ "learning_rate": 4.48309993553341e-06,
639
+ "loss": 0.3444544553756714,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2912,
644
+ "grad_norm": 3.1032077209020468,
645
+ "learning_rate": 4.465969555172468e-06,
646
+ "loss": 0.34571564197540283,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2944,
651
+ "grad_norm": 2.5407458837926638,
652
+ "learning_rate": 4.448593793556391e-06,
653
+ "loss": 0.3534140110015869,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2976,
658
+ "grad_norm": 3.1253686498979123,
659
+ "learning_rate": 4.430974819429954e-06,
660
+ "loss": 0.3445676326751709,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.3008,
665
+ "grad_norm": 3.740083740472538,
666
+ "learning_rate": 4.413114831894344e-06,
667
+ "loss": 0.33962287902832033,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.304,
672
+ "grad_norm": 4.724023923665093,
673
+ "learning_rate": 4.3950160601326865e-06,
674
+ "loss": 0.3363780498504639,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.3072,
679
+ "grad_norm": 3.597276867142834,
680
+ "learning_rate": 4.376680763131811e-06,
681
+ "loss": 0.3429840087890625,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.3104,
686
+ "grad_norm": 2.97998267012516,
687
+ "learning_rate": 4.358111229400296e-06,
688
+ "loss": 0.3470882177352905,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3136,
693
+ "grad_norm": 3.1405275857331856,
694
+ "learning_rate": 4.33930977668283e-06,
695
+ "loss": 0.35235731601715087,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3168,
700
+ "grad_norm": 3.774584318253359,
701
+ "learning_rate": 4.320278751670922e-06,
702
+ "loss": 0.3418004512786865,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.32,
707
+ "grad_norm": 3.4325438208492605,
708
+ "learning_rate": 4.301020529710009e-06,
709
+ "loss": 0.3456583499908447,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3232,
714
+ "grad_norm": 3.1407187711443916,
715
+ "learning_rate": 4.281537514502962e-06,
716
+ "loss": 0.3446167469024658,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3264,
721
+ "grad_norm": 2.6154317834679226,
722
+ "learning_rate": 4.261832137810093e-06,
723
+ "loss": 0.34354138374328613,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3296,
728
+ "grad_norm": 2.8993376261822648,
729
+ "learning_rate": 4.241906859145611e-06,
730
+ "loss": 0.3451784372329712,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.3328,
735
+ "grad_norm": 2.3351853591260574,
736
+ "learning_rate": 4.221764165470661e-06,
737
+ "loss": 0.33875834941864014,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.336,
742
+ "grad_norm": 3.4295735539049605,
743
+ "learning_rate": 4.201406570882898e-06,
744
+ "loss": 0.33980226516723633,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3392,
749
+ "grad_norm": 2.6388634367096735,
750
+ "learning_rate": 4.180836616302704e-06,
751
+ "loss": 0.3395829200744629,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.3424,
756
+ "grad_norm": 3.211009486395674,
757
+ "learning_rate": 4.160056869156041e-06,
758
+ "loss": 0.3433471441268921,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.3456,
763
+ "grad_norm": 3.4377414857289317,
764
+ "learning_rate": 4.139069923053995e-06,
765
+ "loss": 0.34047765731811525,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.3488,
770
+ "grad_norm": 3.131466112366247,
771
+ "learning_rate": 4.117878397469062e-06,
772
+ "loss": 0.3420018434524536,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.352,
777
+ "grad_norm": 2.388207923072635,
778
+ "learning_rate": 4.096484937408195e-06,
779
+ "loss": 0.3351470470428467,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.3552,
784
+ "grad_norm": 2.2910707329028117,
785
+ "learning_rate": 4.074892213082676e-06,
786
+ "loss": 0.33539299964904784,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3584,
791
+ "grad_norm": 2.156244058261874,
792
+ "learning_rate": 4.0531029195748265e-06,
793
+ "loss": 0.33862009048461916,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3616,
798
+ "grad_norm": 2.6382644444406296,
799
+ "learning_rate": 4.03111977650163e-06,
800
+ "loss": 0.34041495323181153,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.3648,
805
+ "grad_norm": 2.5960896388831545,
806
+ "learning_rate": 4.008945527675281e-06,
807
+ "loss": 0.3390871524810791,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.368,
812
+ "grad_norm": 3.657074741484568,
813
+ "learning_rate": 3.986582940760717e-06,
814
+ "loss": 0.3278806209564209,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.3712,
819
+ "grad_norm": 2.9587401358526075,
820
+ "learning_rate": 3.9640348069301785e-06,
821
+ "loss": 0.3368961334228516,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.3744,
826
+ "grad_norm": 1.965300565427372,
827
+ "learning_rate": 3.941303940514826e-06,
828
+ "loss": 0.3339808464050293,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.3776,
833
+ "grad_norm": 2.90985435283837,
834
+ "learning_rate": 3.918393178653472e-06,
835
+ "loss": 0.3376065969467163,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.3808,
840
+ "grad_norm": 3.27190473511409,
841
+ "learning_rate": 3.895305380938468e-06,
842
+ "loss": 0.3342454433441162,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.384,
847
+ "grad_norm": 2.0468253424433165,
848
+ "learning_rate": 3.872043429058783e-06,
849
+ "loss": 0.32965447902679446,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.3872,
854
+ "grad_norm": 2.5123150680001576,
855
+ "learning_rate": 3.84861022644033e-06,
856
+ "loss": 0.3357837677001953,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.3904,
861
+ "grad_norm": 3.148104290988529,
862
+ "learning_rate": 3.825008697883574e-06,
863
+ "loss": 0.34343953132629396,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3936,
868
+ "grad_norm": 2.488823913942074,
869
+ "learning_rate": 3.8012417891984776e-06,
870
+ "loss": 0.333116340637207,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3968,
875
+ "grad_norm": 3.0225259799028645,
876
+ "learning_rate": 3.777312466836819e-06,
877
+ "loss": 0.3318933486938477,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.4,
882
+ "grad_norm": 3.3439153363899115,
883
+ "learning_rate": 3.7532237175219378e-06,
884
+ "loss": 0.32833037376403806,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.4032,
889
+ "grad_norm": 2.72884090647899,
890
+ "learning_rate": 3.728978547875948e-06,
891
+ "loss": 0.3360243082046509,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.4064,
896
+ "grad_norm": 2.5999080124511966,
897
+ "learning_rate": 3.7045799840444712e-06,
898
+ "loss": 0.33025145530700684,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.4096,
903
+ "grad_norm": 3.0518346526448488,
904
+ "learning_rate": 3.6800310713189258e-06,
905
+ "loss": 0.3306798219680786,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.4128,
910
+ "grad_norm": 2.0509087709244507,
911
+ "learning_rate": 3.6553348737564328e-06,
912
+ "loss": 0.33091559410095217,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.416,
917
+ "grad_norm": 2.908137390744499,
918
+ "learning_rate": 3.6304944737973794e-06,
919
+ "loss": 0.33455810546875,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.4192,
924
+ "grad_norm": 3.0396312942670796,
925
+ "learning_rate": 3.6055129718806836e-06,
926
+ "loss": 0.331624960899353,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4224,
931
+ "grad_norm": 3.282462978283218,
932
+ "learning_rate": 3.5803934860568134e-06,
933
+ "loss": 0.32364490032196047,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4256,
938
+ "grad_norm": 2.2269456751164727,
939
+ "learning_rate": 3.5551391515986163e-06,
940
+ "loss": 0.3319955348968506,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4288,
945
+ "grad_norm": 2.8364899461485527,
946
+ "learning_rate": 3.529753120609982e-06,
947
+ "loss": 0.3252741813659668,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.432,
952
+ "grad_norm": 2.89515974439621,
953
+ "learning_rate": 3.5042385616324243e-06,
954
+ "loss": 0.3287111520767212,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4352,
959
+ "grad_norm": 2.311001238312573,
960
+ "learning_rate": 3.4785986592495934e-06,
961
+ "loss": 0.32939796447753905,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4384,
966
+ "grad_norm": 2.4126049139350734,
967
+ "learning_rate": 3.452836613689803e-06,
968
+ "loss": 0.32168779373168943,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.4416,
973
+ "grad_norm": 3.1765584413022254,
974
+ "learning_rate": 3.426955640426584e-06,
975
+ "loss": 0.32864985466003416,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.4448,
980
+ "grad_norm": 3.154206643410634,
981
+ "learning_rate": 3.4009589697773605e-06,
982
+ "loss": 0.3260640621185303,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.448,
987
+ "grad_norm": 3.4230687653412564,
988
+ "learning_rate": 3.3748498465002475e-06,
989
+ "loss": 0.32304584980010986,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.4512,
994
+ "grad_norm": 2.6276396964869684,
995
+ "learning_rate": 3.3486315293890693e-06,
996
+ "loss": 0.33318138122558594,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4544,
1001
+ "grad_norm": 2.754821177049362,
1002
+ "learning_rate": 3.3223072908666053e-06,
1003
+ "loss": 0.32256054878234863,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4576,
1008
+ "grad_norm": 2.881952130772473,
1009
+ "learning_rate": 3.295880416576153e-06,
1010
+ "loss": 0.33387539386749265,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4608,
1015
+ "grad_norm": 2.5217047707442966,
1016
+ "learning_rate": 3.269354204971427e-06,
1017
+ "loss": 0.32321481704711913,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.464,
1022
+ "grad_norm": 2.976679985492794,
1023
+ "learning_rate": 3.242731966904865e-06,
1024
+ "loss": 0.32245721817016604,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4672,
1029
+ "grad_norm": 2.527563459090948,
1030
+ "learning_rate": 3.2160170252143913e-06,
1031
+ "loss": 0.32239205837249757,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4704,
1036
+ "grad_norm": 1.997832889519553,
1037
+ "learning_rate": 3.1892127143086716e-06,
1038
+ "loss": 0.32758924961090086,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4736,
1043
+ "grad_norm": 2.299101703675196,
1044
+ "learning_rate": 3.1623223797509347e-06,
1045
+ "loss": 0.31891183853149413,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4768,
1050
+ "grad_norm": 2.9210746413068907,
1051
+ "learning_rate": 3.135349377841396e-06,
1052
+ "loss": 0.32430353164672854,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.48,
1057
+ "grad_norm": 2.6265609696149146,
1058
+ "learning_rate": 3.1082970751983497e-06,
1059
+ "loss": 0.3312281608581543,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.4832,
1064
+ "grad_norm": 2.5956160397204786,
1065
+ "learning_rate": 3.0811688483379546e-06,
1066
+ "loss": 0.3238035202026367,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4864,
1071
+ "grad_norm": 2.231793404952503,
1072
+ "learning_rate": 3.0539680832528074e-06,
1073
+ "loss": 0.32330875396728515,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4896,
1078
+ "grad_norm": 2.5723097920479763,
1079
+ "learning_rate": 3.026698174989316e-06,
1080
+ "loss": 0.32520170211791993,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4928,
1085
+ "grad_norm": 2.691498291676849,
1086
+ "learning_rate": 2.999362527223952e-06,
1087
+ "loss": 0.3273704290390015,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.496,
1092
+ "grad_norm": 2.0511124933056375,
1093
+ "learning_rate": 2.9719645518384194e-06,
1094
+ "loss": 0.3250606536865234,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4992,
1099
+ "grad_norm": 2.872290392112785,
1100
+ "learning_rate": 2.944507668493807e-06,
1101
+ "loss": 0.3281686782836914,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.5024,
1106
+ "grad_norm": 2.330246614888919,
1107
+ "learning_rate": 2.9169953042037623e-06,
1108
+ "loss": 0.32374157905578616,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.5056,
1113
+ "grad_norm": 2.0520711406500394,
1114
+ "learning_rate": 2.889430892906754e-06,
1115
+ "loss": 0.3169667720794678,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.5088,
1120
+ "grad_norm": 2.048670737699487,
1121
+ "learning_rate": 2.861817875037462e-06,
1122
+ "loss": 0.3160442590713501,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.512,
1127
+ "grad_norm": 2.8695840695234303,
1128
+ "learning_rate": 2.8341596970973683e-06,
1129
+ "loss": 0.32544608116149903,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.5152,
1134
+ "grad_norm": 1.976397223627746,
1135
+ "learning_rate": 2.80645981122458e-06,
1136
+ "loss": 0.3229134798049927,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.5184,
1141
+ "grad_norm": 2.7070609575351807,
1142
+ "learning_rate": 2.7787216747629508e-06,
1143
+ "loss": 0.32655487060546873,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5216,
1148
+ "grad_norm": 2.6027463070090993,
1149
+ "learning_rate": 2.7509487498305615e-06,
1150
+ "loss": 0.31430754661560056,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5248,
1155
+ "grad_norm": 2.4274539931656585,
1156
+ "learning_rate": 2.7231445028875924e-06,
1157
+ "loss": 0.3237884759902954,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.528,
1162
+ "grad_norm": 1.9308598632845329,
1163
+ "learning_rate": 2.6953124043036604e-06,
1164
+ "loss": 0.32111692428588867,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5312,
1169
+ "grad_norm": 2.1321964485217784,
1170
+ "learning_rate": 2.667455927924667e-06,
1171
+ "loss": 0.3178241729736328,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5344,
1176
+ "grad_norm": 3.1390388403682534,
1177
+ "learning_rate": 2.6395785506392164e-06,
1178
+ "loss": 0.31754770278930666,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.5376,
1183
+ "grad_norm": 2.137535651695072,
1184
+ "learning_rate": 2.6116837519446407e-06,
1185
+ "loss": 0.3183767795562744,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5408,
1190
+ "grad_norm": 2.353751591087722,
1191
+ "learning_rate": 2.5837750135127192e-06,
1192
+ "loss": 0.31382954120635986,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.544,
1197
+ "grad_norm": 2.58704039056448,
1198
+ "learning_rate": 2.555855818755108e-06,
1199
+ "loss": 0.3226866483688354,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.5472,
1204
+ "grad_norm": 2.709677414439902,
1205
+ "learning_rate": 2.5279296523885636e-06,
1206
+ "loss": 0.3166576623916626,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5504,
1211
+ "grad_norm": 2.0859245317104107,
1212
+ "learning_rate": 2.5e-06,
1213
+ "loss": 0.3218212127685547,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5536,
1218
+ "grad_norm": 2.3347357869338436,
1219
+ "learning_rate": 2.472070347611437e-06,
1220
+ "loss": 0.31246294975280764,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5568,
1225
+ "grad_norm": 2.5799420800617106,
1226
+ "learning_rate": 2.444144181244893e-06,
1227
+ "loss": 0.31868853569030764,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.56,
1232
+ "grad_norm": 2.8867509619529406,
1233
+ "learning_rate": 2.416224986487282e-06,
1234
+ "loss": 0.31381807327270506,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5632,
1239
+ "grad_norm": 2.625660671305278,
1240
+ "learning_rate": 2.3883162480553605e-06,
1241
+ "loss": 0.31146280765533446,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5664,
1246
+ "grad_norm": 2.8862495653341544,
1247
+ "learning_rate": 2.3604214493607844e-06,
1248
+ "loss": 0.3111546993255615,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5696,
1253
+ "grad_norm": 2.267020272744141,
1254
+ "learning_rate": 2.332544072075333e-06,
1255
+ "loss": 0.32178173065185545,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5728,
1260
+ "grad_norm": 2.073205643473978,
1261
+ "learning_rate": 2.30468759569634e-06,
1262
+ "loss": 0.31751441955566406,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.576,
1267
+ "grad_norm": 2.232045258362397,
1268
+ "learning_rate": 2.276855497112408e-06,
1269
+ "loss": 0.3135702610015869,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5792,
1274
+ "grad_norm": 3.4632505976937744,
1275
+ "learning_rate": 2.2490512501694394e-06,
1276
+ "loss": 0.3126095771789551,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5824,
1281
+ "grad_norm": 2.7008114205550022,
1282
+ "learning_rate": 2.2212783252370496e-06,
1283
+ "loss": 0.31725611686706545,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5856,
1288
+ "grad_norm": 2.640110404643157,
1289
+ "learning_rate": 2.1935401887754213e-06,
1290
+ "loss": 0.3210929870605469,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5888,
1295
+ "grad_norm": 2.9154181525967924,
1296
+ "learning_rate": 2.165840302902632e-06,
1297
+ "loss": 0.31817543506622314,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.592,
1302
+ "grad_norm": 2.3435756622683916,
1303
+ "learning_rate": 2.1381821249625383e-06,
1304
+ "loss": 0.3186073303222656,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5952,
1309
+ "grad_norm": 2.391868801860604,
1310
+ "learning_rate": 2.1105691070932465e-06,
1311
+ "loss": 0.3081700563430786,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5984,
1316
+ "grad_norm": 2.27033295147997,
1317
+ "learning_rate": 2.083004695796238e-06,
1318
+ "loss": 0.30403599739074705,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.6016,
1323
+ "grad_norm": 2.1095837820360157,
1324
+ "learning_rate": 2.055492331506194e-06,
1325
+ "loss": 0.31353535652160647,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.6048,
1330
+ "grad_norm": 2.284519052184323,
1331
+ "learning_rate": 2.0280354481615814e-06,
1332
+ "loss": 0.31677517890930174,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.608,
1337
+ "grad_norm": 2.237766836173548,
1338
+ "learning_rate": 2.000637472776049e-06,
1339
+ "loss": 0.3152945041656494,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.6112,
1344
+ "grad_norm": 2.7842715157490434,
1345
+ "learning_rate": 1.973301825010685e-06,
1346
+ "loss": 0.30818216800689696,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.6144,
1351
+ "grad_norm": 2.4813744091778784,
1352
+ "learning_rate": 1.9460319167471934e-06,
1353
+ "loss": 0.31820502281188967,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.6176,
1358
+ "grad_norm": 2.0821248606030887,
1359
+ "learning_rate": 1.9188311516620466e-06,
1360
+ "loss": 0.31040709018707274,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.6208,
1365
+ "grad_norm": 2.9336859566866975,
1366
+ "learning_rate": 1.891702924801651e-06,
1367
+ "loss": 0.31292426586151123,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.624,
1372
+ "grad_norm": 2.511253012965921,
1373
+ "learning_rate": 1.864650622158604e-06,
1374
+ "loss": 0.32196660041809083,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6272,
1379
+ "grad_norm": 2.4545922236833455,
1380
+ "learning_rate": 1.8376776202490666e-06,
1381
+ "loss": 0.31464810371398927,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6304,
1386
+ "grad_norm": 2.277913414668649,
1387
+ "learning_rate": 1.8107872856913293e-06,
1388
+ "loss": 0.30748977661132815,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.6336,
1393
+ "grad_norm": 3.6960663974743273,
1394
+ "learning_rate": 1.7839829747856096e-06,
1395
+ "loss": 0.31303911209106444,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6368,
1400
+ "grad_norm": 2.5169048193896844,
1401
+ "learning_rate": 1.7572680330951359e-06,
1402
+ "loss": 0.309541130065918,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.64,
1407
+ "grad_norm": 2.625801312197355,
1408
+ "learning_rate": 1.7306457950285747e-06,
1409
+ "loss": 0.31228773593902587,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6432,
1414
+ "grad_norm": 3.166705714244592,
1415
+ "learning_rate": 1.704119583423848e-06,
1416
+ "loss": 0.30709683895111084,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6464,
1421
+ "grad_norm": 2.7529448920288755,
1422
+ "learning_rate": 1.677692709133396e-06,
1423
+ "loss": 0.3121641159057617,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6496,
1428
+ "grad_norm": 2.4164266641009386,
1429
+ "learning_rate": 1.6513684706109311e-06,
1430
+ "loss": 0.31612191200256345,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6528,
1435
+ "grad_norm": 2.1475852674178486,
1436
+ "learning_rate": 1.6251501534997529e-06,
1437
+ "loss": 0.30926761627197263,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.656,
1442
+ "grad_norm": 3.027937409819003,
1443
+ "learning_rate": 1.5990410302226405e-06,
1444
+ "loss": 0.3059820652008057,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6592,
1449
+ "grad_norm": 2.3663528005893575,
1450
+ "learning_rate": 1.5730443595734162e-06,
1451
+ "loss": 0.30960190296173096,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6624,
1456
+ "grad_norm": 2.5495655090650806,
1457
+ "learning_rate": 1.5471633863101982e-06,
1458
+ "loss": 0.3146512508392334,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.6656,
1463
+ "grad_norm": 2.563871195645732,
1464
+ "learning_rate": 1.521401340750407e-06,
1465
+ "loss": 0.3116560935974121,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6688,
1470
+ "grad_norm": 2.4316488926893314,
1471
+ "learning_rate": 1.495761438367577e-06,
1472
+ "loss": 0.31447796821594237,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.672,
1477
+ "grad_norm": 2.446980089200077,
1478
+ "learning_rate": 1.4702468793900187e-06,
1479
+ "loss": 0.3153538703918457,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6752,
1484
+ "grad_norm": 2.2511595317617283,
1485
+ "learning_rate": 1.444860848401384e-06,
1486
+ "loss": 0.31273808479309084,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6784,
1491
+ "grad_norm": 2.459748219135552,
1492
+ "learning_rate": 1.4196065139431866e-06,
1493
+ "loss": 0.31059865951538085,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6816,
1498
+ "grad_norm": 2.4570005490031805,
1499
+ "learning_rate": 1.3944870281193178e-06,
1500
+ "loss": 0.31122384071350095,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6848,
1505
+ "grad_norm": 2.5940034157380447,
1506
+ "learning_rate": 1.3695055262026208e-06,
1507
+ "loss": 0.3145638704299927,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.688,
1512
+ "grad_norm": 2.8940635665298644,
1513
+ "learning_rate": 1.3446651262435679e-06,
1514
+ "loss": 0.31133465766906737,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6912,
1519
+ "grad_norm": 2.2603444512196216,
1520
+ "learning_rate": 1.3199689286810746e-06,
1521
+ "loss": 0.31110968589782717,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6944,
1526
+ "grad_norm": 2.3697248986342223,
1527
+ "learning_rate": 1.2954200159555294e-06,
1528
+ "loss": 0.3046250820159912,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6976,
1533
+ "grad_norm": 2.9149559965372083,
1534
+ "learning_rate": 1.2710214521240527e-06,
1535
+ "loss": 0.3056375503540039,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.7008,
1540
+ "grad_norm": 2.785583016537511,
1541
+ "learning_rate": 1.246776282478063e-06,
1542
+ "loss": 0.3074607849121094,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.704,
1547
+ "grad_norm": 2.238483419316128,
1548
+ "learning_rate": 1.222687533163181e-06,
1549
+ "loss": 0.30821986198425294,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.7072,
1554
+ "grad_norm": 2.0963873111402225,
1555
+ "learning_rate": 1.1987582108015228e-06,
1556
+ "loss": 0.31098227500915526,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.7104,
1561
+ "grad_norm": 2.3511311934322725,
1562
+ "learning_rate": 1.1749913021164255e-06,
1563
+ "loss": 0.3125911712646484,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.7136,
1568
+ "grad_norm": 2.0182013166602735,
1569
+ "learning_rate": 1.1513897735596702e-06,
1570
+ "loss": 0.30506420135498047,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.7168,
1575
+ "grad_norm": 2.0904990978865654,
1576
+ "learning_rate": 1.127956570941218e-06,
1577
+ "loss": 0.30170474052429197,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.72,
1582
+ "grad_norm": 2.3591898483151525,
1583
+ "learning_rate": 1.104694619061533e-06,
1584
+ "loss": 0.3140627145767212,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.7232,
1589
+ "grad_norm": 2.3874798738589553,
1590
+ "learning_rate": 1.0816068213465295e-06,
1591
+ "loss": 0.3148207187652588,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.7264,
1596
+ "grad_norm": 2.462173136321867,
1597
+ "learning_rate": 1.0586960594851762e-06,
1598
+ "loss": 0.30828402042388914,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.7296,
1603
+ "grad_norm": 2.2877287929832946,
1604
+ "learning_rate": 1.0359651930698217e-06,
1605
+ "loss": 0.30725433826446535,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7328,
1610
+ "grad_norm": 2.5585705908550413,
1611
+ "learning_rate": 1.0134170592392837e-06,
1612
+ "loss": 0.30991530418395996,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.736,
1617
+ "grad_norm": 2.415441399008779,
1618
+ "learning_rate": 9.910544723247204e-07,
1619
+ "loss": 0.31087689399719237,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7392,
1624
+ "grad_norm": 2.6450690086623285,
1625
+ "learning_rate": 9.688802234983706e-07,
1626
+ "loss": 0.3067446231842041,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7424,
1631
+ "grad_norm": 2.363123649822279,
1632
+ "learning_rate": 9.468970804251742e-07,
1633
+ "loss": 0.30767192840576174,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7456,
1638
+ "grad_norm": 2.245412676348008,
1639
+ "learning_rate": 9.251077869173244e-07,
1640
+ "loss": 0.30107917785644533,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7488,
1645
+ "grad_norm": 2.5736642361970503,
1646
+ "learning_rate": 9.035150625918054e-07,
1647
+ "loss": 0.303986120223999,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.752,
1652
+ "grad_norm": 2.6844109007429138,
1653
+ "learning_rate": 8.821216025309395e-07,
1654
+ "loss": 0.3074802875518799,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7552,
1659
+ "grad_norm": 2.412670568786912,
1660
+ "learning_rate": 8.609300769460055e-07,
1661
+ "loss": 0.30130510330200194,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7584,
1666
+ "grad_norm": 3.176069141824472,
1667
+ "learning_rate": 8.399431308439592e-07,
1668
+ "loss": 0.3105806827545166,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7616,
1673
+ "grad_norm": 2.23339472526297,
1674
+ "learning_rate": 8.191633836972962e-07,
1675
+ "loss": 0.3084972620010376,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7648,
1680
+ "grad_norm": 2.6912839020724175,
1681
+ "learning_rate": 7.985934291171024e-07,
1682
+ "loss": 0.3067460536956787,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.768,
1687
+ "grad_norm": 2.5426618104677976,
1688
+ "learning_rate": 7.7823583452934e-07,
1689
+ "loss": 0.30809898376464845,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7712,
1694
+ "grad_norm": 2.55531536817282,
1695
+ "learning_rate": 7.58093140854389e-07,
1696
+ "loss": 0.3071744441986084,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7744,
1701
+ "grad_norm": 2.285863017236424,
1702
+ "learning_rate": 7.381678621899077e-07,
1703
+ "loss": 0.3093477725982666,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7776,
1708
+ "grad_norm": 2.3600405361881767,
1709
+ "learning_rate": 7.184624854970379e-07,
1710
+ "loss": 0.30798888206481934,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7808,
1715
+ "grad_norm": 2.0247328579355726,
1716
+ "learning_rate": 6.989794702899932e-07,
1717
+ "loss": 0.3048464298248291,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.784,
1722
+ "grad_norm": 2.7079172300622334,
1723
+ "learning_rate": 6.797212483290777e-07,
1724
+ "loss": 0.3093360424041748,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7872,
1729
+ "grad_norm": 2.8011999237207967,
1730
+ "learning_rate": 6.60690223317171e-07,
1731
+ "loss": 0.30233092308044435,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7904,
1736
+ "grad_norm": 2.202966089641912,
1737
+ "learning_rate": 6.418887705997046e-07,
1738
+ "loss": 0.3048731327056885,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7936,
1743
+ "grad_norm": 2.6510546903467755,
1744
+ "learning_rate": 6.23319236868189e-07,
1745
+ "loss": 0.3104764461517334,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7968,
1750
+ "grad_norm": 2.510992490322273,
1751
+ "learning_rate": 6.049839398673141e-07,
1752
+ "loss": 0.31223044395446775,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.8,
1757
+ "grad_norm": 2.7988283248607604,
1758
+ "learning_rate": 5.868851681056567e-07,
1759
+ "loss": 0.3109541893005371,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.8032,
1764
+ "grad_norm": 2.370572243788772,
1765
+ "learning_rate": 5.690251805700467e-07,
1766
+ "loss": 0.3075347900390625,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.8064,
1771
+ "grad_norm": 2.057318428676814,
1772
+ "learning_rate": 5.514062064436096e-07,
1773
+ "loss": 0.30944228172302246,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.8096,
1778
+ "grad_norm": 2.9526395601791937,
1779
+ "learning_rate": 5.34030444827533e-07,
1780
+ "loss": 0.30773684978485105,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.8128,
1785
+ "grad_norm": 2.1808951881567165,
1786
+ "learning_rate": 5.169000644665895e-07,
1787
+ "loss": 0.30281686782836914,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.816,
1792
+ "grad_norm": 2.501184820191482,
1793
+ "learning_rate": 5.000172034784442e-07,
1794
+ "loss": 0.30731327533721925,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.8192,
1799
+ "grad_norm": 2.4433836822113304,
1800
+ "learning_rate": 4.833839690867853e-07,
1801
+ "loss": 0.30861892700195315,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.8224,
1806
+ "grad_norm": 2.482955525732734,
1807
+ "learning_rate": 4.6700243735831705e-07,
1808
+ "loss": 0.3014340400695801,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.8256,
1813
+ "grad_norm": 2.516375989369738,
1814
+ "learning_rate": 4.508746529436311e-07,
1815
+ "loss": 0.302032995223999,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.8288,
1820
+ "grad_norm": 2.2676227598264926,
1821
+ "learning_rate": 4.350026288220083e-07,
1822
+ "loss": 0.30550131797790525,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.832,
1827
+ "grad_norm": 2.3829531066293126,
1828
+ "learning_rate": 4.1938834605017133e-07,
1829
+ "loss": 0.3046237945556641,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8352,
1834
+ "grad_norm": 2.0018887466739548,
1835
+ "learning_rate": 4.0403375351501515e-07,
1836
+ "loss": 0.3024258852005005,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8384,
1841
+ "grad_norm": 2.5182571334882597,
1842
+ "learning_rate": 3.88940767690362e-07,
1843
+ "loss": 0.3063870906829834,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8416,
1848
+ "grad_norm": 2.7441991027074355,
1849
+ "learning_rate": 3.7411127239775774e-07,
1850
+ "loss": 0.30306272506713866,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8448,
1855
+ "grad_norm": 2.161963722714269,
1856
+ "learning_rate": 3.595471185713431e-07,
1857
+ "loss": 0.3009947299957275,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.848,
1862
+ "grad_norm": 2.7694143698141285,
1863
+ "learning_rate": 3.4525012402682826e-07,
1864
+ "loss": 0.30188300609588625,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8512,
1869
+ "grad_norm": 2.6814413975784217,
1870
+ "learning_rate": 3.3122207323460804e-07,
1871
+ "loss": 0.3024703025817871,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8544,
1876
+ "grad_norm": 2.4444711671869306,
1877
+ "learning_rate": 3.1746471709702963e-07,
1878
+ "loss": 0.3008608102798462,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8576,
1883
+ "grad_norm": 2.6886622183433015,
1884
+ "learning_rate": 3.039797727298585e-07,
1885
+ "loss": 0.30821614265441893,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8608,
1890
+ "grad_norm": 2.641784614909192,
1891
+ "learning_rate": 2.9076892324795546e-07,
1892
+ "loss": 0.30515303611755373,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.864,
1897
+ "grad_norm": 2.5595370943122444,
1898
+ "learning_rate": 2.778338175551995e-07,
1899
+ "loss": 0.3007267236709595,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8672,
1904
+ "grad_norm": 2.283872628964803,
1905
+ "learning_rate": 2.6517607013868326e-07,
1906
+ "loss": 0.30617167949676516,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.8704,
1911
+ "grad_norm": 2.558413840419693,
1912
+ "learning_rate": 2.527972608672002e-07,
1913
+ "loss": 0.3038905143737793,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8736,
1918
+ "grad_norm": 2.4952676522317567,
1919
+ "learning_rate": 2.40698934794053e-07,
1920
+ "loss": 0.3054081201553345,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8768,
1925
+ "grad_norm": 2.247637838190116,
1926
+ "learning_rate": 2.2888260196421237e-07,
1927
+ "loss": 0.3028261661529541,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.88,
1932
+ "grad_norm": 2.5035963414447804,
1933
+ "learning_rate": 2.1734973722583735e-07,
1934
+ "loss": 0.3062435626983643,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8832,
1939
+ "grad_norm": 1.918923632238423,
1940
+ "learning_rate": 2.0610178004619564e-07,
1941
+ "loss": 0.2972743034362793,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8864,
1946
+ "grad_norm": 2.4603002546330845,
1947
+ "learning_rate": 1.9514013433199834e-07,
1948
+ "loss": 0.3119321346282959,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8896,
1953
+ "grad_norm": 2.1315709346733667,
1954
+ "learning_rate": 1.8446616825416958e-07,
1955
+ "loss": 0.30900893211364744,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8928,
1960
+ "grad_norm": 2.3753122188061218,
1961
+ "learning_rate": 1.7408121407708007e-07,
1962
+ "loss": 0.3069151401519775,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.896,
1967
+ "grad_norm": 2.207415755325001,
1968
+ "learning_rate": 1.6398656799226253e-07,
1969
+ "loss": 0.2986165523529053,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8992,
1974
+ "grad_norm": 2.178561452169741,
1975
+ "learning_rate": 1.5418348995662773e-07,
1976
+ "loss": 0.3010268688201904,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.9024,
1981
+ "grad_norm": 2.5082064593439393,
1982
+ "learning_rate": 1.4467320353520275e-07,
1983
+ "loss": 0.2984073877334595,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.9056,
1988
+ "grad_norm": 2.366814729694057,
1989
+ "learning_rate": 1.3545689574841341e-07,
1990
+ "loss": 0.3026757001876831,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.9088,
1995
+ "grad_norm": 2.380709246306716,
1996
+ "learning_rate": 1.26535716923927e-07,
1997
+ "loss": 0.310437536239624,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.912,
2002
+ "grad_norm": 2.484246324702375,
2003
+ "learning_rate": 1.1791078055307493e-07,
2004
+ "loss": 0.30369887351989744,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.9152,
2009
+ "grad_norm": 2.6412244000001786,
2010
+ "learning_rate": 1.0958316315187289e-07,
2011
+ "loss": 0.3044759750366211,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.9184,
2016
+ "grad_norm": 2.4542916560781967,
2017
+ "learning_rate": 1.0155390412665528e-07,
2018
+ "loss": 0.30136928558349607,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.9216,
2023
+ "grad_norm": 2.631911471911446,
2024
+ "learning_rate": 9.38240056443443e-08,
2025
+ "loss": 0.30144243240356444,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.9248,
2030
+ "grad_norm": 2.2530200189747243,
2031
+ "learning_rate": 8.639443250736402e-08,
2032
+ "loss": 0.3027902603149414,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.928,
2037
+ "grad_norm": 3.1331936934174123,
2038
+ "learning_rate": 7.926611203321777e-08,
2039
+ "loss": 0.30441856384277344,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.9312,
2044
+ "grad_norm": 2.5134219010551067,
2045
+ "learning_rate": 7.243993393874882e-08,
2046
+ "loss": 0.306389307975769,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.9344,
2051
+ "grad_norm": 2.372785201514508,
2052
+ "learning_rate": 6.591675022908805e-08,
2053
+ "loss": 0.30292179584503176,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9376,
2058
+ "grad_norm": 2.407913531878434,
2059
+ "learning_rate": 5.969737509131241e-08,
2060
+ "loss": 0.29895825386047364,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9408,
2065
+ "grad_norm": 2.2376435379528865,
2066
+ "learning_rate": 5.3782584792823334e-08,
2067
+ "loss": 0.30271134376525877,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.944,
2072
+ "grad_norm": 2.653290725438786,
2073
+ "learning_rate": 4.817311758445686e-08,
2074
+ "loss": 0.3062829732894897,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9472,
2079
+ "grad_norm": 2.42511171945876,
2080
+ "learning_rate": 4.286967360833866e-08,
2081
+ "loss": 0.3066932439804077,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9504,
2086
+ "grad_norm": 2.1534299736877895,
2087
+ "learning_rate": 3.787291481049754e-08,
2088
+ "loss": 0.3068870544433594,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9536,
2093
+ "grad_norm": 2.209956884835794,
2094
+ "learning_rate": 3.3183464858244364e-08,
2095
+ "loss": 0.31453580856323243,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.9568,
2100
+ "grad_norm": 2.5928568899987017,
2101
+ "learning_rate": 2.8801909062328992e-08,
2102
+ "loss": 0.2991969108581543,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.96,
2107
+ "grad_norm": 2.385980918167846,
2108
+ "learning_rate": 2.4728794303886248e-08,
2109
+ "loss": 0.2963397026062012,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9632,
2114
+ "grad_norm": 2.374100986684654,
2115
+ "learning_rate": 2.0964628966175794e-08,
2116
+ "loss": 0.30301966667175295,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.9664,
2121
+ "grad_norm": 2.094256605734986,
2122
+ "learning_rate": 1.750988287113009e-08,
2123
+ "loss": 0.2994666576385498,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9696,
2128
+ "grad_norm": 1.916185239441286,
2129
+ "learning_rate": 1.4364987220713278e-08,
2130
+ "loss": 0.3080729007720947,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9728,
2135
+ "grad_norm": 2.3446521041543207,
2136
+ "learning_rate": 1.1530334543099763e-08,
2137
+ "loss": 0.3026130199432373,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.976,
2142
+ "grad_norm": 2.5854178252734323,
2143
+ "learning_rate": 9.006278643683697e-09,
2144
+ "loss": 0.309655499458313,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9792,
2149
+ "grad_norm": 1.9908162772434517,
2150
+ "learning_rate": 6.793134560916514e-09,
2151
+ "loss": 0.31186389923095703,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9824,
2156
+ "grad_norm": 2.1977962094508534,
2157
+ "learning_rate": 4.891178526986451e-09,
2158
+ "loss": 0.30645883083343506,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9856,
2163
+ "grad_norm": 2.2397406638818147,
2164
+ "learning_rate": 3.3006479333413943e-09,
2165
+ "loss": 0.3090504169464111,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9888,
2170
+ "grad_norm": 2.0435901319475036,
2171
+ "learning_rate": 2.021741301058422e-09,
2172
+ "loss": 0.3049570322036743,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.992,
2177
+ "grad_norm": 2.371036869409615,
2178
+ "learning_rate": 1.0546182560652872e-09,
2179
+ "loss": 0.3073274612426758,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9952,
2184
+ "grad_norm": 2.2551729202130457,
2185
+ "learning_rate": 3.9939950921774607e-10,
2186
+ "loss": 0.30047030448913575,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.9984,
2191
+ "grad_norm": 2.2067081414460827,
2192
+ "learning_rate": 5.616684123160854e-11,
2193
+ "loss": 0.3023503065109253,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 1.0,
2198
+ "step": 3125,
2199
+ "total_flos": 1.0913057758773248e+16,
2200
+ "train_loss": 0.7292402684783935,
2201
+ "train_runtime": 30167.0559,
2202
+ "train_samples_per_second": 6.63,
2203
+ "train_steps_per_second": 0.104
2204
+ }
2205
+ ],
2206
+ "logging_steps": 10,
2207
+ "max_steps": 3125,
2208
+ "num_input_tokens_seen": 0,
2209
+ "num_train_epochs": 1,
2210
+ "save_steps": 500,
2211
+ "stateful_callbacks": {
2212
+ "TrainerControl": {
2213
+ "args": {
2214
+ "should_epoch_stop": false,
2215
+ "should_evaluate": false,
2216
+ "should_log": false,
2217
+ "should_save": true,
2218
+ "should_training_stop": true
2219
+ },
2220
+ "attributes": {}
2221
+ }
2222
+ },
2223
+ "total_flos": 1.0913057758773248e+16,
2224
+ "train_batch_size": 4,
2225
+ "trial_name": null,
2226
+ "trial_params": null
2227
+ }
checkpoints/Gemma-4-E4B-it-SFT/training_loss.png ADDED
checkpoints/InternVL3.5-8B-SFT/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1955525886476288.0,
4
+ "train_loss": 0.1948647116279602,
5
+ "train_runtime": 28413.61,
6
+ "train_samples_per_second": 7.039,
7
+ "train_steps_per_second": 0.11
8
+ }
checkpoints/InternVL3.5-8B-SFT/chat_template.jinja ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {% for message in messages %}{{'<|im_start|>' + message['role'] + '
2
+ '}}{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}{{ '<image>
3
+ ' }}{% elif content['type'] == 'video' %}{{ '<video>
4
+ ' }}{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}{{'<|im_end|>
5
+ '}}{% endfor %}{% if add_generation_prompt %}{{'<|im_start|>assistant
6
+ ' }}{% endif %}
checkpoints/InternVL3.5-8B-SFT/config.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "InternVLForConditionalGeneration"
4
+ ],
5
+ "downsample_ratio": 0.5,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_size": 4096,
9
+ "image_seq_length": 256,
10
+ "image_token_id": 151671,
11
+ "model_type": "internvl",
12
+ "pad_token_id": 151643,
13
+ "projector_hidden_act": "gelu",
14
+ "text_config": {
15
+ "attention_bias": false,
16
+ "attention_dropout": 0.0,
17
+ "bos_token_id": 151643,
18
+ "debug": false,
19
+ "dtype": "bfloat16",
20
+ "eos_token_id": 151645,
21
+ "ep_size": 1,
22
+ "head_dim": 128,
23
+ "hidden_act": "silu",
24
+ "hidden_size": 4096,
25
+ "initializer_range": 0.02,
26
+ "intermediate_size": 12288,
27
+ "layer_types": [
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention",
42
+ "full_attention",
43
+ "full_attention",
44
+ "full_attention",
45
+ "full_attention",
46
+ "full_attention",
47
+ "full_attention",
48
+ "full_attention",
49
+ "full_attention",
50
+ "full_attention",
51
+ "full_attention",
52
+ "full_attention",
53
+ "full_attention",
54
+ "full_attention",
55
+ "full_attention",
56
+ "full_attention",
57
+ "full_attention",
58
+ "full_attention",
59
+ "full_attention",
60
+ "full_attention",
61
+ "full_attention",
62
+ "full_attention",
63
+ "full_attention"
64
+ ],
65
+ "max_position_embeddings": 40960,
66
+ "max_window_layers": 36,
67
+ "micro_forward": false,
68
+ "model_type": "qwen3",
69
+ "num_attention_heads": 32,
70
+ "num_hidden_layers": 36,
71
+ "num_key_value_heads": 8,
72
+ "pad_token_id": null,
73
+ "rms_norm_eps": 1e-06,
74
+ "rope_parameters": {
75
+ "rope_theta": 1000000,
76
+ "rope_type": "default"
77
+ },
78
+ "skip_checkpoint": false,
79
+ "sliding_window": null,
80
+ "tie_word_embeddings": false,
81
+ "use_cache": false,
82
+ "use_deepep": false,
83
+ "use_sliding_window": false,
84
+ "vocab_size": 151936
85
+ },
86
+ "tie_word_embeddings": false,
87
+ "transformers_version": "5.5.3",
88
+ "use_cache": false,
89
+ "vision_config": {
90
+ "attention_bias": true,
91
+ "attention_dropout": 0.0,
92
+ "dtype": "bfloat16",
93
+ "hidden_act": "gelu",
94
+ "hidden_dropout_prob": 0.0,
95
+ "hidden_size": 1024,
96
+ "image_size": [
97
+ 448,
98
+ 448
99
+ ],
100
+ "initializer_range": 0.02,
101
+ "intermediate_size": 4096,
102
+ "layer_norm_eps": 1e-06,
103
+ "layer_scale_init_value": 0.1,
104
+ "model_type": "internvl_vision",
105
+ "norm_type": "layer_norm",
106
+ "num_attention_heads": 16,
107
+ "num_channels": 3,
108
+ "num_hidden_layers": 24,
109
+ "patch_size": [
110
+ 14,
111
+ 14
112
+ ],
113
+ "projection_dropout": 0.0,
114
+ "use_absolute_position_embeddings": true,
115
+ "use_mask_token": false,
116
+ "use_mean_pooling": true,
117
+ "use_qk_norm": false
118
+ },
119
+ "vision_feature_layer": -1,
120
+ "vision_feature_select_strategy": "default"
121
+ }
checkpoints/InternVL3.5-8B-SFT/eval_results_job_internvl35_8b_internvl35_8b_20260430_002347.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mae_dx": 0.1673913793103448,
3
+ "rmse_dx": 0.558917781258475,
4
+ "mae_dy": 0.14736034482758623,
5
+ "rmse_dy": 0.43480112378118263,
6
+ "mae_dz": 0.014543103448275864,
7
+ "rmse_dz": 0.08856353651475307,
8
+ "mae_dpitch": 0.3550155172413793,
9
+ "rmse_dpitch": 0.7902604437560551,
10
+ "mae_dyaw": 1.1672620689655173,
11
+ "rmse_dyaw": 2.7476475518096417,
12
+ "mae_droll": 0.0,
13
+ "rmse_droll": 0.0,
14
+ "mae_overall": 0.3085954022988506,
15
+ "mae_position": 0.10976494252873564,
16
+ "mae_rotation": 0.5074258620689654,
17
+ "rmse_overall": 1.2030075672648746,
18
+ "wp1_euc_mae": 0.07507887870117307,
19
+ "wp1_euc_median": 0.020000000000000018,
20
+ "wp2_euc_mae": 0.15300439055300805,
21
+ "wp2_euc_median": 0.04472135954999579,
22
+ "wp3_euc_mae": 0.24257533874781437,
23
+ "wp3_euc_median": 0.0806225774829854,
24
+ "wp4_euc_mae": 0.35212693283711727,
25
+ "wp4_euc_median": 0.12369316876852973,
26
+ "wp5_euc_mae": 0.4665492393220971,
27
+ "wp5_euc_median": 0.17131835484052965,
28
+ "euclidean_mae": 0.25786695603224197,
29
+ "ADE": 0.25786695603224197,
30
+ "FDE": 0.4665492393220971,
31
+ "ADE_median": 0.09423994273900672,
32
+ "FDE_median": 0.17131835484052965,
33
+ "SR@0.5m": 0.8844827586206897,
34
+ "SR@1.0m": 0.9520689655172414,
35
+ "SR@2.0m": 0.9801724137931035,
36
+ "SR@5.0m": 0.9956896551724138,
37
+ "TrajSR@1.0m": 0.8931034482758621,
38
+ "TrajSR@2.0m": 0.9586206896551724,
39
+ "TrajSR@5.0m": 0.9887931034482759,
40
+ "RotAcc@1.0deg": 0.6555172413793103,
41
+ "RotAcc@5.0deg": 0.9496551724137932,
42
+ "RotAcc@10.0deg": 0.9872413793103448,
43
+ "wp1_rot_mae": 0.6533116266968418,
44
+ "wp2_rot_mae": 0.9349310214465391,
45
+ "wp3_rot_mae": 1.2746919556832232,
46
+ "wp4_rot_mae": 1.6432791843561125,
47
+ "wp5_rot_mae": 2.0312724349773714,
48
+ "rotation_euc_mae": 1.3074972446320177,
49
+ "parse_failure_rate": 0.0,
50
+ "parse_success_rate": 1.0,
51
+ "valid_samples": 1160,
52
+ "total_samples": 1160,
53
+ "parse_failures": 0,
54
+ "inference_engine": "transformers"
55
+ }
checkpoints/InternVL3.5-8B-SFT/generation_config.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "eos_token_id": [
4
+ 151645
5
+ ],
6
+ "pad_token_id": 151643,
7
+ "transformers_version": "5.5.3"
8
+ }
checkpoints/InternVL3.5-8B-SFT/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ca84729e7bbaee8063f7e92a2435d5d69d0d38983a451a5085d11c886ab5e34
3
+ size 17056747968
checkpoints/InternVL3.5-8B-SFT/processor_config.json ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "crop_to_patches": false,
4
+ "data_format": "channels_first",
5
+ "default_to_square": true,
6
+ "do_convert_rgb": true,
7
+ "do_normalize": true,
8
+ "do_rescale": true,
9
+ "do_resize": true,
10
+ "image_mean": [
11
+ 0.485,
12
+ 0.456,
13
+ 0.406
14
+ ],
15
+ "image_processor_type": "GotOcr2ImageProcessor",
16
+ "image_std": [
17
+ 0.229,
18
+ 0.224,
19
+ 0.225
20
+ ],
21
+ "max_patches": 12,
22
+ "min_patches": 1,
23
+ "resample": 3,
24
+ "rescale_factor": 0.00392156862745098,
25
+ "size": {
26
+ "height": 448,
27
+ "width": 448
28
+ }
29
+ },
30
+ "image_seq_length": 256,
31
+ "processor_class": "InternVLProcessor",
32
+ "video_processor": {
33
+ "data_format": "channels_first",
34
+ "default_to_square": true,
35
+ "do_convert_rgb": true,
36
+ "do_normalize": true,
37
+ "do_rescale": true,
38
+ "do_resize": true,
39
+ "do_sample_frames": false,
40
+ "image_mean": [
41
+ 0.48145466,
42
+ 0.4578275,
43
+ 0.40821073
44
+ ],
45
+ "image_std": [
46
+ 0.26862954,
47
+ 0.26130258,
48
+ 0.27577711
49
+ ],
50
+ "initial_shift": true,
51
+ "model_valid_processing_keys": [
52
+ "do_convert_rgb",
53
+ "do_resize",
54
+ "size",
55
+ "size_divisor",
56
+ "default_to_square",
57
+ "resample",
58
+ "do_rescale",
59
+ "rescale_factor",
60
+ "do_normalize",
61
+ "image_mean",
62
+ "image_std",
63
+ "do_pad",
64
+ "do_center_crop",
65
+ "crop_size",
66
+ "data_format",
67
+ "input_data_format",
68
+ "device"
69
+ ],
70
+ "resample": 3,
71
+ "rescale_factor": 0.00392156862745098,
72
+ "return_metadata": false,
73
+ "size": {
74
+ "height": 384,
75
+ "width": 384
76
+ },
77
+ "video_processor_type": "InternVLVideoProcessor"
78
+ }
79
+ }
checkpoints/InternVL3.5-8B-SFT/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6581c44164d273d4222df982905a7e0450dcf3a4a7ebe98f9ec53e4de05beffe
3
+ size 11424300
checkpoints/InternVL3.5-8B-SFT/tokenizer_config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "context_image_token": "<IMG_CONTEXT>",
7
+ "end_image_token": "</img>",
8
+ "eos_token": "<|im_end|>",
9
+ "errors": "replace",
10
+ "extra_special_tokens": [
11
+ "<|im_end|>"
12
+ ],
13
+ "is_local": true,
14
+ "model_max_length": 14588,
15
+ "model_specific_special_tokens": {
16
+ "context_image_token": "<IMG_CONTEXT>",
17
+ "end_image_token": "</img>",
18
+ "start_image_token": "<img>",
19
+ "video_token": "<|video_pad|>"
20
+ },
21
+ "pad_token": "<|endoftext|>",
22
+ "padding_side": "right",
23
+ "processor_class": "InternVLProcessor",
24
+ "split_special_tokens": false,
25
+ "start_image_token": "<img>",
26
+ "tokenizer_class": "Qwen2Tokenizer",
27
+ "unk_token": null,
28
+ "video_token": "<|video_pad|>"
29
+ }
checkpoints/InternVL3.5-8B-SFT/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1955525886476288.0,
4
+ "train_loss": 0.1948647116279602,
5
+ "train_runtime": 28413.61,
6
+ "train_samples_per_second": 7.039,
7
+ "train_steps_per_second": 0.11
8
+ }
checkpoints/InternVL3.5-8B-SFT/trainer_state.json ADDED
@@ -0,0 +1,2227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0032,
14
+ "grad_norm": 7.051180674009678,
15
+ "learning_rate": 1.437699680511182e-07,
16
+ "loss": 0.45998425483703614,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0064,
21
+ "grad_norm": 6.386709802443142,
22
+ "learning_rate": 3.0351437699680514e-07,
23
+ "loss": 0.44952831268310545,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0096,
28
+ "grad_norm": 5.3631010908380015,
29
+ "learning_rate": 4.6325878594249205e-07,
30
+ "loss": 0.3993690013885498,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0128,
35
+ "grad_norm": 0.852155255839625,
36
+ "learning_rate": 6.230031948881789e-07,
37
+ "loss": 0.3118258237838745,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.016,
42
+ "grad_norm": 0.4705571475990448,
43
+ "learning_rate": 7.82747603833866e-07,
44
+ "loss": 0.2786674976348877,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.0192,
49
+ "grad_norm": 0.39720799855122535,
50
+ "learning_rate": 9.424920127795528e-07,
51
+ "loss": 0.2685645580291748,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0224,
56
+ "grad_norm": 0.34144681090493506,
57
+ "learning_rate": 1.1022364217252397e-06,
58
+ "loss": 0.27388153076171873,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0256,
63
+ "grad_norm": 0.29670665469044527,
64
+ "learning_rate": 1.2619808306709266e-06,
65
+ "loss": 0.2562382221221924,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0288,
70
+ "grad_norm": 0.2721949763566226,
71
+ "learning_rate": 1.4217252396166134e-06,
72
+ "loss": 0.2521932125091553,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.032,
77
+ "grad_norm": 0.30509418891876505,
78
+ "learning_rate": 1.5814696485623005e-06,
79
+ "loss": 0.2553669214248657,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.0352,
84
+ "grad_norm": 0.2710599378904947,
85
+ "learning_rate": 1.7412140575079875e-06,
86
+ "loss": 0.2548961162567139,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.0384,
91
+ "grad_norm": 0.3180117403374185,
92
+ "learning_rate": 1.9009584664536742e-06,
93
+ "loss": 0.2442842960357666,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.0416,
98
+ "grad_norm": 0.2695352733592907,
99
+ "learning_rate": 2.060702875399361e-06,
100
+ "loss": 0.24766459465026855,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.0448,
105
+ "grad_norm": 0.3064535363854503,
106
+ "learning_rate": 2.220447284345048e-06,
107
+ "loss": 0.23845260143280028,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.048,
112
+ "grad_norm": 0.29068646435586043,
113
+ "learning_rate": 2.380191693290735e-06,
114
+ "loss": 0.23559024333953857,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.0512,
119
+ "grad_norm": 0.3186203915842237,
120
+ "learning_rate": 2.539936102236422e-06,
121
+ "loss": 0.23029537200927735,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.0544,
126
+ "grad_norm": 0.32754075011707046,
127
+ "learning_rate": 2.699680511182109e-06,
128
+ "loss": 0.2385089635848999,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.0576,
133
+ "grad_norm": 0.38484368515577855,
134
+ "learning_rate": 2.8594249201277955e-06,
135
+ "loss": 0.23111426830291748,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.0608,
140
+ "grad_norm": 0.299502856060473,
141
+ "learning_rate": 3.0191693290734825e-06,
142
+ "loss": 0.23530282974243164,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.064,
147
+ "grad_norm": 0.3078123289936782,
148
+ "learning_rate": 3.17891373801917e-06,
149
+ "loss": 0.23611860275268554,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.0672,
154
+ "grad_norm": 0.30717572422426626,
155
+ "learning_rate": 3.3386581469648564e-06,
156
+ "loss": 0.23241891860961914,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.0704,
161
+ "grad_norm": 0.30949630760689323,
162
+ "learning_rate": 3.4984025559105434e-06,
163
+ "loss": 0.2257370948791504,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.0736,
168
+ "grad_norm": 0.2734080768093611,
169
+ "learning_rate": 3.6581469648562303e-06,
170
+ "loss": 0.22820501327514647,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0768,
175
+ "grad_norm": 0.31986420438753294,
176
+ "learning_rate": 3.817891373801918e-06,
177
+ "loss": 0.22324295043945314,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.08,
182
+ "grad_norm": 0.3271935835910018,
183
+ "learning_rate": 3.977635782747604e-06,
184
+ "loss": 0.22092509269714355,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.0832,
189
+ "grad_norm": 0.28164810489138675,
190
+ "learning_rate": 4.137380191693291e-06,
191
+ "loss": 0.22088565826416015,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.0864,
196
+ "grad_norm": 0.2806581165168549,
197
+ "learning_rate": 4.297124600638978e-06,
198
+ "loss": 0.2235860824584961,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.0896,
203
+ "grad_norm": 0.2818314341404028,
204
+ "learning_rate": 4.456869009584665e-06,
205
+ "loss": 0.21951718330383302,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0928,
210
+ "grad_norm": 0.2755068214230404,
211
+ "learning_rate": 4.616613418530352e-06,
212
+ "loss": 0.22480430603027343,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.096,
217
+ "grad_norm": 0.2991295097090295,
218
+ "learning_rate": 4.776357827476039e-06,
219
+ "loss": 0.22600164413452148,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.0992,
224
+ "grad_norm": 0.3239664056294863,
225
+ "learning_rate": 4.936102236421725e-06,
226
+ "loss": 0.21372499465942382,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.1024,
231
+ "grad_norm": 0.2881723034602484,
232
+ "learning_rate": 4.999943833158769e-06,
233
+ "loss": 0.21513206958770753,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.1056,
238
+ "grad_norm": 0.27877645475403023,
239
+ "learning_rate": 4.999600600490783e-06,
240
+ "loss": 0.22072982788085938,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.1088,
245
+ "grad_norm": 0.28224550070191395,
246
+ "learning_rate": 4.9989453817439345e-06,
247
+ "loss": 0.2146312713623047,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.112,
252
+ "grad_norm": 0.26853026150431764,
253
+ "learning_rate": 4.997978258698942e-06,
254
+ "loss": 0.21449072360992433,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.1152,
259
+ "grad_norm": 0.2706003676564934,
260
+ "learning_rate": 4.996699352066659e-06,
261
+ "loss": 0.2151791572570801,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.1184,
266
+ "grad_norm": 0.28539700359373177,
267
+ "learning_rate": 4.995108821473014e-06,
268
+ "loss": 0.21470160484313966,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.1216,
273
+ "grad_norm": 0.29207494833659137,
274
+ "learning_rate": 4.993206865439084e-06,
275
+ "loss": 0.21086468696594238,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.1248,
280
+ "grad_norm": 0.263064572322246,
281
+ "learning_rate": 4.990993721356317e-06,
282
+ "loss": 0.20984139442443847,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.128,
287
+ "grad_norm": 0.2865097413347111,
288
+ "learning_rate": 4.988469665456901e-06,
289
+ "loss": 0.21040558815002441,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1312,
294
+ "grad_norm": 0.2738592744136949,
295
+ "learning_rate": 4.985635012779288e-06,
296
+ "loss": 0.21828360557556153,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1344,
301
+ "grad_norm": 0.24970618963972283,
302
+ "learning_rate": 4.98249011712887e-06,
303
+ "loss": 0.2106489658355713,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.1376,
308
+ "grad_norm": 0.2629431248486553,
309
+ "learning_rate": 4.979035371033824e-06,
310
+ "loss": 0.20979018211364747,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.1408,
315
+ "grad_norm": 0.2725505982701801,
316
+ "learning_rate": 4.975271205696115e-06,
317
+ "loss": 0.20948367118835448,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.144,
322
+ "grad_norm": 0.2704053444924022,
323
+ "learning_rate": 4.971198090937671e-06,
324
+ "loss": 0.2033768653869629,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1472,
329
+ "grad_norm": 0.31765035973786815,
330
+ "learning_rate": 4.966816535141756e-06,
331
+ "loss": 0.20044360160827637,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.1504,
336
+ "grad_norm": 0.2502347867419884,
337
+ "learning_rate": 4.9621270851895035e-06,
338
+ "loss": 0.2100567102432251,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1536,
343
+ "grad_norm": 0.2934932151321077,
344
+ "learning_rate": 4.957130326391662e-06,
345
+ "loss": 0.21090621948242189,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1568,
350
+ "grad_norm": 0.26660410583968774,
351
+ "learning_rate": 4.951826882415544e-06,
352
+ "loss": 0.20775444507598878,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.16,
357
+ "grad_norm": 0.28519626596006936,
358
+ "learning_rate": 4.946217415207177e-06,
359
+ "loss": 0.20256528854370118,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1632,
364
+ "grad_norm": 0.2798675045050625,
365
+ "learning_rate": 4.940302624908689e-06,
366
+ "loss": 0.20623595714569093,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1664,
371
+ "grad_norm": 0.28222884808809434,
372
+ "learning_rate": 4.934083249770912e-06,
373
+ "loss": 0.20097856521606444,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1696,
378
+ "grad_norm": 0.2788085638053828,
379
+ "learning_rate": 4.927560066061251e-06,
380
+ "loss": 0.20387496948242187,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.1728,
385
+ "grad_norm": 0.27554368272722524,
386
+ "learning_rate": 4.920733887966783e-06,
387
+ "loss": 0.21524934768676757,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.176,
392
+ "grad_norm": 0.26559833530971816,
393
+ "learning_rate": 4.913605567492636e-06,
394
+ "loss": 0.20402135848999023,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1792,
399
+ "grad_norm": 0.26554772115650926,
400
+ "learning_rate": 4.906175994355656e-06,
401
+ "loss": 0.20598478317260743,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.1824,
406
+ "grad_norm": 0.3223429392292309,
407
+ "learning_rate": 4.898446095873345e-06,
408
+ "loss": 0.20747475624084472,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.1856,
413
+ "grad_norm": 0.24355730693567182,
414
+ "learning_rate": 4.890416836848128e-06,
415
+ "loss": 0.20512137413024903,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.1888,
420
+ "grad_norm": 0.3001767473938059,
421
+ "learning_rate": 4.882089219446925e-06,
422
+ "loss": 0.19992779493331908,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.192,
427
+ "grad_norm": 0.2835389086432711,
428
+ "learning_rate": 4.873464283076074e-06,
429
+ "loss": 0.20495295524597168,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.1952,
434
+ "grad_norm": 0.26019712508927473,
435
+ "learning_rate": 4.864543104251587e-06,
436
+ "loss": 0.2035728931427002,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.1984,
441
+ "grad_norm": 0.2657949563176517,
442
+ "learning_rate": 4.855326796464798e-06,
443
+ "loss": 0.20619282722473145,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.2016,
448
+ "grad_norm": 0.28295912792439204,
449
+ "learning_rate": 4.8458165100433725e-06,
450
+ "loss": 0.2016925811767578,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.2048,
455
+ "grad_norm": 0.2902924299127114,
456
+ "learning_rate": 4.836013432007738e-06,
457
+ "loss": 0.20164456367492675,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.208,
462
+ "grad_norm": 0.24256417788990398,
463
+ "learning_rate": 4.825918785922921e-06,
464
+ "loss": 0.20648303031921386,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.2112,
469
+ "grad_norm": 0.27122351891055063,
470
+ "learning_rate": 4.8155338317458315e-06,
471
+ "loss": 0.20356349945068358,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.2144,
476
+ "grad_norm": 0.2600569122055766,
477
+ "learning_rate": 4.804859865668002e-06,
478
+ "loss": 0.19959055185317992,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.2176,
483
+ "grad_norm": 0.25345624369635567,
484
+ "learning_rate": 4.793898219953804e-06,
485
+ "loss": 0.2007960557937622,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.2208,
490
+ "grad_norm": 0.2544929334444299,
491
+ "learning_rate": 4.782650262774164e-06,
492
+ "loss": 0.20300769805908203,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.224,
497
+ "grad_norm": 0.2897145189307127,
498
+ "learning_rate": 4.7711173980357886e-06,
499
+ "loss": 0.19880002737045288,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.2272,
504
+ "grad_norm": 0.2560542526589546,
505
+ "learning_rate": 4.759301065205947e-06,
506
+ "loss": 0.19960763454437255,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.2304,
511
+ "grad_norm": 0.3097914904022575,
512
+ "learning_rate": 4.7472027391328e-06,
513
+ "loss": 0.2003918170928955,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.2336,
518
+ "grad_norm": 0.2887809607696432,
519
+ "learning_rate": 4.734823929861317e-06,
520
+ "loss": 0.20292911529541016,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.2368,
525
+ "grad_norm": 0.28917619670340877,
526
+ "learning_rate": 4.722166182444801e-06,
527
+ "loss": 0.20004558563232422,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.24,
532
+ "grad_norm": 0.27043264841658887,
533
+ "learning_rate": 4.709231076752045e-06,
534
+ "loss": 0.19843683242797852,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.2432,
539
+ "grad_norm": 0.2645651727770741,
540
+ "learning_rate": 4.696020227270142e-06,
541
+ "loss": 0.20258240699768065,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.2464,
546
+ "grad_norm": 0.2777282429222742,
547
+ "learning_rate": 4.6825352829029705e-06,
548
+ "loss": 0.1994302749633789,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.2496,
553
+ "grad_norm": 0.28182340391383837,
554
+ "learning_rate": 4.668777926765392e-06,
555
+ "loss": 0.197939932346344,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.2528,
560
+ "grad_norm": 0.2390403179508666,
561
+ "learning_rate": 4.6547498759731725e-06,
562
+ "loss": 0.19328031539916993,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.256,
567
+ "grad_norm": 0.30761446053746666,
568
+ "learning_rate": 4.6404528814286575e-06,
569
+ "loss": 0.1962287425994873,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.2592,
574
+ "grad_norm": 0.26058296777263723,
575
+ "learning_rate": 4.6258887276022425e-06,
576
+ "loss": 0.20304152965545655,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2624,
581
+ "grad_norm": 0.3023946784650888,
582
+ "learning_rate": 4.611059232309639e-06,
583
+ "loss": 0.19789116382598876,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2656,
588
+ "grad_norm": 0.28736962727648746,
589
+ "learning_rate": 4.595966246484986e-06,
590
+ "loss": 0.19968997240066527,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.2688,
595
+ "grad_norm": 0.28571881200537336,
596
+ "learning_rate": 4.580611653949829e-06,
597
+ "loss": 0.20007586479187012,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.272,
602
+ "grad_norm": 0.295019179491335,
603
+ "learning_rate": 4.564997371177992e-06,
604
+ "loss": 0.19763822555541993,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.2752,
609
+ "grad_norm": 0.29653404936460237,
610
+ "learning_rate": 4.54912534705637e-06,
611
+ "loss": 0.19755616188049316,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.2784,
616
+ "grad_norm": 0.2642449071502374,
617
+ "learning_rate": 4.532997562641683e-06,
618
+ "loss": 0.19439829587936402,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.2816,
623
+ "grad_norm": 0.25657475126133233,
624
+ "learning_rate": 4.516616030913214e-06,
625
+ "loss": 0.1987127423286438,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.2848,
630
+ "grad_norm": 0.28458590654874555,
631
+ "learning_rate": 4.499982796521556e-06,
632
+ "loss": 0.19352295398712158,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.288,
637
+ "grad_norm": 0.2793448530701338,
638
+ "learning_rate": 4.48309993553341e-06,
639
+ "loss": 0.1959349274635315,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2912,
644
+ "grad_norm": 0.3163250873932861,
645
+ "learning_rate": 4.465969555172468e-06,
646
+ "loss": 0.1957021713256836,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2944,
651
+ "grad_norm": 0.2933329400631374,
652
+ "learning_rate": 4.448593793556391e-06,
653
+ "loss": 0.20156097412109375,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2976,
658
+ "grad_norm": 0.2688085579058971,
659
+ "learning_rate": 4.430974819429954e-06,
660
+ "loss": 0.1948945164680481,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.3008,
665
+ "grad_norm": 0.28553708068341715,
666
+ "learning_rate": 4.413114831894344e-06,
667
+ "loss": 0.18995710611343383,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.304,
672
+ "grad_norm": 0.26518275753825254,
673
+ "learning_rate": 4.3950160601326865e-06,
674
+ "loss": 0.18871839046478273,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.3072,
679
+ "grad_norm": 0.28692003913342795,
680
+ "learning_rate": 4.376680763131811e-06,
681
+ "loss": 0.19533849954605104,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.3104,
686
+ "grad_norm": 0.27227233815166896,
687
+ "learning_rate": 4.358111229400296e-06,
688
+ "loss": 0.19751427173614503,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3136,
693
+ "grad_norm": 0.27245831220598377,
694
+ "learning_rate": 4.33930977668283e-06,
695
+ "loss": 0.20111453533172607,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3168,
700
+ "grad_norm": 0.2482632152661181,
701
+ "learning_rate": 4.320278751670922e-06,
702
+ "loss": 0.19406617879867555,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.32,
707
+ "grad_norm": 0.2892442073812178,
708
+ "learning_rate": 4.301020529710009e-06,
709
+ "loss": 0.19525597095489503,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3232,
714
+ "grad_norm": 0.26392559431034407,
715
+ "learning_rate": 4.281537514502962e-06,
716
+ "loss": 0.19918107986450195,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3264,
721
+ "grad_norm": 0.27003912401002855,
722
+ "learning_rate": 4.261832137810093e-06,
723
+ "loss": 0.1964997172355652,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3296,
728
+ "grad_norm": 0.2664017566726753,
729
+ "learning_rate": 4.241906859145611e-06,
730
+ "loss": 0.19660145044326782,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.3328,
735
+ "grad_norm": 0.2744161118643581,
736
+ "learning_rate": 4.221764165470661e-06,
737
+ "loss": 0.1935626745223999,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.336,
742
+ "grad_norm": 0.2717693030089869,
743
+ "learning_rate": 4.201406570882898e-06,
744
+ "loss": 0.19286205768585205,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3392,
749
+ "grad_norm": 0.259292524653773,
750
+ "learning_rate": 4.180836616302704e-06,
751
+ "loss": 0.1922353148460388,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.3424,
756
+ "grad_norm": 0.2739674960468982,
757
+ "learning_rate": 4.160056869156041e-06,
758
+ "loss": 0.19553282260894775,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.3456,
763
+ "grad_norm": 0.272965837223612,
764
+ "learning_rate": 4.139069923053995e-06,
765
+ "loss": 0.19367674589157105,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.3488,
770
+ "grad_norm": 0.2463436566122966,
771
+ "learning_rate": 4.117878397469062e-06,
772
+ "loss": 0.19772920608520508,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.352,
777
+ "grad_norm": 0.24672019869428047,
778
+ "learning_rate": 4.096484937408195e-06,
779
+ "loss": 0.1892393112182617,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.3552,
784
+ "grad_norm": 0.2673060417093708,
785
+ "learning_rate": 4.074892213082676e-06,
786
+ "loss": 0.1892371416091919,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3584,
791
+ "grad_norm": 0.26767314750680543,
792
+ "learning_rate": 4.0531029195748265e-06,
793
+ "loss": 0.19717614650726317,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3616,
798
+ "grad_norm": 0.2796524343786416,
799
+ "learning_rate": 4.03111977650163e-06,
800
+ "loss": 0.19503848552703856,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.3648,
805
+ "grad_norm": 0.2816284710404393,
806
+ "learning_rate": 4.008945527675281e-06,
807
+ "loss": 0.19529366493225098,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.368,
812
+ "grad_norm": 0.31949481569871324,
813
+ "learning_rate": 3.986582940760717e-06,
814
+ "loss": 0.18451136350631714,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.3712,
819
+ "grad_norm": 0.2723449306170863,
820
+ "learning_rate": 3.9640348069301785e-06,
821
+ "loss": 0.191510009765625,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.3744,
826
+ "grad_norm": 0.27747112521567696,
827
+ "learning_rate": 3.941303940514826e-06,
828
+ "loss": 0.19263410568237305,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.3776,
833
+ "grad_norm": 0.2719099807762723,
834
+ "learning_rate": 3.918393178653472e-06,
835
+ "loss": 0.19341590404510497,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.3808,
840
+ "grad_norm": 0.29074805846664115,
841
+ "learning_rate": 3.895305380938468e-06,
842
+ "loss": 0.19099385738372804,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.384,
847
+ "grad_norm": 0.2517462589595264,
848
+ "learning_rate": 3.872043429058783e-06,
849
+ "loss": 0.18874506950378417,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.3872,
854
+ "grad_norm": 0.2591827841853763,
855
+ "learning_rate": 3.84861022644033e-06,
856
+ "loss": 0.19069148302078248,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.3904,
861
+ "grad_norm": 0.2702770742629986,
862
+ "learning_rate": 3.825008697883574e-06,
863
+ "loss": 0.19928838014602662,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3936,
868
+ "grad_norm": 0.27788866885326635,
869
+ "learning_rate": 3.8012417891984776e-06,
870
+ "loss": 0.19237933158874512,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3968,
875
+ "grad_norm": 0.2656255469668472,
876
+ "learning_rate": 3.777312466836819e-06,
877
+ "loss": 0.19055767059326173,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.4,
882
+ "grad_norm": 0.28446496354107703,
883
+ "learning_rate": 3.7532237175219378e-06,
884
+ "loss": 0.18940582275390624,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.4032,
889
+ "grad_norm": 0.4152862546777316,
890
+ "learning_rate": 3.728978547875948e-06,
891
+ "loss": 0.19362914562225342,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.4064,
896
+ "grad_norm": 0.28537432061728957,
897
+ "learning_rate": 3.7045799840444712e-06,
898
+ "loss": 0.1886904716491699,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.4096,
903
+ "grad_norm": 0.29038310854731697,
904
+ "learning_rate": 3.6800310713189258e-06,
905
+ "loss": 0.18923617601394654,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.4128,
910
+ "grad_norm": 0.32132086585692904,
911
+ "learning_rate": 3.6553348737564328e-06,
912
+ "loss": 0.19005811214447021,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.416,
917
+ "grad_norm": 0.2669423345384319,
918
+ "learning_rate": 3.6304944737973794e-06,
919
+ "loss": 0.19575085639953613,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.4192,
924
+ "grad_norm": 0.28931030301965927,
925
+ "learning_rate": 3.6055129718806836e-06,
926
+ "loss": 0.18975239992141724,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4224,
931
+ "grad_norm": 0.28948269391746034,
932
+ "learning_rate": 3.5803934860568134e-06,
933
+ "loss": 0.18510067462921143,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4256,
938
+ "grad_norm": 0.2821484963772758,
939
+ "learning_rate": 3.5551391515986163e-06,
940
+ "loss": 0.1907583475112915,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4288,
945
+ "grad_norm": 0.27423888046510925,
946
+ "learning_rate": 3.529753120609982e-06,
947
+ "loss": 0.18690071105957032,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.432,
952
+ "grad_norm": 0.30811658453814883,
953
+ "learning_rate": 3.5042385616324243e-06,
954
+ "loss": 0.19000139236450195,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4352,
959
+ "grad_norm": 0.24402420223179272,
960
+ "learning_rate": 3.4785986592495934e-06,
961
+ "loss": 0.18803791999816893,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4384,
966
+ "grad_norm": 0.24576039119812526,
967
+ "learning_rate": 3.452836613689803e-06,
968
+ "loss": 0.1866163969039917,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.4416,
973
+ "grad_norm": 0.2949022587874532,
974
+ "learning_rate": 3.426955640426584e-06,
975
+ "loss": 0.1890486001968384,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.4448,
980
+ "grad_norm": 0.2582182081996982,
981
+ "learning_rate": 3.4009589697773605e-06,
982
+ "loss": 0.18851635456085206,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.448,
987
+ "grad_norm": 0.2722482128131903,
988
+ "learning_rate": 3.3748498465002475e-06,
989
+ "loss": 0.18554195165634155,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.4512,
994
+ "grad_norm": 0.27484686642107964,
995
+ "learning_rate": 3.3486315293890693e-06,
996
+ "loss": 0.19425587654113768,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4544,
1001
+ "grad_norm": 0.28258316073925427,
1002
+ "learning_rate": 3.3223072908666053e-06,
1003
+ "loss": 0.1843653440475464,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4576,
1008
+ "grad_norm": 0.28555979247115143,
1009
+ "learning_rate": 3.295880416576153e-06,
1010
+ "loss": 0.1941524863243103,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4608,
1015
+ "grad_norm": 0.2969010932820601,
1016
+ "learning_rate": 3.269354204971427e-06,
1017
+ "loss": 0.18759560585021973,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.464,
1022
+ "grad_norm": 0.30795851200957197,
1023
+ "learning_rate": 3.242731966904865e-06,
1024
+ "loss": 0.18544803857803344,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4672,
1029
+ "grad_norm": 0.28527072571260903,
1030
+ "learning_rate": 3.2160170252143913e-06,
1031
+ "loss": 0.18547136783599855,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4704,
1036
+ "grad_norm": 0.2533866816866613,
1037
+ "learning_rate": 3.1892127143086716e-06,
1038
+ "loss": 0.19228132963180541,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4736,
1043
+ "grad_norm": 0.2776942873045479,
1044
+ "learning_rate": 3.1623223797509347e-06,
1045
+ "loss": 0.1812342882156372,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4768,
1050
+ "grad_norm": 0.2744584915099732,
1051
+ "learning_rate": 3.135349377841396e-06,
1052
+ "loss": 0.1853887915611267,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.48,
1057
+ "grad_norm": 0.2866639604297882,
1058
+ "learning_rate": 3.1082970751983497e-06,
1059
+ "loss": 0.1918737769126892,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.4832,
1064
+ "grad_norm": 0.26310322890713356,
1065
+ "learning_rate": 3.0811688483379546e-06,
1066
+ "loss": 0.18995790481567382,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4864,
1071
+ "grad_norm": 0.28320054398109096,
1072
+ "learning_rate": 3.0539680832528074e-06,
1073
+ "loss": 0.18962399959564208,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4896,
1078
+ "grad_norm": 0.2654570443815982,
1079
+ "learning_rate": 3.026698174989316e-06,
1080
+ "loss": 0.18734774589538575,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4928,
1085
+ "grad_norm": 0.2658181920127404,
1086
+ "learning_rate": 2.999362527223952e-06,
1087
+ "loss": 0.1873406410217285,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.496,
1092
+ "grad_norm": 0.29250213703445505,
1093
+ "learning_rate": 2.9719645518384194e-06,
1094
+ "loss": 0.1892526626586914,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4992,
1099
+ "grad_norm": 0.3090995402302473,
1100
+ "learning_rate": 2.944507668493807e-06,
1101
+ "loss": 0.19257349967956544,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.5024,
1106
+ "grad_norm": 0.28272052629438726,
1107
+ "learning_rate": 2.9169953042037623e-06,
1108
+ "loss": 0.18868753910064698,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.5056,
1113
+ "grad_norm": 0.3954198531333443,
1114
+ "learning_rate": 2.889430892906754e-06,
1115
+ "loss": 0.18459179401397705,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.5088,
1120
+ "grad_norm": 0.2563261821009193,
1121
+ "learning_rate": 2.861817875037462e-06,
1122
+ "loss": 0.18163517713546753,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.512,
1127
+ "grad_norm": 0.28115388072993086,
1128
+ "learning_rate": 2.8341596970973683e-06,
1129
+ "loss": 0.19087796211242675,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.5152,
1134
+ "grad_norm": 0.27079102831839946,
1135
+ "learning_rate": 2.80645981122458e-06,
1136
+ "loss": 0.1863863706588745,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.5184,
1141
+ "grad_norm": 0.27596423249252744,
1142
+ "learning_rate": 2.7787216747629508e-06,
1143
+ "loss": 0.19303735494613647,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5216,
1148
+ "grad_norm": 0.2682301223547138,
1149
+ "learning_rate": 2.7509487498305615e-06,
1150
+ "loss": 0.18045294284820557,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5248,
1155
+ "grad_norm": 0.27817197846381203,
1156
+ "learning_rate": 2.7231445028875924e-06,
1157
+ "loss": 0.18653267621994019,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.528,
1162
+ "grad_norm": 0.25176165708531945,
1163
+ "learning_rate": 2.6953124043036604e-06,
1164
+ "loss": 0.18530716896057128,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5312,
1169
+ "grad_norm": 0.272299195118528,
1170
+ "learning_rate": 2.667455927924667e-06,
1171
+ "loss": 0.18495219945907593,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5344,
1176
+ "grad_norm": 0.26513870922757315,
1177
+ "learning_rate": 2.6395785506392164e-06,
1178
+ "loss": 0.18016864061355592,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.5376,
1183
+ "grad_norm": 0.26899577641448663,
1184
+ "learning_rate": 2.6116837519446407e-06,
1185
+ "loss": 0.18437364101409912,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5408,
1190
+ "grad_norm": 0.29589553270345376,
1191
+ "learning_rate": 2.5837750135127192e-06,
1192
+ "loss": 0.18141529560089112,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.544,
1197
+ "grad_norm": 0.28180995392351926,
1198
+ "learning_rate": 2.555855818755108e-06,
1199
+ "loss": 0.18680166006088256,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.5472,
1204
+ "grad_norm": 0.29608650413456306,
1205
+ "learning_rate": 2.5279296523885636e-06,
1206
+ "loss": 0.18486298322677613,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5504,
1211
+ "grad_norm": 0.28475957723655715,
1212
+ "learning_rate": 2.5e-06,
1213
+ "loss": 0.1850725531578064,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5536,
1218
+ "grad_norm": 0.27856833997611247,
1219
+ "learning_rate": 2.472070347611437e-06,
1220
+ "loss": 0.1791991949081421,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5568,
1225
+ "grad_norm": 0.30516489860119894,
1226
+ "learning_rate": 2.444144181244893e-06,
1227
+ "loss": 0.18483606576919556,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.56,
1232
+ "grad_norm": 0.29804656625996045,
1233
+ "learning_rate": 2.416224986487282e-06,
1234
+ "loss": 0.18195321559906005,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5632,
1239
+ "grad_norm": 0.30740179095263215,
1240
+ "learning_rate": 2.3883162480553605e-06,
1241
+ "loss": 0.17964634895324708,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5664,
1246
+ "grad_norm": 0.29672245353605753,
1247
+ "learning_rate": 2.3604214493607844e-06,
1248
+ "loss": 0.18308933973312377,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5696,
1253
+ "grad_norm": 0.2837212145176832,
1254
+ "learning_rate": 2.332544072075333e-06,
1255
+ "loss": 0.18688681125640869,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5728,
1260
+ "grad_norm": 0.28451872958084823,
1261
+ "learning_rate": 2.30468759569634e-06,
1262
+ "loss": 0.18532857894897461,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.576,
1267
+ "grad_norm": 0.29734825652467917,
1268
+ "learning_rate": 2.276855497112408e-06,
1269
+ "loss": 0.18262310028076173,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5792,
1274
+ "grad_norm": 0.3012944650683003,
1275
+ "learning_rate": 2.2490512501694394e-06,
1276
+ "loss": 0.17781586647033693,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5824,
1281
+ "grad_norm": 0.2692920477116042,
1282
+ "learning_rate": 2.2212783252370496e-06,
1283
+ "loss": 0.18318163156509398,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5856,
1288
+ "grad_norm": 0.2700619255739624,
1289
+ "learning_rate": 2.1935401887754213e-06,
1290
+ "loss": 0.18857367038726808,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5888,
1295
+ "grad_norm": 0.2868516489290536,
1296
+ "learning_rate": 2.165840302902632e-06,
1297
+ "loss": 0.18190672397613525,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.592,
1302
+ "grad_norm": 0.28726300225812107,
1303
+ "learning_rate": 2.1381821249625383e-06,
1304
+ "loss": 0.1867521286010742,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5952,
1309
+ "grad_norm": 0.2995145996099388,
1310
+ "learning_rate": 2.1105691070932465e-06,
1311
+ "loss": 0.17851842641830445,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5984,
1316
+ "grad_norm": 0.28575212768410063,
1317
+ "learning_rate": 2.083004695796238e-06,
1318
+ "loss": 0.17741835117340088,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.6016,
1323
+ "grad_norm": 0.31284763297048707,
1324
+ "learning_rate": 2.055492331506194e-06,
1325
+ "loss": 0.1843113422393799,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.6048,
1330
+ "grad_norm": 0.3170666816206652,
1331
+ "learning_rate": 2.0280354481615814e-06,
1332
+ "loss": 0.18248820304870605,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.608,
1337
+ "grad_norm": 0.30950907311465886,
1338
+ "learning_rate": 2.000637472776049e-06,
1339
+ "loss": 0.1839754819869995,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.6112,
1344
+ "grad_norm": 0.2536972696685391,
1345
+ "learning_rate": 1.973301825010685e-06,
1346
+ "loss": 0.17841637134552002,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.6144,
1351
+ "grad_norm": 0.291862692607901,
1352
+ "learning_rate": 1.9460319167471934e-06,
1353
+ "loss": 0.18339977264404297,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.6176,
1358
+ "grad_norm": 0.2848109477155621,
1359
+ "learning_rate": 1.9188311516620466e-06,
1360
+ "loss": 0.17915148735046388,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.6208,
1365
+ "grad_norm": 0.3060077712638729,
1366
+ "learning_rate": 1.891702924801651e-06,
1367
+ "loss": 0.1848907709121704,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.624,
1372
+ "grad_norm": 0.27297816434517674,
1373
+ "learning_rate": 1.864650622158604e-06,
1374
+ "loss": 0.18888840675354004,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6272,
1379
+ "grad_norm": 0.2781302448691454,
1380
+ "learning_rate": 1.8376776202490666e-06,
1381
+ "loss": 0.1847243309020996,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6304,
1386
+ "grad_norm": 0.31527749144779466,
1387
+ "learning_rate": 1.8107872856913293e-06,
1388
+ "loss": 0.17888798713684081,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.6336,
1393
+ "grad_norm": 0.2981389294211551,
1394
+ "learning_rate": 1.7839829747856096e-06,
1395
+ "loss": 0.18081605434417725,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6368,
1400
+ "grad_norm": 0.29438595992497246,
1401
+ "learning_rate": 1.7572680330951359e-06,
1402
+ "loss": 0.17975808382034303,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.64,
1407
+ "grad_norm": 0.2777422843592099,
1408
+ "learning_rate": 1.7306457950285747e-06,
1409
+ "loss": 0.1812159538269043,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6432,
1414
+ "grad_norm": 0.3068388373590525,
1415
+ "learning_rate": 1.704119583423848e-06,
1416
+ "loss": 0.17536230087280275,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6464,
1421
+ "grad_norm": 0.272885194568128,
1422
+ "learning_rate": 1.677692709133396e-06,
1423
+ "loss": 0.18365554809570311,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6496,
1428
+ "grad_norm": 0.3023336412584975,
1429
+ "learning_rate": 1.6513684706109311e-06,
1430
+ "loss": 0.18368566036224365,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6528,
1435
+ "grad_norm": 0.28988866387653284,
1436
+ "learning_rate": 1.6251501534997529e-06,
1437
+ "loss": 0.18175660371780394,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.656,
1442
+ "grad_norm": 0.28123365590903454,
1443
+ "learning_rate": 1.5990410302226405e-06,
1444
+ "loss": 0.17483808994293212,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6592,
1449
+ "grad_norm": 0.28187049939921544,
1450
+ "learning_rate": 1.5730443595734162e-06,
1451
+ "loss": 0.18124582767486572,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6624,
1456
+ "grad_norm": 0.31643189708694724,
1457
+ "learning_rate": 1.5471633863101982e-06,
1458
+ "loss": 0.18188211917877198,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.6656,
1463
+ "grad_norm": 0.3071146379480691,
1464
+ "learning_rate": 1.521401340750407e-06,
1465
+ "loss": 0.18458983898162842,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6688,
1470
+ "grad_norm": 0.30923765962914507,
1471
+ "learning_rate": 1.495761438367577e-06,
1472
+ "loss": 0.18291953802108765,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.672,
1477
+ "grad_norm": 0.31506268222239586,
1478
+ "learning_rate": 1.4702468793900187e-06,
1479
+ "loss": 0.18112607002258302,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6752,
1484
+ "grad_norm": 0.2991031913192095,
1485
+ "learning_rate": 1.444860848401384e-06,
1486
+ "loss": 0.18132129907608033,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6784,
1491
+ "grad_norm": 0.306957825954438,
1492
+ "learning_rate": 1.4196065139431866e-06,
1493
+ "loss": 0.18091821670532227,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6816,
1498
+ "grad_norm": 0.30984784981623864,
1499
+ "learning_rate": 1.3944870281193178e-06,
1500
+ "loss": 0.17975277900695802,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6848,
1505
+ "grad_norm": 0.33685631116321924,
1506
+ "learning_rate": 1.3695055262026208e-06,
1507
+ "loss": 0.18606040477752686,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.688,
1512
+ "grad_norm": 0.28362188085343176,
1513
+ "learning_rate": 1.3446651262435679e-06,
1514
+ "loss": 0.17845985889434815,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6912,
1519
+ "grad_norm": 0.28046286761312267,
1520
+ "learning_rate": 1.3199689286810746e-06,
1521
+ "loss": 0.18048195838928222,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6944,
1526
+ "grad_norm": 0.29900090645940436,
1527
+ "learning_rate": 1.2954200159555294e-06,
1528
+ "loss": 0.17538446187973022,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6976,
1533
+ "grad_norm": 0.32576508972663926,
1534
+ "learning_rate": 1.2710214521240527e-06,
1535
+ "loss": 0.18001599311828614,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.7008,
1540
+ "grad_norm": 0.30869890145158635,
1541
+ "learning_rate": 1.246776282478063e-06,
1542
+ "loss": 0.18135268688201905,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.704,
1547
+ "grad_norm": 0.28612747319198,
1548
+ "learning_rate": 1.222687533163181e-06,
1549
+ "loss": 0.18038851022720337,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.7072,
1554
+ "grad_norm": 0.32303440375726766,
1555
+ "learning_rate": 1.1987582108015228e-06,
1556
+ "loss": 0.18109045028686524,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.7104,
1561
+ "grad_norm": 0.3093047688685527,
1562
+ "learning_rate": 1.1749913021164255e-06,
1563
+ "loss": 0.18254566192626953,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.7136,
1568
+ "grad_norm": 0.2882548432858515,
1569
+ "learning_rate": 1.1513897735596702e-06,
1570
+ "loss": 0.17732615470886232,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.7168,
1575
+ "grad_norm": 0.29445166285798274,
1576
+ "learning_rate": 1.127956570941218e-06,
1577
+ "loss": 0.17425966262817383,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.72,
1582
+ "grad_norm": 0.3514589237334647,
1583
+ "learning_rate": 1.104694619061533e-06,
1584
+ "loss": 0.18296418190002442,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.7232,
1589
+ "grad_norm": 0.32323021290499837,
1590
+ "learning_rate": 1.0816068213465295e-06,
1591
+ "loss": 0.1851881265640259,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.7264,
1596
+ "grad_norm": 0.30421571681673176,
1597
+ "learning_rate": 1.0586960594851762e-06,
1598
+ "loss": 0.180436372756958,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.7296,
1603
+ "grad_norm": 0.31911631321578676,
1604
+ "learning_rate": 1.0359651930698217e-06,
1605
+ "loss": 0.17929892539978026,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7328,
1610
+ "grad_norm": 0.30015899620754233,
1611
+ "learning_rate": 1.0134170592392837e-06,
1612
+ "loss": 0.18022915124893188,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.736,
1617
+ "grad_norm": 0.31786084969492157,
1618
+ "learning_rate": 9.910544723247204e-07,
1619
+ "loss": 0.17959039211273192,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7392,
1624
+ "grad_norm": 0.31599364626026827,
1625
+ "learning_rate": 9.688802234983706e-07,
1626
+ "loss": 0.17806137800216676,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7424,
1631
+ "grad_norm": 0.3303243768736776,
1632
+ "learning_rate": 9.468970804251742e-07,
1633
+ "loss": 0.1811964988708496,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7456,
1638
+ "grad_norm": 0.3312986961423255,
1639
+ "learning_rate": 9.251077869173244e-07,
1640
+ "loss": 0.17583439350128174,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7488,
1645
+ "grad_norm": 0.30030412592967864,
1646
+ "learning_rate": 9.035150625918054e-07,
1647
+ "loss": 0.17623555660247803,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.752,
1652
+ "grad_norm": 0.3177646626866783,
1653
+ "learning_rate": 8.821216025309395e-07,
1654
+ "loss": 0.18003884553909302,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7552,
1659
+ "grad_norm": 0.3012142976429357,
1660
+ "learning_rate": 8.609300769460055e-07,
1661
+ "loss": 0.17543296813964843,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7584,
1666
+ "grad_norm": 0.3177168816443014,
1667
+ "learning_rate": 8.399431308439592e-07,
1668
+ "loss": 0.18021781444549562,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7616,
1673
+ "grad_norm": 0.34248252589513506,
1674
+ "learning_rate": 8.191633836972962e-07,
1675
+ "loss": 0.18125417232513427,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7648,
1680
+ "grad_norm": 0.29292480325152365,
1681
+ "learning_rate": 7.985934291171024e-07,
1682
+ "loss": 0.17757056951522826,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.768,
1687
+ "grad_norm": 0.3257764859746147,
1688
+ "learning_rate": 7.7823583452934e-07,
1689
+ "loss": 0.18096057176589966,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7712,
1694
+ "grad_norm": 0.28892062916284306,
1695
+ "learning_rate": 7.58093140854389e-07,
1696
+ "loss": 0.18015010356903077,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7744,
1701
+ "grad_norm": 0.32360358107292697,
1702
+ "learning_rate": 7.381678621899077e-07,
1703
+ "loss": 0.18067935705184937,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7776,
1708
+ "grad_norm": 0.3139428787829718,
1709
+ "learning_rate": 7.184624854970379e-07,
1710
+ "loss": 0.1768512487411499,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7808,
1715
+ "grad_norm": 0.3182311104789415,
1716
+ "learning_rate": 6.989794702899932e-07,
1717
+ "loss": 0.17589566707611085,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.784,
1722
+ "grad_norm": 0.3112954733861784,
1723
+ "learning_rate": 6.797212483290777e-07,
1724
+ "loss": 0.177903413772583,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7872,
1729
+ "grad_norm": 0.31026727362843554,
1730
+ "learning_rate": 6.60690223317171e-07,
1731
+ "loss": 0.17535500526428222,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7904,
1736
+ "grad_norm": 0.2855504901999764,
1737
+ "learning_rate": 6.418887705997046e-07,
1738
+ "loss": 0.1787285327911377,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7936,
1743
+ "grad_norm": 0.33581031525319194,
1744
+ "learning_rate": 6.23319236868189e-07,
1745
+ "loss": 0.181508469581604,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7968,
1750
+ "grad_norm": 0.30084134655605693,
1751
+ "learning_rate": 6.049839398673141e-07,
1752
+ "loss": 0.18244649171829225,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.8,
1757
+ "grad_norm": 0.3207759323449182,
1758
+ "learning_rate": 5.868851681056567e-07,
1759
+ "loss": 0.18296375274658203,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.8032,
1764
+ "grad_norm": 0.3103299858846911,
1765
+ "learning_rate": 5.690251805700467e-07,
1766
+ "loss": 0.18089601993560792,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.8064,
1771
+ "grad_norm": 0.3310470653200237,
1772
+ "learning_rate": 5.514062064436096e-07,
1773
+ "loss": 0.1829407334327698,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.8096,
1778
+ "grad_norm": 0.31783823046596615,
1779
+ "learning_rate": 5.34030444827533e-07,
1780
+ "loss": 0.17886234521865846,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.8128,
1785
+ "grad_norm": 0.3279151171862584,
1786
+ "learning_rate": 5.169000644665895e-07,
1787
+ "loss": 0.17618993520736695,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.816,
1792
+ "grad_norm": 0.3006249030100123,
1793
+ "learning_rate": 5.000172034784442e-07,
1794
+ "loss": 0.17779455184936524,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.8192,
1799
+ "grad_norm": 0.3164261324675526,
1800
+ "learning_rate": 4.833839690867853e-07,
1801
+ "loss": 0.18002912998199463,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.8224,
1806
+ "grad_norm": 0.31374931318878396,
1807
+ "learning_rate": 4.6700243735831705e-07,
1808
+ "loss": 0.173567795753479,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.8256,
1813
+ "grad_norm": 0.31170459979916293,
1814
+ "learning_rate": 4.508746529436311e-07,
1815
+ "loss": 0.1724323034286499,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.8288,
1820
+ "grad_norm": 0.3080863565290302,
1821
+ "learning_rate": 4.350026288220083e-07,
1822
+ "loss": 0.1794981598854065,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.832,
1827
+ "grad_norm": 0.30618951989415283,
1828
+ "learning_rate": 4.1938834605017133e-07,
1829
+ "loss": 0.1761255979537964,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8352,
1834
+ "grad_norm": 0.3029510706797137,
1835
+ "learning_rate": 4.0403375351501515e-07,
1836
+ "loss": 0.17623082399368287,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8384,
1841
+ "grad_norm": 0.336336912959277,
1842
+ "learning_rate": 3.88940767690362e-07,
1843
+ "loss": 0.1757615327835083,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8416,
1848
+ "grad_norm": 0.32859024308656015,
1849
+ "learning_rate": 3.7411127239775774e-07,
1850
+ "loss": 0.17455869913101196,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8448,
1855
+ "grad_norm": 0.3174124959768476,
1856
+ "learning_rate": 3.595471185713431e-07,
1857
+ "loss": 0.17312180995941162,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.848,
1862
+ "grad_norm": 0.3247217043719523,
1863
+ "learning_rate": 3.4525012402682826e-07,
1864
+ "loss": 0.17421470880508422,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8512,
1869
+ "grad_norm": 0.3290462164412991,
1870
+ "learning_rate": 3.3122207323460804e-07,
1871
+ "loss": 0.17708632946014405,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8544,
1876
+ "grad_norm": 0.3024938333869805,
1877
+ "learning_rate": 3.1746471709702963e-07,
1878
+ "loss": 0.17333836555480958,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8576,
1883
+ "grad_norm": 0.32678703604131465,
1884
+ "learning_rate": 3.039797727298585e-07,
1885
+ "loss": 0.1801586151123047,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8608,
1890
+ "grad_norm": 0.32985764106850785,
1891
+ "learning_rate": 2.9076892324795546e-07,
1892
+ "loss": 0.17783432006835936,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.864,
1897
+ "grad_norm": 0.31242585953952057,
1898
+ "learning_rate": 2.778338175551995e-07,
1899
+ "loss": 0.17357670068740844,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8672,
1904
+ "grad_norm": 0.3220012856306909,
1905
+ "learning_rate": 2.6517607013868326e-07,
1906
+ "loss": 0.18131563663482667,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.8704,
1911
+ "grad_norm": 0.33350326064348024,
1912
+ "learning_rate": 2.527972608672002e-07,
1913
+ "loss": 0.17757024765014648,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8736,
1918
+ "grad_norm": 0.335919926946263,
1919
+ "learning_rate": 2.40698934794053e-07,
1920
+ "loss": 0.17683808803558348,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8768,
1925
+ "grad_norm": 0.3209912976041497,
1926
+ "learning_rate": 2.2888260196421237e-07,
1927
+ "loss": 0.17635661363601685,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.88,
1932
+ "grad_norm": 0.3165955269677658,
1933
+ "learning_rate": 2.1734973722583735e-07,
1934
+ "loss": 0.17913974523544313,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8832,
1939
+ "grad_norm": 0.31474674596852353,
1940
+ "learning_rate": 2.0610178004619564e-07,
1941
+ "loss": 0.17095563411712647,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8864,
1946
+ "grad_norm": 0.305115903859637,
1947
+ "learning_rate": 1.9514013433199834e-07,
1948
+ "loss": 0.18293533325195313,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8896,
1953
+ "grad_norm": 0.3164297745100823,
1954
+ "learning_rate": 1.8446616825416958e-07,
1955
+ "loss": 0.18138229846954346,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8928,
1960
+ "grad_norm": 0.3526140625065779,
1961
+ "learning_rate": 1.7408121407708007e-07,
1962
+ "loss": 0.18163397312164306,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.896,
1967
+ "grad_norm": 0.3224933819196559,
1968
+ "learning_rate": 1.6398656799226253e-07,
1969
+ "loss": 0.1705089807510376,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8992,
1974
+ "grad_norm": 0.31764589677400257,
1975
+ "learning_rate": 1.5418348995662773e-07,
1976
+ "loss": 0.17652597427368164,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.9024,
1981
+ "grad_norm": 0.3414067132784035,
1982
+ "learning_rate": 1.4467320353520275e-07,
1983
+ "loss": 0.17487871646881104,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.9056,
1988
+ "grad_norm": 0.3138098972679996,
1989
+ "learning_rate": 1.3545689574841341e-07,
1990
+ "loss": 0.17592911720275878,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.9088,
1995
+ "grad_norm": 0.31560573280288073,
1996
+ "learning_rate": 1.26535716923927e-07,
1997
+ "loss": 0.18197228908538818,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.912,
2002
+ "grad_norm": 0.3188962184685744,
2003
+ "learning_rate": 1.1791078055307493e-07,
2004
+ "loss": 0.1777464509010315,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.9152,
2009
+ "grad_norm": 0.31575220367713525,
2010
+ "learning_rate": 1.0958316315187289e-07,
2011
+ "loss": 0.17706483602523804,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.9184,
2016
+ "grad_norm": 0.3131837624055497,
2017
+ "learning_rate": 1.0155390412665528e-07,
2018
+ "loss": 0.17496002912521363,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.9216,
2023
+ "grad_norm": 0.32248583567737266,
2024
+ "learning_rate": 9.38240056443443e-08,
2025
+ "loss": 0.17229046821594238,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.9248,
2030
+ "grad_norm": 0.3101253584845484,
2031
+ "learning_rate": 8.639443250736402e-08,
2032
+ "loss": 0.17552309036254882,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.928,
2037
+ "grad_norm": 0.33217431742972764,
2038
+ "learning_rate": 7.926611203321777e-08,
2039
+ "loss": 0.17659810781478882,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.9312,
2044
+ "grad_norm": 0.33918124282098266,
2045
+ "learning_rate": 7.243993393874882e-08,
2046
+ "loss": 0.17737939357757568,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.9344,
2051
+ "grad_norm": 0.31351790893613213,
2052
+ "learning_rate": 6.591675022908805e-08,
2053
+ "loss": 0.1745692253112793,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9376,
2058
+ "grad_norm": 0.33783778867129854,
2059
+ "learning_rate": 5.969737509131241e-08,
2060
+ "loss": 0.1722058415412903,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9408,
2065
+ "grad_norm": 0.308776655874055,
2066
+ "learning_rate": 5.3782584792823334e-08,
2067
+ "loss": 0.17710112333297728,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.944,
2072
+ "grad_norm": 0.3142338038371378,
2073
+ "learning_rate": 4.817311758445686e-08,
2074
+ "loss": 0.178252911567688,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9472,
2079
+ "grad_norm": 0.33048986218580767,
2080
+ "learning_rate": 4.286967360833866e-08,
2081
+ "loss": 0.1782402753829956,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9504,
2086
+ "grad_norm": 0.3110909627270251,
2087
+ "learning_rate": 3.787291481049754e-08,
2088
+ "loss": 0.17829475402832032,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9536,
2093
+ "grad_norm": 0.33726065147122686,
2094
+ "learning_rate": 3.3183464858244364e-08,
2095
+ "loss": 0.18406097888946532,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.9568,
2100
+ "grad_norm": 0.3326393750086487,
2101
+ "learning_rate": 2.8801909062328992e-08,
2102
+ "loss": 0.17060396671295167,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.96,
2107
+ "grad_norm": 0.32948960265922206,
2108
+ "learning_rate": 2.4728794303886248e-08,
2109
+ "loss": 0.16899311542510986,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9632,
2114
+ "grad_norm": 0.33211982053439487,
2115
+ "learning_rate": 2.0964628966175794e-08,
2116
+ "loss": 0.17517964839935302,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.9664,
2121
+ "grad_norm": 0.30613498697830943,
2122
+ "learning_rate": 1.750988287113009e-08,
2123
+ "loss": 0.17458994388580323,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9696,
2128
+ "grad_norm": 0.3027770955918648,
2129
+ "learning_rate": 1.4364987220713278e-08,
2130
+ "loss": 0.18178436756134034,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9728,
2135
+ "grad_norm": 0.3292318037983906,
2136
+ "learning_rate": 1.1530334543099763e-08,
2137
+ "loss": 0.1790144681930542,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.976,
2142
+ "grad_norm": 0.33300755292787143,
2143
+ "learning_rate": 9.006278643683697e-09,
2144
+ "loss": 0.1808505654335022,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9792,
2149
+ "grad_norm": 0.32631723332989787,
2150
+ "learning_rate": 6.793134560916514e-09,
2151
+ "loss": 0.18275127410888672,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9824,
2156
+ "grad_norm": 0.3082787331662993,
2157
+ "learning_rate": 4.891178526986451e-09,
2158
+ "loss": 0.1783647656440735,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9856,
2163
+ "grad_norm": 0.32341550392390483,
2164
+ "learning_rate": 3.3006479333413943e-09,
2165
+ "loss": 0.18126009702682494,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9888,
2170
+ "grad_norm": 0.30931371888762194,
2171
+ "learning_rate": 2.021741301058422e-09,
2172
+ "loss": 0.17681236267089845,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.992,
2177
+ "grad_norm": 0.3419672311636941,
2178
+ "learning_rate": 1.0546182560652872e-09,
2179
+ "loss": 0.17989683151245117,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9952,
2184
+ "grad_norm": 0.3111951639834393,
2185
+ "learning_rate": 3.9939950921774607e-10,
2186
+ "loss": 0.17482796907424927,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.9984,
2191
+ "grad_norm": 0.3230413672933209,
2192
+ "learning_rate": 5.616684123160854e-11,
2193
+ "loss": 0.17436976432800294,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 1.0,
2198
+ "step": 3125,
2199
+ "total_flos": 1955525886476288.0,
2200
+ "train_loss": 0.1948647116279602,
2201
+ "train_runtime": 28413.61,
2202
+ "train_samples_per_second": 7.039,
2203
+ "train_steps_per_second": 0.11
2204
+ }
2205
+ ],
2206
+ "logging_steps": 10,
2207
+ "max_steps": 3125,
2208
+ "num_input_tokens_seen": 0,
2209
+ "num_train_epochs": 1,
2210
+ "save_steps": 500,
2211
+ "stateful_callbacks": {
2212
+ "TrainerControl": {
2213
+ "args": {
2214
+ "should_epoch_stop": false,
2215
+ "should_evaluate": false,
2216
+ "should_log": false,
2217
+ "should_save": true,
2218
+ "should_training_stop": true
2219
+ },
2220
+ "attributes": {}
2221
+ }
2222
+ },
2223
+ "total_flos": 1955525886476288.0,
2224
+ "train_batch_size": 4,
2225
+ "trial_name": null,
2226
+ "trial_params": null
2227
+ }
checkpoints/InternVL3.5-8B-SFT/training_loss.png ADDED
checkpoints/Qwen3-VL-2B-SFT/all_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1201860236279808.0,
4
+ "train_loss": 0.2128027264213562,
5
+ "train_runtime": 15463.9635,
6
+ "train_samples_per_second": 12.933,
7
+ "train_steps_per_second": 0.202
8
+ }
checkpoints/Qwen3-VL-2B-SFT/chat_template.jinja ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].role == 'system' %}
4
+ {%- if messages[0].content is string %}
5
+ {{- messages[0].content }}
6
+ {%- else %}
7
+ {%- for content in messages[0].content %}
8
+ {%- if 'text' in content %}
9
+ {{- content.text }}
10
+ {%- endif %}
11
+ {%- endfor %}
12
+ {%- endif %}
13
+ {{- '\n\n' }}
14
+ {%- endif %}
15
+ {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
16
+ {%- for tool in tools %}
17
+ {{- "\n" }}
18
+ {{- tool | tojson }}
19
+ {%- endfor %}
20
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
21
+ {%- else %}
22
+ {%- if messages[0].role == 'system' %}
23
+ {{- '<|im_start|>system\n' }}
24
+ {%- if messages[0].content is string %}
25
+ {{- messages[0].content }}
26
+ {%- else %}
27
+ {%- for content in messages[0].content %}
28
+ {%- if 'text' in content %}
29
+ {{- content.text }}
30
+ {%- endif %}
31
+ {%- endfor %}
32
+ {%- endif %}
33
+ {{- '<|im_end|>\n' }}
34
+ {%- endif %}
35
+ {%- endif %}
36
+ {%- set image_count = namespace(value=0) %}
37
+ {%- set video_count = namespace(value=0) %}
38
+ {%- for message in messages %}
39
+ {%- if message.role == "user" %}
40
+ {{- '<|im_start|>' + message.role + '\n' }}
41
+ {%- if message.content is string %}
42
+ {{- message.content }}
43
+ {%- else %}
44
+ {%- for content in message.content %}
45
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
46
+ {%- set image_count.value = image_count.value + 1 %}
47
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
48
+ <|vision_start|><|image_pad|><|vision_end|>
49
+ {%- elif content.type == 'video' or 'video' in content %}
50
+ {%- set video_count.value = video_count.value + 1 %}
51
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
52
+ <|vision_start|><|video_pad|><|vision_end|>
53
+ {%- elif 'text' in content %}
54
+ {{- content.text }}
55
+ {%- endif %}
56
+ {%- endfor %}
57
+ {%- endif %}
58
+ {{- '<|im_end|>\n' }}
59
+ {%- elif message.role == "assistant" %}
60
+ {{- '<|im_start|>' + message.role + '\n' }}
61
+ {%- if message.content is string %}
62
+ {{- message.content }}
63
+ {%- else %}
64
+ {%- for content_item in message.content %}
65
+ {%- if 'text' in content_item %}
66
+ {{- content_item.text }}
67
+ {%- endif %}
68
+ {%- endfor %}
69
+ {%- endif %}
70
+ {%- if message.tool_calls %}
71
+ {%- for tool_call in message.tool_calls %}
72
+ {%- if (loop.first and message.content) or (not loop.first) %}
73
+ {{- '\n' }}
74
+ {%- endif %}
75
+ {%- if tool_call.function %}
76
+ {%- set tool_call = tool_call.function %}
77
+ {%- endif %}
78
+ {{- '<tool_call>\n{"name": "' }}
79
+ {{- tool_call.name }}
80
+ {{- '", "arguments": ' }}
81
+ {%- if tool_call.arguments is string %}
82
+ {{- tool_call.arguments }}
83
+ {%- else %}
84
+ {{- tool_call.arguments | tojson }}
85
+ {%- endif %}
86
+ {{- '}\n</tool_call>' }}
87
+ {%- endfor %}
88
+ {%- endif %}
89
+ {{- '<|im_end|>\n' }}
90
+ {%- elif message.role == "tool" %}
91
+ {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
92
+ {{- '<|im_start|>user' }}
93
+ {%- endif %}
94
+ {{- '\n<tool_response>\n' }}
95
+ {%- if message.content is string %}
96
+ {{- message.content }}
97
+ {%- else %}
98
+ {%- for content in message.content %}
99
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content %}
100
+ {%- set image_count.value = image_count.value + 1 %}
101
+ {%- if add_vision_id %}Picture {{ image_count.value }}: {% endif -%}
102
+ <|vision_start|><|image_pad|><|vision_end|>
103
+ {%- elif content.type == 'video' or 'video' in content %}
104
+ {%- set video_count.value = video_count.value + 1 %}
105
+ {%- if add_vision_id %}Video {{ video_count.value }}: {% endif -%}
106
+ <|vision_start|><|video_pad|><|vision_end|>
107
+ {%- elif 'text' in content %}
108
+ {{- content.text }}
109
+ {%- endif %}
110
+ {%- endfor %}
111
+ {%- endif %}
112
+ {{- '\n</tool_response>' }}
113
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
114
+ {{- '<|im_end|>\n' }}
115
+ {%- endif %}
116
+ {%- endif %}
117
+ {%- endfor %}
118
+ {%- if add_generation_prompt %}
119
+ {{- '<|im_start|>assistant\n' }}
120
+ {%- endif %}
checkpoints/Qwen3-VL-2B-SFT/config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen3VLForConditionalGeneration"
4
+ ],
5
+ "bos_token_id": null,
6
+ "dtype": "bfloat16",
7
+ "eos_token_id": 151645,
8
+ "hidden_size": 2048,
9
+ "image_token_id": 151655,
10
+ "model_type": "qwen3_vl",
11
+ "pad_token_id": 151643,
12
+ "text_config": {
13
+ "attention_bias": false,
14
+ "attention_dropout": 0.0,
15
+ "bos_token_id": 151643,
16
+ "dtype": "bfloat16",
17
+ "eos_token_id": 151645,
18
+ "head_dim": 128,
19
+ "hidden_act": "silu",
20
+ "hidden_size": 2048,
21
+ "initializer_range": 0.02,
22
+ "intermediate_size": 6144,
23
+ "max_position_embeddings": 262144,
24
+ "model_type": "qwen3_vl_text",
25
+ "num_attention_heads": 16,
26
+ "num_hidden_layers": 28,
27
+ "num_key_value_heads": 8,
28
+ "pad_token_id": null,
29
+ "rms_norm_eps": 1e-06,
30
+ "rope_parameters": {
31
+ "mrope_interleaved": true,
32
+ "mrope_section": [
33
+ 24,
34
+ 20,
35
+ 20
36
+ ],
37
+ "rope_theta": 5000000,
38
+ "rope_type": "default"
39
+ },
40
+ "tie_word_embeddings": true,
41
+ "use_cache": false,
42
+ "vocab_size": 151936
43
+ },
44
+ "tie_word_embeddings": true,
45
+ "transformers_version": "5.5.3",
46
+ "use_cache": false,
47
+ "video_token_id": 151656,
48
+ "vision_config": {
49
+ "deepstack_visual_indexes": [
50
+ 5,
51
+ 11,
52
+ 17
53
+ ],
54
+ "depth": 24,
55
+ "dtype": "bfloat16",
56
+ "hidden_act": "gelu_pytorch_tanh",
57
+ "hidden_size": 1024,
58
+ "in_channels": 3,
59
+ "initializer_range": 0.02,
60
+ "intermediate_size": 4096,
61
+ "model_type": "qwen3_vl",
62
+ "num_heads": 16,
63
+ "num_position_embeddings": 2304,
64
+ "out_hidden_size": 2048,
65
+ "patch_size": 16,
66
+ "spatial_merge_size": 2,
67
+ "temporal_patch_size": 2
68
+ },
69
+ "vision_end_token_id": 151653,
70
+ "vision_start_token_id": 151652
71
+ }
checkpoints/Qwen3-VL-2B-SFT/eval_results_job_qwen3vl_2b_qwen3_vl_2b_20260430_002232.json ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mae_dx": 0.1790637931034483,
3
+ "rmse_dx": 0.5420201834898262,
4
+ "mae_dy": 0.16369482758620688,
5
+ "rmse_dy": 0.4466289248937244,
6
+ "mae_dz": 0.017312068965517242,
7
+ "rmse_dz": 0.11626166978875267,
8
+ "mae_dpitch": 0.35010344827586204,
9
+ "rmse_dpitch": 0.7528209935143029,
10
+ "mae_dyaw": 1.3350413793103448,
11
+ "rmse_dyaw": 2.917615133797725,
12
+ "mae_droll": 0.0,
13
+ "rmse_droll": 0.0,
14
+ "mae_overall": 0.3408692528735632,
15
+ "mae_position": 0.12002356321839079,
16
+ "mae_rotation": 0.5617149425287357,
17
+ "rmse_overall": 1.263988236295834,
18
+ "wp1_euc_mae": 0.08209225193702147,
19
+ "wp1_euc_median": 0.022360679774997918,
20
+ "wp2_euc_mae": 0.1592417265186995,
21
+ "wp2_euc_median": 0.058309518948453015,
22
+ "wp3_euc_mae": 0.2573809864066125,
23
+ "wp3_euc_median": 0.10049875621120885,
24
+ "wp4_euc_mae": 0.3827385749455985,
25
+ "wp4_euc_median": 0.1529705854077837,
26
+ "wp5_euc_mae": 0.5205297307973544,
27
+ "wp5_euc_median": 0.2197724920005007,
28
+ "euclidean_mae": 0.2803966541210573,
29
+ "ADE": 0.2803966541210573,
30
+ "FDE": 0.5205297307973544,
31
+ "ADE_median": 0.11621310610654717,
32
+ "FDE_median": 0.2197724920005007,
33
+ "SR@0.5m": 0.8663793103448276,
34
+ "SR@1.0m": 0.9436206896551724,
35
+ "SR@2.0m": 0.98,
36
+ "SR@5.0m": 0.9955172413793103,
37
+ "TrajSR@1.0m": 0.8732758620689656,
38
+ "TrajSR@2.0m": 0.9517241379310345,
39
+ "TrajSR@5.0m": 0.9887931034482759,
40
+ "RotAcc@1.0deg": 0.608103448275862,
41
+ "RotAcc@5.0deg": 0.9405172413793104,
42
+ "RotAcc@10.0deg": 0.9853448275862069,
43
+ "wp1_rot_mae": 0.7521227362111129,
44
+ "wp2_rot_mae": 1.0375870328696497,
45
+ "wp3_rot_mae": 1.394596726377232,
46
+ "wp4_rot_mae": 1.8037210071006413,
47
+ "wp5_rot_mae": 2.2606874444077585,
48
+ "rotation_euc_mae": 1.4497429893932787,
49
+ "parse_failure_rate": 0.0,
50
+ "parse_success_rate": 1.0,
51
+ "valid_samples": 1160,
52
+ "total_samples": 1160,
53
+ "parse_failures": 0,
54
+ "inference_engine": "vllm",
55
+ "vllm_version": "0.19.0"
56
+ }
checkpoints/Qwen3-VL-2B-SFT/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "do_sample": true,
3
+ "eos_token_id": [
4
+ 151645,
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.0,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "5.5.3"
14
+ }
checkpoints/Qwen3-VL-2B-SFT/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f00f124b4c3943edcd2411fc8bac2af6a9fd0b4b769e7a55996571189440f85
3
+ size 4255140312
checkpoints/Qwen3-VL-2B-SFT/processor_config.json ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "image_processor": {
3
+ "do_convert_rgb": true,
4
+ "do_normalize": true,
5
+ "do_rescale": true,
6
+ "do_resize": true,
7
+ "image_mean": [
8
+ 0.5,
9
+ 0.5,
10
+ 0.5
11
+ ],
12
+ "image_processor_type": "Qwen2VLImageProcessor",
13
+ "image_std": [
14
+ 0.5,
15
+ 0.5,
16
+ 0.5
17
+ ],
18
+ "merge_size": 2,
19
+ "patch_size": 16,
20
+ "resample": 3,
21
+ "rescale_factor": 0.00392156862745098,
22
+ "size": {
23
+ "longest_edge": 16777216,
24
+ "shortest_edge": 65536
25
+ },
26
+ "temporal_patch_size": 2
27
+ },
28
+ "processor_class": "Qwen3VLProcessor",
29
+ "video_processor": {
30
+ "do_convert_rgb": true,
31
+ "do_normalize": true,
32
+ "do_rescale": true,
33
+ "do_resize": true,
34
+ "do_sample_frames": true,
35
+ "fps": 2,
36
+ "image_mean": [
37
+ 0.5,
38
+ 0.5,
39
+ 0.5
40
+ ],
41
+ "image_std": [
42
+ 0.5,
43
+ 0.5,
44
+ 0.5
45
+ ],
46
+ "max_frames": 768,
47
+ "merge_size": 2,
48
+ "min_frames": 4,
49
+ "patch_size": 16,
50
+ "resample": 3,
51
+ "rescale_factor": 0.00392156862745098,
52
+ "return_metadata": false,
53
+ "size": {
54
+ "longest_edge": 25165824,
55
+ "shortest_edge": 4096
56
+ },
57
+ "temporal_patch_size": 2,
58
+ "video_processor_type": "Qwen3VLVideoProcessor"
59
+ }
60
+ }
checkpoints/Qwen3-VL-2B-SFT/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
3
+ size 11422650
checkpoints/Qwen3-VL-2B-SFT/tokenizer_config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": null,
5
+ "clean_up_tokenization_spaces": false,
6
+ "eos_token": "<|im_end|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|im_start|>",
10
+ "<|im_end|>",
11
+ "<|object_ref_start|>",
12
+ "<|object_ref_end|>",
13
+ "<|box_start|>",
14
+ "<|box_end|>",
15
+ "<|quad_start|>",
16
+ "<|quad_end|>",
17
+ "<|vision_start|>",
18
+ "<|vision_end|>",
19
+ "<|vision_pad|>",
20
+ "<|image_pad|>",
21
+ "<|video_pad|>"
22
+ ],
23
+ "is_local": true,
24
+ "model_max_length": 262144,
25
+ "pad_token": "<|endoftext|>",
26
+ "padding_side": "right",
27
+ "processor_class": "Qwen3VLProcessor",
28
+ "split_special_tokens": false,
29
+ "tokenizer_class": "Qwen2Tokenizer",
30
+ "unk_token": null
31
+ }
checkpoints/Qwen3-VL-2B-SFT/train_results.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "epoch": 1.0,
3
+ "total_flos": 1201860236279808.0,
4
+ "train_loss": 0.2128027264213562,
5
+ "train_runtime": 15463.9635,
6
+ "train_samples_per_second": 12.933,
7
+ "train_steps_per_second": 0.202
8
+ }
checkpoints/Qwen3-VL-2B-SFT/trainer_state.json ADDED
@@ -0,0 +1,2227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 1.0,
6
+ "eval_steps": 500,
7
+ "global_step": 3125,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0032,
14
+ "grad_norm": 16.167704755253098,
15
+ "learning_rate": 1.437699680511182e-07,
16
+ "loss": 0.6528051853179931,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.0064,
21
+ "grad_norm": 15.890120546753822,
22
+ "learning_rate": 3.0351437699680514e-07,
23
+ "loss": 0.6462714195251464,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.0096,
28
+ "grad_norm": 14.94996510180698,
29
+ "learning_rate": 4.6325878594249205e-07,
30
+ "loss": 0.6038930416107178,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.0128,
35
+ "grad_norm": 7.595956825837255,
36
+ "learning_rate": 6.230031948881789e-07,
37
+ "loss": 0.49077792167663575,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.016,
42
+ "grad_norm": 3.026643067758099,
43
+ "learning_rate": 7.82747603833866e-07,
44
+ "loss": 0.3725566864013672,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.0192,
49
+ "grad_norm": 1.45050871801394,
50
+ "learning_rate": 9.424920127795528e-07,
51
+ "loss": 0.3130798816680908,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.0224,
56
+ "grad_norm": 0.7098603642718405,
57
+ "learning_rate": 1.1022364217252397e-06,
58
+ "loss": 0.29621334075927735,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.0256,
63
+ "grad_norm": 0.6027577608327673,
64
+ "learning_rate": 1.2619808306709266e-06,
65
+ "loss": 0.27455599308013917,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.0288,
70
+ "grad_norm": 0.6521596145147045,
71
+ "learning_rate": 1.4217252396166134e-06,
72
+ "loss": 0.2667043447494507,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.032,
77
+ "grad_norm": 0.5069890685833461,
78
+ "learning_rate": 1.5814696485623005e-06,
79
+ "loss": 0.26807360649108886,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 0.0352,
84
+ "grad_norm": 0.5470393023746721,
85
+ "learning_rate": 1.7412140575079875e-06,
86
+ "loss": 0.26680865287780764,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 0.0384,
91
+ "grad_norm": 0.5543553869620175,
92
+ "learning_rate": 1.9009584664536742e-06,
93
+ "loss": 0.25434055328369143,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 0.0416,
98
+ "grad_norm": 0.5420531484574165,
99
+ "learning_rate": 2.060702875399361e-06,
100
+ "loss": 0.25767529010772705,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 0.0448,
105
+ "grad_norm": 0.645702037816744,
106
+ "learning_rate": 2.220447284345048e-06,
107
+ "loss": 0.24863953590393068,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 0.048,
112
+ "grad_norm": 0.6143136416629473,
113
+ "learning_rate": 2.380191693290735e-06,
114
+ "loss": 0.24553947448730468,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 0.0512,
119
+ "grad_norm": 0.5094817219127052,
120
+ "learning_rate": 2.539936102236422e-06,
121
+ "loss": 0.2415369987487793,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 0.0544,
126
+ "grad_norm": 0.6291606522275387,
127
+ "learning_rate": 2.699680511182109e-06,
128
+ "loss": 0.24887418746948242,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 0.0576,
133
+ "grad_norm": 0.6248895072998087,
134
+ "learning_rate": 2.8594249201277955e-06,
135
+ "loss": 0.2414403438568115,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 0.0608,
140
+ "grad_norm": 0.6640745861299296,
141
+ "learning_rate": 3.0191693290734825e-06,
142
+ "loss": 0.24553894996643066,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 0.064,
147
+ "grad_norm": 0.6136916428260776,
148
+ "learning_rate": 3.17891373801917e-06,
149
+ "loss": 0.24655485153198242,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 0.0672,
154
+ "grad_norm": 0.6572881584027297,
155
+ "learning_rate": 3.3386581469648564e-06,
156
+ "loss": 0.2433255910873413,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 0.0704,
161
+ "grad_norm": 0.6365580690264084,
162
+ "learning_rate": 3.4984025559105434e-06,
163
+ "loss": 0.23687341213226318,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 0.0736,
168
+ "grad_norm": 0.6771736107097397,
169
+ "learning_rate": 3.6581469648562303e-06,
170
+ "loss": 0.23829469680786133,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 0.0768,
175
+ "grad_norm": 0.6990706788858505,
176
+ "learning_rate": 3.817891373801918e-06,
177
+ "loss": 0.23471264839172362,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 0.08,
182
+ "grad_norm": 0.6029376877872676,
183
+ "learning_rate": 3.977635782747604e-06,
184
+ "loss": 0.23215394020080565,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 0.0832,
189
+ "grad_norm": 0.6082124769869354,
190
+ "learning_rate": 4.137380191693291e-06,
191
+ "loss": 0.2326298713684082,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 0.0864,
196
+ "grad_norm": 0.7069824323872274,
197
+ "learning_rate": 4.297124600638978e-06,
198
+ "loss": 0.23525137901306153,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 0.0896,
203
+ "grad_norm": 0.6697633994539672,
204
+ "learning_rate": 4.456869009584665e-06,
205
+ "loss": 0.23122966289520264,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 0.0928,
210
+ "grad_norm": 0.5896144959913211,
211
+ "learning_rate": 4.616613418530352e-06,
212
+ "loss": 0.2369994878768921,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 0.096,
217
+ "grad_norm": 0.6202443536122002,
218
+ "learning_rate": 4.776357827476039e-06,
219
+ "loss": 0.23878774642944336,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 0.0992,
224
+ "grad_norm": 0.654740818437731,
225
+ "learning_rate": 4.936102236421725e-06,
226
+ "loss": 0.22523627281188965,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 0.1024,
231
+ "grad_norm": 0.5332231058888761,
232
+ "learning_rate": 4.999943833158769e-06,
233
+ "loss": 0.22634780406951904,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 0.1056,
238
+ "grad_norm": 0.5353007164619794,
239
+ "learning_rate": 4.999600600490783e-06,
240
+ "loss": 0.23276047706604003,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 0.1088,
245
+ "grad_norm": 0.53617134295571,
246
+ "learning_rate": 4.9989453817439345e-06,
247
+ "loss": 0.22672569751739502,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 0.112,
252
+ "grad_norm": 0.5149149938648103,
253
+ "learning_rate": 4.997978258698942e-06,
254
+ "loss": 0.22631363868713378,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 0.1152,
259
+ "grad_norm": 0.5959881018141326,
260
+ "learning_rate": 4.996699352066659e-06,
261
+ "loss": 0.22707018852233887,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 0.1184,
266
+ "grad_norm": 0.6648028246958526,
267
+ "learning_rate": 4.995108821473014e-06,
268
+ "loss": 0.22777373790740968,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 0.1216,
273
+ "grad_norm": 0.6395047869916185,
274
+ "learning_rate": 4.993206865439084e-06,
275
+ "loss": 0.22382116317749023,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 0.1248,
280
+ "grad_norm": 0.6449783716947614,
281
+ "learning_rate": 4.990993721356317e-06,
282
+ "loss": 0.22268824577331542,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 0.128,
287
+ "grad_norm": 0.6709421623745665,
288
+ "learning_rate": 4.988469665456901e-06,
289
+ "loss": 0.22317943572998047,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 0.1312,
294
+ "grad_norm": 0.5466948727484514,
295
+ "learning_rate": 4.985635012779288e-06,
296
+ "loss": 0.23101482391357422,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 0.1344,
301
+ "grad_norm": 0.48989327197226856,
302
+ "learning_rate": 4.98249011712887e-06,
303
+ "loss": 0.2234072208404541,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 0.1376,
308
+ "grad_norm": 0.5417400145938138,
309
+ "learning_rate": 4.979035371033824e-06,
310
+ "loss": 0.22212049961090088,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 0.1408,
315
+ "grad_norm": 0.5576422767413268,
316
+ "learning_rate": 4.975271205696115e-06,
317
+ "loss": 0.22228083610534669,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 0.144,
322
+ "grad_norm": 0.6175584799790863,
323
+ "learning_rate": 4.971198090937671e-06,
324
+ "loss": 0.21532373428344725,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 0.1472,
329
+ "grad_norm": 0.6360712146764758,
330
+ "learning_rate": 4.966816535141756e-06,
331
+ "loss": 0.21311187744140625,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 0.1504,
336
+ "grad_norm": 0.5401953881204377,
337
+ "learning_rate": 4.9621270851895035e-06,
338
+ "loss": 0.22237277030944824,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 0.1536,
343
+ "grad_norm": 0.5988873649948656,
344
+ "learning_rate": 4.957130326391662e-06,
345
+ "loss": 0.22391064167022706,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 0.1568,
350
+ "grad_norm": 0.5132670412160366,
351
+ "learning_rate": 4.951826882415544e-06,
352
+ "loss": 0.2206397533416748,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 0.16,
357
+ "grad_norm": 0.5935020011592513,
358
+ "learning_rate": 4.946217415207177e-06,
359
+ "loss": 0.2148068904876709,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 0.1632,
364
+ "grad_norm": 0.5324390349507315,
365
+ "learning_rate": 4.940302624908689e-06,
366
+ "loss": 0.21909193992614745,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 0.1664,
371
+ "grad_norm": 0.6082929578051663,
372
+ "learning_rate": 4.934083249770912e-06,
373
+ "loss": 0.2133782386779785,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 0.1696,
378
+ "grad_norm": 0.6272295187969801,
379
+ "learning_rate": 4.927560066061251e-06,
380
+ "loss": 0.2180723190307617,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 0.1728,
385
+ "grad_norm": 0.5538741111929965,
386
+ "learning_rate": 4.920733887966783e-06,
387
+ "loss": 0.22759020328521729,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 0.176,
392
+ "grad_norm": 0.5703593568416581,
393
+ "learning_rate": 4.913605567492636e-06,
394
+ "loss": 0.21657073497772217,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 0.1792,
399
+ "grad_norm": 0.5873043850881617,
400
+ "learning_rate": 4.906175994355656e-06,
401
+ "loss": 0.21824207305908203,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 0.1824,
406
+ "grad_norm": 0.7955355117519857,
407
+ "learning_rate": 4.898446095873345e-06,
408
+ "loss": 0.2209712028503418,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 0.1856,
413
+ "grad_norm": 0.5347403539894492,
414
+ "learning_rate": 4.890416836848128e-06,
415
+ "loss": 0.2184591293334961,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 0.1888,
420
+ "grad_norm": 0.5464598874722423,
421
+ "learning_rate": 4.882089219446925e-06,
422
+ "loss": 0.2130581855773926,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 0.192,
427
+ "grad_norm": 0.5871382794412585,
428
+ "learning_rate": 4.873464283076074e-06,
429
+ "loss": 0.21770844459533692,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 0.1952,
434
+ "grad_norm": 0.5516595084585112,
435
+ "learning_rate": 4.864543104251587e-06,
436
+ "loss": 0.21629047393798828,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 0.1984,
441
+ "grad_norm": 0.5949100146178041,
442
+ "learning_rate": 4.855326796464798e-06,
443
+ "loss": 0.22033746242523194,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 0.2016,
448
+ "grad_norm": 0.5798876425998256,
449
+ "learning_rate": 4.8458165100433725e-06,
450
+ "loss": 0.21477458477020264,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 0.2048,
455
+ "grad_norm": 0.563545251458103,
456
+ "learning_rate": 4.836013432007738e-06,
457
+ "loss": 0.21490144729614258,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 0.208,
462
+ "grad_norm": 0.5256728978801903,
463
+ "learning_rate": 4.825918785922921e-06,
464
+ "loss": 0.21858677864074708,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 0.2112,
469
+ "grad_norm": 0.5062609806869888,
470
+ "learning_rate": 4.8155338317458315e-06,
471
+ "loss": 0.21592459678649903,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 0.2144,
476
+ "grad_norm": 0.555318042395406,
477
+ "learning_rate": 4.804859865668002e-06,
478
+ "loss": 0.21323423385620116,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 0.2176,
483
+ "grad_norm": 0.6382467151310525,
484
+ "learning_rate": 4.793898219953804e-06,
485
+ "loss": 0.2151188373565674,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 0.2208,
490
+ "grad_norm": 0.5426280956852546,
491
+ "learning_rate": 4.782650262774164e-06,
492
+ "loss": 0.2155141830444336,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 0.224,
497
+ "grad_norm": 0.5602841392771764,
498
+ "learning_rate": 4.7711173980357886e-06,
499
+ "loss": 0.21242978572845458,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 0.2272,
504
+ "grad_norm": 0.5837827171492797,
505
+ "learning_rate": 4.759301065205947e-06,
506
+ "loss": 0.2129213333129883,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 0.2304,
511
+ "grad_norm": 0.5678516858648391,
512
+ "learning_rate": 4.7472027391328e-06,
513
+ "loss": 0.21422340869903564,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 0.2336,
518
+ "grad_norm": 0.6213695156464779,
519
+ "learning_rate": 4.734823929861317e-06,
520
+ "loss": 0.2172607660293579,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 0.2368,
525
+ "grad_norm": 0.6084105321286742,
526
+ "learning_rate": 4.722166182444801e-06,
527
+ "loss": 0.21331138610839845,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 0.24,
532
+ "grad_norm": 0.5848312022835148,
533
+ "learning_rate": 4.709231076752045e-06,
534
+ "loss": 0.21255254745483398,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 0.2432,
539
+ "grad_norm": 0.5855428740644943,
540
+ "learning_rate": 4.696020227270142e-06,
541
+ "loss": 0.21734881401062012,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 0.2464,
546
+ "grad_norm": 0.5135013968609298,
547
+ "learning_rate": 4.6825352829029705e-06,
548
+ "loss": 0.21321442127227783,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 0.2496,
553
+ "grad_norm": 0.5938685951597557,
554
+ "learning_rate": 4.668777926765392e-06,
555
+ "loss": 0.21113758087158202,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 0.2528,
560
+ "grad_norm": 0.6490004462160337,
561
+ "learning_rate": 4.6547498759731725e-06,
562
+ "loss": 0.20692987442016603,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 0.256,
567
+ "grad_norm": 0.5694207965471786,
568
+ "learning_rate": 4.6404528814286575e-06,
569
+ "loss": 0.20959222316741943,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 0.2592,
574
+ "grad_norm": 0.5648942925010132,
575
+ "learning_rate": 4.6258887276022425e-06,
576
+ "loss": 0.21758944988250734,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 0.2624,
581
+ "grad_norm": 0.6544068998265237,
582
+ "learning_rate": 4.611059232309639e-06,
583
+ "loss": 0.21146907806396484,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 0.2656,
588
+ "grad_norm": 0.6680185905090128,
589
+ "learning_rate": 4.595966246484986e-06,
590
+ "loss": 0.21348462104797364,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 0.2688,
595
+ "grad_norm": 0.4956164506371995,
596
+ "learning_rate": 4.580611653949829e-06,
597
+ "loss": 0.21317172050476074,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 0.272,
602
+ "grad_norm": 0.6491508776235345,
603
+ "learning_rate": 4.564997371177992e-06,
604
+ "loss": 0.2108323574066162,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 0.2752,
609
+ "grad_norm": 0.6859739128419746,
610
+ "learning_rate": 4.54912534705637e-06,
611
+ "loss": 0.21068863868713378,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 0.2784,
616
+ "grad_norm": 0.5876140035889241,
617
+ "learning_rate": 4.532997562641683e-06,
618
+ "loss": 0.20738301277160645,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 0.2816,
623
+ "grad_norm": 0.5388630641864397,
624
+ "learning_rate": 4.516616030913214e-06,
625
+ "loss": 0.2113194465637207,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 0.2848,
630
+ "grad_norm": 0.527263546069221,
631
+ "learning_rate": 4.499982796521556e-06,
632
+ "loss": 0.20718231201171874,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 0.288,
637
+ "grad_norm": 0.6778383199902553,
638
+ "learning_rate": 4.48309993553341e-06,
639
+ "loss": 0.20899975299835205,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 0.2912,
644
+ "grad_norm": 0.6041502046582736,
645
+ "learning_rate": 4.465969555172468e-06,
646
+ "loss": 0.20922982692718506,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 0.2944,
651
+ "grad_norm": 0.5872507915529911,
652
+ "learning_rate": 4.448593793556391e-06,
653
+ "loss": 0.21518073081970215,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 0.2976,
658
+ "grad_norm": 0.5414243473578003,
659
+ "learning_rate": 4.430974819429954e-06,
660
+ "loss": 0.20869126319885253,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 0.3008,
665
+ "grad_norm": 0.4624854855413159,
666
+ "learning_rate": 4.413114831894344e-06,
667
+ "loss": 0.20277881622314453,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 0.304,
672
+ "grad_norm": 0.5247854876993729,
673
+ "learning_rate": 4.3950160601326865e-06,
674
+ "loss": 0.20181698799133302,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 0.3072,
679
+ "grad_norm": 0.5808078368512252,
680
+ "learning_rate": 4.376680763131811e-06,
681
+ "loss": 0.20898809432983398,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 0.3104,
686
+ "grad_norm": 0.5805212694083882,
687
+ "learning_rate": 4.358111229400296e-06,
688
+ "loss": 0.21212198734283447,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 0.3136,
693
+ "grad_norm": 0.5721764020420262,
694
+ "learning_rate": 4.33930977668283e-06,
695
+ "loss": 0.21448736190795897,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 0.3168,
700
+ "grad_norm": 0.5598008397585128,
701
+ "learning_rate": 4.320278751670922e-06,
702
+ "loss": 0.20758256912231446,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 0.32,
707
+ "grad_norm": 0.5522723710696453,
708
+ "learning_rate": 4.301020529710009e-06,
709
+ "loss": 0.20947573184967042,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 0.3232,
714
+ "grad_norm": 0.5556932215476815,
715
+ "learning_rate": 4.281537514502962e-06,
716
+ "loss": 0.2131945848464966,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 0.3264,
721
+ "grad_norm": 0.5256326530235461,
722
+ "learning_rate": 4.261832137810093e-06,
723
+ "loss": 0.20962438583374024,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 0.3296,
728
+ "grad_norm": 0.5141067804644184,
729
+ "learning_rate": 4.241906859145611e-06,
730
+ "loss": 0.21035046577453614,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 0.3328,
735
+ "grad_norm": 0.509376911595103,
736
+ "learning_rate": 4.221764165470661e-06,
737
+ "loss": 0.20757730007171632,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 0.336,
742
+ "grad_norm": 0.5632632187185198,
743
+ "learning_rate": 4.201406570882898e-06,
744
+ "loss": 0.20691304206848143,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 0.3392,
749
+ "grad_norm": 0.5786515758035645,
750
+ "learning_rate": 4.180836616302704e-06,
751
+ "loss": 0.20582923889160157,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 0.3424,
756
+ "grad_norm": 0.591108109764431,
757
+ "learning_rate": 4.160056869156041e-06,
758
+ "loss": 0.2102893590927124,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 0.3456,
763
+ "grad_norm": 0.5367428274966828,
764
+ "learning_rate": 4.139069923053995e-06,
765
+ "loss": 0.20834057331085204,
766
+ "step": 1080
767
+ },
768
+ {
769
+ "epoch": 0.3488,
770
+ "grad_norm": 0.49962583382458753,
771
+ "learning_rate": 4.117878397469062e-06,
772
+ "loss": 0.2114588975906372,
773
+ "step": 1090
774
+ },
775
+ {
776
+ "epoch": 0.352,
777
+ "grad_norm": 0.5580828852277292,
778
+ "learning_rate": 4.096484937408195e-06,
779
+ "loss": 0.2029412269592285,
780
+ "step": 1100
781
+ },
782
+ {
783
+ "epoch": 0.3552,
784
+ "grad_norm": 0.5671943339841842,
785
+ "learning_rate": 4.074892213082676e-06,
786
+ "loss": 0.20308828353881836,
787
+ "step": 1110
788
+ },
789
+ {
790
+ "epoch": 0.3584,
791
+ "grad_norm": 0.5583868175031171,
792
+ "learning_rate": 4.0531029195748265e-06,
793
+ "loss": 0.2104210376739502,
794
+ "step": 1120
795
+ },
796
+ {
797
+ "epoch": 0.3616,
798
+ "grad_norm": 0.5452939479895703,
799
+ "learning_rate": 4.03111977650163e-06,
800
+ "loss": 0.20968456268310548,
801
+ "step": 1130
802
+ },
803
+ {
804
+ "epoch": 0.3648,
805
+ "grad_norm": 0.6195183591357212,
806
+ "learning_rate": 4.008945527675281e-06,
807
+ "loss": 0.20957679748535157,
808
+ "step": 1140
809
+ },
810
+ {
811
+ "epoch": 0.368,
812
+ "grad_norm": 0.6171258889408775,
813
+ "learning_rate": 3.986582940760717e-06,
814
+ "loss": 0.1984492540359497,
815
+ "step": 1150
816
+ },
817
+ {
818
+ "epoch": 0.3712,
819
+ "grad_norm": 0.6164010362674036,
820
+ "learning_rate": 3.9640348069301785e-06,
821
+ "loss": 0.20632429122924806,
822
+ "step": 1160
823
+ },
824
+ {
825
+ "epoch": 0.3744,
826
+ "grad_norm": 0.5558070727772452,
827
+ "learning_rate": 3.941303940514826e-06,
828
+ "loss": 0.20776019096374512,
829
+ "step": 1170
830
+ },
831
+ {
832
+ "epoch": 0.3776,
833
+ "grad_norm": 0.5943916453083408,
834
+ "learning_rate": 3.918393178653472e-06,
835
+ "loss": 0.20839078426361085,
836
+ "step": 1180
837
+ },
838
+ {
839
+ "epoch": 0.3808,
840
+ "grad_norm": 0.5018385923371635,
841
+ "learning_rate": 3.895305380938468e-06,
842
+ "loss": 0.2044908285140991,
843
+ "step": 1190
844
+ },
845
+ {
846
+ "epoch": 0.384,
847
+ "grad_norm": 0.48660847876218716,
848
+ "learning_rate": 3.872043429058783e-06,
849
+ "loss": 0.20328717231750487,
850
+ "step": 1200
851
+ },
852
+ {
853
+ "epoch": 0.3872,
854
+ "grad_norm": 0.5586353975354608,
855
+ "learning_rate": 3.84861022644033e-06,
856
+ "loss": 0.20572426319122314,
857
+ "step": 1210
858
+ },
859
+ {
860
+ "epoch": 0.3904,
861
+ "grad_norm": 0.5709168788921625,
862
+ "learning_rate": 3.825008697883574e-06,
863
+ "loss": 0.21369614601135253,
864
+ "step": 1220
865
+ },
866
+ {
867
+ "epoch": 0.3936,
868
+ "grad_norm": 0.5589246090839964,
869
+ "learning_rate": 3.8012417891984776e-06,
870
+ "loss": 0.2072831630706787,
871
+ "step": 1230
872
+ },
873
+ {
874
+ "epoch": 0.3968,
875
+ "grad_norm": 0.5711782327133378,
876
+ "learning_rate": 3.777312466836819e-06,
877
+ "loss": 0.20526669025421143,
878
+ "step": 1240
879
+ },
880
+ {
881
+ "epoch": 0.4,
882
+ "grad_norm": 0.5656399244912672,
883
+ "learning_rate": 3.7532237175219378e-06,
884
+ "loss": 0.20442888736724854,
885
+ "step": 1250
886
+ },
887
+ {
888
+ "epoch": 0.4032,
889
+ "grad_norm": 0.5520901024337347,
890
+ "learning_rate": 3.728978547875948e-06,
891
+ "loss": 0.2092284679412842,
892
+ "step": 1260
893
+ },
894
+ {
895
+ "epoch": 0.4064,
896
+ "grad_norm": 0.553756025103199,
897
+ "learning_rate": 3.7045799840444712e-06,
898
+ "loss": 0.20277605056762696,
899
+ "step": 1270
900
+ },
901
+ {
902
+ "epoch": 0.4096,
903
+ "grad_norm": 0.5430187148138641,
904
+ "learning_rate": 3.6800310713189258e-06,
905
+ "loss": 0.20491743087768555,
906
+ "step": 1280
907
+ },
908
+ {
909
+ "epoch": 0.4128,
910
+ "grad_norm": 0.7620941398223869,
911
+ "learning_rate": 3.6553348737564328e-06,
912
+ "loss": 0.2055516481399536,
913
+ "step": 1290
914
+ },
915
+ {
916
+ "epoch": 0.416,
917
+ "grad_norm": 0.5265612798122297,
918
+ "learning_rate": 3.6304944737973794e-06,
919
+ "loss": 0.21130599975585937,
920
+ "step": 1300
921
+ },
922
+ {
923
+ "epoch": 0.4192,
924
+ "grad_norm": 0.5353794185025008,
925
+ "learning_rate": 3.6055129718806836e-06,
926
+ "loss": 0.20504627227783204,
927
+ "step": 1310
928
+ },
929
+ {
930
+ "epoch": 0.4224,
931
+ "grad_norm": 0.5979654766960453,
932
+ "learning_rate": 3.5803934860568134e-06,
933
+ "loss": 0.2000981330871582,
934
+ "step": 1320
935
+ },
936
+ {
937
+ "epoch": 0.4256,
938
+ "grad_norm": 0.5915664314356317,
939
+ "learning_rate": 3.5551391515986163e-06,
940
+ "loss": 0.20581989288330077,
941
+ "step": 1330
942
+ },
943
+ {
944
+ "epoch": 0.4288,
945
+ "grad_norm": 0.562992516341074,
946
+ "learning_rate": 3.529753120609982e-06,
947
+ "loss": 0.20160207748413086,
948
+ "step": 1340
949
+ },
950
+ {
951
+ "epoch": 0.432,
952
+ "grad_norm": 0.7046032478558245,
953
+ "learning_rate": 3.5042385616324243e-06,
954
+ "loss": 0.2043483018875122,
955
+ "step": 1350
956
+ },
957
+ {
958
+ "epoch": 0.4352,
959
+ "grad_norm": 0.5184492477363449,
960
+ "learning_rate": 3.4785986592495934e-06,
961
+ "loss": 0.20285494327545167,
962
+ "step": 1360
963
+ },
964
+ {
965
+ "epoch": 0.4384,
966
+ "grad_norm": 0.5806380074338086,
967
+ "learning_rate": 3.452836613689803e-06,
968
+ "loss": 0.2009434223175049,
969
+ "step": 1370
970
+ },
971
+ {
972
+ "epoch": 0.4416,
973
+ "grad_norm": 0.5204618736945451,
974
+ "learning_rate": 3.426955640426584e-06,
975
+ "loss": 0.20416510105133057,
976
+ "step": 1380
977
+ },
978
+ {
979
+ "epoch": 0.4448,
980
+ "grad_norm": 0.5765864502605341,
981
+ "learning_rate": 3.4009589697773605e-06,
982
+ "loss": 0.20326631069183348,
983
+ "step": 1390
984
+ },
985
+ {
986
+ "epoch": 0.448,
987
+ "grad_norm": 0.5779970501460372,
988
+ "learning_rate": 3.3748498465002475e-06,
989
+ "loss": 0.20073289871215821,
990
+ "step": 1400
991
+ },
992
+ {
993
+ "epoch": 0.4512,
994
+ "grad_norm": 0.6393995362823897,
995
+ "learning_rate": 3.3486315293890693e-06,
996
+ "loss": 0.20874643325805664,
997
+ "step": 1410
998
+ },
999
+ {
1000
+ "epoch": 0.4544,
1001
+ "grad_norm": 0.5108762095593324,
1002
+ "learning_rate": 3.3223072908666053e-06,
1003
+ "loss": 0.19835340976715088,
1004
+ "step": 1420
1005
+ },
1006
+ {
1007
+ "epoch": 0.4576,
1008
+ "grad_norm": 0.6435280387445825,
1009
+ "learning_rate": 3.295880416576153e-06,
1010
+ "loss": 0.20992684364318848,
1011
+ "step": 1430
1012
+ },
1013
+ {
1014
+ "epoch": 0.4608,
1015
+ "grad_norm": 0.5838753206875198,
1016
+ "learning_rate": 3.269354204971427e-06,
1017
+ "loss": 0.20265870094299315,
1018
+ "step": 1440
1019
+ },
1020
+ {
1021
+ "epoch": 0.464,
1022
+ "grad_norm": 0.6745984788898958,
1023
+ "learning_rate": 3.242731966904865e-06,
1024
+ "loss": 0.20037527084350587,
1025
+ "step": 1450
1026
+ },
1027
+ {
1028
+ "epoch": 0.4672,
1029
+ "grad_norm": 0.5358161645108944,
1030
+ "learning_rate": 3.2160170252143913e-06,
1031
+ "loss": 0.20123369693756105,
1032
+ "step": 1460
1033
+ },
1034
+ {
1035
+ "epoch": 0.4704,
1036
+ "grad_norm": 0.5112361606823973,
1037
+ "learning_rate": 3.1892127143086716e-06,
1038
+ "loss": 0.20752406120300293,
1039
+ "step": 1470
1040
+ },
1041
+ {
1042
+ "epoch": 0.4736,
1043
+ "grad_norm": 0.6333759965752455,
1044
+ "learning_rate": 3.1623223797509347e-06,
1045
+ "loss": 0.19706425666809083,
1046
+ "step": 1480
1047
+ },
1048
+ {
1049
+ "epoch": 0.4768,
1050
+ "grad_norm": 0.6206117536462172,
1051
+ "learning_rate": 3.135349377841396e-06,
1052
+ "loss": 0.20125732421875,
1053
+ "step": 1490
1054
+ },
1055
+ {
1056
+ "epoch": 0.48,
1057
+ "grad_norm": 0.5541712474486513,
1058
+ "learning_rate": 3.1082970751983497e-06,
1059
+ "loss": 0.20749812126159667,
1060
+ "step": 1500
1061
+ },
1062
+ {
1063
+ "epoch": 0.4832,
1064
+ "grad_norm": 0.5835934183180771,
1065
+ "learning_rate": 3.0811688483379546e-06,
1066
+ "loss": 0.20475554466247559,
1067
+ "step": 1510
1068
+ },
1069
+ {
1070
+ "epoch": 0.4864,
1071
+ "grad_norm": 0.5792514427898341,
1072
+ "learning_rate": 3.0539680832528074e-06,
1073
+ "loss": 0.20504088401794435,
1074
+ "step": 1520
1075
+ },
1076
+ {
1077
+ "epoch": 0.4896,
1078
+ "grad_norm": 0.6358843481166787,
1079
+ "learning_rate": 3.026698174989316e-06,
1080
+ "loss": 0.20325000286102296,
1081
+ "step": 1530
1082
+ },
1083
+ {
1084
+ "epoch": 0.4928,
1085
+ "grad_norm": 0.5059500889981753,
1086
+ "learning_rate": 2.999362527223952e-06,
1087
+ "loss": 0.2031909465789795,
1088
+ "step": 1540
1089
+ },
1090
+ {
1091
+ "epoch": 0.496,
1092
+ "grad_norm": 0.5388306821924389,
1093
+ "learning_rate": 2.9719645518384194e-06,
1094
+ "loss": 0.20504312515258788,
1095
+ "step": 1550
1096
+ },
1097
+ {
1098
+ "epoch": 0.4992,
1099
+ "grad_norm": 0.5939936480408617,
1100
+ "learning_rate": 2.944507668493807e-06,
1101
+ "loss": 0.2084404706954956,
1102
+ "step": 1560
1103
+ },
1104
+ {
1105
+ "epoch": 0.5024,
1106
+ "grad_norm": 0.5687025114161597,
1107
+ "learning_rate": 2.9169953042037623e-06,
1108
+ "loss": 0.20367155075073243,
1109
+ "step": 1570
1110
+ },
1111
+ {
1112
+ "epoch": 0.5056,
1113
+ "grad_norm": 0.5703613797457775,
1114
+ "learning_rate": 2.889430892906754e-06,
1115
+ "loss": 0.19950419664382935,
1116
+ "step": 1580
1117
+ },
1118
+ {
1119
+ "epoch": 0.5088,
1120
+ "grad_norm": 0.50147360976836,
1121
+ "learning_rate": 2.861817875037462e-06,
1122
+ "loss": 0.19737675189971923,
1123
+ "step": 1590
1124
+ },
1125
+ {
1126
+ "epoch": 0.512,
1127
+ "grad_norm": 0.5962810686359508,
1128
+ "learning_rate": 2.8341596970973683e-06,
1129
+ "loss": 0.206866455078125,
1130
+ "step": 1600
1131
+ },
1132
+ {
1133
+ "epoch": 0.5152,
1134
+ "grad_norm": 0.564566320219468,
1135
+ "learning_rate": 2.80645981122458e-06,
1136
+ "loss": 0.2020205020904541,
1137
+ "step": 1610
1138
+ },
1139
+ {
1140
+ "epoch": 0.5184,
1141
+ "grad_norm": 0.5246372929237232,
1142
+ "learning_rate": 2.7787216747629508e-06,
1143
+ "loss": 0.20939722061157226,
1144
+ "step": 1620
1145
+ },
1146
+ {
1147
+ "epoch": 0.5216,
1148
+ "grad_norm": 0.5415181940486332,
1149
+ "learning_rate": 2.7509487498305615e-06,
1150
+ "loss": 0.19629446268081666,
1151
+ "step": 1630
1152
+ },
1153
+ {
1154
+ "epoch": 0.5248,
1155
+ "grad_norm": 0.5627430222118958,
1156
+ "learning_rate": 2.7231445028875924e-06,
1157
+ "loss": 0.20240178108215331,
1158
+ "step": 1640
1159
+ },
1160
+ {
1161
+ "epoch": 0.528,
1162
+ "grad_norm": 0.5578941065241574,
1163
+ "learning_rate": 2.6953124043036604e-06,
1164
+ "loss": 0.2012562036514282,
1165
+ "step": 1650
1166
+ },
1167
+ {
1168
+ "epoch": 0.5312,
1169
+ "grad_norm": 0.5487117054063715,
1170
+ "learning_rate": 2.667455927924667e-06,
1171
+ "loss": 0.20127537250518798,
1172
+ "step": 1660
1173
+ },
1174
+ {
1175
+ "epoch": 0.5344,
1176
+ "grad_norm": 0.571360126804376,
1177
+ "learning_rate": 2.6395785506392164e-06,
1178
+ "loss": 0.1964709758758545,
1179
+ "step": 1670
1180
+ },
1181
+ {
1182
+ "epoch": 0.5376,
1183
+ "grad_norm": 0.6088527341362128,
1184
+ "learning_rate": 2.6116837519446407e-06,
1185
+ "loss": 0.1997244954109192,
1186
+ "step": 1680
1187
+ },
1188
+ {
1189
+ "epoch": 0.5408,
1190
+ "grad_norm": 0.5974545138027041,
1191
+ "learning_rate": 2.5837750135127192e-06,
1192
+ "loss": 0.19768773317337035,
1193
+ "step": 1690
1194
+ },
1195
+ {
1196
+ "epoch": 0.544,
1197
+ "grad_norm": 0.5496714163583045,
1198
+ "learning_rate": 2.555855818755108e-06,
1199
+ "loss": 0.20294923782348634,
1200
+ "step": 1700
1201
+ },
1202
+ {
1203
+ "epoch": 0.5472,
1204
+ "grad_norm": 0.7083231030411815,
1205
+ "learning_rate": 2.5279296523885636e-06,
1206
+ "loss": 0.20083847045898437,
1207
+ "step": 1710
1208
+ },
1209
+ {
1210
+ "epoch": 0.5504,
1211
+ "grad_norm": 0.5938882026412365,
1212
+ "learning_rate": 2.5e-06,
1213
+ "loss": 0.20156488418579102,
1214
+ "step": 1720
1215
+ },
1216
+ {
1217
+ "epoch": 0.5536,
1218
+ "grad_norm": 0.5963429209905415,
1219
+ "learning_rate": 2.472070347611437e-06,
1220
+ "loss": 0.19514652490615844,
1221
+ "step": 1730
1222
+ },
1223
+ {
1224
+ "epoch": 0.5568,
1225
+ "grad_norm": 0.6395947365412442,
1226
+ "learning_rate": 2.444144181244893e-06,
1227
+ "loss": 0.20121583938598633,
1228
+ "step": 1740
1229
+ },
1230
+ {
1231
+ "epoch": 0.56,
1232
+ "grad_norm": 0.5998001248295249,
1233
+ "learning_rate": 2.416224986487282e-06,
1234
+ "loss": 0.19726226329803467,
1235
+ "step": 1750
1236
+ },
1237
+ {
1238
+ "epoch": 0.5632,
1239
+ "grad_norm": 0.5593754591530539,
1240
+ "learning_rate": 2.3883162480553605e-06,
1241
+ "loss": 0.19497768878936766,
1242
+ "step": 1760
1243
+ },
1244
+ {
1245
+ "epoch": 0.5664,
1246
+ "grad_norm": 0.5860785466160793,
1247
+ "learning_rate": 2.3604214493607844e-06,
1248
+ "loss": 0.1996150493621826,
1249
+ "step": 1770
1250
+ },
1251
+ {
1252
+ "epoch": 0.5696,
1253
+ "grad_norm": 0.5963601131944923,
1254
+ "learning_rate": 2.332544072075333e-06,
1255
+ "loss": 0.20348951816558838,
1256
+ "step": 1780
1257
+ },
1258
+ {
1259
+ "epoch": 0.5728,
1260
+ "grad_norm": 0.5745583695919886,
1261
+ "learning_rate": 2.30468759569634e-06,
1262
+ "loss": 0.2016512393951416,
1263
+ "step": 1790
1264
+ },
1265
+ {
1266
+ "epoch": 0.576,
1267
+ "grad_norm": 0.5720738010975994,
1268
+ "learning_rate": 2.276855497112408e-06,
1269
+ "loss": 0.1983588457107544,
1270
+ "step": 1800
1271
+ },
1272
+ {
1273
+ "epoch": 0.5792,
1274
+ "grad_norm": 0.594436652050367,
1275
+ "learning_rate": 2.2490512501694394e-06,
1276
+ "loss": 0.19393882751464844,
1277
+ "step": 1810
1278
+ },
1279
+ {
1280
+ "epoch": 0.5824,
1281
+ "grad_norm": 0.5547702774883363,
1282
+ "learning_rate": 2.2212783252370496e-06,
1283
+ "loss": 0.19950855970382692,
1284
+ "step": 1820
1285
+ },
1286
+ {
1287
+ "epoch": 0.5856,
1288
+ "grad_norm": 0.49741997333090354,
1289
+ "learning_rate": 2.1935401887754213e-06,
1290
+ "loss": 0.20486598014831542,
1291
+ "step": 1830
1292
+ },
1293
+ {
1294
+ "epoch": 0.5888,
1295
+ "grad_norm": 0.6191188389453962,
1296
+ "learning_rate": 2.165840302902632e-06,
1297
+ "loss": 0.1979525566101074,
1298
+ "step": 1840
1299
+ },
1300
+ {
1301
+ "epoch": 0.592,
1302
+ "grad_norm": 0.613998551941137,
1303
+ "learning_rate": 2.1381821249625383e-06,
1304
+ "loss": 0.2030627727508545,
1305
+ "step": 1850
1306
+ },
1307
+ {
1308
+ "epoch": 0.5952,
1309
+ "grad_norm": 0.6115410126221079,
1310
+ "learning_rate": 2.1105691070932465e-06,
1311
+ "loss": 0.1951197624206543,
1312
+ "step": 1860
1313
+ },
1314
+ {
1315
+ "epoch": 0.5984,
1316
+ "grad_norm": 0.5666967026000811,
1317
+ "learning_rate": 2.083004695796238e-06,
1318
+ "loss": 0.1926891803741455,
1319
+ "step": 1870
1320
+ },
1321
+ {
1322
+ "epoch": 0.6016,
1323
+ "grad_norm": 0.5564168831256036,
1324
+ "learning_rate": 2.055492331506194e-06,
1325
+ "loss": 0.20087857246398927,
1326
+ "step": 1880
1327
+ },
1328
+ {
1329
+ "epoch": 0.6048,
1330
+ "grad_norm": 0.647003695530594,
1331
+ "learning_rate": 2.0280354481615814e-06,
1332
+ "loss": 0.1991624116897583,
1333
+ "step": 1890
1334
+ },
1335
+ {
1336
+ "epoch": 0.608,
1337
+ "grad_norm": 0.6020348842840653,
1338
+ "learning_rate": 2.000637472776049e-06,
1339
+ "loss": 0.20029563903808595,
1340
+ "step": 1900
1341
+ },
1342
+ {
1343
+ "epoch": 0.6112,
1344
+ "grad_norm": 0.593460784828495,
1345
+ "learning_rate": 1.973301825010685e-06,
1346
+ "loss": 0.19462828636169432,
1347
+ "step": 1910
1348
+ },
1349
+ {
1350
+ "epoch": 0.6144,
1351
+ "grad_norm": 0.6796900420369784,
1352
+ "learning_rate": 1.9460319167471934e-06,
1353
+ "loss": 0.20009157657623292,
1354
+ "step": 1920
1355
+ },
1356
+ {
1357
+ "epoch": 0.6176,
1358
+ "grad_norm": 0.5803908647953272,
1359
+ "learning_rate": 1.9188311516620466e-06,
1360
+ "loss": 0.19473812580108643,
1361
+ "step": 1930
1362
+ },
1363
+ {
1364
+ "epoch": 0.6208,
1365
+ "grad_norm": 0.5919196787967083,
1366
+ "learning_rate": 1.891702924801651e-06,
1367
+ "loss": 0.20190510749816895,
1368
+ "step": 1940
1369
+ },
1370
+ {
1371
+ "epoch": 0.624,
1372
+ "grad_norm": 0.6056764566097385,
1373
+ "learning_rate": 1.864650622158604e-06,
1374
+ "loss": 0.2063821792602539,
1375
+ "step": 1950
1376
+ },
1377
+ {
1378
+ "epoch": 0.6272,
1379
+ "grad_norm": 0.5106064574990916,
1380
+ "learning_rate": 1.8376776202490666e-06,
1381
+ "loss": 0.20139360427856445,
1382
+ "step": 1960
1383
+ },
1384
+ {
1385
+ "epoch": 0.6304,
1386
+ "grad_norm": 0.5816570517079882,
1387
+ "learning_rate": 1.8107872856913293e-06,
1388
+ "loss": 0.19568054676055907,
1389
+ "step": 1970
1390
+ },
1391
+ {
1392
+ "epoch": 0.6336,
1393
+ "grad_norm": 0.6100308085295513,
1394
+ "learning_rate": 1.7839829747856096e-06,
1395
+ "loss": 0.19661173820495606,
1396
+ "step": 1980
1397
+ },
1398
+ {
1399
+ "epoch": 0.6368,
1400
+ "grad_norm": 0.6256775545767371,
1401
+ "learning_rate": 1.7572680330951359e-06,
1402
+ "loss": 0.19576869010925294,
1403
+ "step": 1990
1404
+ },
1405
+ {
1406
+ "epoch": 0.64,
1407
+ "grad_norm": 0.5979254874380191,
1408
+ "learning_rate": 1.7306457950285747e-06,
1409
+ "loss": 0.19802470207214357,
1410
+ "step": 2000
1411
+ },
1412
+ {
1413
+ "epoch": 0.6432,
1414
+ "grad_norm": 0.6445065470953916,
1415
+ "learning_rate": 1.704119583423848e-06,
1416
+ "loss": 0.19182772636413575,
1417
+ "step": 2010
1418
+ },
1419
+ {
1420
+ "epoch": 0.6464,
1421
+ "grad_norm": 0.5238518416749739,
1422
+ "learning_rate": 1.677692709133396e-06,
1423
+ "loss": 0.19971816539764403,
1424
+ "step": 2020
1425
+ },
1426
+ {
1427
+ "epoch": 0.6496,
1428
+ "grad_norm": 0.5902086462380663,
1429
+ "learning_rate": 1.6513684706109311e-06,
1430
+ "loss": 0.20058016777038573,
1431
+ "step": 2030
1432
+ },
1433
+ {
1434
+ "epoch": 0.6528,
1435
+ "grad_norm": 0.5301315426540266,
1436
+ "learning_rate": 1.6251501534997529e-06,
1437
+ "loss": 0.19816763401031495,
1438
+ "step": 2040
1439
+ },
1440
+ {
1441
+ "epoch": 0.656,
1442
+ "grad_norm": 0.5702221922649561,
1443
+ "learning_rate": 1.5990410302226405e-06,
1444
+ "loss": 0.19167234897613525,
1445
+ "step": 2050
1446
+ },
1447
+ {
1448
+ "epoch": 0.6592,
1449
+ "grad_norm": 0.5682142108318351,
1450
+ "learning_rate": 1.5730443595734162e-06,
1451
+ "loss": 0.19806729555130004,
1452
+ "step": 2060
1453
+ },
1454
+ {
1455
+ "epoch": 0.6624,
1456
+ "grad_norm": 0.6268750721579749,
1457
+ "learning_rate": 1.5471633863101982e-06,
1458
+ "loss": 0.1990320086479187,
1459
+ "step": 2070
1460
+ },
1461
+ {
1462
+ "epoch": 0.6656,
1463
+ "grad_norm": 0.6501758398050216,
1464
+ "learning_rate": 1.521401340750407e-06,
1465
+ "loss": 0.20063567161560059,
1466
+ "step": 2080
1467
+ },
1468
+ {
1469
+ "epoch": 0.6688,
1470
+ "grad_norm": 0.5367071332530153,
1471
+ "learning_rate": 1.495761438367577e-06,
1472
+ "loss": 0.2000502109527588,
1473
+ "step": 2090
1474
+ },
1475
+ {
1476
+ "epoch": 0.672,
1477
+ "grad_norm": 0.6644202151690211,
1478
+ "learning_rate": 1.4702468793900187e-06,
1479
+ "loss": 0.19811663627624512,
1480
+ "step": 2100
1481
+ },
1482
+ {
1483
+ "epoch": 0.6752,
1484
+ "grad_norm": 0.6020454013039992,
1485
+ "learning_rate": 1.444860848401384e-06,
1486
+ "loss": 0.19873985052108764,
1487
+ "step": 2110
1488
+ },
1489
+ {
1490
+ "epoch": 0.6784,
1491
+ "grad_norm": 0.5672766014696592,
1492
+ "learning_rate": 1.4196065139431866e-06,
1493
+ "loss": 0.19663108587265016,
1494
+ "step": 2120
1495
+ },
1496
+ {
1497
+ "epoch": 0.6816,
1498
+ "grad_norm": 0.6668756559032718,
1499
+ "learning_rate": 1.3944870281193178e-06,
1500
+ "loss": 0.19677751064300536,
1501
+ "step": 2130
1502
+ },
1503
+ {
1504
+ "epoch": 0.6848,
1505
+ "grad_norm": 0.6146850263092741,
1506
+ "learning_rate": 1.3695055262026208e-06,
1507
+ "loss": 0.20252432823181152,
1508
+ "step": 2140
1509
+ },
1510
+ {
1511
+ "epoch": 0.688,
1512
+ "grad_norm": 0.6023134400750195,
1513
+ "learning_rate": 1.3446651262435679e-06,
1514
+ "loss": 0.19564807415008545,
1515
+ "step": 2150
1516
+ },
1517
+ {
1518
+ "epoch": 0.6912,
1519
+ "grad_norm": 0.5973758444267007,
1520
+ "learning_rate": 1.3199689286810746e-06,
1521
+ "loss": 0.19767165184020996,
1522
+ "step": 2160
1523
+ },
1524
+ {
1525
+ "epoch": 0.6944,
1526
+ "grad_norm": 0.604085220565822,
1527
+ "learning_rate": 1.2954200159555294e-06,
1528
+ "loss": 0.19245314598083496,
1529
+ "step": 2170
1530
+ },
1531
+ {
1532
+ "epoch": 0.6976,
1533
+ "grad_norm": 0.5971658440027723,
1534
+ "learning_rate": 1.2710214521240527e-06,
1535
+ "loss": 0.19593756198883056,
1536
+ "step": 2180
1537
+ },
1538
+ {
1539
+ "epoch": 0.7008,
1540
+ "grad_norm": 0.6712656742168871,
1541
+ "learning_rate": 1.246776282478063e-06,
1542
+ "loss": 0.19848381280899047,
1543
+ "step": 2190
1544
+ },
1545
+ {
1546
+ "epoch": 0.704,
1547
+ "grad_norm": 0.5303502593262494,
1548
+ "learning_rate": 1.222687533163181e-06,
1549
+ "loss": 0.19739968776702882,
1550
+ "step": 2200
1551
+ },
1552
+ {
1553
+ "epoch": 0.7072,
1554
+ "grad_norm": 0.6329890536946617,
1555
+ "learning_rate": 1.1987582108015228e-06,
1556
+ "loss": 0.19885218143463135,
1557
+ "step": 2210
1558
+ },
1559
+ {
1560
+ "epoch": 0.7104,
1561
+ "grad_norm": 0.6175733280769058,
1562
+ "learning_rate": 1.1749913021164255e-06,
1563
+ "loss": 0.20003676414489746,
1564
+ "step": 2220
1565
+ },
1566
+ {
1567
+ "epoch": 0.7136,
1568
+ "grad_norm": 0.6297338992517326,
1569
+ "learning_rate": 1.1513897735596702e-06,
1570
+ "loss": 0.19420522451400757,
1571
+ "step": 2230
1572
+ },
1573
+ {
1574
+ "epoch": 0.7168,
1575
+ "grad_norm": 0.5570261846558745,
1576
+ "learning_rate": 1.127956570941218e-06,
1577
+ "loss": 0.19144604206085206,
1578
+ "step": 2240
1579
+ },
1580
+ {
1581
+ "epoch": 0.72,
1582
+ "grad_norm": 0.7464999016757174,
1583
+ "learning_rate": 1.104694619061533e-06,
1584
+ "loss": 0.20028018951416016,
1585
+ "step": 2250
1586
+ },
1587
+ {
1588
+ "epoch": 0.7232,
1589
+ "grad_norm": 0.5813509472785208,
1590
+ "learning_rate": 1.0816068213465295e-06,
1591
+ "loss": 0.2022254228591919,
1592
+ "step": 2260
1593
+ },
1594
+ {
1595
+ "epoch": 0.7264,
1596
+ "grad_norm": 0.5788680063085246,
1597
+ "learning_rate": 1.0586960594851762e-06,
1598
+ "loss": 0.19734264612197877,
1599
+ "step": 2270
1600
+ },
1601
+ {
1602
+ "epoch": 0.7296,
1603
+ "grad_norm": 0.6879904092074834,
1604
+ "learning_rate": 1.0359651930698217e-06,
1605
+ "loss": 0.19566457271575927,
1606
+ "step": 2280
1607
+ },
1608
+ {
1609
+ "epoch": 0.7328,
1610
+ "grad_norm": 0.545714278159425,
1611
+ "learning_rate": 1.0134170592392837e-06,
1612
+ "loss": 0.19808268547058105,
1613
+ "step": 2290
1614
+ },
1615
+ {
1616
+ "epoch": 0.736,
1617
+ "grad_norm": 0.6957466724150051,
1618
+ "learning_rate": 9.910544723247204e-07,
1619
+ "loss": 0.19703471660614014,
1620
+ "step": 2300
1621
+ },
1622
+ {
1623
+ "epoch": 0.7392,
1624
+ "grad_norm": 0.5722555379171206,
1625
+ "learning_rate": 9.688802234983706e-07,
1626
+ "loss": 0.19638856649398803,
1627
+ "step": 2310
1628
+ },
1629
+ {
1630
+ "epoch": 0.7424,
1631
+ "grad_norm": 0.6657445816108672,
1632
+ "learning_rate": 9.468970804251742e-07,
1633
+ "loss": 0.1994560480117798,
1634
+ "step": 2320
1635
+ },
1636
+ {
1637
+ "epoch": 0.7456,
1638
+ "grad_norm": 0.6118638240003964,
1639
+ "learning_rate": 9.251077869173244e-07,
1640
+ "loss": 0.19247424602508545,
1641
+ "step": 2330
1642
+ },
1643
+ {
1644
+ "epoch": 0.7488,
1645
+ "grad_norm": 0.618262759129052,
1646
+ "learning_rate": 9.035150625918054e-07,
1647
+ "loss": 0.19384448528289794,
1648
+ "step": 2340
1649
+ },
1650
+ {
1651
+ "epoch": 0.752,
1652
+ "grad_norm": 0.5841167908088344,
1653
+ "learning_rate": 8.821216025309395e-07,
1654
+ "loss": 0.19670048952102662,
1655
+ "step": 2350
1656
+ },
1657
+ {
1658
+ "epoch": 0.7552,
1659
+ "grad_norm": 0.6330443090953268,
1660
+ "learning_rate": 8.609300769460055e-07,
1661
+ "loss": 0.191538667678833,
1662
+ "step": 2360
1663
+ },
1664
+ {
1665
+ "epoch": 0.7584,
1666
+ "grad_norm": 0.6922248169402944,
1667
+ "learning_rate": 8.399431308439592e-07,
1668
+ "loss": 0.19869886636734008,
1669
+ "step": 2370
1670
+ },
1671
+ {
1672
+ "epoch": 0.7616,
1673
+ "grad_norm": 0.5821907331028691,
1674
+ "learning_rate": 8.191633836972962e-07,
1675
+ "loss": 0.19837281703948975,
1676
+ "step": 2380
1677
+ },
1678
+ {
1679
+ "epoch": 0.7648,
1680
+ "grad_norm": 0.5484553164447705,
1681
+ "learning_rate": 7.985934291171024e-07,
1682
+ "loss": 0.19366707801818847,
1683
+ "step": 2390
1684
+ },
1685
+ {
1686
+ "epoch": 0.768,
1687
+ "grad_norm": 0.6131324978552078,
1688
+ "learning_rate": 7.7823583452934e-07,
1689
+ "loss": 0.19763607978820802,
1690
+ "step": 2400
1691
+ },
1692
+ {
1693
+ "epoch": 0.7712,
1694
+ "grad_norm": 0.5665386766642198,
1695
+ "learning_rate": 7.58093140854389e-07,
1696
+ "loss": 0.19747262001037597,
1697
+ "step": 2410
1698
+ },
1699
+ {
1700
+ "epoch": 0.7744,
1701
+ "grad_norm": 0.6702088035794936,
1702
+ "learning_rate": 7.381678621899077e-07,
1703
+ "loss": 0.19848825931549072,
1704
+ "step": 2420
1705
+ },
1706
+ {
1707
+ "epoch": 0.7776,
1708
+ "grad_norm": 0.6808200224599221,
1709
+ "learning_rate": 7.184624854970379e-07,
1710
+ "loss": 0.19454023838043213,
1711
+ "step": 2430
1712
+ },
1713
+ {
1714
+ "epoch": 0.7808,
1715
+ "grad_norm": 0.5446840545845119,
1716
+ "learning_rate": 6.989794702899932e-07,
1717
+ "loss": 0.1943270444869995,
1718
+ "step": 2440
1719
+ },
1720
+ {
1721
+ "epoch": 0.784,
1722
+ "grad_norm": 0.6415010178339859,
1723
+ "learning_rate": 6.797212483290777e-07,
1724
+ "loss": 0.19584910869598388,
1725
+ "step": 2450
1726
+ },
1727
+ {
1728
+ "epoch": 0.7872,
1729
+ "grad_norm": 0.603526871568268,
1730
+ "learning_rate": 6.60690223317171e-07,
1731
+ "loss": 0.19342836141586303,
1732
+ "step": 2460
1733
+ },
1734
+ {
1735
+ "epoch": 0.7904,
1736
+ "grad_norm": 0.5817111419419255,
1737
+ "learning_rate": 6.418887705997046e-07,
1738
+ "loss": 0.19574793577194213,
1739
+ "step": 2470
1740
+ },
1741
+ {
1742
+ "epoch": 0.7936,
1743
+ "grad_norm": 0.7792382444355755,
1744
+ "learning_rate": 6.23319236868189e-07,
1745
+ "loss": 0.1987607717514038,
1746
+ "step": 2480
1747
+ },
1748
+ {
1749
+ "epoch": 0.7968,
1750
+ "grad_norm": 0.6291788716222239,
1751
+ "learning_rate": 6.049839398673141e-07,
1752
+ "loss": 0.20009655952453614,
1753
+ "step": 2490
1754
+ },
1755
+ {
1756
+ "epoch": 0.8,
1757
+ "grad_norm": 0.674170182636883,
1758
+ "learning_rate": 5.868851681056567e-07,
1759
+ "loss": 0.2016763210296631,
1760
+ "step": 2500
1761
+ },
1762
+ {
1763
+ "epoch": 0.8032,
1764
+ "grad_norm": 0.5738700746068163,
1765
+ "learning_rate": 5.690251805700467e-07,
1766
+ "loss": 0.19858623743057252,
1767
+ "step": 2510
1768
+ },
1769
+ {
1770
+ "epoch": 0.8064,
1771
+ "grad_norm": 0.5748267344102337,
1772
+ "learning_rate": 5.514062064436096e-07,
1773
+ "loss": 0.19959205389022827,
1774
+ "step": 2520
1775
+ },
1776
+ {
1777
+ "epoch": 0.8096,
1778
+ "grad_norm": 0.6464282974919533,
1779
+ "learning_rate": 5.34030444827533e-07,
1780
+ "loss": 0.19621236324310304,
1781
+ "step": 2530
1782
+ },
1783
+ {
1784
+ "epoch": 0.8128,
1785
+ "grad_norm": 0.6390320405050175,
1786
+ "learning_rate": 5.169000644665895e-07,
1787
+ "loss": 0.19293551445007323,
1788
+ "step": 2540
1789
+ },
1790
+ {
1791
+ "epoch": 0.816,
1792
+ "grad_norm": 0.5856228193289068,
1793
+ "learning_rate": 5.000172034784442e-07,
1794
+ "loss": 0.1952167272567749,
1795
+ "step": 2550
1796
+ },
1797
+ {
1798
+ "epoch": 0.8192,
1799
+ "grad_norm": 0.6152721851543074,
1800
+ "learning_rate": 4.833839690867853e-07,
1801
+ "loss": 0.19755464792251587,
1802
+ "step": 2560
1803
+ },
1804
+ {
1805
+ "epoch": 0.8224,
1806
+ "grad_norm": 0.6792777707129383,
1807
+ "learning_rate": 4.6700243735831705e-07,
1808
+ "loss": 0.1906466007232666,
1809
+ "step": 2570
1810
+ },
1811
+ {
1812
+ "epoch": 0.8256,
1813
+ "grad_norm": 0.5650779115466599,
1814
+ "learning_rate": 4.508746529436311e-07,
1815
+ "loss": 0.1896218776702881,
1816
+ "step": 2580
1817
+ },
1818
+ {
1819
+ "epoch": 0.8288,
1820
+ "grad_norm": 0.6068556104605155,
1821
+ "learning_rate": 4.350026288220083e-07,
1822
+ "loss": 0.1972370147705078,
1823
+ "step": 2590
1824
+ },
1825
+ {
1826
+ "epoch": 0.832,
1827
+ "grad_norm": 0.6087844635927864,
1828
+ "learning_rate": 4.1938834605017133e-07,
1829
+ "loss": 0.19401493072509765,
1830
+ "step": 2600
1831
+ },
1832
+ {
1833
+ "epoch": 0.8352,
1834
+ "grad_norm": 0.594443863161453,
1835
+ "learning_rate": 4.0403375351501515e-07,
1836
+ "loss": 0.19397275447845458,
1837
+ "step": 2610
1838
+ },
1839
+ {
1840
+ "epoch": 0.8384,
1841
+ "grad_norm": 0.5777613928889838,
1842
+ "learning_rate": 3.88940767690362e-07,
1843
+ "loss": 0.19363962411880492,
1844
+ "step": 2620
1845
+ },
1846
+ {
1847
+ "epoch": 0.8416,
1848
+ "grad_norm": 0.6122408540819826,
1849
+ "learning_rate": 3.7411127239775774e-07,
1850
+ "loss": 0.19224631786346436,
1851
+ "step": 2630
1852
+ },
1853
+ {
1854
+ "epoch": 0.8448,
1855
+ "grad_norm": 0.5922115547592817,
1856
+ "learning_rate": 3.595471185713431e-07,
1857
+ "loss": 0.19027912616729736,
1858
+ "step": 2640
1859
+ },
1860
+ {
1861
+ "epoch": 0.848,
1862
+ "grad_norm": 0.6012010067551694,
1863
+ "learning_rate": 3.4525012402682826e-07,
1864
+ "loss": 0.1921192765235901,
1865
+ "step": 2650
1866
+ },
1867
+ {
1868
+ "epoch": 0.8512,
1869
+ "grad_norm": 0.6089446682050474,
1870
+ "learning_rate": 3.3122207323460804e-07,
1871
+ "loss": 0.19460537433624267,
1872
+ "step": 2660
1873
+ },
1874
+ {
1875
+ "epoch": 0.8544,
1876
+ "grad_norm": 0.6314431181993275,
1877
+ "learning_rate": 3.1746471709702963e-07,
1878
+ "loss": 0.19075865745544435,
1879
+ "step": 2670
1880
+ },
1881
+ {
1882
+ "epoch": 0.8576,
1883
+ "grad_norm": 0.6136529603147252,
1884
+ "learning_rate": 3.039797727298585e-07,
1885
+ "loss": 0.1973212718963623,
1886
+ "step": 2680
1887
+ },
1888
+ {
1889
+ "epoch": 0.8608,
1890
+ "grad_norm": 0.6278068265217286,
1891
+ "learning_rate": 2.9076892324795546e-07,
1892
+ "loss": 0.19564627408981322,
1893
+ "step": 2690
1894
+ },
1895
+ {
1896
+ "epoch": 0.864,
1897
+ "grad_norm": 0.6308491327804164,
1898
+ "learning_rate": 2.778338175551995e-07,
1899
+ "loss": 0.19089040756225586,
1900
+ "step": 2700
1901
+ },
1902
+ {
1903
+ "epoch": 0.8672,
1904
+ "grad_norm": 0.6806226474068601,
1905
+ "learning_rate": 2.6517607013868326e-07,
1906
+ "loss": 0.19906394481658934,
1907
+ "step": 2710
1908
+ },
1909
+ {
1910
+ "epoch": 0.8704,
1911
+ "grad_norm": 0.6497216896329614,
1912
+ "learning_rate": 2.527972608672002e-07,
1913
+ "loss": 0.19420729875564574,
1914
+ "step": 2720
1915
+ },
1916
+ {
1917
+ "epoch": 0.8736,
1918
+ "grad_norm": 0.5988037888796804,
1919
+ "learning_rate": 2.40698934794053e-07,
1920
+ "loss": 0.1949334740638733,
1921
+ "step": 2730
1922
+ },
1923
+ {
1924
+ "epoch": 0.8768,
1925
+ "grad_norm": 0.5825410688543936,
1926
+ "learning_rate": 2.2888260196421237e-07,
1927
+ "loss": 0.19373006820678712,
1928
+ "step": 2740
1929
+ },
1930
+ {
1931
+ "epoch": 0.88,
1932
+ "grad_norm": 0.5659393573725252,
1933
+ "learning_rate": 2.1734973722583735e-07,
1934
+ "loss": 0.19743962287902833,
1935
+ "step": 2750
1936
+ },
1937
+ {
1938
+ "epoch": 0.8832,
1939
+ "grad_norm": 0.6810045862821603,
1940
+ "learning_rate": 2.0610178004619564e-07,
1941
+ "loss": 0.18792747259140014,
1942
+ "step": 2760
1943
+ },
1944
+ {
1945
+ "epoch": 0.8864,
1946
+ "grad_norm": 0.5624807528399969,
1947
+ "learning_rate": 1.9514013433199834e-07,
1948
+ "loss": 0.20065484046936036,
1949
+ "step": 2770
1950
+ },
1951
+ {
1952
+ "epoch": 0.8896,
1953
+ "grad_norm": 0.5300049949985157,
1954
+ "learning_rate": 1.8446616825416958e-07,
1955
+ "loss": 0.19963890314102173,
1956
+ "step": 2780
1957
+ },
1958
+ {
1959
+ "epoch": 0.8928,
1960
+ "grad_norm": 0.6417643354263414,
1961
+ "learning_rate": 1.7408121407708007e-07,
1962
+ "loss": 0.19946534633636476,
1963
+ "step": 2790
1964
+ },
1965
+ {
1966
+ "epoch": 0.896,
1967
+ "grad_norm": 0.6263783317633913,
1968
+ "learning_rate": 1.6398656799226253e-07,
1969
+ "loss": 0.1873138427734375,
1970
+ "step": 2800
1971
+ },
1972
+ {
1973
+ "epoch": 0.8992,
1974
+ "grad_norm": 0.6642472444356609,
1975
+ "learning_rate": 1.5418348995662773e-07,
1976
+ "loss": 0.1936098575592041,
1977
+ "step": 2810
1978
+ },
1979
+ {
1980
+ "epoch": 0.9024,
1981
+ "grad_norm": 0.6361104958877116,
1982
+ "learning_rate": 1.4467320353520275e-07,
1983
+ "loss": 0.192909574508667,
1984
+ "step": 2820
1985
+ },
1986
+ {
1987
+ "epoch": 0.9056,
1988
+ "grad_norm": 0.606401356191172,
1989
+ "learning_rate": 1.3545689574841341e-07,
1990
+ "loss": 0.1932598114013672,
1991
+ "step": 2830
1992
+ },
1993
+ {
1994
+ "epoch": 0.9088,
1995
+ "grad_norm": 0.6138805257535019,
1996
+ "learning_rate": 1.26535716923927e-07,
1997
+ "loss": 0.19897468090057374,
1998
+ "step": 2840
1999
+ },
2000
+ {
2001
+ "epoch": 0.912,
2002
+ "grad_norm": 0.6113791423952993,
2003
+ "learning_rate": 1.1791078055307493e-07,
2004
+ "loss": 0.19516528844833375,
2005
+ "step": 2850
2006
+ },
2007
+ {
2008
+ "epoch": 0.9152,
2009
+ "grad_norm": 0.5897316619244026,
2010
+ "learning_rate": 1.0958316315187289e-07,
2011
+ "loss": 0.1947079300880432,
2012
+ "step": 2860
2013
+ },
2014
+ {
2015
+ "epoch": 0.9184,
2016
+ "grad_norm": 0.6570448249633108,
2017
+ "learning_rate": 1.0155390412665528e-07,
2018
+ "loss": 0.19286593198776245,
2019
+ "step": 2870
2020
+ },
2021
+ {
2022
+ "epoch": 0.9216,
2023
+ "grad_norm": 0.6143543897264965,
2024
+ "learning_rate": 9.38240056443443e-08,
2025
+ "loss": 0.18985612392425538,
2026
+ "step": 2880
2027
+ },
2028
+ {
2029
+ "epoch": 0.9248,
2030
+ "grad_norm": 0.6208574768565508,
2031
+ "learning_rate": 8.639443250736402e-08,
2032
+ "loss": 0.1930636167526245,
2033
+ "step": 2890
2034
+ },
2035
+ {
2036
+ "epoch": 0.928,
2037
+ "grad_norm": 0.6380337536968056,
2038
+ "learning_rate": 7.926611203321777e-08,
2039
+ "loss": 0.1940324306488037,
2040
+ "step": 2900
2041
+ },
2042
+ {
2043
+ "epoch": 0.9312,
2044
+ "grad_norm": 0.6333119199427104,
2045
+ "learning_rate": 7.243993393874882e-08,
2046
+ "loss": 0.195207679271698,
2047
+ "step": 2910
2048
+ },
2049
+ {
2050
+ "epoch": 0.9344,
2051
+ "grad_norm": 0.5601684784399228,
2052
+ "learning_rate": 6.591675022908805e-08,
2053
+ "loss": 0.1926344394683838,
2054
+ "step": 2920
2055
+ },
2056
+ {
2057
+ "epoch": 0.9376,
2058
+ "grad_norm": 0.7001254632467586,
2059
+ "learning_rate": 5.969737509131241e-08,
2060
+ "loss": 0.189910888671875,
2061
+ "step": 2930
2062
+ },
2063
+ {
2064
+ "epoch": 0.9408,
2065
+ "grad_norm": 0.5707165379372983,
2066
+ "learning_rate": 5.3782584792823334e-08,
2067
+ "loss": 0.1941395878791809,
2068
+ "step": 2940
2069
+ },
2070
+ {
2071
+ "epoch": 0.944,
2072
+ "grad_norm": 0.637882100753534,
2073
+ "learning_rate": 4.817311758445686e-08,
2074
+ "loss": 0.19586544036865233,
2075
+ "step": 2950
2076
+ },
2077
+ {
2078
+ "epoch": 0.9472,
2079
+ "grad_norm": 0.58305847153215,
2080
+ "learning_rate": 4.286967360833866e-08,
2081
+ "loss": 0.19621498584747316,
2082
+ "step": 2960
2083
+ },
2084
+ {
2085
+ "epoch": 0.9504,
2086
+ "grad_norm": 0.6444124781946634,
2087
+ "learning_rate": 3.787291481049754e-08,
2088
+ "loss": 0.19597216844558715,
2089
+ "step": 2970
2090
+ },
2091
+ {
2092
+ "epoch": 0.9536,
2093
+ "grad_norm": 0.68778482150424,
2094
+ "learning_rate": 3.3183464858244364e-08,
2095
+ "loss": 0.20229551792144776,
2096
+ "step": 2980
2097
+ },
2098
+ {
2099
+ "epoch": 0.9568,
2100
+ "grad_norm": 0.589065287919965,
2101
+ "learning_rate": 2.8801909062328992e-08,
2102
+ "loss": 0.1879359722137451,
2103
+ "step": 2990
2104
+ },
2105
+ {
2106
+ "epoch": 0.96,
2107
+ "grad_norm": 0.7200708770444023,
2108
+ "learning_rate": 2.4728794303886248e-08,
2109
+ "loss": 0.18806444406509398,
2110
+ "step": 3000
2111
+ },
2112
+ {
2113
+ "epoch": 0.9632,
2114
+ "grad_norm": 0.6369212243333968,
2115
+ "learning_rate": 2.0964628966175794e-08,
2116
+ "loss": 0.19293060302734374,
2117
+ "step": 3010
2118
+ },
2119
+ {
2120
+ "epoch": 0.9664,
2121
+ "grad_norm": 0.6150129328436796,
2122
+ "learning_rate": 1.750988287113009e-08,
2123
+ "loss": 0.19189660549163817,
2124
+ "step": 3020
2125
+ },
2126
+ {
2127
+ "epoch": 0.9696,
2128
+ "grad_norm": 0.5966036549992078,
2129
+ "learning_rate": 1.4364987220713278e-08,
2130
+ "loss": 0.1992994427680969,
2131
+ "step": 3030
2132
+ },
2133
+ {
2134
+ "epoch": 0.9728,
2135
+ "grad_norm": 0.6785615385564472,
2136
+ "learning_rate": 1.1530334543099763e-08,
2137
+ "loss": 0.19624128341674804,
2138
+ "step": 3040
2139
+ },
2140
+ {
2141
+ "epoch": 0.976,
2142
+ "grad_norm": 0.626236262460755,
2143
+ "learning_rate": 9.006278643683697e-09,
2144
+ "loss": 0.19942662715911866,
2145
+ "step": 3050
2146
+ },
2147
+ {
2148
+ "epoch": 0.9792,
2149
+ "grad_norm": 0.71228117768398,
2150
+ "learning_rate": 6.793134560916514e-09,
2151
+ "loss": 0.2007957935333252,
2152
+ "step": 3060
2153
+ },
2154
+ {
2155
+ "epoch": 0.9824,
2156
+ "grad_norm": 0.5740813965788273,
2157
+ "learning_rate": 4.891178526986451e-09,
2158
+ "loss": 0.19730459451675414,
2159
+ "step": 3070
2160
+ },
2161
+ {
2162
+ "epoch": 0.9856,
2163
+ "grad_norm": 0.6522249776214731,
2164
+ "learning_rate": 3.3006479333413943e-09,
2165
+ "loss": 0.1995969295501709,
2166
+ "step": 3080
2167
+ },
2168
+ {
2169
+ "epoch": 0.9888,
2170
+ "grad_norm": 0.6484316026206892,
2171
+ "learning_rate": 2.021741301058422e-09,
2172
+ "loss": 0.19556543827056885,
2173
+ "step": 3090
2174
+ },
2175
+ {
2176
+ "epoch": 0.992,
2177
+ "grad_norm": 0.6355663767406068,
2178
+ "learning_rate": 1.0546182560652872e-09,
2179
+ "loss": 0.19732578992843627,
2180
+ "step": 3100
2181
+ },
2182
+ {
2183
+ "epoch": 0.9952,
2184
+ "grad_norm": 0.6169267731488666,
2185
+ "learning_rate": 3.9939950921774607e-10,
2186
+ "loss": 0.1917206883430481,
2187
+ "step": 3110
2188
+ },
2189
+ {
2190
+ "epoch": 0.9984,
2191
+ "grad_norm": 0.5994063111681457,
2192
+ "learning_rate": 5.616684123160854e-11,
2193
+ "loss": 0.1916499137878418,
2194
+ "step": 3120
2195
+ },
2196
+ {
2197
+ "epoch": 1.0,
2198
+ "step": 3125,
2199
+ "total_flos": 1201860236279808.0,
2200
+ "train_loss": 0.2128027264213562,
2201
+ "train_runtime": 15463.9635,
2202
+ "train_samples_per_second": 12.933,
2203
+ "train_steps_per_second": 0.202
2204
+ }
2205
+ ],
2206
+ "logging_steps": 10,
2207
+ "max_steps": 3125,
2208
+ "num_input_tokens_seen": 0,
2209
+ "num_train_epochs": 1,
2210
+ "save_steps": 500,
2211
+ "stateful_callbacks": {
2212
+ "TrainerControl": {
2213
+ "args": {
2214
+ "should_epoch_stop": false,
2215
+ "should_evaluate": false,
2216
+ "should_log": false,
2217
+ "should_save": true,
2218
+ "should_training_stop": true
2219
+ },
2220
+ "attributes": {}
2221
+ }
2222
+ },
2223
+ "total_flos": 1201860236279808.0,
2224
+ "train_batch_size": 8,
2225
+ "trial_name": null,
2226
+ "trial_params": null
2227
+ }
checkpoints/Qwen3-VL-2B-SFT/training_loss.png ADDED