martinkorelic commited on
Commit
cb17bda
·
verified ·
1 Parent(s): 66e58a5

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/adapter_config.json +40 -0
  2. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/eval_results.json +4 -0
  3. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/training_configuration.json +38 -0
  4. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/training_logs.json +625 -0
  5. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/adapter_config.json +40 -0
  6. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/eval_results.json +4 -0
  7. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/training_configuration.json +38 -0
  8. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/training_logs.json +625 -0
  9. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/adapter_config.json +40 -0
  10. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/eval_results.json +4 -0
  11. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/training_configuration.json +38 -0
  12. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/training_logs.json +625 -0
  13. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/adapter_config.json +40 -0
  14. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/eval_results.json +4 -0
  15. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/training_configuration.json +38 -0
  16. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/training_logs.json +1273 -0
  17. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/adapter_config.json +40 -0
  18. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/eval_results.json +4 -0
  19. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/training_configuration.json +38 -0
  20. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/training_logs.json +1273 -0
  21. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/adapter_config.json +40 -0
  22. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/eval_results.json +4 -0
  23. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/training_configuration.json +38 -0
  24. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/training_logs.json +1273 -0
  25. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/adapter_config.json +40 -0
  26. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/eval_results.json +4 -0
  27. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/training_configuration.json +38 -0
  28. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/training_logs.json +2659 -0
  29. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/adapter_config.json +40 -0
  30. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/eval_results.json +4 -0
  31. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/training_configuration.json +38 -0
  32. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/training_logs.json +0 -0
  33. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/adapter_config.json +40 -0
  34. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/eval_results.json +4 -0
  35. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/training_configuration.json +38 -0
  36. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/training_logs.json +0 -0
  37. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/adapter_config.json +40 -0
  38. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/eval_results.json +4 -0
  39. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_configuration.json +38 -0
  40. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_logs.json +0 -0
  41. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/adapter_config.json +40 -0
  42. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/eval_results.json +4 -0
  43. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/training_configuration.json +38 -0
  44. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/training_logs.json +0 -0
  45. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/adapter_config.json +40 -0
  46. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/eval_results.json +4 -0
  47. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/training_configuration.json +38 -0
  48. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/training_logs.json +0 -0
  49. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-winogrande-r2-a2/adapter_config.json +40 -0
  50. TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-winogrande-r2-a2/eval_results.json +4 -0
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.5742320819112628
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_c-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T01:26:53.248869"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r2-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.26033664,
6
+ "gpu_mem": 1.570594304,
7
+ "loss": 4.523,
8
+ "grad_norm": 386.9033203125,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.262106112,
15
+ "gpu_mem": 1.581082624,
16
+ "loss": 4.7678,
17
+ "grad_norm": 397.74127197265625,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.26230272,
24
+ "gpu_mem": 1.581113344,
25
+ "loss": 2.1543,
26
+ "grad_norm": 580.8819580078125,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.26230272,
33
+ "gpu_mem": 1.581079552,
34
+ "loss": 1.549,
35
+ "grad_norm": 28.669719696044922,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.262499328,
42
+ "gpu_mem": 1.581067264,
43
+ "loss": 1.5365,
44
+ "grad_norm": 46.89268493652344,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.262499328,
51
+ "gpu_mem": 1.58113024,
52
+ "loss": 1.4569,
53
+ "grad_norm": 26.04387092590332,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.262499328,
60
+ "gpu_mem": 1.581136384,
61
+ "loss": 1.4555,
62
+ "grad_norm": 25.978248596191406,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.262695936,
69
+ "gpu_mem": 1.581094912,
70
+ "loss": 1.5001,
71
+ "grad_norm": 33.86579132080078,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.262695936,
78
+ "gpu_mem": 1.581090304,
79
+ "loss": 1.4249,
80
+ "grad_norm": 32.164085388183594,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.262695936,
87
+ "gpu_mem": 1.581079552,
88
+ "loss": 1.5991,
89
+ "grad_norm": 28.974334716796875,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.262695936,
96
+ "gpu_mem": 1.581090304,
97
+ "loss": 1.3896,
98
+ "grad_norm": 12.336336135864258,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.262695936,
105
+ "gpu_mem": 1.58111488,
106
+ "loss": 1.5206,
107
+ "grad_norm": 24.24568748474121,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.262695936,
114
+ "gpu_mem": 1.58111488,
115
+ "loss": 1.3429,
116
+ "grad_norm": 20.367385864257812,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.262695936,
123
+ "gpu_mem": 1.581062656,
124
+ "loss": 1.5003,
125
+ "grad_norm": 19.901113510131836,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.262695936,
132
+ "gpu_mem": 1.58113792,
133
+ "loss": 1.4286,
134
+ "grad_norm": 9.188563346862793,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.262695936,
141
+ "gpu_mem": 1.581131776,
142
+ "loss": 1.4626,
143
+ "grad_norm": 22.184289932250977,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.262695936,
150
+ "gpu_mem": 1.581136384,
151
+ "loss": 1.3688,
152
+ "grad_norm": 10.638352394104004,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.262695936,
159
+ "gpu_mem": 1.5863424,
160
+ "loss": 2.0961,
161
+ "grad_norm": 25.868715286254883,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.262695936,
168
+ "gpu_mem": 1.586340864,
169
+ "loss": 1.4221,
170
+ "grad_norm": 15.752737045288086,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.262695936,
177
+ "gpu_mem": 1.586316288,
178
+ "loss": 1.2975,
179
+ "grad_norm": 13.059765815734863,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.262695936,
186
+ "gpu_mem": 1.586323968,
187
+ "loss": 1.4209,
188
+ "grad_norm": 15.304547309875488,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.262695936,
195
+ "gpu_mem": 1.586353152,
196
+ "loss": 1.3644,
197
+ "grad_norm": 14.864243507385254,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.262695936,
204
+ "gpu_mem": 1.586382336,
205
+ "loss": 1.3023,
206
+ "grad_norm": 11.402833938598633,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.262695936,
213
+ "gpu_mem": 1.586325504,
214
+ "loss": 1.3767,
215
+ "grad_norm": 12.885428428649902,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.262695936,
222
+ "gpu_mem": 1.586394624,
223
+ "loss": 1.3454,
224
+ "grad_norm": 10.699958801269531,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.262695936,
231
+ "gpu_mem": 1.586351616,
232
+ "loss": 1.3789,
233
+ "grad_norm": 19.259225845336914,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.262695936,
240
+ "gpu_mem": 1.586310144,
241
+ "loss": 1.4133,
242
+ "grad_norm": 10.785472869873047,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.262695936,
249
+ "gpu_mem": 1.586356224,
250
+ "loss": 1.5834,
251
+ "grad_norm": 21.35184669494629,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.262695936,
258
+ "gpu_mem": 1.586351616,
259
+ "loss": 1.3562,
260
+ "grad_norm": 7.896946907043457,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.262695936,
267
+ "gpu_mem": 1.586340864,
268
+ "loss": 1.3457,
269
+ "grad_norm": 12.608896255493164,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.262695936,
276
+ "gpu_mem": 1.586371584,
277
+ "loss": 1.4318,
278
+ "grad_norm": 19.986997604370117,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.262695936,
285
+ "gpu_mem": 1.5863808,
286
+ "loss": 1.3923,
287
+ "grad_norm": 15.040932655334473,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.262695936,
294
+ "gpu_mem": 1.586360832,
295
+ "loss": 1.3957,
296
+ "grad_norm": 10.201010704040527,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.262695936,
303
+ "gpu_mem": 1.586339328,
304
+ "loss": 1.4377,
305
+ "grad_norm": 10.625317573547363,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.262695936,
312
+ "gpu_mem": 1.5862272,
313
+ "loss": 2.1463,
314
+ "grad_norm": 13.790069580078125,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.262695936,
321
+ "gpu_mem": 1.581108736,
322
+ "loss": 1.3124,
323
+ "grad_norm": 6.783844947814941,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.262695936,
330
+ "gpu_mem": 1.581117952,
331
+ "loss": 1.4666,
332
+ "grad_norm": 18.66895866394043,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.262695936,
339
+ "gpu_mem": 1.581088768,
340
+ "loss": 1.4124,
341
+ "grad_norm": 16.531171798706055,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.262695936,
348
+ "gpu_mem": 1.5811072,
349
+ "loss": 1.4034,
350
+ "grad_norm": 15.827587127685547,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.262695936,
357
+ "gpu_mem": 1.58108416,
358
+ "loss": 1.3581,
359
+ "grad_norm": 6.456789970397949,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.262695936,
366
+ "gpu_mem": 1.581085696,
367
+ "loss": 1.3423,
368
+ "grad_norm": 5.921348571777344,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.262695936,
375
+ "gpu_mem": 1.58111488,
376
+ "loss": 1.2722,
377
+ "grad_norm": 7.7053327560424805,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.262695936,
384
+ "gpu_mem": 1.58113024,
385
+ "loss": 1.3529,
386
+ "grad_norm": 9.941762924194336,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.262695936,
393
+ "gpu_mem": 1.581148672,
394
+ "loss": 1.2702,
395
+ "grad_norm": 7.22553014755249,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.262695936,
402
+ "gpu_mem": 1.581102592,
403
+ "loss": 1.2056,
404
+ "grad_norm": 6.9285888671875,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.262695936,
411
+ "gpu_mem": 1.581096448,
412
+ "loss": 1.2361,
413
+ "grad_norm": 8.24813461303711,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.262695936,
420
+ "gpu_mem": 1.581090304,
421
+ "loss": 1.2144,
422
+ "grad_norm": 13.11104679107666,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.262695936,
429
+ "gpu_mem": 1.581094912,
430
+ "loss": 1.1387,
431
+ "grad_norm": 10.768682479858398,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.262695936,
438
+ "gpu_mem": 1.581085696,
439
+ "loss": 1.1511,
440
+ "grad_norm": 11.449169158935547,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.262695936,
447
+ "gpu_mem": 1.581067264,
448
+ "loss": 1.2684,
449
+ "grad_norm": 17.303821563720703,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.262695936,
456
+ "gpu_mem": 1.58109184,
457
+ "loss": 1.132,
458
+ "grad_norm": 17.978370666503906,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.262695936,
465
+ "gpu_mem": 1.581119488,
466
+ "loss": 1.3197,
467
+ "grad_norm": 21.446002960205078,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.262695936,
474
+ "gpu_mem": 1.586337792,
475
+ "loss": 1.7427,
476
+ "grad_norm": 33.512630462646484,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.262695936,
483
+ "gpu_mem": 1.586307072,
484
+ "loss": 1.0626,
485
+ "grad_norm": 17.818815231323242,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.262695936,
492
+ "gpu_mem": 1.586340864,
493
+ "loss": 1.1322,
494
+ "grad_norm": 19.490102767944336,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.262695936,
501
+ "gpu_mem": 1.586414592,
502
+ "loss": 1.0963,
503
+ "grad_norm": 24.299396514892578,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.262695936,
510
+ "gpu_mem": 1.58635776,
511
+ "loss": 1.0444,
512
+ "grad_norm": 23.322965621948242,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.262695936,
519
+ "gpu_mem": 1.586351616,
520
+ "loss": 0.9757,
521
+ "grad_norm": 27.589811325073242,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.262695936,
528
+ "gpu_mem": 1.586402304,
529
+ "loss": 0.9909,
530
+ "grad_norm": 18.960704803466797,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.262695936,
537
+ "gpu_mem": 1.586328576,
538
+ "loss": 1.0639,
539
+ "grad_norm": 30.403343200683594,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.262695936,
546
+ "gpu_mem": 1.5863424,
547
+ "loss": 1.0703,
548
+ "grad_norm": 24.316631317138672,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.262695936,
555
+ "gpu_mem": 1.586343936,
556
+ "loss": 1.0999,
557
+ "grad_norm": 23.189725875854492,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.262695936,
564
+ "gpu_mem": 1.586333184,
565
+ "loss": 1.0584,
566
+ "grad_norm": 22.57982063293457,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.262695936,
573
+ "gpu_mem": 1.58635008,
574
+ "loss": 0.9975,
575
+ "grad_norm": 21.49742317199707,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.262695936,
582
+ "gpu_mem": 1.586371584,
583
+ "loss": 0.8987,
584
+ "grad_norm": 23.114856719970703,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.262695936,
591
+ "gpu_mem": 1.586362368,
592
+ "loss": 0.9014,
593
+ "grad_norm": 20.236783981323242,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.262695936,
600
+ "gpu_mem": 1.58638848,
601
+ "loss": 1.0798,
602
+ "grad_norm": 19.5511417388916,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.262695936,
609
+ "gpu_mem": 1.586339328,
610
+ "loss": 0.9931,
611
+ "grad_norm": 20.80122947692871,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.262695936,
618
+ "gpu_mem": 1.586339328,
619
+ "train_runtime": 387.8643,
620
+ "train_samples_per_second": 11.54,
621
+ "train_steps_per_second": 0.175,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.434577746426358
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.4786689419795222
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_c-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T16:06:46.886810"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r32-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 3.286253568,
6
+ "gpu_mem": 1.650468352,
7
+ "loss": 4.523,
8
+ "grad_norm": 82.09971618652344,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 3.288219648,
15
+ "gpu_mem": 1.818607616,
16
+ "loss": 4.7678,
17
+ "grad_norm": 84.00991821289062,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 3.288416256,
24
+ "gpu_mem": 1.818638336,
25
+ "loss": 2.4154,
26
+ "grad_norm": 163.50953674316406,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 3.288416256,
33
+ "gpu_mem": 1.818604544,
34
+ "loss": 1.5756,
35
+ "grad_norm": 5.579870700836182,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 3.288416256,
42
+ "gpu_mem": 1.818592256,
43
+ "loss": 1.4128,
44
+ "grad_norm": 3.7148215770721436,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 3.288416256,
51
+ "gpu_mem": 1.818655232,
52
+ "loss": 1.5169,
53
+ "grad_norm": 8.302370071411133,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 3.288416256,
60
+ "gpu_mem": 1.818661376,
61
+ "loss": 1.5872,
62
+ "grad_norm": 7.169394493103027,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 3.288612864,
69
+ "gpu_mem": 1.818619904,
70
+ "loss": 1.3811,
71
+ "grad_norm": 2.602477550506592,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 3.288612864,
78
+ "gpu_mem": 1.818615296,
79
+ "loss": 1.3664,
80
+ "grad_norm": 4.919002056121826,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 3.288612864,
87
+ "gpu_mem": 1.818604544,
88
+ "loss": 1.4621,
89
+ "grad_norm": 3.9696922302246094,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 3.288612864,
96
+ "gpu_mem": 1.818615296,
97
+ "loss": 1.371,
98
+ "grad_norm": 2.049038887023926,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 3.288612864,
105
+ "gpu_mem": 1.818639872,
106
+ "loss": 1.5241,
107
+ "grad_norm": 4.8838067054748535,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 3.288612864,
114
+ "gpu_mem": 1.818639872,
115
+ "loss": 1.4337,
116
+ "grad_norm": 4.761289119720459,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 3.288612864,
123
+ "gpu_mem": 1.818587648,
124
+ "loss": 1.5096,
125
+ "grad_norm": 3.6689658164978027,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 3.288612864,
132
+ "gpu_mem": 1.818662912,
133
+ "loss": 1.424,
134
+ "grad_norm": 2.6308181285858154,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 3.288612864,
141
+ "gpu_mem": 1.818656768,
142
+ "loss": 1.5535,
143
+ "grad_norm": 4.361621379852295,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 3.288612864,
150
+ "gpu_mem": 1.818661376,
151
+ "loss": 1.6281,
152
+ "grad_norm": 5.062143325805664,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 3.288612864,
159
+ "gpu_mem": 1.902692864,
160
+ "loss": 2.1642,
161
+ "grad_norm": 3.5901105403900146,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 3.288612864,
168
+ "gpu_mem": 1.902691328,
169
+ "loss": 1.3664,
170
+ "grad_norm": 0.9835745096206665,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 3.288612864,
177
+ "gpu_mem": 1.902666752,
178
+ "loss": 1.3149,
179
+ "grad_norm": 1.6624815464019775,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 3.288612864,
186
+ "gpu_mem": 1.902674432,
187
+ "loss": 1.4204,
188
+ "grad_norm": 2.086113691329956,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 3.288612864,
195
+ "gpu_mem": 1.902703616,
196
+ "loss": 1.4248,
197
+ "grad_norm": 2.951725959777832,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 3.288612864,
204
+ "gpu_mem": 1.9027328,
205
+ "loss": 1.3371,
206
+ "grad_norm": 1.7192511558532715,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 3.288612864,
213
+ "gpu_mem": 1.902675968,
214
+ "loss": 1.3549,
215
+ "grad_norm": 1.3918964862823486,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 3.288612864,
222
+ "gpu_mem": 1.902745088,
223
+ "loss": 1.4172,
224
+ "grad_norm": 3.677994728088379,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 3.288612864,
231
+ "gpu_mem": 1.90270208,
232
+ "loss": 1.4108,
233
+ "grad_norm": 2.309683084487915,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 3.288612864,
240
+ "gpu_mem": 1.902660608,
241
+ "loss": 1.3956,
242
+ "grad_norm": 2.2960798740386963,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 3.288612864,
249
+ "gpu_mem": 1.902706688,
250
+ "loss": 1.5291,
251
+ "grad_norm": 4.181861400604248,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 3.288612864,
258
+ "gpu_mem": 1.90270208,
259
+ "loss": 1.3542,
260
+ "grad_norm": 1.3037711381912231,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 3.288612864,
267
+ "gpu_mem": 1.902691328,
268
+ "loss": 1.341,
269
+ "grad_norm": 1.7516839504241943,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 3.288612864,
276
+ "gpu_mem": 1.902722048,
277
+ "loss": 1.3945,
278
+ "grad_norm": 2.477121591567993,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 3.288612864,
285
+ "gpu_mem": 1.902731264,
286
+ "loss": 1.3733,
287
+ "grad_norm": 1.7665789127349854,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 3.288612864,
294
+ "gpu_mem": 1.902711296,
295
+ "loss": 1.3785,
296
+ "grad_norm": 1.5914952754974365,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 3.288612864,
303
+ "gpu_mem": 1.902689792,
304
+ "loss": 1.4217,
305
+ "grad_norm": 2.3489677906036377,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 3.288612864,
312
+ "gpu_mem": 1.902577664,
313
+ "loss": 2.1136,
314
+ "grad_norm": 2.8285887241363525,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 3.288612864,
321
+ "gpu_mem": 1.818633728,
322
+ "loss": 1.3195,
323
+ "grad_norm": 0.946521520614624,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 3.288612864,
330
+ "gpu_mem": 1.818642944,
331
+ "loss": 1.4247,
332
+ "grad_norm": 2.4699201583862305,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 3.288612864,
339
+ "gpu_mem": 1.81861376,
340
+ "loss": 1.3203,
341
+ "grad_norm": 1.1205692291259766,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 3.288612864,
348
+ "gpu_mem": 1.818632192,
349
+ "loss": 1.344,
350
+ "grad_norm": 0.953203022480011,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 3.288612864,
357
+ "gpu_mem": 1.818609152,
358
+ "loss": 1.4074,
359
+ "grad_norm": 2.248765707015991,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 3.288612864,
366
+ "gpu_mem": 1.818610688,
367
+ "loss": 1.4226,
368
+ "grad_norm": 2.287403106689453,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 3.288612864,
375
+ "gpu_mem": 1.818639872,
376
+ "loss": 1.3198,
377
+ "grad_norm": 2.023106813430786,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 3.288612864,
384
+ "gpu_mem": 1.818655232,
385
+ "loss": 1.3779,
386
+ "grad_norm": 2.326486587524414,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 3.288612864,
393
+ "gpu_mem": 1.818673664,
394
+ "loss": 1.3048,
395
+ "grad_norm": 1.1482384204864502,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 3.288612864,
402
+ "gpu_mem": 1.818627584,
403
+ "loss": 1.2985,
404
+ "grad_norm": 1.2250388860702515,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 3.288612864,
411
+ "gpu_mem": 1.81862144,
412
+ "loss": 1.239,
413
+ "grad_norm": 0.9595347046852112,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 3.288612864,
420
+ "gpu_mem": 1.818615296,
421
+ "loss": 1.2764,
422
+ "grad_norm": 1.3047759532928467,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 3.288612864,
429
+ "gpu_mem": 1.818619904,
430
+ "loss": 1.2382,
431
+ "grad_norm": 1.3883724212646484,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 3.288612864,
438
+ "gpu_mem": 1.818610688,
439
+ "loss": 1.2729,
440
+ "grad_norm": 1.0796666145324707,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 3.288612864,
447
+ "gpu_mem": 1.818592256,
448
+ "loss": 1.3162,
449
+ "grad_norm": 1.7300052642822266,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 3.288612864,
456
+ "gpu_mem": 1.818616832,
457
+ "loss": 1.2545,
458
+ "grad_norm": 1.2138413190841675,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 3.288612864,
465
+ "gpu_mem": 1.81864448,
466
+ "loss": 1.3454,
467
+ "grad_norm": 1.7738908529281616,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 3.288612864,
474
+ "gpu_mem": 1.902688256,
475
+ "loss": 1.878,
476
+ "grad_norm": 2.353388547897339,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 3.288612864,
483
+ "gpu_mem": 1.902657536,
484
+ "loss": 1.2712,
485
+ "grad_norm": 1.5192903280258179,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 3.288612864,
492
+ "gpu_mem": 1.902691328,
493
+ "loss": 1.2764,
494
+ "grad_norm": 1.8985798358917236,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 3.288612864,
501
+ "gpu_mem": 1.902765056,
502
+ "loss": 1.2176,
503
+ "grad_norm": 1.4844719171524048,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 3.288612864,
510
+ "gpu_mem": 1.902708224,
511
+ "loss": 1.1988,
512
+ "grad_norm": 1.1800713539123535,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 3.288612864,
519
+ "gpu_mem": 1.90270208,
520
+ "loss": 1.1535,
521
+ "grad_norm": 1.6904629468917847,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 3.288612864,
528
+ "gpu_mem": 1.902752768,
529
+ "loss": 1.1694,
530
+ "grad_norm": 1.3597590923309326,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 3.288612864,
537
+ "gpu_mem": 1.90267904,
538
+ "loss": 1.3025,
539
+ "grad_norm": 2.6322078704833984,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 3.288612864,
546
+ "gpu_mem": 1.902692864,
547
+ "loss": 1.2436,
548
+ "grad_norm": 2.03961181640625,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 3.288612864,
555
+ "gpu_mem": 1.9026944,
556
+ "loss": 1.2301,
557
+ "grad_norm": 2.2234535217285156,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 3.288612864,
564
+ "gpu_mem": 1.902683648,
565
+ "loss": 1.2626,
566
+ "grad_norm": 1.904895305633545,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 3.288612864,
573
+ "gpu_mem": 1.902700544,
574
+ "loss": 1.231,
575
+ "grad_norm": 2.2669050693511963,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 3.288612864,
582
+ "gpu_mem": 1.902722048,
583
+ "loss": 1.2184,
584
+ "grad_norm": 2.1905152797698975,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 3.288612864,
591
+ "gpu_mem": 1.902712832,
592
+ "loss": 1.1744,
593
+ "grad_norm": 1.8609352111816406,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 3.288612864,
600
+ "gpu_mem": 1.902738944,
601
+ "loss": 1.1777,
602
+ "grad_norm": 1.5420715808868408,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 3.288612864,
609
+ "gpu_mem": 1.902689792,
610
+ "loss": 1.227,
611
+ "grad_norm": 2.120975971221924,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 3.288612864,
618
+ "gpu_mem": 1.902689792,
619
+ "train_runtime": 391.4503,
620
+ "train_samples_per_second": 11.434,
621
+ "train_steps_per_second": 0.174,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.495284583638696
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.4931740614334471
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_c-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_c-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T08:46:25.215113"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_c-r8-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 2.267058176,
6
+ "gpu_mem": 1.589430784,
7
+ "loss": 4.523,
8
+ "grad_norm": 183.94940185546875,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 2.269024256,
15
+ "gpu_mem": 1.631300608,
16
+ "loss": 4.7678,
17
+ "grad_norm": 189.22462463378906,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 2.269220864,
24
+ "gpu_mem": 1.631331328,
25
+ "loss": 2.1307,
26
+ "grad_norm": 215.5003204345703,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 2.269220864,
33
+ "gpu_mem": 1.631297536,
34
+ "loss": 1.5658,
35
+ "grad_norm": 12.267834663391113,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 2.269220864,
42
+ "gpu_mem": 1.631285248,
43
+ "loss": 1.4716,
44
+ "grad_norm": 15.389113426208496,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 2.269220864,
51
+ "gpu_mem": 1.631348224,
52
+ "loss": 1.4827,
53
+ "grad_norm": 13.980608940124512,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 2.269220864,
60
+ "gpu_mem": 1.631354368,
61
+ "loss": 1.476,
62
+ "grad_norm": 11.878750801086426,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 2.269220864,
69
+ "gpu_mem": 1.631312896,
70
+ "loss": 1.4347,
71
+ "grad_norm": 7.746488094329834,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 2.269220864,
78
+ "gpu_mem": 1.631308288,
79
+ "loss": 1.3948,
80
+ "grad_norm": 10.116765975952148,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 2.269220864,
87
+ "gpu_mem": 1.631297536,
88
+ "loss": 1.502,
89
+ "grad_norm": 9.356364250183105,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 2.269220864,
96
+ "gpu_mem": 1.631308288,
97
+ "loss": 1.4267,
98
+ "grad_norm": 6.226222038269043,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 2.269220864,
105
+ "gpu_mem": 1.631332864,
106
+ "loss": 1.4326,
107
+ "grad_norm": 6.319589138031006,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 2.269220864,
114
+ "gpu_mem": 1.631332864,
115
+ "loss": 1.349,
116
+ "grad_norm": 7.3017497062683105,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 2.269220864,
123
+ "gpu_mem": 1.63128064,
124
+ "loss": 1.5659,
125
+ "grad_norm": 7.98867130279541,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 2.269220864,
132
+ "gpu_mem": 1.631355904,
133
+ "loss": 1.6127,
134
+ "grad_norm": 8.264293670654297,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 2.269220864,
141
+ "gpu_mem": 1.63134976,
142
+ "loss": 1.4507,
143
+ "grad_norm": 4.270768642425537,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 2.269220864,
150
+ "gpu_mem": 1.631354368,
151
+ "loss": 1.3901,
152
+ "grad_norm": 3.606339693069458,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 2.269220864,
159
+ "gpu_mem": 1.652251136,
160
+ "loss": 2.027,
161
+ "grad_norm": 4.613204479217529,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 2.269220864,
168
+ "gpu_mem": 1.6522496,
169
+ "loss": 1.4172,
170
+ "grad_norm": 4.635749340057373,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 2.269220864,
177
+ "gpu_mem": 1.652225024,
178
+ "loss": 1.3737,
179
+ "grad_norm": 5.134657382965088,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 2.269220864,
186
+ "gpu_mem": 1.652232704,
187
+ "loss": 1.3672,
188
+ "grad_norm": 2.7773523330688477,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 2.269220864,
195
+ "gpu_mem": 1.652261888,
196
+ "loss": 1.3778,
197
+ "grad_norm": 8.92182731628418,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 2.269220864,
204
+ "gpu_mem": 1.652291072,
205
+ "loss": 1.3223,
206
+ "grad_norm": 4.256777286529541,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 2.269220864,
213
+ "gpu_mem": 1.65223424,
214
+ "loss": 1.432,
215
+ "grad_norm": 5.225455284118652,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 2.269220864,
222
+ "gpu_mem": 1.65230336,
223
+ "loss": 1.3362,
224
+ "grad_norm": 4.080379486083984,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 2.269220864,
231
+ "gpu_mem": 1.652260352,
232
+ "loss": 1.3412,
233
+ "grad_norm": 2.463536024093628,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 2.269220864,
240
+ "gpu_mem": 1.65221888,
241
+ "loss": 1.3948,
242
+ "grad_norm": 3.877969980239868,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 2.269220864,
249
+ "gpu_mem": 1.65226496,
250
+ "loss": 1.6029,
251
+ "grad_norm": 8.654217720031738,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 2.269220864,
258
+ "gpu_mem": 1.652260352,
259
+ "loss": 1.3744,
260
+ "grad_norm": 3.113161563873291,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 2.269220864,
267
+ "gpu_mem": 1.6522496,
268
+ "loss": 1.3267,
269
+ "grad_norm": 2.45149827003479,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 2.269220864,
276
+ "gpu_mem": 1.65228032,
277
+ "loss": 1.34,
278
+ "grad_norm": 1.9663938283920288,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 2.269220864,
285
+ "gpu_mem": 1.652289536,
286
+ "loss": 1.387,
287
+ "grad_norm": 3.6047306060791016,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 2.269220864,
294
+ "gpu_mem": 1.652269568,
295
+ "loss": 1.4011,
296
+ "grad_norm": 3.453739643096924,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 2.269220864,
303
+ "gpu_mem": 1.652248064,
304
+ "loss": 1.3625,
305
+ "grad_norm": 2.226464033126831,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 2.269220864,
312
+ "gpu_mem": 1.652135936,
313
+ "loss": 2.0998,
314
+ "grad_norm": 3.2824151515960693,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 2.269220864,
321
+ "gpu_mem": 1.63132672,
322
+ "loss": 1.3592,
323
+ "grad_norm": 2.3378264904022217,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 2.269220864,
330
+ "gpu_mem": 1.631335936,
331
+ "loss": 1.3725,
332
+ "grad_norm": 2.820693254470825,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 2.269220864,
339
+ "gpu_mem": 1.631306752,
340
+ "loss": 1.3403,
341
+ "grad_norm": 1.8874971866607666,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 2.269220864,
348
+ "gpu_mem": 1.631325184,
349
+ "loss": 1.3479,
350
+ "grad_norm": 2.4137139320373535,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 2.269220864,
357
+ "gpu_mem": 1.631302144,
358
+ "loss": 1.3676,
359
+ "grad_norm": 1.963843584060669,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 2.269220864,
366
+ "gpu_mem": 1.63130368,
367
+ "loss": 1.3756,
368
+ "grad_norm": 2.127540111541748,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 2.269220864,
375
+ "gpu_mem": 1.631332864,
376
+ "loss": 1.3251,
377
+ "grad_norm": 2.7455198764801025,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 2.269220864,
384
+ "gpu_mem": 1.631348224,
385
+ "loss": 1.3266,
386
+ "grad_norm": 3.228006601333618,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 2.269220864,
393
+ "gpu_mem": 1.631366656,
394
+ "loss": 1.2648,
395
+ "grad_norm": 2.178924322128296,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 2.269220864,
402
+ "gpu_mem": 1.631320576,
403
+ "loss": 1.283,
404
+ "grad_norm": 2.0270533561706543,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 2.269220864,
411
+ "gpu_mem": 1.631314432,
412
+ "loss": 1.272,
413
+ "grad_norm": 2.6357274055480957,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 2.269220864,
420
+ "gpu_mem": 1.631308288,
421
+ "loss": 1.2825,
422
+ "grad_norm": 2.7534518241882324,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 2.269220864,
429
+ "gpu_mem": 1.631312896,
430
+ "loss": 1.2297,
431
+ "grad_norm": 2.427360773086548,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 2.269220864,
438
+ "gpu_mem": 1.63130368,
439
+ "loss": 1.2132,
440
+ "grad_norm": 2.579684019088745,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 2.269220864,
447
+ "gpu_mem": 1.631285248,
448
+ "loss": 1.297,
449
+ "grad_norm": 3.4447336196899414,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 2.269220864,
456
+ "gpu_mem": 1.631309824,
457
+ "loss": 1.2385,
458
+ "grad_norm": 3.1413519382476807,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 2.269220864,
465
+ "gpu_mem": 1.631337472,
466
+ "loss": 1.3316,
467
+ "grad_norm": 3.923443078994751,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 2.269220864,
474
+ "gpu_mem": 1.652246528,
475
+ "loss": 1.8283,
476
+ "grad_norm": 3.775007724761963,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 2.269220864,
483
+ "gpu_mem": 1.652215808,
484
+ "loss": 1.2183,
485
+ "grad_norm": 3.3977837562561035,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 2.269220864,
492
+ "gpu_mem": 1.6522496,
493
+ "loss": 1.2348,
494
+ "grad_norm": 3.5957634449005127,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 2.269220864,
501
+ "gpu_mem": 1.652323328,
502
+ "loss": 1.2134,
503
+ "grad_norm": 4.578391075134277,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 2.269220864,
510
+ "gpu_mem": 1.652266496,
511
+ "loss": 1.1648,
512
+ "grad_norm": 3.1363987922668457,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 2.269220864,
519
+ "gpu_mem": 1.652260352,
520
+ "loss": 1.0782,
521
+ "grad_norm": 3.9204323291778564,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 2.269220864,
528
+ "gpu_mem": 1.65231104,
529
+ "loss": 1.1375,
530
+ "grad_norm": 3.8483402729034424,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 2.269220864,
537
+ "gpu_mem": 1.652237312,
538
+ "loss": 1.2112,
539
+ "grad_norm": 3.2146248817443848,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 2.269220864,
546
+ "gpu_mem": 1.652251136,
547
+ "loss": 1.1779,
548
+ "grad_norm": 3.3050568103790283,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 2.269220864,
555
+ "gpu_mem": 1.652252672,
556
+ "loss": 1.2063,
557
+ "grad_norm": 3.480621576309204,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 2.269220864,
564
+ "gpu_mem": 1.65224192,
565
+ "loss": 1.2262,
566
+ "grad_norm": 3.0599448680877686,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 2.269220864,
573
+ "gpu_mem": 1.652258816,
574
+ "loss": 1.1751,
575
+ "grad_norm": 3.4559524059295654,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 2.269220864,
582
+ "gpu_mem": 1.65228032,
583
+ "loss": 1.2058,
584
+ "grad_norm": 3.909868001937866,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 2.269220864,
591
+ "gpu_mem": 1.652271104,
592
+ "loss": 1.1769,
593
+ "grad_norm": 4.007200717926025,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 2.269220864,
600
+ "gpu_mem": 1.652297216,
601
+ "loss": 1.1841,
602
+ "grad_norm": 3.987147808074951,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 2.269220864,
609
+ "gpu_mem": 1.652248064,
610
+ "loss": 1.1852,
611
+ "grad_norm": 3.8617331981658936,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 2.269220864,
618
+ "gpu_mem": 1.652248064,
619
+ "train_runtime": 387.9534,
620
+ "train_samples_per_second": 11.537,
621
+ "train_steps_per_second": 0.175,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.472226900212905
624
+ }
625
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.4537037037037037
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_e-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_e-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T00:43:50.399653"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r2-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 2.265014272,
6
+ "gpu_mem": 1.56739328,
7
+ "loss": 4.6935,
8
+ "grad_norm": 387.9242248535156,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 2.266980352,
15
+ "gpu_mem": 1.577952256,
16
+ "loss": 4.5357,
17
+ "grad_norm": 402.400146484375,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 2.267373568,
24
+ "gpu_mem": 1.577930752,
25
+ "loss": 2.8425,
26
+ "grad_norm": 678.4750366210938,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 2.267373568,
33
+ "gpu_mem": 1.577909248,
34
+ "loss": 1.9806,
35
+ "grad_norm": 68.57826232910156,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 2.267570176,
42
+ "gpu_mem": 1.57795072,
43
+ "loss": 1.4963,
44
+ "grad_norm": 21.31400489807129,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 2.267570176,
51
+ "gpu_mem": 1.577926144,
52
+ "loss": 1.4158,
53
+ "grad_norm": 39.33024597167969,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 2.267570176,
60
+ "gpu_mem": 1.577949184,
61
+ "loss": 1.5827,
62
+ "grad_norm": 57.0329704284668,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 2.267570176,
69
+ "gpu_mem": 1.577907712,
70
+ "loss": 1.3503,
71
+ "grad_norm": 12.728636741638184,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 2.267570176,
78
+ "gpu_mem": 1.577909248,
79
+ "loss": 1.4173,
80
+ "grad_norm": 31.24067497253418,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 2.267570176,
87
+ "gpu_mem": 1.57790464,
88
+ "loss": 1.4484,
89
+ "grad_norm": 32.26777267456055,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 2.267766784,
96
+ "gpu_mem": 1.577982976,
97
+ "loss": 1.5226,
98
+ "grad_norm": 34.87485885620117,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 2.267766784,
105
+ "gpu_mem": 1.577956864,
106
+ "loss": 1.3442,
107
+ "grad_norm": 10.862824440002441,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 2.267766784,
114
+ "gpu_mem": 1.577907712,
115
+ "loss": 1.5159,
116
+ "grad_norm": 37.704254150390625,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 2.267766784,
123
+ "gpu_mem": 1.577929216,
124
+ "loss": 1.3849,
125
+ "grad_norm": 13.6458101272583,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 2.267766784,
132
+ "gpu_mem": 1.577906176,
133
+ "loss": 1.5217,
134
+ "grad_norm": 27.110424041748047,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 2.267766784,
141
+ "gpu_mem": 1.577910784,
142
+ "loss": 1.3714,
143
+ "grad_norm": 8.012628555297852,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 2.267766784,
150
+ "gpu_mem": 1.577947648,
151
+ "loss": 1.532,
152
+ "grad_norm": 47.00474548339844,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 2.267766784,
159
+ "gpu_mem": 1.5779584,
160
+ "loss": 1.6678,
161
+ "grad_norm": 57.446075439453125,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 2.267766784,
168
+ "gpu_mem": 1.577901568,
169
+ "loss": 1.3468,
170
+ "grad_norm": 17.228172302246094,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 2.267766784,
177
+ "gpu_mem": 1.577972224,
178
+ "loss": 1.3959,
179
+ "grad_norm": 10.988210678100586,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 2.267766784,
186
+ "gpu_mem": 1.577970688,
187
+ "loss": 1.3904,
188
+ "grad_norm": 12.519684791564941,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 2.267766784,
195
+ "gpu_mem": 1.57792768,
196
+ "loss": 1.3938,
197
+ "grad_norm": 12.789338111877441,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 2.267766784,
204
+ "gpu_mem": 1.577944576,
205
+ "loss": 1.344,
206
+ "grad_norm": 7.412827491760254,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 2.267766784,
213
+ "gpu_mem": 1.577901568,
214
+ "loss": 1.3602,
215
+ "grad_norm": 6.341000080108643,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 2.267766784,
222
+ "gpu_mem": 1.577930752,
223
+ "loss": 1.4191,
224
+ "grad_norm": 12.618824005126953,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 2.267766784,
231
+ "gpu_mem": 1.577910784,
232
+ "loss": 1.4534,
233
+ "grad_norm": 8.628890991210938,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 2.267766784,
240
+ "gpu_mem": 1.577936896,
241
+ "loss": 1.3662,
242
+ "grad_norm": 11.88647747039795,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 2.267766784,
249
+ "gpu_mem": 1.577936896,
250
+ "loss": 1.5111,
251
+ "grad_norm": 27.525169372558594,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 2.267766784,
258
+ "gpu_mem": 1.577915392,
259
+ "loss": 1.3126,
260
+ "grad_norm": 12.335830688476562,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 2.267766784,
267
+ "gpu_mem": 1.577906176,
268
+ "loss": 1.3298,
269
+ "grad_norm": 6.091091632843018,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 2.267766784,
276
+ "gpu_mem": 1.577924608,
277
+ "loss": 1.3667,
278
+ "grad_norm": 5.047645568847656,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 2.267766784,
285
+ "gpu_mem": 1.577947648,
286
+ "loss": 1.3291,
287
+ "grad_norm": 8.144316673278809,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 2.267766784,
294
+ "gpu_mem": 1.577944576,
295
+ "loss": 1.3767,
296
+ "grad_norm": 7.056203365325928,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 2.267766784,
303
+ "gpu_mem": 1.577947648,
304
+ "loss": 1.3605,
305
+ "grad_norm": 6.500875473022461,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 2.267766784,
312
+ "gpu_mem": 1.577929216,
313
+ "loss": 1.3749,
314
+ "grad_norm": 20.8975830078125,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 2.267766784,
321
+ "gpu_mem": 1.583169024,
322
+ "loss": 2.1167,
323
+ "grad_norm": 38.09403991699219,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 2.267766784,
330
+ "gpu_mem": 1.583173632,
331
+ "loss": 1.4101,
332
+ "grad_norm": 17.080562591552734,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 2.267766784,
339
+ "gpu_mem": 1.583152128,
340
+ "loss": 1.237,
341
+ "grad_norm": 6.0900115966796875,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 2.267766784,
348
+ "gpu_mem": 1.583141376,
349
+ "loss": 1.3867,
350
+ "grad_norm": 11.881949424743652,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 2.267766784,
357
+ "gpu_mem": 1.583204352,
358
+ "loss": 1.3781,
359
+ "grad_norm": 9.173269271850586,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 2.267766784,
366
+ "gpu_mem": 1.583164416,
367
+ "loss": 1.3495,
368
+ "grad_norm": 12.694510459899902,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 2.267766784,
375
+ "gpu_mem": 1.583207424,
376
+ "loss": 1.3983,
377
+ "grad_norm": 8.700448036193848,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 2.267766784,
384
+ "gpu_mem": 1.583156736,
385
+ "loss": 1.3842,
386
+ "grad_norm": 5.271851539611816,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 2.267766784,
393
+ "gpu_mem": 1.583221248,
394
+ "loss": 1.3936,
395
+ "grad_norm": 8.918002128601074,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 2.267766784,
402
+ "gpu_mem": 1.583188992,
403
+ "loss": 1.3787,
404
+ "grad_norm": 5.303096294403076,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 2.267766784,
411
+ "gpu_mem": 1.5831936,
412
+ "loss": 1.3903,
413
+ "grad_norm": 7.693558216094971,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 2.267766784,
420
+ "gpu_mem": 1.58313984,
421
+ "loss": 1.3498,
422
+ "grad_norm": 9.270402908325195,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 2.267766784,
429
+ "gpu_mem": 1.583153664,
430
+ "loss": 1.3259,
431
+ "grad_norm": 5.86254358291626,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 2.267766784,
438
+ "gpu_mem": 1.583142912,
439
+ "loss": 1.3812,
440
+ "grad_norm": 15.352643966674805,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 2.267766784,
447
+ "gpu_mem": 1.583156736,
448
+ "loss": 1.4118,
449
+ "grad_norm": 17.63778305053711,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 2.267766784,
456
+ "gpu_mem": 1.58320896,
457
+ "loss": 1.3764,
458
+ "grad_norm": 11.841841697692871,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 2.267766784,
465
+ "gpu_mem": 1.583156736,
466
+ "loss": 1.4696,
467
+ "grad_norm": 18.075777053833008,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 2.267766784,
474
+ "gpu_mem": 1.583225856,
475
+ "loss": 1.3317,
476
+ "grad_norm": 6.4796929359436035,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 2.267766784,
483
+ "gpu_mem": 1.5831936,
484
+ "loss": 1.3314,
485
+ "grad_norm": 17.895187377929688,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 2.267766784,
492
+ "gpu_mem": 1.583202816,
493
+ "loss": 1.4675,
494
+ "grad_norm": 33.634151458740234,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 2.267766784,
501
+ "gpu_mem": 1.58317824,
502
+ "loss": 1.3211,
503
+ "grad_norm": 7.974305152893066,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 2.267766784,
510
+ "gpu_mem": 1.583212032,
511
+ "loss": 1.4806,
512
+ "grad_norm": 30.471376419067383,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 2.267766784,
519
+ "gpu_mem": 1.5831936,
520
+ "loss": 1.4122,
521
+ "grad_norm": 21.166011810302734,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 2.267766784,
528
+ "gpu_mem": 1.583179776,
529
+ "loss": 1.3415,
530
+ "grad_norm": 6.54095458984375,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 2.267766784,
537
+ "gpu_mem": 1.583218176,
538
+ "loss": 1.3858,
539
+ "grad_norm": 14.59696102142334,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 2.267766784,
546
+ "gpu_mem": 1.583150592,
547
+ "loss": 1.2966,
548
+ "grad_norm": 10.116411209106445,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 2.267766784,
555
+ "gpu_mem": 1.583198208,
556
+ "loss": 1.4485,
557
+ "grad_norm": 14.308155059814453,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 2.267766784,
564
+ "gpu_mem": 1.58314752,
565
+ "loss": 1.3406,
566
+ "grad_norm": 6.881035804748535,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 2.267766784,
573
+ "gpu_mem": 1.583196672,
574
+ "loss": 1.3451,
575
+ "grad_norm": 11.539894104003906,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 2.267766784,
582
+ "gpu_mem": 1.583195136,
583
+ "loss": 1.4033,
584
+ "grad_norm": 13.667032241821289,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 2.267766784,
591
+ "gpu_mem": 1.583213568,
592
+ "loss": 1.404,
593
+ "grad_norm": 14.032964706420898,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 2.267766784,
600
+ "gpu_mem": 1.5831552,
601
+ "loss": 1.3399,
602
+ "grad_norm": 8.294322967529297,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 2.267766784,
609
+ "gpu_mem": 1.583167488,
610
+ "loss": 1.3537,
611
+ "grad_norm": 4.33001708984375,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 2.267766784,
618
+ "gpu_mem": 1.583192064,
619
+ "loss": 1.3197,
620
+ "grad_norm": 9.072599411010742,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 2.267766784,
627
+ "gpu_mem": 1.583169024,
628
+ "loss": 1.2934,
629
+ "grad_norm": 5.792518138885498,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 2.267766784,
636
+ "gpu_mem": 1.583004672,
637
+ "loss": 1.95,
638
+ "grad_norm": 12.88838005065918,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 2.267766784,
645
+ "gpu_mem": 1.577939968,
646
+ "loss": 1.4181,
647
+ "grad_norm": 11.97128963470459,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 2.267766784,
654
+ "gpu_mem": 1.577903104,
655
+ "loss": 1.2886,
656
+ "grad_norm": 5.973176002502441,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 2.267766784,
663
+ "gpu_mem": 1.577963008,
664
+ "loss": 1.3368,
665
+ "grad_norm": 13.524489402770996,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 2.267766784,
672
+ "gpu_mem": 1.577930752,
673
+ "loss": 1.2792,
674
+ "grad_norm": 5.348632335662842,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 2.267766784,
681
+ "gpu_mem": 1.577941504,
682
+ "loss": 1.2896,
683
+ "grad_norm": 4.869000434875488,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 2.267766784,
690
+ "gpu_mem": 1.577978368,
691
+ "loss": 1.3782,
692
+ "grad_norm": 10.77273178100586,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 2.267766784,
699
+ "gpu_mem": 1.577963008,
700
+ "loss": 1.4087,
701
+ "grad_norm": 14.859511375427246,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 2.267766784,
708
+ "gpu_mem": 1.577913856,
709
+ "loss": 1.258,
710
+ "grad_norm": 9.860651969909668,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 2.267766784,
717
+ "gpu_mem": 1.5779584,
718
+ "loss": 1.3132,
719
+ "grad_norm": 7.07096004486084,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 2.267766784,
726
+ "gpu_mem": 1.577944576,
727
+ "loss": 1.5142,
728
+ "grad_norm": 35.71551513671875,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 2.267766784,
735
+ "gpu_mem": 1.57791232,
736
+ "loss": 1.5237,
737
+ "grad_norm": 34.79934310913086,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 2.267766784,
744
+ "gpu_mem": 1.577963008,
745
+ "loss": 1.3283,
746
+ "grad_norm": 15.516402244567871,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 2.267766784,
753
+ "gpu_mem": 1.577901568,
754
+ "loss": 1.3513,
755
+ "grad_norm": 9.372213363647461,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 2.267766784,
762
+ "gpu_mem": 1.577947648,
763
+ "loss": 1.3558,
764
+ "grad_norm": 11.078695297241211,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 2.267766784,
771
+ "gpu_mem": 1.577901568,
772
+ "loss": 1.3629,
773
+ "grad_norm": 13.846244812011719,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 2.267766784,
780
+ "gpu_mem": 1.577932288,
781
+ "loss": 1.3761,
782
+ "grad_norm": 8.658803939819336,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 2.267766784,
789
+ "gpu_mem": 1.577907712,
790
+ "loss": 1.2837,
791
+ "grad_norm": 10.853123664855957,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 2.267766784,
798
+ "gpu_mem": 1.577961472,
799
+ "loss": 1.2869,
800
+ "grad_norm": 6.296206951141357,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 2.267766784,
807
+ "gpu_mem": 1.57794304,
808
+ "loss": 1.3812,
809
+ "grad_norm": 11.038230895996094,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 2.267766784,
816
+ "gpu_mem": 1.577892352,
817
+ "loss": 1.3665,
818
+ "grad_norm": 11.633645057678223,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 2.267766784,
825
+ "gpu_mem": 1.577916928,
826
+ "loss": 1.3593,
827
+ "grad_norm": 8.726666450500488,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 2.267766784,
834
+ "gpu_mem": 1.57792,
835
+ "loss": 1.2704,
836
+ "grad_norm": 8.788180351257324,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 2.267766784,
843
+ "gpu_mem": 1.57791232,
844
+ "loss": 1.3275,
845
+ "grad_norm": 6.910650253295898,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 2.267766784,
852
+ "gpu_mem": 1.57795072,
853
+ "loss": 1.3308,
854
+ "grad_norm": 6.377485752105713,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 2.267766784,
861
+ "gpu_mem": 1.577959936,
862
+ "loss": 1.2641,
863
+ "grad_norm": 8.137248992919922,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 2.267766784,
870
+ "gpu_mem": 1.577903104,
871
+ "loss": 1.3382,
872
+ "grad_norm": 7.780050277709961,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 2.267766784,
879
+ "gpu_mem": 1.577903104,
880
+ "loss": 1.2993,
881
+ "grad_norm": 6.235042572021484,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 2.267766784,
888
+ "gpu_mem": 1.577900032,
889
+ "loss": 1.2706,
890
+ "grad_norm": 5.6923041343688965,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 2.267766784,
897
+ "gpu_mem": 1.577898496,
898
+ "loss": 1.2759,
899
+ "grad_norm": 7.749869346618652,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 2.267766784,
906
+ "gpu_mem": 1.577941504,
907
+ "loss": 1.2566,
908
+ "grad_norm": 9.361809730529785,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 2.267766784,
915
+ "gpu_mem": 1.577880064,
916
+ "loss": 1.3431,
917
+ "grad_norm": 13.51416301727295,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 2.267766784,
924
+ "gpu_mem": 1.577929216,
925
+ "loss": 1.3039,
926
+ "grad_norm": 6.510090351104736,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 2.267766784,
933
+ "gpu_mem": 1.577992192,
934
+ "loss": 1.3544,
935
+ "grad_norm": 10.343463897705078,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 2.267766784,
942
+ "gpu_mem": 1.577944576,
943
+ "loss": 1.2719,
944
+ "grad_norm": 5.498920917510986,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 2.267766784,
951
+ "gpu_mem": 1.577926144,
952
+ "loss": 1.3132,
953
+ "grad_norm": 5.256633281707764,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 2.267766784,
960
+ "gpu_mem": 1.583192064,
961
+ "loss": 1.8605,
962
+ "grad_norm": 11.751715660095215,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 2.267766784,
969
+ "gpu_mem": 1.583173632,
970
+ "loss": 1.2496,
971
+ "grad_norm": 5.319509029388428,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 2.267766784,
978
+ "gpu_mem": 1.58316288,
979
+ "loss": 1.2938,
980
+ "grad_norm": 7.404820442199707,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 2.267766784,
987
+ "gpu_mem": 1.58321664,
988
+ "loss": 1.2956,
989
+ "grad_norm": 9.645465850830078,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 2.267766784,
996
+ "gpu_mem": 1.583176704,
997
+ "loss": 1.2614,
998
+ "grad_norm": 5.74735164642334,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 2.267766784,
1005
+ "gpu_mem": 1.583195136,
1006
+ "loss": 1.2525,
1007
+ "grad_norm": 4.955123424530029,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 2.267766784,
1014
+ "gpu_mem": 1.583258112,
1015
+ "loss": 1.2641,
1016
+ "grad_norm": 6.185914039611816,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 2.267766784,
1023
+ "gpu_mem": 1.58318592,
1024
+ "loss": 1.264,
1025
+ "grad_norm": 5.543385982513428,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 2.267766784,
1032
+ "gpu_mem": 1.583179776,
1033
+ "loss": 1.3229,
1034
+ "grad_norm": 8.056672096252441,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 2.267766784,
1041
+ "gpu_mem": 1.583195136,
1042
+ "loss": 1.2307,
1043
+ "grad_norm": 6.282785415649414,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 2.267766784,
1050
+ "gpu_mem": 1.583210496,
1051
+ "loss": 1.1953,
1052
+ "grad_norm": 5.729328632354736,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 2.267766784,
1059
+ "gpu_mem": 1.58320128,
1060
+ "loss": 1.2218,
1061
+ "grad_norm": 6.79281759262085,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 2.267766784,
1068
+ "gpu_mem": 1.583192064,
1069
+ "loss": 1.2716,
1070
+ "grad_norm": 7.204923152923584,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 2.267766784,
1077
+ "gpu_mem": 1.583210496,
1078
+ "loss": 1.2197,
1079
+ "grad_norm": 8.614418029785156,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 2.267766784,
1086
+ "gpu_mem": 1.58320896,
1087
+ "loss": 1.2098,
1088
+ "grad_norm": 9.425457954406738,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 2.267766784,
1095
+ "gpu_mem": 1.583165952,
1096
+ "loss": 1.1986,
1097
+ "grad_norm": 8.279610633850098,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 2.267766784,
1104
+ "gpu_mem": 1.583198208,
1105
+ "loss": 1.1924,
1106
+ "grad_norm": 9.477372169494629,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 2.267766784,
1113
+ "gpu_mem": 1.583152128,
1114
+ "loss": 1.2124,
1115
+ "grad_norm": 7.941216945648193,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 2.267766784,
1122
+ "gpu_mem": 1.583196672,
1123
+ "loss": 1.1826,
1124
+ "grad_norm": 10.422591209411621,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 2.267766784,
1131
+ "gpu_mem": 1.58314752,
1132
+ "loss": 1.1745,
1133
+ "grad_norm": 8.78809928894043,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 2.267766784,
1140
+ "gpu_mem": 1.583159808,
1141
+ "loss": 1.2327,
1142
+ "grad_norm": 9.099221229553223,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 2.267766784,
1149
+ "gpu_mem": 1.583184384,
1150
+ "loss": 1.2086,
1151
+ "grad_norm": 9.764792442321777,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 2.267766784,
1158
+ "gpu_mem": 1.583145984,
1159
+ "loss": 1.1995,
1160
+ "grad_norm": 9.325013160705566,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 2.267766784,
1167
+ "gpu_mem": 1.583149056,
1168
+ "loss": 1.2573,
1169
+ "grad_norm": 10.700399398803711,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 2.267766784,
1176
+ "gpu_mem": 1.583161344,
1177
+ "loss": 1.1239,
1178
+ "grad_norm": 10.698701858520508,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 2.267766784,
1185
+ "gpu_mem": 1.583126016,
1186
+ "loss": 1.1922,
1187
+ "grad_norm": 8.860681533813477,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 2.267766784,
1194
+ "gpu_mem": 1.583167488,
1195
+ "loss": 1.112,
1196
+ "grad_norm": 10.341314315795898,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 2.267766784,
1203
+ "gpu_mem": 1.583182848,
1204
+ "loss": 1.1351,
1205
+ "grad_norm": 9.759340286254883,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 2.267766784,
1212
+ "gpu_mem": 1.58314752,
1213
+ "loss": 1.2383,
1214
+ "grad_norm": 13.044609069824219,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 2.267766784,
1221
+ "gpu_mem": 1.5831552,
1222
+ "loss": 1.2511,
1223
+ "grad_norm": 12.605598449707031,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 2.267766784,
1230
+ "gpu_mem": 1.583176704,
1231
+ "loss": 1.1776,
1232
+ "grad_norm": 12.512730598449707,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 2.267766784,
1239
+ "gpu_mem": 1.583187456,
1240
+ "loss": 1.2326,
1241
+ "grad_norm": 10.543280601501465,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 2.267766784,
1248
+ "gpu_mem": 1.583179776,
1249
+ "loss": 1.2616,
1250
+ "grad_norm": 9.948535919189453,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 2.267766784,
1257
+ "gpu_mem": 1.583213568,
1258
+ "loss": 1.2206,
1259
+ "grad_norm": 11.966527938842773,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 2.267766784,
1266
+ "gpu_mem": 1.583213568,
1267
+ "train_runtime": 698.7578,
1268
+ "train_samples_per_second": 12.886,
1269
+ "train_steps_per_second": 0.2,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.4118309472288404
1272
+ }
1273
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.6447811447811448
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_e-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_e-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T15:24:32.855044"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r32-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 3.35321088,
6
+ "gpu_mem": 1.652510208,
7
+ "loss": 4.6935,
8
+ "grad_norm": 81.9864730834961,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 3.355373568,
15
+ "gpu_mem": 1.820720128,
16
+ "loss": 4.5357,
17
+ "grad_norm": 84.7640609741211,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 3.355570176,
24
+ "gpu_mem": 1.820698624,
25
+ "loss": 3.1725,
26
+ "grad_norm": 151.07528686523438,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 3.355766784,
33
+ "gpu_mem": 1.82067712,
34
+ "loss": 2.212,
35
+ "grad_norm": 19.06497573852539,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 3.355766784,
42
+ "gpu_mem": 1.820718592,
43
+ "loss": 1.5609,
44
+ "grad_norm": 5.8123087882995605,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 3.355963392,
51
+ "gpu_mem": 1.820694016,
52
+ "loss": 1.4784,
53
+ "grad_norm": 9.214070320129395,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 3.355963392,
60
+ "gpu_mem": 1.820717056,
61
+ "loss": 1.4832,
62
+ "grad_norm": 8.009530067443848,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 3.355963392,
69
+ "gpu_mem": 1.820675584,
70
+ "loss": 1.3657,
71
+ "grad_norm": 4.512720584869385,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 3.355963392,
78
+ "gpu_mem": 1.82067712,
79
+ "loss": 1.4025,
80
+ "grad_norm": 5.974582672119141,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 3.355963392,
87
+ "gpu_mem": 1.820672512,
88
+ "loss": 1.5038,
89
+ "grad_norm": 9.22023868560791,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 3.35616,
96
+ "gpu_mem": 1.820750848,
97
+ "loss": 1.3828,
98
+ "grad_norm": 4.30027437210083,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 3.35616,
105
+ "gpu_mem": 1.820724736,
106
+ "loss": 1.3444,
107
+ "grad_norm": 3.374626874923706,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 3.35616,
114
+ "gpu_mem": 1.820675584,
115
+ "loss": 1.4374,
116
+ "grad_norm": 4.711981773376465,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 3.35616,
123
+ "gpu_mem": 1.820697088,
124
+ "loss": 1.391,
125
+ "grad_norm": 3.111585855484009,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 3.35616,
132
+ "gpu_mem": 1.820674048,
133
+ "loss": 1.3646,
134
+ "grad_norm": 2.3693125247955322,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 3.35616,
141
+ "gpu_mem": 1.820678656,
142
+ "loss": 1.4111,
143
+ "grad_norm": 2.2814691066741943,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 3.35616,
150
+ "gpu_mem": 1.82071552,
151
+ "loss": 1.3598,
152
+ "grad_norm": 1.9385331869125366,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 3.35616,
159
+ "gpu_mem": 1.820726272,
160
+ "loss": 1.3688,
161
+ "grad_norm": 2.220923900604248,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 3.35616,
168
+ "gpu_mem": 1.82066944,
169
+ "loss": 1.3524,
170
+ "grad_norm": 2.5402183532714844,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 3.35616,
177
+ "gpu_mem": 1.820740096,
178
+ "loss": 1.3742,
179
+ "grad_norm": 1.5485947132110596,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 3.35616,
186
+ "gpu_mem": 1.82073856,
187
+ "loss": 1.3328,
188
+ "grad_norm": 1.8351880311965942,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 3.35616,
195
+ "gpu_mem": 1.820695552,
196
+ "loss": 1.3738,
197
+ "grad_norm": 1.9658966064453125,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 3.35616,
204
+ "gpu_mem": 1.820712448,
205
+ "loss": 1.3493,
206
+ "grad_norm": 1.7647842168807983,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 3.35616,
213
+ "gpu_mem": 1.82066944,
214
+ "loss": 1.3223,
215
+ "grad_norm": 1.180082082748413,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 3.35616,
222
+ "gpu_mem": 1.820698624,
223
+ "loss": 1.4242,
224
+ "grad_norm": 2.7960832118988037,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 3.35616,
231
+ "gpu_mem": 1.820678656,
232
+ "loss": 1.4995,
233
+ "grad_norm": 2.7815604209899902,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 3.35616,
240
+ "gpu_mem": 1.820704768,
241
+ "loss": 1.3165,
242
+ "grad_norm": 0.7974480986595154,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 3.35616,
249
+ "gpu_mem": 1.820704768,
250
+ "loss": 1.3992,
251
+ "grad_norm": 1.6349438428878784,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 3.35616,
258
+ "gpu_mem": 1.820683264,
259
+ "loss": 1.3072,
260
+ "grad_norm": 1.8837945461273193,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 3.35616,
267
+ "gpu_mem": 1.820674048,
268
+ "loss": 1.351,
269
+ "grad_norm": 1.2649332284927368,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 3.35616,
276
+ "gpu_mem": 1.82069248,
277
+ "loss": 1.3523,
278
+ "grad_norm": 0.6487420797348022,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 3.35616,
285
+ "gpu_mem": 1.82071552,
286
+ "loss": 1.3443,
287
+ "grad_norm": 1.5899792909622192,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 3.35616,
294
+ "gpu_mem": 1.820712448,
295
+ "loss": 1.3724,
296
+ "grad_norm": 1.497815728187561,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 3.35616,
303
+ "gpu_mem": 1.82071552,
304
+ "loss": 1.393,
305
+ "grad_norm": 1.631191372871399,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 3.35616,
312
+ "gpu_mem": 1.820697088,
313
+ "loss": 1.2828,
314
+ "grad_norm": 0.6670944690704346,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 3.35616,
321
+ "gpu_mem": 1.904762368,
322
+ "loss": 2.0004,
323
+ "grad_norm": 2.8792665004730225,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 3.35616,
330
+ "gpu_mem": 1.904766976,
331
+ "loss": 1.3754,
332
+ "grad_norm": 1.6826180219650269,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 3.35616,
339
+ "gpu_mem": 1.904745472,
340
+ "loss": 1.2394,
341
+ "grad_norm": 1.0976940393447876,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 3.35616,
348
+ "gpu_mem": 1.90473472,
349
+ "loss": 1.3578,
350
+ "grad_norm": 1.8338680267333984,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 3.35616,
357
+ "gpu_mem": 1.904797696,
358
+ "loss": 1.3568,
359
+ "grad_norm": 1.278975248336792,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 3.35616,
366
+ "gpu_mem": 1.90475776,
367
+ "loss": 1.3333,
368
+ "grad_norm": 1.2090142965316772,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 3.35616,
375
+ "gpu_mem": 1.904800768,
376
+ "loss": 1.3686,
377
+ "grad_norm": 1.252097487449646,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 3.35616,
384
+ "gpu_mem": 1.90475008,
385
+ "loss": 1.3719,
386
+ "grad_norm": 0.7772877216339111,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 3.35616,
393
+ "gpu_mem": 1.904814592,
394
+ "loss": 1.3569,
395
+ "grad_norm": 1.0584994554519653,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 3.35616,
402
+ "gpu_mem": 1.904782336,
403
+ "loss": 1.3571,
404
+ "grad_norm": 0.783757746219635,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 3.35616,
411
+ "gpu_mem": 1.904786944,
412
+ "loss": 1.3759,
413
+ "grad_norm": 1.1785410642623901,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 3.35616,
420
+ "gpu_mem": 1.904733184,
421
+ "loss": 1.3357,
422
+ "grad_norm": 1.9965938329696655,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 3.35616,
429
+ "gpu_mem": 1.904747008,
430
+ "loss": 1.303,
431
+ "grad_norm": 0.7584894299507141,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 3.35616,
438
+ "gpu_mem": 1.904736256,
439
+ "loss": 1.3147,
440
+ "grad_norm": 1.1462693214416504,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 3.35616,
447
+ "gpu_mem": 1.90475008,
448
+ "loss": 1.401,
449
+ "grad_norm": 2.1255180835723877,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 3.35616,
456
+ "gpu_mem": 1.904802304,
457
+ "loss": 1.3794,
458
+ "grad_norm": 1.7536078691482544,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 3.35616,
465
+ "gpu_mem": 1.90475008,
466
+ "loss": 1.3257,
467
+ "grad_norm": 1.2524150609970093,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 3.35616,
474
+ "gpu_mem": 1.9048192,
475
+ "loss": 1.2916,
476
+ "grad_norm": 0.7183902263641357,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 3.35616,
483
+ "gpu_mem": 1.904786944,
484
+ "loss": 1.3173,
485
+ "grad_norm": 1.6396170854568481,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 3.35616,
492
+ "gpu_mem": 1.90479616,
493
+ "loss": 1.4298,
494
+ "grad_norm": 2.2417478561401367,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 3.35616,
501
+ "gpu_mem": 1.904771584,
502
+ "loss": 1.3538,
503
+ "grad_norm": 1.401225209236145,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 3.35616,
510
+ "gpu_mem": 1.904805376,
511
+ "loss": 1.2931,
512
+ "grad_norm": 0.6661651730537415,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 3.35616,
519
+ "gpu_mem": 1.904786944,
520
+ "loss": 1.3105,
521
+ "grad_norm": 0.7987121343612671,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 3.35616,
528
+ "gpu_mem": 1.90477312,
529
+ "loss": 1.311,
530
+ "grad_norm": 1.006524682044983,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 3.35616,
537
+ "gpu_mem": 1.90481152,
538
+ "loss": 1.2965,
539
+ "grad_norm": 1.2677685022354126,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 3.35616,
546
+ "gpu_mem": 1.904743936,
547
+ "loss": 1.3036,
548
+ "grad_norm": 1.6080312728881836,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 3.35616,
555
+ "gpu_mem": 1.904791552,
556
+ "loss": 1.3916,
557
+ "grad_norm": 1.582788109779358,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 3.35616,
564
+ "gpu_mem": 1.904740864,
565
+ "loss": 1.3774,
566
+ "grad_norm": 2.0091729164123535,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 3.35616,
573
+ "gpu_mem": 1.904790016,
574
+ "loss": 1.401,
575
+ "grad_norm": 2.015021562576294,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 3.35616,
582
+ "gpu_mem": 1.90478848,
583
+ "loss": 1.3467,
584
+ "grad_norm": 1.1769288778305054,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 3.35616,
591
+ "gpu_mem": 1.904806912,
592
+ "loss": 1.3007,
593
+ "grad_norm": 1.305434226989746,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 3.35616,
600
+ "gpu_mem": 1.904748544,
601
+ "loss": 1.3223,
602
+ "grad_norm": 1.276455283164978,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 3.35616,
609
+ "gpu_mem": 1.904760832,
610
+ "loss": 1.3533,
611
+ "grad_norm": 0.9940769076347351,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 3.35616,
618
+ "gpu_mem": 1.904785408,
619
+ "loss": 1.3161,
620
+ "grad_norm": 1.365990161895752,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 3.35616,
627
+ "gpu_mem": 1.904762368,
628
+ "loss": 1.2294,
629
+ "grad_norm": 0.7751104831695557,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 3.35616,
636
+ "gpu_mem": 1.904598016,
637
+ "loss": 2.0285,
638
+ "grad_norm": 2.5267796516418457,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 3.35616,
645
+ "gpu_mem": 1.82070784,
646
+ "loss": 1.3271,
647
+ "grad_norm": 1.5583995580673218,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 3.35616,
654
+ "gpu_mem": 1.820670976,
655
+ "loss": 1.2622,
656
+ "grad_norm": 1.4085772037506104,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 3.35616,
663
+ "gpu_mem": 1.82073088,
664
+ "loss": 1.2426,
665
+ "grad_norm": 1.2474768161773682,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 3.35616,
672
+ "gpu_mem": 1.820698624,
673
+ "loss": 1.2905,
674
+ "grad_norm": 1.4204754829406738,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 3.35616,
681
+ "gpu_mem": 1.820709376,
682
+ "loss": 1.2587,
683
+ "grad_norm": 1.3559608459472656,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 3.35616,
690
+ "gpu_mem": 1.82074624,
691
+ "loss": 1.3177,
692
+ "grad_norm": 1.8520830869674683,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 3.35616,
699
+ "gpu_mem": 1.82073088,
700
+ "loss": 1.2753,
701
+ "grad_norm": 1.4194159507751465,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 3.35616,
708
+ "gpu_mem": 1.820681728,
709
+ "loss": 1.1687,
710
+ "grad_norm": 1.2779954671859741,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 3.35616,
717
+ "gpu_mem": 1.820726272,
718
+ "loss": 1.2946,
719
+ "grad_norm": 1.634787678718567,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 3.35616,
726
+ "gpu_mem": 1.820712448,
727
+ "loss": 1.3459,
728
+ "grad_norm": 2.697556972503662,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 3.35616,
735
+ "gpu_mem": 1.820680192,
736
+ "loss": 1.2763,
737
+ "grad_norm": 2.162923574447632,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 3.35616,
744
+ "gpu_mem": 1.82073088,
745
+ "loss": 1.2306,
746
+ "grad_norm": 2.2317888736724854,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 3.35616,
753
+ "gpu_mem": 1.82066944,
754
+ "loss": 1.3417,
755
+ "grad_norm": 1.5752201080322266,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 3.35616,
762
+ "gpu_mem": 1.82071552,
763
+ "loss": 1.3301,
764
+ "grad_norm": 2.0932695865631104,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 3.35616,
771
+ "gpu_mem": 1.82066944,
772
+ "loss": 1.2446,
773
+ "grad_norm": 1.411714792251587,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 3.35616,
780
+ "gpu_mem": 1.82070016,
781
+ "loss": 1.2545,
782
+ "grad_norm": 1.1710765361785889,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 3.35616,
789
+ "gpu_mem": 1.820675584,
790
+ "loss": 1.176,
791
+ "grad_norm": 1.4399785995483398,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 3.35616,
798
+ "gpu_mem": 1.820729344,
799
+ "loss": 1.2088,
800
+ "grad_norm": 1.4779694080352783,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 3.35616,
807
+ "gpu_mem": 1.820710912,
808
+ "loss": 1.2646,
809
+ "grad_norm": 1.814059853553772,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 3.35616,
816
+ "gpu_mem": 1.820660224,
817
+ "loss": 1.3137,
818
+ "grad_norm": 2.912797451019287,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 3.35616,
825
+ "gpu_mem": 1.8206848,
826
+ "loss": 1.2847,
827
+ "grad_norm": 1.7235602140426636,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 3.35616,
834
+ "gpu_mem": 1.820687872,
835
+ "loss": 1.1973,
836
+ "grad_norm": 1.7765477895736694,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 3.35616,
843
+ "gpu_mem": 1.820680192,
844
+ "loss": 1.1483,
845
+ "grad_norm": 1.4611889123916626,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 3.35616,
852
+ "gpu_mem": 1.820718592,
853
+ "loss": 1.1896,
854
+ "grad_norm": 1.6439387798309326,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 3.35616,
861
+ "gpu_mem": 1.820727808,
862
+ "loss": 1.0802,
863
+ "grad_norm": 1.8210686445236206,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 3.35616,
870
+ "gpu_mem": 1.820670976,
871
+ "loss": 1.2072,
872
+ "grad_norm": 2.035700559616089,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 3.35616,
879
+ "gpu_mem": 1.820670976,
880
+ "loss": 1.185,
881
+ "grad_norm": 1.6696794033050537,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 3.35616,
888
+ "gpu_mem": 1.820667904,
889
+ "loss": 1.1511,
890
+ "grad_norm": 1.774680495262146,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 3.35616,
897
+ "gpu_mem": 1.820666368,
898
+ "loss": 1.091,
899
+ "grad_norm": 2.38977313041687,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 3.35616,
906
+ "gpu_mem": 1.820709376,
907
+ "loss": 1.0598,
908
+ "grad_norm": 2.5313806533813477,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 3.35616,
915
+ "gpu_mem": 1.820647936,
916
+ "loss": 1.2758,
917
+ "grad_norm": 3.1765358448028564,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 3.35616,
924
+ "gpu_mem": 1.820697088,
925
+ "loss": 1.1792,
926
+ "grad_norm": 2.3216633796691895,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 3.35616,
933
+ "gpu_mem": 1.820760064,
934
+ "loss": 1.3839,
935
+ "grad_norm": 4.054759502410889,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 3.35616,
942
+ "gpu_mem": 1.820712448,
943
+ "loss": 1.1875,
944
+ "grad_norm": 2.587730884552002,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 3.35616,
951
+ "gpu_mem": 1.820694016,
952
+ "loss": 1.2425,
953
+ "grad_norm": 2.8117191791534424,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 3.35616,
960
+ "gpu_mem": 1.904785408,
961
+ "loss": 1.5796,
962
+ "grad_norm": 5.7084808349609375,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 3.35616,
969
+ "gpu_mem": 1.904766976,
970
+ "loss": 0.9891,
971
+ "grad_norm": 2.1359453201293945,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 3.35616,
978
+ "gpu_mem": 1.904756224,
979
+ "loss": 0.9248,
980
+ "grad_norm": 2.321399450302124,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 3.35616,
987
+ "gpu_mem": 1.904809984,
988
+ "loss": 0.9902,
989
+ "grad_norm": 2.2922966480255127,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 3.35616,
996
+ "gpu_mem": 1.904770048,
997
+ "loss": 1.03,
998
+ "grad_norm": 3.497184991836548,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 3.35616,
1005
+ "gpu_mem": 1.90478848,
1006
+ "loss": 1.0531,
1007
+ "grad_norm": 3.832878589630127,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 3.35616,
1014
+ "gpu_mem": 1.904851456,
1015
+ "loss": 0.8908,
1016
+ "grad_norm": 2.621711254119873,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 3.35616,
1023
+ "gpu_mem": 1.904779264,
1024
+ "loss": 0.9852,
1025
+ "grad_norm": 3.2300689220428467,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 3.35616,
1032
+ "gpu_mem": 1.90477312,
1033
+ "loss": 1.1226,
1034
+ "grad_norm": 2.926384687423706,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 3.35616,
1041
+ "gpu_mem": 1.90478848,
1042
+ "loss": 0.9271,
1043
+ "grad_norm": 2.808828115463257,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 3.35616,
1050
+ "gpu_mem": 1.90480384,
1051
+ "loss": 0.9244,
1052
+ "grad_norm": 2.575187921524048,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 3.35616,
1059
+ "gpu_mem": 1.904794624,
1060
+ "loss": 0.907,
1061
+ "grad_norm": 2.466205358505249,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 3.35616,
1068
+ "gpu_mem": 1.904785408,
1069
+ "loss": 0.9185,
1070
+ "grad_norm": 2.5773041248321533,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 3.35616,
1077
+ "gpu_mem": 1.90480384,
1078
+ "loss": 0.9643,
1079
+ "grad_norm": 2.6320464611053467,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 3.35616,
1086
+ "gpu_mem": 1.904802304,
1087
+ "loss": 0.8452,
1088
+ "grad_norm": 2.8066372871398926,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 3.35616,
1095
+ "gpu_mem": 1.904759296,
1096
+ "loss": 0.7788,
1097
+ "grad_norm": 2.6877243518829346,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 3.35616,
1104
+ "gpu_mem": 1.904791552,
1105
+ "loss": 0.8496,
1106
+ "grad_norm": 2.811607837677002,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 3.35616,
1113
+ "gpu_mem": 1.904745472,
1114
+ "loss": 0.9168,
1115
+ "grad_norm": 3.270836114883423,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 3.35616,
1122
+ "gpu_mem": 1.904790016,
1123
+ "loss": 0.8047,
1124
+ "grad_norm": 3.4934275150299072,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 3.35616,
1131
+ "gpu_mem": 1.904740864,
1132
+ "loss": 0.8915,
1133
+ "grad_norm": 4.3499064445495605,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 3.35616,
1140
+ "gpu_mem": 1.904753152,
1141
+ "loss": 0.8663,
1142
+ "grad_norm": 3.6024134159088135,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 3.35616,
1149
+ "gpu_mem": 1.904777728,
1150
+ "loss": 0.8417,
1151
+ "grad_norm": 3.112325429916382,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 3.35616,
1158
+ "gpu_mem": 1.904739328,
1159
+ "loss": 0.7377,
1160
+ "grad_norm": 3.261399507522583,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 3.35616,
1167
+ "gpu_mem": 1.9047424,
1168
+ "loss": 0.8205,
1169
+ "grad_norm": 3.735766649246216,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 3.35616,
1176
+ "gpu_mem": 1.904754688,
1177
+ "loss": 0.8529,
1178
+ "grad_norm": 3.8941218852996826,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 3.35616,
1185
+ "gpu_mem": 1.90471936,
1186
+ "loss": 0.9044,
1187
+ "grad_norm": 3.9655284881591797,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 3.35616,
1194
+ "gpu_mem": 1.904760832,
1195
+ "loss": 0.9472,
1196
+ "grad_norm": 4.0462965965271,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 3.35616,
1203
+ "gpu_mem": 1.904776192,
1204
+ "loss": 0.8624,
1205
+ "grad_norm": 3.840670108795166,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 3.35616,
1212
+ "gpu_mem": 1.904740864,
1213
+ "loss": 1.0031,
1214
+ "grad_norm": 4.6597771644592285,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 3.35616,
1221
+ "gpu_mem": 1.904748544,
1222
+ "loss": 0.8918,
1223
+ "grad_norm": 3.974522590637207,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 3.35616,
1230
+ "gpu_mem": 1.904770048,
1231
+ "loss": 0.7939,
1232
+ "grad_norm": 3.5985963344573975,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 3.35616,
1239
+ "gpu_mem": 1.9047808,
1240
+ "loss": 0.8105,
1241
+ "grad_norm": 3.1945037841796875,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 3.35616,
1248
+ "gpu_mem": 1.90477312,
1249
+ "loss": 0.9011,
1250
+ "grad_norm": 3.9432594776153564,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 3.35616,
1257
+ "gpu_mem": 1.904806912,
1258
+ "loss": 0.8043,
1259
+ "grad_norm": 3.507493019104004,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 3.35616,
1266
+ "gpu_mem": 1.904806912,
1267
+ "train_runtime": 703.9932,
1268
+ "train_samples_per_second": 12.79,
1269
+ "train_steps_per_second": 0.199,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.2982905549662453
1272
+ }
1273
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.6839225589225589
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-arc_e-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-arc_e-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T08:03:35.730715"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-arc_e-r8-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 2.529038336,
6
+ "gpu_mem": 1.58622976,
7
+ "loss": 4.6935,
8
+ "grad_norm": 183.89422607421875,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 2.531201024,
15
+ "gpu_mem": 1.62817024,
16
+ "loss": 4.5357,
17
+ "grad_norm": 189.22933959960938,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 2.531397632,
24
+ "gpu_mem": 1.628148736,
25
+ "loss": 2.9034,
26
+ "grad_norm": 198.05426025390625,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 2.53159424,
33
+ "gpu_mem": 1.628127232,
34
+ "loss": 2.0558,
35
+ "grad_norm": 34.57008743286133,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 2.53159424,
42
+ "gpu_mem": 1.628168704,
43
+ "loss": 1.5141,
44
+ "grad_norm": 9.954888343811035,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 2.53159424,
51
+ "gpu_mem": 1.628144128,
52
+ "loss": 1.4406,
53
+ "grad_norm": 18.402481079101562,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 2.531790848,
60
+ "gpu_mem": 1.628167168,
61
+ "loss": 1.5382,
62
+ "grad_norm": 21.70599937438965,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 2.531790848,
69
+ "gpu_mem": 1.628125696,
70
+ "loss": 1.3666,
71
+ "grad_norm": 8.612508773803711,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 2.531790848,
78
+ "gpu_mem": 1.628127232,
79
+ "loss": 1.4335,
80
+ "grad_norm": 14.70494270324707,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 2.531790848,
87
+ "gpu_mem": 1.628122624,
88
+ "loss": 1.538,
89
+ "grad_norm": 17.99971580505371,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 2.531790848,
96
+ "gpu_mem": 1.62820096,
97
+ "loss": 1.3729,
98
+ "grad_norm": 6.285915374755859,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 2.531987456,
105
+ "gpu_mem": 1.628174848,
106
+ "loss": 1.5256,
107
+ "grad_norm": 18.817567825317383,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 2.531987456,
114
+ "gpu_mem": 1.628125696,
115
+ "loss": 1.3357,
116
+ "grad_norm": 4.089114189147949,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 2.531987456,
123
+ "gpu_mem": 1.6281472,
124
+ "loss": 1.3939,
125
+ "grad_norm": 8.272595405578613,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 2.531987456,
132
+ "gpu_mem": 1.62812416,
133
+ "loss": 1.3336,
134
+ "grad_norm": 4.772479057312012,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 2.531987456,
141
+ "gpu_mem": 1.628128768,
142
+ "loss": 1.3705,
143
+ "grad_norm": 4.441692352294922,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 2.531987456,
150
+ "gpu_mem": 1.628165632,
151
+ "loss": 1.3695,
152
+ "grad_norm": 5.738226413726807,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 2.531987456,
159
+ "gpu_mem": 1.628176384,
160
+ "loss": 1.4602,
161
+ "grad_norm": 8.853696823120117,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 2.531987456,
168
+ "gpu_mem": 1.628119552,
169
+ "loss": 1.301,
170
+ "grad_norm": 5.047065258026123,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 2.531987456,
177
+ "gpu_mem": 1.628190208,
178
+ "loss": 1.3813,
179
+ "grad_norm": 5.146203517913818,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 2.531987456,
186
+ "gpu_mem": 1.628188672,
187
+ "loss": 1.3252,
188
+ "grad_norm": 4.078402996063232,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 2.531987456,
195
+ "gpu_mem": 1.628145664,
196
+ "loss": 1.3251,
197
+ "grad_norm": 3.823366641998291,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 2.531987456,
204
+ "gpu_mem": 1.62816256,
205
+ "loss": 1.3462,
206
+ "grad_norm": 3.1414875984191895,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 2.531987456,
213
+ "gpu_mem": 1.628119552,
214
+ "loss": 1.3586,
215
+ "grad_norm": 4.213393211364746,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 2.531987456,
222
+ "gpu_mem": 1.628148736,
223
+ "loss": 1.374,
224
+ "grad_norm": 2.9499149322509766,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 2.531987456,
231
+ "gpu_mem": 1.628128768,
232
+ "loss": 1.4602,
233
+ "grad_norm": 3.467322826385498,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 2.531987456,
240
+ "gpu_mem": 1.62815488,
241
+ "loss": 1.3166,
242
+ "grad_norm": 1.3668574094772339,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 2.531987456,
249
+ "gpu_mem": 1.62815488,
250
+ "loss": 1.4116,
251
+ "grad_norm": 3.6738901138305664,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 2.531987456,
258
+ "gpu_mem": 1.628133376,
259
+ "loss": 1.26,
260
+ "grad_norm": 2.2090518474578857,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 2.531987456,
267
+ "gpu_mem": 1.62812416,
268
+ "loss": 1.3387,
269
+ "grad_norm": 2.738809823989868,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 2.531987456,
276
+ "gpu_mem": 1.628142592,
277
+ "loss": 1.3622,
278
+ "grad_norm": 1.6162611246109009,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 2.531987456,
285
+ "gpu_mem": 1.628165632,
286
+ "loss": 1.3308,
287
+ "grad_norm": 3.204833507537842,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 2.531987456,
294
+ "gpu_mem": 1.62816256,
295
+ "loss": 1.3762,
296
+ "grad_norm": 2.8587379455566406,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 2.531987456,
303
+ "gpu_mem": 1.628165632,
304
+ "loss": 1.4294,
305
+ "grad_norm": 3.531189441680908,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 2.531987456,
312
+ "gpu_mem": 1.6281472,
313
+ "loss": 1.3384,
314
+ "grad_norm": 2.4379618167877197,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 2.531987456,
321
+ "gpu_mem": 1.64907776,
322
+ "loss": 1.961,
323
+ "grad_norm": 3.7823243141174316,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 2.531987456,
330
+ "gpu_mem": 1.649082368,
331
+ "loss": 1.3818,
332
+ "grad_norm": 3.630120038986206,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 2.531987456,
339
+ "gpu_mem": 1.649060864,
340
+ "loss": 1.2186,
341
+ "grad_norm": 2.131730794906616,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 2.531987456,
348
+ "gpu_mem": 1.649050112,
349
+ "loss": 1.4261,
350
+ "grad_norm": 5.274710655212402,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 2.531987456,
357
+ "gpu_mem": 1.649113088,
358
+ "loss": 1.3845,
359
+ "grad_norm": 6.914450168609619,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 2.531987456,
366
+ "gpu_mem": 1.649073152,
367
+ "loss": 1.2903,
368
+ "grad_norm": 1.467061996459961,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 2.531987456,
375
+ "gpu_mem": 1.64911616,
376
+ "loss": 1.3445,
377
+ "grad_norm": 2.253526449203491,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 2.531987456,
384
+ "gpu_mem": 1.649065472,
385
+ "loss": 1.354,
386
+ "grad_norm": 2.19681453704834,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 2.531987456,
393
+ "gpu_mem": 1.649129984,
394
+ "loss": 1.3465,
395
+ "grad_norm": 2.786311388015747,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 2.531987456,
402
+ "gpu_mem": 1.649097728,
403
+ "loss": 1.3308,
404
+ "grad_norm": 1.906112790107727,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 2.531987456,
411
+ "gpu_mem": 1.649102336,
412
+ "loss": 1.3854,
413
+ "grad_norm": 2.904219627380371,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 2.531987456,
420
+ "gpu_mem": 1.649048576,
421
+ "loss": 1.3704,
422
+ "grad_norm": 4.026738166809082,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 2.531987456,
429
+ "gpu_mem": 1.6490624,
430
+ "loss": 1.3392,
431
+ "grad_norm": 2.988410711288452,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 2.531987456,
438
+ "gpu_mem": 1.649051648,
439
+ "loss": 1.2991,
440
+ "grad_norm": 3.5249204635620117,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 2.531987456,
447
+ "gpu_mem": 1.649065472,
448
+ "loss": 1.387,
449
+ "grad_norm": 10.594127655029297,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 2.531987456,
456
+ "gpu_mem": 1.649117696,
457
+ "loss": 1.3701,
458
+ "grad_norm": 4.035572052001953,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 2.531987456,
465
+ "gpu_mem": 1.649065472,
466
+ "loss": 1.3821,
467
+ "grad_norm": 4.223878860473633,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 2.531987456,
474
+ "gpu_mem": 1.649134592,
475
+ "loss": 1.2939,
476
+ "grad_norm": 2.68819522857666,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 2.531987456,
483
+ "gpu_mem": 1.649102336,
484
+ "loss": 1.2812,
485
+ "grad_norm": 2.8337361812591553,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 2.531987456,
492
+ "gpu_mem": 1.649111552,
493
+ "loss": 1.4149,
494
+ "grad_norm": 3.814574956893921,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 2.531987456,
501
+ "gpu_mem": 1.649086976,
502
+ "loss": 1.3374,
503
+ "grad_norm": 2.5673627853393555,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 2.531987456,
510
+ "gpu_mem": 1.649120768,
511
+ "loss": 1.3018,
512
+ "grad_norm": 2.016061782836914,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 2.531987456,
519
+ "gpu_mem": 1.649102336,
520
+ "loss": 1.3003,
521
+ "grad_norm": 2.109996795654297,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 2.531987456,
528
+ "gpu_mem": 1.649088512,
529
+ "loss": 1.3424,
530
+ "grad_norm": 1.9124606847763062,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 2.531987456,
537
+ "gpu_mem": 1.649126912,
538
+ "loss": 1.3049,
539
+ "grad_norm": 2.3165783882141113,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 2.531987456,
546
+ "gpu_mem": 1.649059328,
547
+ "loss": 1.3213,
548
+ "grad_norm": 2.8745310306549072,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 2.531987456,
555
+ "gpu_mem": 1.649106944,
556
+ "loss": 1.386,
557
+ "grad_norm": 2.8461756706237793,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 2.531987456,
564
+ "gpu_mem": 1.649056256,
565
+ "loss": 1.36,
566
+ "grad_norm": 2.8900582790374756,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 2.531987456,
573
+ "gpu_mem": 1.649105408,
574
+ "loss": 1.3483,
575
+ "grad_norm": 2.471662998199463,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 2.531987456,
582
+ "gpu_mem": 1.649103872,
583
+ "loss": 1.3554,
584
+ "grad_norm": 2.223353385925293,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 2.531987456,
591
+ "gpu_mem": 1.649122304,
592
+ "loss": 1.3305,
593
+ "grad_norm": 2.4578514099121094,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 2.531987456,
600
+ "gpu_mem": 1.649063936,
601
+ "loss": 1.3244,
602
+ "grad_norm": 2.145490884780884,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 2.531987456,
609
+ "gpu_mem": 1.649076224,
610
+ "loss": 1.3476,
611
+ "grad_norm": 1.486546516418457,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 2.531987456,
618
+ "gpu_mem": 1.6491008,
619
+ "loss": 1.3053,
620
+ "grad_norm": 2.1030406951904297,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 2.531987456,
627
+ "gpu_mem": 1.64907776,
628
+ "loss": 1.2559,
629
+ "grad_norm": 1.7029767036437988,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 2.531987456,
636
+ "gpu_mem": 1.648913408,
637
+ "loss": 1.9143,
638
+ "grad_norm": 3.819026231765747,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 2.531987456,
645
+ "gpu_mem": 1.628157952,
646
+ "loss": 1.3873,
647
+ "grad_norm": 4.2176618576049805,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 2.531987456,
654
+ "gpu_mem": 1.628121088,
655
+ "loss": 1.27,
656
+ "grad_norm": 2.87872052192688,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 2.531987456,
663
+ "gpu_mem": 1.628180992,
664
+ "loss": 1.2572,
665
+ "grad_norm": 2.571578025817871,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 2.531987456,
672
+ "gpu_mem": 1.628148736,
673
+ "loss": 1.2877,
674
+ "grad_norm": 3.2488789558410645,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 2.531987456,
681
+ "gpu_mem": 1.628159488,
682
+ "loss": 1.2229,
683
+ "grad_norm": 2.675572633743286,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 2.531987456,
690
+ "gpu_mem": 1.628196352,
691
+ "loss": 1.2628,
692
+ "grad_norm": 2.3351566791534424,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 2.531987456,
699
+ "gpu_mem": 1.628180992,
700
+ "loss": 1.3087,
701
+ "grad_norm": 2.939434289932251,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 2.531987456,
708
+ "gpu_mem": 1.62813184,
709
+ "loss": 1.1645,
710
+ "grad_norm": 2.723719358444214,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 2.531987456,
717
+ "gpu_mem": 1.628176384,
718
+ "loss": 1.2542,
719
+ "grad_norm": 2.5774831771850586,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 2.531987456,
726
+ "gpu_mem": 1.62816256,
727
+ "loss": 1.2592,
728
+ "grad_norm": 2.8569812774658203,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 2.531987456,
735
+ "gpu_mem": 1.628130304,
736
+ "loss": 1.1921,
737
+ "grad_norm": 3.697158098220825,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 2.531987456,
744
+ "gpu_mem": 1.628180992,
745
+ "loss": 1.1601,
746
+ "grad_norm": 3.921438694000244,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 2.531987456,
753
+ "gpu_mem": 1.628119552,
754
+ "loss": 1.2786,
755
+ "grad_norm": 4.190140724182129,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 2.531987456,
762
+ "gpu_mem": 1.628165632,
763
+ "loss": 1.2469,
764
+ "grad_norm": 4.45603609085083,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 2.531987456,
771
+ "gpu_mem": 1.628119552,
772
+ "loss": 1.1951,
773
+ "grad_norm": 3.795806407928467,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 2.531987456,
780
+ "gpu_mem": 1.628150272,
781
+ "loss": 1.2247,
782
+ "grad_norm": 6.081866264343262,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 2.531987456,
789
+ "gpu_mem": 1.628125696,
790
+ "loss": 1.1483,
791
+ "grad_norm": 3.995368242263794,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 2.531987456,
798
+ "gpu_mem": 1.628179456,
799
+ "loss": 1.1926,
800
+ "grad_norm": 5.806249618530273,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 2.531987456,
807
+ "gpu_mem": 1.628161024,
808
+ "loss": 1.1925,
809
+ "grad_norm": 6.463298320770264,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 2.531987456,
816
+ "gpu_mem": 1.628110336,
817
+ "loss": 1.1719,
818
+ "grad_norm": 6.830526351928711,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 2.531987456,
825
+ "gpu_mem": 1.628134912,
826
+ "loss": 1.4142,
827
+ "grad_norm": 11.937300682067871,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 2.531987456,
834
+ "gpu_mem": 1.628137984,
835
+ "loss": 1.2969,
836
+ "grad_norm": 9.072361946105957,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 2.531987456,
843
+ "gpu_mem": 1.628130304,
844
+ "loss": 1.2869,
845
+ "grad_norm": 10.54022216796875,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 2.531987456,
852
+ "gpu_mem": 1.628168704,
853
+ "loss": 1.2361,
854
+ "grad_norm": 8.78108024597168,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 2.531987456,
861
+ "gpu_mem": 1.62817792,
862
+ "loss": 1.182,
863
+ "grad_norm": 7.460996150970459,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 2.531987456,
870
+ "gpu_mem": 1.628121088,
871
+ "loss": 1.1831,
872
+ "grad_norm": 6.490010738372803,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 2.531987456,
879
+ "gpu_mem": 1.628121088,
880
+ "loss": 1.1482,
881
+ "grad_norm": 4.607930660247803,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 2.531987456,
888
+ "gpu_mem": 1.628118016,
889
+ "loss": 1.1483,
890
+ "grad_norm": 4.311230182647705,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 2.531987456,
897
+ "gpu_mem": 1.62811648,
898
+ "loss": 1.1551,
899
+ "grad_norm": 5.20615816116333,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 2.531987456,
906
+ "gpu_mem": 1.628159488,
907
+ "loss": 1.0691,
908
+ "grad_norm": 5.477205753326416,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 2.531987456,
915
+ "gpu_mem": 1.628098048,
916
+ "loss": 1.2742,
917
+ "grad_norm": 6.022722244262695,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 2.531987456,
924
+ "gpu_mem": 1.6281472,
925
+ "loss": 1.1785,
926
+ "grad_norm": 4.99077844619751,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 2.531987456,
933
+ "gpu_mem": 1.628210176,
934
+ "loss": 1.3201,
935
+ "grad_norm": 6.570130825042725,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 2.531987456,
942
+ "gpu_mem": 1.62816256,
943
+ "loss": 1.1121,
944
+ "grad_norm": 4.336287498474121,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 2.531987456,
951
+ "gpu_mem": 1.628144128,
952
+ "loss": 1.1449,
953
+ "grad_norm": 4.363764762878418,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 2.531987456,
960
+ "gpu_mem": 1.6491008,
961
+ "loss": 1.4543,
962
+ "grad_norm": 9.88137149810791,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 2.531987456,
969
+ "gpu_mem": 1.649082368,
970
+ "loss": 0.9681,
971
+ "grad_norm": 4.040791988372803,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 2.531987456,
978
+ "gpu_mem": 1.649071616,
979
+ "loss": 0.9507,
980
+ "grad_norm": 4.64716911315918,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 2.531987456,
987
+ "gpu_mem": 1.649125376,
988
+ "loss": 1.0047,
989
+ "grad_norm": 5.491010665893555,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 2.531987456,
996
+ "gpu_mem": 1.64908544,
997
+ "loss": 1.0033,
998
+ "grad_norm": 4.694374084472656,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 2.531987456,
1005
+ "gpu_mem": 1.649103872,
1006
+ "loss": 0.9728,
1007
+ "grad_norm": 4.7792439460754395,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 2.531987456,
1014
+ "gpu_mem": 1.649166848,
1015
+ "loss": 0.9014,
1016
+ "grad_norm": 5.062333106994629,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 2.531987456,
1023
+ "gpu_mem": 1.649094656,
1024
+ "loss": 0.9253,
1025
+ "grad_norm": 5.648083686828613,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 2.531987456,
1032
+ "gpu_mem": 1.649088512,
1033
+ "loss": 1.0593,
1034
+ "grad_norm": 6.263454914093018,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 2.531987456,
1041
+ "gpu_mem": 1.649103872,
1042
+ "loss": 0.8779,
1043
+ "grad_norm": 6.009137153625488,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 2.531987456,
1050
+ "gpu_mem": 1.649119232,
1051
+ "loss": 0.9084,
1052
+ "grad_norm": 5.85214376449585,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 2.531987456,
1059
+ "gpu_mem": 1.649110016,
1060
+ "loss": 0.8025,
1061
+ "grad_norm": 5.127455234527588,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 2.531987456,
1068
+ "gpu_mem": 1.6491008,
1069
+ "loss": 0.8571,
1070
+ "grad_norm": 5.880275726318359,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 2.531987456,
1077
+ "gpu_mem": 1.649119232,
1078
+ "loss": 0.9102,
1079
+ "grad_norm": 5.5184149742126465,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 2.531987456,
1086
+ "gpu_mem": 1.649117696,
1087
+ "loss": 0.8212,
1088
+ "grad_norm": 6.405752658843994,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 2.531987456,
1095
+ "gpu_mem": 1.649074688,
1096
+ "loss": 0.6898,
1097
+ "grad_norm": 5.653371810913086,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 2.531987456,
1104
+ "gpu_mem": 1.649106944,
1105
+ "loss": 0.752,
1106
+ "grad_norm": 6.088992118835449,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 2.531987456,
1113
+ "gpu_mem": 1.649060864,
1114
+ "loss": 0.8299,
1115
+ "grad_norm": 6.718883514404297,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 2.531987456,
1122
+ "gpu_mem": 1.649105408,
1123
+ "loss": 0.7087,
1124
+ "grad_norm": 7.2722272872924805,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 2.531987456,
1131
+ "gpu_mem": 1.649056256,
1132
+ "loss": 0.8637,
1133
+ "grad_norm": 7.862452030181885,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 2.531987456,
1140
+ "gpu_mem": 1.649068544,
1141
+ "loss": 0.7982,
1142
+ "grad_norm": 6.94256591796875,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 2.531987456,
1149
+ "gpu_mem": 1.64909312,
1150
+ "loss": 0.7816,
1151
+ "grad_norm": 7.385449409484863,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 2.531987456,
1158
+ "gpu_mem": 1.64905472,
1159
+ "loss": 0.7424,
1160
+ "grad_norm": 8.720227241516113,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 2.531987456,
1167
+ "gpu_mem": 1.649057792,
1168
+ "loss": 0.7202,
1169
+ "grad_norm": 7.994754314422607,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 2.531987456,
1176
+ "gpu_mem": 1.64907008,
1177
+ "loss": 0.7348,
1178
+ "grad_norm": 7.094006538391113,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 2.531987456,
1185
+ "gpu_mem": 1.649034752,
1186
+ "loss": 0.8036,
1187
+ "grad_norm": 7.807676792144775,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 2.531987456,
1194
+ "gpu_mem": 1.649076224,
1195
+ "loss": 0.7837,
1196
+ "grad_norm": 8.70398235321045,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 2.531987456,
1203
+ "gpu_mem": 1.649091584,
1204
+ "loss": 0.8218,
1205
+ "grad_norm": 8.904121398925781,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 2.531987456,
1212
+ "gpu_mem": 1.649056256,
1213
+ "loss": 0.8381,
1214
+ "grad_norm": 7.767360210418701,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 2.531987456,
1221
+ "gpu_mem": 1.649063936,
1222
+ "loss": 0.7854,
1223
+ "grad_norm": 9.733752250671387,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 2.531987456,
1230
+ "gpu_mem": 1.64908544,
1231
+ "loss": 0.6518,
1232
+ "grad_norm": 7.4530439376831055,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 2.531987456,
1239
+ "gpu_mem": 1.649096192,
1240
+ "loss": 0.8439,
1241
+ "grad_norm": 7.9819016456604,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 2.531987456,
1248
+ "gpu_mem": 1.649088512,
1249
+ "loss": 0.8905,
1250
+ "grad_norm": 9.51168441772461,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 2.531987456,
1257
+ "gpu_mem": 1.649122304,
1258
+ "loss": 0.7906,
1259
+ "grad_norm": 7.949920177459717,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 2.531987456,
1266
+ "gpu_mem": 1.649122304,
1267
+ "train_runtime": 698.6517,
1268
+ "train_samples_per_second": 12.888,
1269
+ "train_steps_per_second": 0.2,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.2763484124626432
1272
+ }
1273
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.6214067278287462
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-boolq-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-boolq-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-01T20:49:28.340656"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-boolq-r2-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 2.766127104,
6
+ "gpu_mem": 1.571689472,
7
+ "loss": 8.8586,
8
+ "grad_norm": 337.5168762207031,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 2.820194304,
15
+ "gpu_mem": 1.599364608,
16
+ "loss": 8.9138,
17
+ "grad_norm": 342.534912109375,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 2.821177344,
24
+ "gpu_mem": 1.5992832,
25
+ "loss": 7.8736,
26
+ "grad_norm": 336.9686584472656,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 2.821767168,
33
+ "gpu_mem": 1.5992832,
34
+ "loss": 5.8241,
35
+ "grad_norm": 326.6863708496094,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 2.822356992,
42
+ "gpu_mem": 1.599218688,
43
+ "loss": 3.4317,
44
+ "grad_norm": 263.6181945800781,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 2.822946816,
51
+ "gpu_mem": 1.599238656,
52
+ "loss": 1.8029,
53
+ "grad_norm": 119.77902221679688,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 2.82353664,
60
+ "gpu_mem": 1.59929088,
61
+ "loss": 1.015,
62
+ "grad_norm": 51.48716735839844,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 2.824126464,
69
+ "gpu_mem": 1.599376896,
70
+ "loss": 0.6603,
71
+ "grad_norm": 18.964567184448242,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 2.82451968,
78
+ "gpu_mem": 1.599284736,
79
+ "loss": 0.7436,
80
+ "grad_norm": 52.228031158447266,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 2.824912896,
87
+ "gpu_mem": 1.599184896,
88
+ "loss": 1.7585,
89
+ "grad_norm": 246.6007537841797,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 2.825306112,
96
+ "gpu_mem": 1.599289344,
97
+ "loss": 1.7154,
98
+ "grad_norm": 226.10888671875,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 2.825895936,
105
+ "gpu_mem": 1.599661056,
106
+ "loss": 0.7456,
107
+ "grad_norm": 54.193077087402344,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 2.826289152,
114
+ "gpu_mem": 1.599264768,
115
+ "loss": 1.1313,
116
+ "grad_norm": 95.89151000976562,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 2.826682368,
123
+ "gpu_mem": 1.599241728,
124
+ "loss": 0.9543,
125
+ "grad_norm": 66.62142944335938,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 2.827075584,
132
+ "gpu_mem": 1.599180288,
133
+ "loss": 0.723,
134
+ "grad_norm": 26.305124282836914,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 2.8274688,
141
+ "gpu_mem": 1.599264768,
142
+ "loss": 1.1127,
143
+ "grad_norm": 71.71159362792969,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 2.827665408,
150
+ "gpu_mem": 1.599304704,
151
+ "loss": 0.9928,
152
+ "grad_norm": 52.495697021484375,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 2.827862016,
159
+ "gpu_mem": 1.59936768,
160
+ "loss": 0.7374,
161
+ "grad_norm": 19.668397903442383,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 2.828255232,
168
+ "gpu_mem": 1.599204864,
169
+ "loss": 1.3679,
170
+ "grad_norm": 119.7974853515625,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 2.828845056,
177
+ "gpu_mem": 1.599316992,
178
+ "loss": 0.695,
179
+ "grad_norm": 37.70135498046875,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 2.829041664,
186
+ "gpu_mem": 1.5994752,
187
+ "loss": 1.024,
188
+ "grad_norm": 114.8195571899414,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 2.829631488,
195
+ "gpu_mem": 1.59936768,
196
+ "loss": 1.1406,
197
+ "grad_norm": 117.71730041503906,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 2.830024704,
204
+ "gpu_mem": 1.599340032,
205
+ "loss": 0.8975,
206
+ "grad_norm": 107.03048706054688,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 2.830614528,
213
+ "gpu_mem": 1.599396864,
214
+ "loss": 0.6425,
215
+ "grad_norm": 27.74810218811035,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 2.830454784,
222
+ "gpu_mem": 1.599181824,
223
+ "loss": 0.954,
224
+ "grad_norm": 82.29080963134766,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 2.831044608,
231
+ "gpu_mem": 1.59923712,
232
+ "loss": 0.7189,
233
+ "grad_norm": 30.92197608947754,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 2.831437824,
240
+ "gpu_mem": 1.59952896,
241
+ "loss": 1.9701,
242
+ "grad_norm": 201.28363037109375,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 2.831425536,
249
+ "gpu_mem": 1.599207936,
250
+ "loss": 1.4237,
251
+ "grad_norm": 129.2787628173828,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 2.831622144,
258
+ "gpu_mem": 1.599272448,
259
+ "loss": 0.653,
260
+ "grad_norm": 11.47395133972168,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 2.83201536,
267
+ "gpu_mem": 1.599350784,
268
+ "loss": 1.6007,
269
+ "grad_norm": 79.31275939941406,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 2.832048128,
276
+ "gpu_mem": 1.599154176,
277
+ "loss": 1.0999,
278
+ "grad_norm": 47.0681037902832,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 2.832441344,
285
+ "gpu_mem": 1.59926784,
286
+ "loss": 0.7922,
287
+ "grad_norm": 27.33715057373047,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 2.83283456,
294
+ "gpu_mem": 1.59950592,
295
+ "loss": 0.9434,
296
+ "grad_norm": 72.70061492919922,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 2.833031168,
303
+ "gpu_mem": 1.599207936,
304
+ "loss": 0.6054,
305
+ "grad_norm": 29.839754104614258,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 2.83318272,
312
+ "gpu_mem": 1.599418368,
313
+ "loss": 4.0075,
314
+ "grad_norm": 309.3017578125,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 2.833313792,
321
+ "gpu_mem": 1.599369216,
322
+ "loss": 2.2427,
323
+ "grad_norm": 199.4197998046875,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 2.8335104,
330
+ "gpu_mem": 1.599180288,
331
+ "loss": 1.5529,
332
+ "grad_norm": 352.44171142578125,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 2.833903616,
339
+ "gpu_mem": 1.599427584,
340
+ "loss": 1.1383,
341
+ "grad_norm": 84.5745620727539,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 2.836262912,
348
+ "gpu_mem": 1.599806976,
349
+ "loss": 0.7886,
350
+ "grad_norm": 53.965965270996094,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 2.83645952,
357
+ "gpu_mem": 1.599376896,
358
+ "loss": 0.7583,
359
+ "grad_norm": 35.12152099609375,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 2.836656128,
366
+ "gpu_mem": 1.599604224,
367
+ "loss": 0.8062,
368
+ "grad_norm": 34.6225700378418,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 2.836656128,
375
+ "gpu_mem": 1.599501312,
376
+ "loss": 0.7247,
377
+ "grad_norm": 22.119951248168945,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 2.836852736,
384
+ "gpu_mem": 1.599323136,
385
+ "loss": 1.1172,
386
+ "grad_norm": 87.2462387084961,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 2.837049344,
393
+ "gpu_mem": 1.599465984,
394
+ "loss": 0.9574,
395
+ "grad_norm": 60.33451843261719,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 2.837245952,
402
+ "gpu_mem": 1.599246336,
403
+ "loss": 0.7015,
404
+ "grad_norm": 11.569185256958008,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 2.83744256,
411
+ "gpu_mem": 1.599489024,
412
+ "loss": 0.7459,
413
+ "grad_norm": 27.62919044494629,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 2.83744256,
420
+ "gpu_mem": 1.599212544,
421
+ "loss": 0.6521,
422
+ "grad_norm": 8.621322631835938,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 2.837639168,
429
+ "gpu_mem": 1.599289344,
430
+ "loss": 0.7513,
431
+ "grad_norm": 26.45704460144043,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 2.837639168,
438
+ "gpu_mem": 1.59930624,
439
+ "loss": 1.1375,
440
+ "grad_norm": 47.635643005371094,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 2.837835776,
447
+ "gpu_mem": 1.5992448,
448
+ "loss": 0.6763,
449
+ "grad_norm": 10.511567115783691,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 2.837835776,
456
+ "gpu_mem": 1.599249408,
457
+ "loss": 1.1386,
458
+ "grad_norm": 88.87720489501953,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 2.838032384,
465
+ "gpu_mem": 1.59932928,
466
+ "loss": 1.0596,
467
+ "grad_norm": 74.58036041259766,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 2.838032384,
474
+ "gpu_mem": 1.59935232,
475
+ "loss": 0.8116,
476
+ "grad_norm": 49.288570404052734,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 2.8384256,
483
+ "gpu_mem": 1.599280128,
484
+ "loss": 0.8314,
485
+ "grad_norm": 25.794435501098633,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 2.8384256,
492
+ "gpu_mem": 1.599550464,
493
+ "loss": 0.6872,
494
+ "grad_norm": 10.4611234664917,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 2.838622208,
501
+ "gpu_mem": 1.59933696,
502
+ "loss": 0.6765,
503
+ "grad_norm": 2.81551194190979,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 2.838622208,
510
+ "gpu_mem": 1.599330816,
511
+ "loss": 0.7878,
512
+ "grad_norm": 41.91350555419922,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 2.838818816,
519
+ "gpu_mem": 1.599226368,
520
+ "loss": 0.721,
521
+ "grad_norm": 37.02594757080078,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 2.838818816,
528
+ "gpu_mem": 1.599243264,
529
+ "loss": 0.7227,
530
+ "grad_norm": 16.337053298950195,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 2.838818816,
537
+ "gpu_mem": 1.59933696,
538
+ "loss": 0.612,
539
+ "grad_norm": 2.82547664642334,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 2.839015424,
546
+ "gpu_mem": 1.599347712,
547
+ "loss": 0.6366,
548
+ "grad_norm": 10.114922523498535,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 2.839015424,
555
+ "gpu_mem": 1.599335424,
556
+ "loss": 0.7537,
557
+ "grad_norm": 27.83133316040039,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 2.839015424,
564
+ "gpu_mem": 1.599327744,
565
+ "loss": 0.6195,
566
+ "grad_norm": 14.669090270996094,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 2.839212032,
573
+ "gpu_mem": 1.599257088,
574
+ "loss": 0.6581,
575
+ "grad_norm": 8.827725410461426,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 2.83940864,
582
+ "gpu_mem": 1.599301632,
583
+ "loss": 0.6777,
584
+ "grad_norm": 3.837571620941162,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 2.83940864,
591
+ "gpu_mem": 1.599495168,
592
+ "loss": 0.6426,
593
+ "grad_norm": 7.71622896194458,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 2.83940864,
600
+ "gpu_mem": 1.599204864,
601
+ "loss": 0.6675,
602
+ "grad_norm": 18.91324806213379,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 2.839605248,
609
+ "gpu_mem": 1.599172608,
610
+ "loss": 0.7491,
611
+ "grad_norm": 29.065805435180664,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 2.839605248,
618
+ "gpu_mem": 1.599238656,
619
+ "loss": 0.923,
620
+ "grad_norm": 65.54679870605469,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 2.839605248,
627
+ "gpu_mem": 1.599232512,
628
+ "loss": 0.7946,
629
+ "grad_norm": 43.269840240478516,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 2.839605248,
636
+ "gpu_mem": 1.599461376,
637
+ "loss": 0.6342,
638
+ "grad_norm": 5.117644309997559,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 2.839801856,
645
+ "gpu_mem": 1.599453696,
646
+ "loss": 0.8442,
647
+ "grad_norm": 36.260414123535156,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 2.839801856,
654
+ "gpu_mem": 1.599419904,
655
+ "loss": 1.0399,
656
+ "grad_norm": 59.90073776245117,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 2.839998464,
663
+ "gpu_mem": 1.599280128,
664
+ "loss": 0.6135,
665
+ "grad_norm": 9.273247718811035,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 2.839998464,
672
+ "gpu_mem": 1.599204864,
673
+ "loss": 0.6221,
674
+ "grad_norm": 26.085264205932617,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 2.839998464,
681
+ "gpu_mem": 1.59914496,
682
+ "loss": 0.8617,
683
+ "grad_norm": 90.78295135498047,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 2.839998464,
690
+ "gpu_mem": 1.599218688,
691
+ "loss": 0.6092,
692
+ "grad_norm": 36.9515266418457,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 2.840195072,
699
+ "gpu_mem": 1.599270912,
700
+ "loss": 0.6966,
701
+ "grad_norm": 38.786502838134766,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 2.840195072,
708
+ "gpu_mem": 1.599403008,
709
+ "loss": 0.6799,
710
+ "grad_norm": 16.99456024169922,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 2.840195072,
717
+ "gpu_mem": 1.599293952,
718
+ "loss": 0.6654,
719
+ "grad_norm": 13.605630874633789,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 2.840195072,
726
+ "gpu_mem": 1.599174144,
727
+ "loss": 0.6601,
728
+ "grad_norm": 6.571524620056152,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 2.840195072,
735
+ "gpu_mem": 1.599243264,
736
+ "loss": 0.7477,
737
+ "grad_norm": 23.323976516723633,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 2.84039168,
744
+ "gpu_mem": 1.599343104,
745
+ "loss": 0.8076,
746
+ "grad_norm": 27.96898078918457,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 2.84039168,
753
+ "gpu_mem": 1.59930624,
754
+ "loss": 0.7206,
755
+ "grad_norm": 195.5382843017578,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 2.84039168,
762
+ "gpu_mem": 1.599338496,
763
+ "loss": 0.6898,
764
+ "grad_norm": 21.206653594970703,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 2.84039168,
771
+ "gpu_mem": 1.599289344,
772
+ "loss": 0.6582,
773
+ "grad_norm": 18.797883987426758,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 2.84039168,
780
+ "gpu_mem": 1.599297024,
781
+ "loss": 0.6653,
782
+ "grad_norm": 6.30393123626709,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 2.84039168,
789
+ "gpu_mem": 1.599441408,
790
+ "loss": 0.6226,
791
+ "grad_norm": 4.516998291015625,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 2.84039168,
798
+ "gpu_mem": 1.599223296,
799
+ "loss": 0.8634,
800
+ "grad_norm": 43.86545181274414,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 2.84039168,
807
+ "gpu_mem": 1.599277056,
808
+ "loss": 0.7882,
809
+ "grad_norm": 26.65186309814453,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 2.84039168,
816
+ "gpu_mem": 1.5992448,
817
+ "loss": 0.6243,
818
+ "grad_norm": 5.84904670715332,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 2.84039168,
825
+ "gpu_mem": 1.599326208,
826
+ "loss": 0.7827,
827
+ "grad_norm": 54.409568786621094,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 2.840588288,
834
+ "gpu_mem": 1.5991296,
835
+ "loss": 0.7024,
836
+ "grad_norm": 16.766782760620117,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 2.840588288,
843
+ "gpu_mem": 1.599243264,
844
+ "loss": 0.6775,
845
+ "grad_norm": 22.029199600219727,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 2.840588288,
852
+ "gpu_mem": 1.599263232,
853
+ "loss": 0.6953,
854
+ "grad_norm": 12.179546356201172,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 2.840588288,
861
+ "gpu_mem": 1.599301632,
862
+ "loss": 0.6619,
863
+ "grad_norm": 6.7413010597229,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 2.840588288,
870
+ "gpu_mem": 1.599286272,
871
+ "loss": 0.7353,
872
+ "grad_norm": 15.342774391174316,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 2.840588288,
879
+ "gpu_mem": 1.59919872,
880
+ "loss": 0.6881,
881
+ "grad_norm": 6.57566499710083,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 2.840588288,
888
+ "gpu_mem": 1.599148032,
889
+ "loss": 0.9004,
890
+ "grad_norm": 41.970733642578125,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 2.840530944,
897
+ "gpu_mem": 1.599264768,
898
+ "loss": 0.9706,
899
+ "grad_norm": 46.277584075927734,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 2.840727552,
906
+ "gpu_mem": 1.599297024,
907
+ "loss": 0.7243,
908
+ "grad_norm": 13.523004531860352,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 2.840727552,
915
+ "gpu_mem": 1.599330816,
916
+ "loss": 0.6663,
917
+ "grad_norm": 2.6940062046051025,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 2.84092416,
924
+ "gpu_mem": 1.599381504,
925
+ "loss": 0.6721,
926
+ "grad_norm": 2.2665679454803467,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 2.84092416,
933
+ "gpu_mem": 1.599286272,
934
+ "loss": 0.7547,
935
+ "grad_norm": 25.09229278564453,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 2.84092416,
942
+ "gpu_mem": 1.599387648,
943
+ "loss": 0.7389,
944
+ "grad_norm": 21.83572006225586,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 2.84092416,
951
+ "gpu_mem": 1.599338496,
952
+ "loss": 0.6515,
953
+ "grad_norm": 9.572114944458008,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 2.84092416,
960
+ "gpu_mem": 1.599226368,
961
+ "loss": 0.6767,
962
+ "grad_norm": 10.984984397888184,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 2.84092416,
969
+ "gpu_mem": 1.599410688,
970
+ "loss": 0.702,
971
+ "grad_norm": 13.317893981933594,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 2.841120768,
978
+ "gpu_mem": 1.599264768,
979
+ "loss": 0.7188,
980
+ "grad_norm": 12.698904991149902,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 2.841120768,
987
+ "gpu_mem": 1.59926784,
988
+ "loss": 0.6739,
989
+ "grad_norm": 7.674340724945068,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 2.84092416,
996
+ "gpu_mem": 1.59923712,
997
+ "loss": 0.707,
998
+ "grad_norm": 18.55880355834961,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 2.841120768,
1005
+ "gpu_mem": 1.5992832,
1006
+ "loss": 0.6925,
1007
+ "grad_norm": 12.338556289672852,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 2.841120768,
1014
+ "gpu_mem": 1.599273984,
1015
+ "loss": 0.6803,
1016
+ "grad_norm": 3.0032811164855957,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 2.841120768,
1023
+ "gpu_mem": 1.599255552,
1024
+ "loss": 0.6488,
1025
+ "grad_norm": 11.390604972839355,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 2.841120768,
1032
+ "gpu_mem": 1.599330816,
1033
+ "loss": 0.6943,
1034
+ "grad_norm": 8.861191749572754,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 2.841120768,
1041
+ "gpu_mem": 1.599250944,
1042
+ "loss": 0.6321,
1043
+ "grad_norm": 3.3490185737609863,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 2.841120768,
1050
+ "gpu_mem": 1.599141888,
1051
+ "loss": 0.6836,
1052
+ "grad_norm": 10.221349716186523,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 2.841055232,
1059
+ "gpu_mem": 1.599379968,
1060
+ "loss": 0.6782,
1061
+ "grad_norm": 2.3667240142822266,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 2.84125184,
1068
+ "gpu_mem": 1.599550464,
1069
+ "loss": 0.761,
1070
+ "grad_norm": 23.628992080688477,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 2.84125184,
1077
+ "gpu_mem": 1.5992832,
1078
+ "loss": 0.7736,
1079
+ "grad_norm": 24.32083511352539,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 2.84125184,
1086
+ "gpu_mem": 1.599310848,
1087
+ "loss": 0.6755,
1088
+ "grad_norm": 14.352532386779785,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 2.84125184,
1095
+ "gpu_mem": 1.599361536,
1096
+ "loss": 0.6097,
1097
+ "grad_norm": 2.115525484085083,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 2.84125184,
1104
+ "gpu_mem": 1.599171072,
1105
+ "loss": 0.7155,
1106
+ "grad_norm": 14.015365600585938,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 2.84123136,
1113
+ "gpu_mem": 1.59961344,
1114
+ "loss": 0.6847,
1115
+ "grad_norm": 7.591170787811279,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 2.84123136,
1122
+ "gpu_mem": 1.599340032,
1123
+ "loss": 0.6583,
1124
+ "grad_norm": 14.31331729888916,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 2.84100608,
1131
+ "gpu_mem": 1.599223296,
1132
+ "loss": 0.6587,
1133
+ "grad_norm": 14.070867538452148,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 2.841202688,
1140
+ "gpu_mem": 1.599662592,
1141
+ "loss": 0.7261,
1142
+ "grad_norm": 13.449365615844727,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 2.841391104,
1149
+ "gpu_mem": 1.599438336,
1150
+ "loss": 0.6008,
1151
+ "grad_norm": 7.718184471130371,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 2.841391104,
1158
+ "gpu_mem": 1.599478272,
1159
+ "loss": 0.7293,
1160
+ "grad_norm": 12.1471586227417,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 2.841391104,
1167
+ "gpu_mem": 1.59926016,
1168
+ "loss": 0.6809,
1169
+ "grad_norm": 5.910035133361816,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 2.841100288,
1176
+ "gpu_mem": 1.599389184,
1177
+ "loss": 0.6826,
1178
+ "grad_norm": 1.297784447669983,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 2.841100288,
1185
+ "gpu_mem": 1.599470592,
1186
+ "loss": 0.6727,
1187
+ "grad_norm": 6.657660484313965,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 2.841296896,
1194
+ "gpu_mem": 1.599254016,
1195
+ "loss": 0.6447,
1196
+ "grad_norm": 5.083381652832031,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 2.84094464,
1203
+ "gpu_mem": 1.599387648,
1204
+ "loss": 0.9661,
1205
+ "grad_norm": 94.68696594238281,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 2.841141248,
1212
+ "gpu_mem": 1.599410688,
1213
+ "loss": 0.7214,
1214
+ "grad_norm": 21.030242919921875,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 2.841493504,
1221
+ "gpu_mem": 1.599247872,
1222
+ "loss": 0.6816,
1223
+ "grad_norm": 115.34664916992188,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 2.841493504,
1230
+ "gpu_mem": 1.599128064,
1231
+ "loss": 0.7982,
1232
+ "grad_norm": 30.468721389770508,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 2.841493504,
1239
+ "gpu_mem": 1.599309312,
1240
+ "loss": 0.716,
1241
+ "grad_norm": 26.950998306274414,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 2.841493504,
1248
+ "gpu_mem": 1.599207936,
1249
+ "loss": 0.8645,
1250
+ "grad_norm": 102.03352355957031,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 2.841493504,
1257
+ "gpu_mem": 1.59926016,
1258
+ "loss": 0.6994,
1259
+ "grad_norm": 38.76273727416992,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 2.841493504,
1266
+ "gpu_mem": 1.599292416,
1267
+ "loss": 2.2689,
1268
+ "grad_norm": 434.62127685546875,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 2.841493504,
1275
+ "gpu_mem": 1.599432192,
1276
+ "loss": 1.8453,
1277
+ "grad_norm": 546.8731079101562,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 2.841493504,
1284
+ "gpu_mem": 1.599415296,
1285
+ "loss": 0.8864,
1286
+ "grad_norm": 159.02481079101562,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 2.841493504,
1293
+ "gpu_mem": 1.599607296,
1294
+ "loss": 1.1283,
1295
+ "grad_norm": 232.38580322265625,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 2.841493504,
1302
+ "gpu_mem": 1.599318528,
1303
+ "loss": 0.6381,
1304
+ "grad_norm": 9.929734230041504,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 2.841452544,
1311
+ "gpu_mem": 1.599353856,
1312
+ "loss": 0.7965,
1313
+ "grad_norm": 61.84294128417969,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 2.8413952,
1320
+ "gpu_mem": 1.59925248,
1321
+ "loss": 0.6329,
1322
+ "grad_norm": 5.885165691375732,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 2.841583616,
1329
+ "gpu_mem": 1.604665856,
1330
+ "loss": 1.3226,
1331
+ "grad_norm": 76.07792663574219,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 2.841583616,
1338
+ "gpu_mem": 1.604601344,
1339
+ "loss": 0.7548,
1340
+ "grad_norm": 26.010759353637695,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 2.841583616,
1347
+ "gpu_mem": 1.604438528,
1348
+ "loss": 0.6601,
1349
+ "grad_norm": 0.8983572125434875,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 2.841378816,
1356
+ "gpu_mem": 1.60451072,
1357
+ "loss": 0.7235,
1358
+ "grad_norm": 8.871695518493652,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 2.841276416,
1365
+ "gpu_mem": 1.604546048,
1366
+ "loss": 0.6913,
1367
+ "grad_norm": 3.036161422729492,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 2.841464832,
1374
+ "gpu_mem": 1.604570624,
1375
+ "loss": 0.6701,
1376
+ "grad_norm": 11.131294250488281,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 2.84166144,
1383
+ "gpu_mem": 1.604532224,
1384
+ "loss": 0.6773,
1385
+ "grad_norm": 2.468815803527832,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 2.84166144,
1392
+ "gpu_mem": 1.604753408,
1393
+ "loss": 0.6785,
1394
+ "grad_norm": 2.2420897483825684,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 2.84166144,
1401
+ "gpu_mem": 1.604661248,
1402
+ "loss": 0.6721,
1403
+ "grad_norm": 12.167314529418945,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 2.841612288,
1410
+ "gpu_mem": 1.604567552,
1411
+ "loss": 0.6914,
1412
+ "grad_norm": 3.158642053604126,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 2.841546752,
1419
+ "gpu_mem": 1.604493824,
1420
+ "loss": 0.6931,
1421
+ "grad_norm": 4.770071506500244,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 2.841505792,
1428
+ "gpu_mem": 1.604842496,
1429
+ "loss": 0.6715,
1430
+ "grad_norm": 11.148482322692871,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 2.84166144,
1437
+ "gpu_mem": 1.604436992,
1438
+ "loss": 0.6815,
1439
+ "grad_norm": 6.524872303009033,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 2.84166144,
1446
+ "gpu_mem": 1.604383232,
1447
+ "loss": 0.6473,
1448
+ "grad_norm": 3.372286081314087,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 2.841427968,
1455
+ "gpu_mem": 1.605158912,
1456
+ "loss": 0.6478,
1457
+ "grad_norm": 7.118155002593994,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 2.841624576,
1464
+ "gpu_mem": 1.604635136,
1465
+ "loss": 0.746,
1466
+ "grad_norm": 16.46024513244629,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 2.84131328,
1473
+ "gpu_mem": 1.604547584,
1474
+ "loss": 0.6521,
1475
+ "grad_norm": 8.345376014709473,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 2.841509888,
1482
+ "gpu_mem": 1.604496896,
1483
+ "loss": 0.651,
1484
+ "grad_norm": 30.932464599609375,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 2.841432064,
1491
+ "gpu_mem": 1.604592128,
1492
+ "loss": 7.5865,
1493
+ "grad_norm": 3689.810546875,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 2.841432064,
1500
+ "gpu_mem": 1.604509184,
1501
+ "loss": 1.2947,
1502
+ "grad_norm": 244.8701171875,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 2.841419776,
1509
+ "gpu_mem": 1.604527616,
1510
+ "loss": 0.6758,
1511
+ "grad_norm": 11.749220848083496,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 2.841419776,
1518
+ "gpu_mem": 1.604615168,
1519
+ "loss": 0.6471,
1520
+ "grad_norm": 7.709426403045654,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 2.84151808,
1527
+ "gpu_mem": 1.604499968,
1528
+ "loss": 0.6793,
1529
+ "grad_norm": 8.016724586486816,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 2.841714688,
1536
+ "gpu_mem": 1.604562944,
1537
+ "loss": 0.6958,
1538
+ "grad_norm": 9.63123893737793,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 2.841714688,
1545
+ "gpu_mem": 1.604470784,
1546
+ "loss": 0.6948,
1547
+ "grad_norm": 5.182499408721924,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 2.841714688,
1554
+ "gpu_mem": 1.604770304,
1555
+ "loss": 0.763,
1556
+ "grad_norm": 24.390262603759766,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 2.841714688,
1563
+ "gpu_mem": 1.604493824,
1564
+ "loss": 0.7458,
1565
+ "grad_norm": 16.760892868041992,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 2.841714688,
1572
+ "gpu_mem": 1.604460032,
1573
+ "loss": 0.7409,
1574
+ "grad_norm": 18.550966262817383,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 2.841714688,
1581
+ "gpu_mem": 1.604598272,
1582
+ "loss": 0.6664,
1583
+ "grad_norm": 10.944726943969727,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 2.841714688,
1590
+ "gpu_mem": 1.604696576,
1591
+ "loss": 0.6843,
1592
+ "grad_norm": 6.6840314865112305,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 2.841714688,
1599
+ "gpu_mem": 1.604443136,
1600
+ "loss": 0.6698,
1601
+ "grad_norm": 7.334102153778076,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 2.841714688,
1608
+ "gpu_mem": 1.604542976,
1609
+ "loss": 0.5671,
1610
+ "grad_norm": 3.5981907844543457,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 2.841714688,
1617
+ "gpu_mem": 1.604515328,
1618
+ "loss": 0.7085,
1619
+ "grad_norm": 10.897017478942871,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 2.841714688,
1626
+ "gpu_mem": 1.604452352,
1627
+ "loss": 0.6523,
1628
+ "grad_norm": 4.810169219970703,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 2.841714688,
1635
+ "gpu_mem": 1.604670464,
1636
+ "loss": 0.7493,
1637
+ "grad_norm": 13.707269668579102,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 2.841714688,
1644
+ "gpu_mem": 1.604567552,
1645
+ "loss": 0.6761,
1646
+ "grad_norm": 11.904723167419434,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 2.841714688,
1653
+ "gpu_mem": 1.604515328,
1654
+ "loss": 0.6724,
1655
+ "grad_norm": 12.823482513427734,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 2.841714688,
1662
+ "gpu_mem": 1.604493824,
1663
+ "loss": 0.6527,
1664
+ "grad_norm": 5.67280912399292,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 2.841714688,
1671
+ "gpu_mem": 1.60450304,
1672
+ "loss": 0.6587,
1673
+ "grad_norm": 11.067151069641113,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 2.841714688,
1680
+ "gpu_mem": 1.604435456,
1681
+ "loss": 0.7083,
1682
+ "grad_norm": 20.79690933227539,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 2.841714688,
1689
+ "gpu_mem": 1.604598272,
1690
+ "loss": 0.7213,
1691
+ "grad_norm": 13.631938934326172,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 2.841714688,
1698
+ "gpu_mem": 1.604467712,
1699
+ "loss": 0.6798,
1700
+ "grad_norm": 11.0151948928833,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 2.841714688,
1707
+ "gpu_mem": 1.60458752,
1708
+ "loss": 0.6727,
1709
+ "grad_norm": 23.102136611938477,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 2.841714688,
1716
+ "gpu_mem": 1.604406272,
1717
+ "loss": 0.7513,
1718
+ "grad_norm": 35.718666076660156,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 2.841714688,
1725
+ "gpu_mem": 1.604538368,
1726
+ "loss": 0.5849,
1727
+ "grad_norm": 7.0041985511779785,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 2.841714688,
1734
+ "gpu_mem": 1.604512256,
1735
+ "loss": 0.6773,
1736
+ "grad_norm": 10.7228364944458,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 2.841714688,
1743
+ "gpu_mem": 1.604478464,
1744
+ "loss": 0.6433,
1745
+ "grad_norm": 0.7278481721878052,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 2.841714688,
1752
+ "gpu_mem": 1.604582912,
1753
+ "loss": 0.6686,
1754
+ "grad_norm": 0.7510453462600708,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 2.841714688,
1761
+ "gpu_mem": 1.604578304,
1762
+ "loss": 0.6746,
1763
+ "grad_norm": 1.4179329872131348,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 2.841714688,
1770
+ "gpu_mem": 1.604436992,
1771
+ "loss": 0.6378,
1772
+ "grad_norm": 9.19565486907959,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 2.841714688,
1779
+ "gpu_mem": 1.604628992,
1780
+ "loss": 0.653,
1781
+ "grad_norm": 0.9654055237770081,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 2.841714688,
1788
+ "gpu_mem": 1.60448,
1789
+ "loss": 0.7071,
1790
+ "grad_norm": 10.333610534667969,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 2.841714688,
1797
+ "gpu_mem": 1.604582912,
1798
+ "loss": 0.6801,
1799
+ "grad_norm": 4.721147537231445,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 2.841714688,
1806
+ "gpu_mem": 1.604785664,
1807
+ "loss": 0.6483,
1808
+ "grad_norm": 9.661271095275879,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 2.841714688,
1815
+ "gpu_mem": 1.60458752,
1816
+ "loss": 0.6669,
1817
+ "grad_norm": 8.098054885864258,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 2.841714688,
1824
+ "gpu_mem": 1.604473856,
1825
+ "loss": 0.6202,
1826
+ "grad_norm": 16.55931282043457,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 2.841714688,
1833
+ "gpu_mem": 1.60448768,
1834
+ "loss": 0.6397,
1835
+ "grad_norm": 1.496640682220459,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 2.841714688,
1842
+ "gpu_mem": 1.604532224,
1843
+ "loss": 0.739,
1844
+ "grad_norm": 16.293212890625,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 2.841714688,
1851
+ "gpu_mem": 1.604473856,
1852
+ "loss": 0.7008,
1853
+ "grad_norm": 13.643010139465332,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 2.841714688,
1860
+ "gpu_mem": 1.604707328,
1861
+ "loss": 0.6319,
1862
+ "grad_norm": 7.230471611022949,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 2.841714688,
1869
+ "gpu_mem": 1.604738048,
1870
+ "loss": 0.5831,
1871
+ "grad_norm": 0.8496401309967041,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 2.841714688,
1878
+ "gpu_mem": 1.60466432,
1879
+ "loss": 0.7229,
1880
+ "grad_norm": 13.127245903015137,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 2.841714688,
1887
+ "gpu_mem": 1.604552192,
1888
+ "loss": 0.6521,
1889
+ "grad_norm": 3.434570074081421,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 2.841714688,
1896
+ "gpu_mem": 1.604501504,
1897
+ "loss": 0.611,
1898
+ "grad_norm": 9.112945556640625,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 2.841714688,
1905
+ "gpu_mem": 1.604470784,
1906
+ "loss": 0.6443,
1907
+ "grad_norm": 6.945525169372559,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 2.841714688,
1914
+ "gpu_mem": 1.604493824,
1915
+ "loss": 0.6578,
1916
+ "grad_norm": 3.5807316303253174,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 2.841714688,
1923
+ "gpu_mem": 1.604576768,
1924
+ "loss": 0.6368,
1925
+ "grad_norm": 3.6909682750701904,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 2.841714688,
1932
+ "gpu_mem": 1.604504576,
1933
+ "loss": 0.7277,
1934
+ "grad_norm": 11.636443138122559,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 2.841714688,
1941
+ "gpu_mem": 1.604670464,
1942
+ "loss": 0.6483,
1943
+ "grad_norm": 3.597099781036377,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 2.841714688,
1950
+ "gpu_mem": 1.604512256,
1951
+ "loss": 0.747,
1952
+ "grad_norm": 14.02973747253418,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 2.841714688,
1959
+ "gpu_mem": 1.604489216,
1960
+ "loss": 0.6333,
1961
+ "grad_norm": 0.4543326795101166,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 2.841714688,
1968
+ "gpu_mem": 1.604615168,
1969
+ "loss": 0.6522,
1970
+ "grad_norm": 0.9680509567260742,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 2.841714688,
1977
+ "gpu_mem": 1.60464896,
1978
+ "loss": 0.6605,
1979
+ "grad_norm": 0.5167866945266724,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 2.841714688,
1986
+ "gpu_mem": 1.604516864,
1987
+ "loss": 0.6969,
1988
+ "grad_norm": 4.705380439758301,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 2.841714688,
1995
+ "gpu_mem": 1.604653568,
1996
+ "loss": 0.6487,
1997
+ "grad_norm": 12.17457389831543,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 2.841714688,
2004
+ "gpu_mem": 1.604567552,
2005
+ "loss": 0.6575,
2006
+ "grad_norm": 14.76824951171875,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 2.841714688,
2013
+ "gpu_mem": 1.604532224,
2014
+ "loss": 0.6684,
2015
+ "grad_norm": 5.920445919036865,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 2.841714688,
2022
+ "gpu_mem": 1.604496896,
2023
+ "loss": 0.6828,
2024
+ "grad_norm": 0.7103701829910278,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 2.841714688,
2031
+ "gpu_mem": 1.604645888,
2032
+ "loss": 0.6435,
2033
+ "grad_norm": 6.378837585449219,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 2.841714688,
2040
+ "gpu_mem": 1.604535296,
2041
+ "loss": 0.6684,
2042
+ "grad_norm": 1.5044411420822144,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 2.841714688,
2049
+ "gpu_mem": 1.60448,
2050
+ "loss": 0.6922,
2051
+ "grad_norm": 6.228693008422852,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 2.841714688,
2058
+ "gpu_mem": 1.604421632,
2059
+ "loss": 0.6719,
2060
+ "grad_norm": 1.1901812553405762,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 2.841714688,
2067
+ "gpu_mem": 1.604478464,
2068
+ "loss": 0.6769,
2069
+ "grad_norm": 0.3899259865283966,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 2.841714688,
2076
+ "gpu_mem": 1.60475648,
2077
+ "loss": 0.6801,
2078
+ "grad_norm": 0.802622377872467,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 2.841714688,
2085
+ "gpu_mem": 1.60448,
2086
+ "loss": 0.6751,
2087
+ "grad_norm": 4.701998233795166,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 2.841911296,
2094
+ "gpu_mem": 1.604793344,
2095
+ "loss": 0.6759,
2096
+ "grad_norm": 2.6930110454559326,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 2.841911296,
2103
+ "gpu_mem": 1.604668928,
2104
+ "loss": 0.6777,
2105
+ "grad_norm": 0.6205549836158752,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 2.841911296,
2112
+ "gpu_mem": 1.604424704,
2113
+ "loss": 0.6348,
2114
+ "grad_norm": 8.741998672485352,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 2.841911296,
2121
+ "gpu_mem": 1.604484608,
2122
+ "loss": 0.6866,
2123
+ "grad_norm": 3.6021006107330322,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 2.841911296,
2130
+ "gpu_mem": 1.604546048,
2131
+ "loss": 0.6392,
2132
+ "grad_norm": 3.156719446182251,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 2.841911296,
2139
+ "gpu_mem": 1.604547584,
2140
+ "loss": 0.6326,
2141
+ "grad_norm": 2.6189823150634766,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 2.841911296,
2148
+ "gpu_mem": 1.604801024,
2149
+ "loss": 0.7199,
2150
+ "grad_norm": 11.9671630859375,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 2.785878016,
2157
+ "gpu_mem": 1.604450816,
2158
+ "loss": 0.6811,
2159
+ "grad_norm": 7.266372203826904,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 2.786664448,
2166
+ "gpu_mem": 1.604747264,
2167
+ "loss": 0.6351,
2168
+ "grad_norm": 1.2568522691726685,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 2.787254272,
2175
+ "gpu_mem": 1.604609024,
2176
+ "loss": 0.6544,
2177
+ "grad_norm": 3.119530200958252,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 2.788040704,
2184
+ "gpu_mem": 1.604461568,
2185
+ "loss": 0.7334,
2186
+ "grad_norm": 13.527824401855469,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 2.788630528,
2193
+ "gpu_mem": 1.604601344,
2194
+ "loss": 0.6429,
2195
+ "grad_norm": 2.8774635791778564,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 2.789220352,
2202
+ "gpu_mem": 1.60448,
2203
+ "loss": 0.64,
2204
+ "grad_norm": 5.759853839874268,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 2.789810176,
2211
+ "gpu_mem": 1.604573696,
2212
+ "loss": 0.6884,
2213
+ "grad_norm": 4.809885501861572,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 2.790203392,
2220
+ "gpu_mem": 1.604592128,
2221
+ "loss": 0.6753,
2222
+ "grad_norm": 1.8933202028274536,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 2.790793216,
2229
+ "gpu_mem": 1.604544512,
2230
+ "loss": 0.6585,
2231
+ "grad_norm": 6.303478717803955,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 2.79138304,
2238
+ "gpu_mem": 1.604450816,
2239
+ "loss": 0.6544,
2240
+ "grad_norm": 8.29896354675293,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 2.791776256,
2247
+ "gpu_mem": 1.604542976,
2248
+ "loss": 0.6402,
2249
+ "grad_norm": 14.609487533569336,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 2.792169472,
2256
+ "gpu_mem": 1.604455424,
2257
+ "loss": 0.6642,
2258
+ "grad_norm": 2.4344687461853027,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 2.792562688,
2265
+ "gpu_mem": 1.604498432,
2266
+ "loss": 0.6778,
2267
+ "grad_norm": 0.48283547163009644,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 2.792955904,
2274
+ "gpu_mem": 1.604691968,
2275
+ "loss": 0.6528,
2276
+ "grad_norm": 2.789032459259033,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 2.793545728,
2283
+ "gpu_mem": 1.60448768,
2284
+ "loss": 0.653,
2285
+ "grad_norm": 1.3657398223876953,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 2.794135552,
2292
+ "gpu_mem": 1.604644352,
2293
+ "loss": 0.7569,
2294
+ "grad_norm": 17.99268341064453,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 2.79433216,
2301
+ "gpu_mem": 1.604470784,
2302
+ "loss": 0.6473,
2303
+ "grad_norm": 2.1559677124023438,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 2.794725376,
2310
+ "gpu_mem": 1.604883968,
2311
+ "loss": 0.6438,
2312
+ "grad_norm": 0.6308892965316772,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 2.795118592,
2319
+ "gpu_mem": 1.604542976,
2320
+ "loss": 0.5939,
2321
+ "grad_norm": 5.338343620300293,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 2.7953152,
2328
+ "gpu_mem": 1.604460032,
2329
+ "loss": 0.6661,
2330
+ "grad_norm": 4.470885276794434,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 2.795708416,
2337
+ "gpu_mem": 1.604576768,
2338
+ "loss": 0.6508,
2339
+ "grad_norm": 2.901054859161377,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 2.796101632,
2346
+ "gpu_mem": 1.604532224,
2347
+ "loss": 0.5875,
2348
+ "grad_norm": 7.065019607543945,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 2.79629824,
2355
+ "gpu_mem": 1.604489216,
2356
+ "loss": 0.707,
2357
+ "grad_norm": 10.139660835266113,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 2.796494848,
2364
+ "gpu_mem": 1.60452608,
2365
+ "loss": 0.6425,
2366
+ "grad_norm": 0.6513071060180664,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 2.796888064,
2373
+ "gpu_mem": 1.604613632,
2374
+ "loss": 0.6336,
2375
+ "grad_norm": 3.408752679824829,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 2.797084672,
2382
+ "gpu_mem": 1.604532224,
2383
+ "loss": 0.6928,
2384
+ "grad_norm": 8.95094108581543,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 2.797477888,
2391
+ "gpu_mem": 1.604747264,
2392
+ "loss": 0.6837,
2393
+ "grad_norm": 5.271203994750977,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 2.797871104,
2400
+ "gpu_mem": 1.604539904,
2401
+ "loss": 0.5781,
2402
+ "grad_norm": 14.758561134338379,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 2.798067712,
2409
+ "gpu_mem": 1.604544512,
2410
+ "loss": 0.7273,
2411
+ "grad_norm": 12.341434478759766,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 2.79826432,
2418
+ "gpu_mem": 1.604555264,
2419
+ "loss": 0.6853,
2420
+ "grad_norm": 5.760283946990967,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 2.798460928,
2427
+ "gpu_mem": 1.604593664,
2428
+ "loss": 0.676,
2429
+ "grad_norm": 3.037813663482666,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 2.798657536,
2436
+ "gpu_mem": 1.604645888,
2437
+ "loss": 0.6711,
2438
+ "grad_norm": 2.3162472248077393,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 2.798854144,
2445
+ "gpu_mem": 1.604504576,
2446
+ "loss": 0.6924,
2447
+ "grad_norm": 3.9464468955993652,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 2.799050752,
2454
+ "gpu_mem": 1.604384768,
2455
+ "loss": 0.6798,
2456
+ "grad_norm": 1.3663150072097778,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 2.79924736,
2463
+ "gpu_mem": 1.604612096,
2464
+ "loss": 0.6787,
2465
+ "grad_norm": 2.0080456733703613,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 2.799443968,
2472
+ "gpu_mem": 1.60485632,
2473
+ "loss": 0.6584,
2474
+ "grad_norm": 7.9492573738098145,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 2.799837184,
2481
+ "gpu_mem": 1.604516864,
2482
+ "loss": 0.6679,
2483
+ "grad_norm": 9.01130485534668,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 2.800033792,
2490
+ "gpu_mem": 1.604463104,
2491
+ "loss": 0.6711,
2492
+ "grad_norm": 5.367643356323242,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 2.8002304,
2499
+ "gpu_mem": 1.60462592,
2500
+ "loss": 0.6726,
2501
+ "grad_norm": 5.745750904083252,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 2.800427008,
2508
+ "gpu_mem": 1.604566016,
2509
+ "loss": 0.6705,
2510
+ "grad_norm": 11.076176643371582,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 2.800623616,
2517
+ "gpu_mem": 1.604546048,
2518
+ "loss": 0.6734,
2519
+ "grad_norm": 4.151473522186279,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 2.800623616,
2526
+ "gpu_mem": 1.604481536,
2527
+ "loss": 0.6599,
2528
+ "grad_norm": 12.471253395080566,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 2.800820224,
2535
+ "gpu_mem": 1.60491008,
2536
+ "loss": 0.6949,
2537
+ "grad_norm": 4.582005977630615,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 2.801016832,
2544
+ "gpu_mem": 1.604616704,
2545
+ "loss": 0.6688,
2546
+ "grad_norm": 8.885772705078125,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 2.80121344,
2553
+ "gpu_mem": 1.60447232,
2554
+ "loss": 0.6523,
2555
+ "grad_norm": 10.084007263183594,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 2.801410048,
2562
+ "gpu_mem": 1.60452608,
2563
+ "loss": 0.6893,
2564
+ "grad_norm": 5.492788791656494,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 2.801606656,
2571
+ "gpu_mem": 1.604943872,
2572
+ "loss": 0.6488,
2573
+ "grad_norm": 16.984107971191406,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 2.801606656,
2580
+ "gpu_mem": 1.604713472,
2581
+ "loss": 0.6994,
2582
+ "grad_norm": 7.15690279006958,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 2.801803264,
2589
+ "gpu_mem": 1.604498432,
2590
+ "loss": 0.675,
2591
+ "grad_norm": 2.7328405380249023,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 2.801803264,
2598
+ "gpu_mem": 1.604582912,
2599
+ "loss": 0.6456,
2600
+ "grad_norm": 16.189531326293945,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 2.801999872,
2607
+ "gpu_mem": 1.604507648,
2608
+ "loss": 0.6917,
2609
+ "grad_norm": 2.199305295944214,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 2.801999872,
2616
+ "gpu_mem": 1.604542976,
2617
+ "loss": 0.672,
2618
+ "grad_norm": 2.269022226333618,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 2.80219648,
2625
+ "gpu_mem": 1.60462592,
2626
+ "loss": 0.6531,
2627
+ "grad_norm": 14.249391555786133,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 2.80219648,
2634
+ "gpu_mem": 1.604542976,
2635
+ "loss": 0.69,
2636
+ "grad_norm": 2.4922139644622803,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 2.802393088,
2643
+ "gpu_mem": 1.604569088,
2644
+ "loss": 0.6779,
2645
+ "grad_norm": 0.5828369855880737,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 2.802393088,
2652
+ "gpu_mem": 1.604569088,
2653
+ "train_runtime": 4555.6589,
2654
+ "train_samples_per_second": 4.139,
2655
+ "train_steps_per_second": 0.065,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.8999996844197617
2658
+ }
2659
+ ]
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.24736108344951205
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-hellaswag-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T01:36:09.474463"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r2-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 32,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 32,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.8578968333001394
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 21018624
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-hellaswag-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T16:15:32.728244"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r32-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.8189603664608643
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-hellaswag-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T08:55:26.870790"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-hellaswag-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.27081181574054947
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1307064
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-logiqa-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-logiqa-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-01T22:11:42.023189"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r2-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 8,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 8,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.429043586035943
4
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "mars",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 5233536
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-mars-logiqa-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-mars-opt3-q8/TinyLlama_v1.1-mars-logiqa-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-02T05:31:47.306898"
38
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-logiqa-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-winogrande-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "enabled_mlp": true,
7
+ "enabled_qkv": [
8
+ "q",
9
+ "k",
10
+ "v"
11
+ ],
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": false,
14
+ "layers_pattern": null,
15
+ "layers_to_transform": null,
16
+ "mixture": false,
17
+ "modules_to_preserve_errors": null,
18
+ "modules_to_quantize": null,
19
+ "modules_to_save": null,
20
+ "onnx_export": false,
21
+ "optimization_level": 3,
22
+ "orthogonal_init": false,
23
+ "peft_type": "MARS",
24
+ "quant_n_bits": 8,
25
+ "r": 2,
26
+ "revision": null,
27
+ "seed": 42,
28
+ "shared_r": 2,
29
+ "target_modules": [
30
+ "down_proj",
31
+ "v_proj",
32
+ "k_proj",
33
+ "up_proj",
34
+ "gate_proj",
35
+ "o_proj",
36
+ "q_proj"
37
+ ],
38
+ "task_type": null,
39
+ "use_bnb": false
40
+ }
TinyLlama_v1.1-mars-opt0-q8/TinyLlama_v1.1-mars-winogrande-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "winogrande",
3
+ "results": 0.5043409629044988
4
+ }