martinkorelic commited on
Commit
66e58a5
·
verified ·
1 Parent(s): 21935ca

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/adapter_config.json +30 -0
  2. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/eval_results.json +4 -0
  3. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_configuration.json +38 -0
  4. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_logs.json +625 -0
  5. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/adapter_config.json +30 -0
  6. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/eval_results.json +4 -0
  7. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_configuration.json +38 -0
  8. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_logs.json +625 -0
  9. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/adapter_config.json +30 -0
  10. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/eval_results.json +4 -0
  11. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_configuration.json +38 -0
  12. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_logs.json +625 -0
  13. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/adapter_config.json +30 -0
  14. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/eval_results.json +4 -0
  15. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_configuration.json +38 -0
  16. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_logs.json +1273 -0
  17. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/adapter_config.json +30 -0
  18. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/eval_results.json +4 -0
  19. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_configuration.json +38 -0
  20. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_logs.json +1273 -0
  21. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/adapter_config.json +30 -0
  22. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/eval_results.json +4 -0
  23. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_configuration.json +38 -0
  24. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_logs.json +1273 -0
  25. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/adapter_config.json +30 -0
  26. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/eval_results.json +4 -0
  27. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_configuration.json +38 -0
  28. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_logs.json +2659 -0
  29. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/adapter_config.json +30 -0
  30. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/eval_results.json +4 -0
  31. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_configuration.json +38 -0
  32. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_logs.json +2659 -0
  33. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/adapter_config.json +30 -0
  34. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/eval_results.json +4 -0
  35. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_configuration.json +38 -0
  36. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_logs.json +2659 -0
  37. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/adapter_config.json +30 -0
  38. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/eval_results.json +4 -0
  39. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_configuration.json +38 -0
  40. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_logs.json +0 -0
  41. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/adapter_config.json +30 -0
  42. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/eval_results.json +4 -0
  43. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_configuration.json +38 -0
  44. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_logs.json +0 -0
  45. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/adapter_config.json +30 -0
  46. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/eval_results.json +4 -0
  47. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_configuration.json +38 -0
  48. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_logs.json +0 -0
  49. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/adapter_config.json +30 -0
  50. TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/eval_results.json +4 -0
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.4513651877133106
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1577576
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_c-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T16:35:26.588040"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r2-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.48738048,
6
+ "gpu_mem": 4.423850496,
7
+ "loss": 4.4614,
8
+ "grad_norm": 329.5343017578125,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.492885504,
15
+ "gpu_mem": 4.436614144,
16
+ "loss": 4.6994,
17
+ "grad_norm": 335.7124328613281,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.492885504,
24
+ "gpu_mem": 4.436644864,
25
+ "loss": 2.1292,
26
+ "grad_norm": 166.02584838867188,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.493082112,
33
+ "gpu_mem": 4.436611072,
34
+ "loss": 1.5628,
35
+ "grad_norm": 19.919021606445312,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.493082112,
42
+ "gpu_mem": 4.436598784,
43
+ "loss": 1.4114,
44
+ "grad_norm": 11.660603523254395,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.493082112,
51
+ "gpu_mem": 4.43666176,
52
+ "loss": 1.4434,
53
+ "grad_norm": 21.82590675354004,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.493082112,
60
+ "gpu_mem": 4.436667904,
61
+ "loss": 1.5455,
62
+ "grad_norm": 18.506698608398438,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.493082112,
69
+ "gpu_mem": 4.436626432,
70
+ "loss": 1.3713,
71
+ "grad_norm": 6.629955768585205,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.493082112,
78
+ "gpu_mem": 4.436621824,
79
+ "loss": 1.3708,
80
+ "grad_norm": 20.72789192199707,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.493082112,
87
+ "gpu_mem": 4.436611072,
88
+ "loss": 1.4826,
89
+ "grad_norm": 16.597583770751953,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.493082112,
96
+ "gpu_mem": 4.436621824,
97
+ "loss": 1.3793,
98
+ "grad_norm": 8.454121589660645,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.493082112,
105
+ "gpu_mem": 4.4366464,
106
+ "loss": 1.4662,
107
+ "grad_norm": 15.033178329467773,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.493082112,
114
+ "gpu_mem": 4.4366464,
115
+ "loss": 1.3175,
116
+ "grad_norm": 9.481575965881348,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.493082112,
123
+ "gpu_mem": 4.436594176,
124
+ "loss": 1.4611,
125
+ "grad_norm": 11.892037391662598,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.493082112,
132
+ "gpu_mem": 4.43666944,
133
+ "loss": 1.3686,
134
+ "grad_norm": 5.933671474456787,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.493082112,
141
+ "gpu_mem": 4.436663296,
142
+ "loss": 1.6435,
143
+ "grad_norm": 23.32131576538086,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.493082112,
150
+ "gpu_mem": 4.436667904,
151
+ "loss": 1.6087,
152
+ "grad_norm": 17.27324676513672,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.493082112,
159
+ "gpu_mem": 4.443011584,
160
+ "loss": 2.1751,
161
+ "grad_norm": 17.91927719116211,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.493082112,
168
+ "gpu_mem": 4.443010048,
169
+ "loss": 1.4158,
170
+ "grad_norm": 5.83022928237915,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.493082112,
177
+ "gpu_mem": 4.442985472,
178
+ "loss": 1.3469,
179
+ "grad_norm": 6.347321510314941,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.493082112,
186
+ "gpu_mem": 4.442993152,
187
+ "loss": 1.3926,
188
+ "grad_norm": 6.69634485244751,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.493082112,
195
+ "gpu_mem": 4.443022336,
196
+ "loss": 1.4088,
197
+ "grad_norm": 9.436979293823242,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.493082112,
204
+ "gpu_mem": 4.44305152,
205
+ "loss": 1.3046,
206
+ "grad_norm": 3.0805585384368896,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.493082112,
213
+ "gpu_mem": 4.442994688,
214
+ "loss": 1.3668,
215
+ "grad_norm": 5.27305269241333,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.493082112,
222
+ "gpu_mem": 4.443063808,
223
+ "loss": 1.3333,
224
+ "grad_norm": 5.636720180511475,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.493082112,
231
+ "gpu_mem": 4.4430208,
232
+ "loss": 1.3928,
233
+ "grad_norm": 6.520215034484863,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.493082112,
240
+ "gpu_mem": 4.442979328,
241
+ "loss": 1.4101,
242
+ "grad_norm": 8.386139869689941,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.493082112,
249
+ "gpu_mem": 4.443025408,
250
+ "loss": 1.5088,
251
+ "grad_norm": 14.999929428100586,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.493082112,
258
+ "gpu_mem": 4.4430208,
259
+ "loss": 1.3617,
260
+ "grad_norm": 5.525674343109131,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.493082112,
267
+ "gpu_mem": 4.443010048,
268
+ "loss": 1.3591,
269
+ "grad_norm": 5.05485200881958,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.493082112,
276
+ "gpu_mem": 4.443040768,
277
+ "loss": 1.3461,
278
+ "grad_norm": 4.573202133178711,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.493082112,
285
+ "gpu_mem": 4.443049984,
286
+ "loss": 1.3656,
287
+ "grad_norm": 5.207383632659912,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.493082112,
294
+ "gpu_mem": 4.443030016,
295
+ "loss": 1.3748,
296
+ "grad_norm": 5.258096694946289,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.493082112,
303
+ "gpu_mem": 4.443008512,
304
+ "loss": 1.3884,
305
+ "grad_norm": 5.586267471313477,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.493082112,
312
+ "gpu_mem": 4.442896384,
313
+ "loss": 2.0668,
314
+ "grad_norm": 7.811915874481201,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.493082112,
321
+ "gpu_mem": 4.436640256,
322
+ "loss": 1.3123,
323
+ "grad_norm": 3.8615543842315674,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.493082112,
330
+ "gpu_mem": 4.436649472,
331
+ "loss": 1.3892,
332
+ "grad_norm": 8.06524658203125,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.493082112,
339
+ "gpu_mem": 4.436620288,
340
+ "loss": 1.336,
341
+ "grad_norm": 4.97226095199585,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.493082112,
348
+ "gpu_mem": 4.43663872,
349
+ "loss": 1.3288,
350
+ "grad_norm": 4.268495559692383,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.493082112,
357
+ "gpu_mem": 4.43661568,
358
+ "loss": 1.3649,
359
+ "grad_norm": 5.184421062469482,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.493082112,
366
+ "gpu_mem": 4.436617216,
367
+ "loss": 1.3795,
368
+ "grad_norm": 5.0870513916015625,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.493082112,
375
+ "gpu_mem": 4.4366464,
376
+ "loss": 1.3246,
377
+ "grad_norm": 6.057374000549316,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.493082112,
384
+ "gpu_mem": 4.43666176,
385
+ "loss": 1.3699,
386
+ "grad_norm": 8.358153343200684,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.493082112,
393
+ "gpu_mem": 4.436680192,
394
+ "loss": 1.3074,
395
+ "grad_norm": 4.818901062011719,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.493082112,
402
+ "gpu_mem": 4.436634112,
403
+ "loss": 1.2994,
404
+ "grad_norm": 5.064252853393555,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.493082112,
411
+ "gpu_mem": 4.436627968,
412
+ "loss": 1.2786,
413
+ "grad_norm": 4.9510273933410645,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.493082112,
420
+ "gpu_mem": 4.436621824,
421
+ "loss": 1.2801,
422
+ "grad_norm": 4.4370856285095215,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.493082112,
429
+ "gpu_mem": 4.436626432,
430
+ "loss": 1.2029,
431
+ "grad_norm": 4.3672943115234375,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.493082112,
438
+ "gpu_mem": 4.436617216,
439
+ "loss": 1.2484,
440
+ "grad_norm": 3.859243154525757,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.493082112,
447
+ "gpu_mem": 4.436598784,
448
+ "loss": 1.3265,
449
+ "grad_norm": 7.188536643981934,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.493082112,
456
+ "gpu_mem": 4.43662336,
457
+ "loss": 1.2704,
458
+ "grad_norm": 5.372682571411133,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.493082112,
465
+ "gpu_mem": 4.436651008,
466
+ "loss": 1.3735,
467
+ "grad_norm": 8.921072959899902,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.493082112,
474
+ "gpu_mem": 4.443006976,
475
+ "loss": 1.832,
476
+ "grad_norm": 8.732340812683105,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.493082112,
483
+ "gpu_mem": 4.442976256,
484
+ "loss": 1.3082,
485
+ "grad_norm": 7.223543167114258,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.493082112,
492
+ "gpu_mem": 4.443010048,
493
+ "loss": 1.3045,
494
+ "grad_norm": 8.533186912536621,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.493082112,
501
+ "gpu_mem": 4.443083776,
502
+ "loss": 1.2201,
503
+ "grad_norm": 5.6041579246521,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.493082112,
510
+ "gpu_mem": 4.443026944,
511
+ "loss": 1.2386,
512
+ "grad_norm": 5.491722583770752,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.493082112,
519
+ "gpu_mem": 4.4430208,
520
+ "loss": 1.142,
521
+ "grad_norm": 5.923037052154541,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.493082112,
528
+ "gpu_mem": 4.443071488,
529
+ "loss": 1.2387,
530
+ "grad_norm": 4.976379871368408,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.493082112,
537
+ "gpu_mem": 4.44299776,
538
+ "loss": 1.2988,
539
+ "grad_norm": 7.554241180419922,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.493082112,
546
+ "gpu_mem": 4.443011584,
547
+ "loss": 1.2697,
548
+ "grad_norm": 6.907799243927002,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.493082112,
555
+ "gpu_mem": 4.44301312,
556
+ "loss": 1.2166,
557
+ "grad_norm": 5.639773845672607,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.493082112,
564
+ "gpu_mem": 4.443002368,
565
+ "loss": 1.2443,
566
+ "grad_norm": 6.010934829711914,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.493082112,
573
+ "gpu_mem": 4.443019264,
574
+ "loss": 1.2085,
575
+ "grad_norm": 6.853599548339844,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.493082112,
582
+ "gpu_mem": 4.443040768,
583
+ "loss": 1.2106,
584
+ "grad_norm": 6.083324909210205,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.493082112,
591
+ "gpu_mem": 4.443031552,
592
+ "loss": 1.1881,
593
+ "grad_norm": 6.191532611846924,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.493082112,
600
+ "gpu_mem": 4.443057664,
601
+ "loss": 1.2379,
602
+ "grad_norm": 6.098973751068115,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.493082112,
609
+ "gpu_mem": 4.443008512,
610
+ "loss": 1.2206,
611
+ "grad_norm": 6.373558044433594,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.493082112,
618
+ "gpu_mem": 4.443008512,
619
+ "train_runtime": 374.6823,
620
+ "train_samples_per_second": 11.946,
621
+ "train_steps_per_second": 0.181,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4825304623912363
624
+ }
625
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.628839590443686
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 25389056
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_c-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T06:31:01.002762"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r32-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.496281088,
6
+ "gpu_mem": 4.519020032,
7
+ "loss": 4.4614,
8
+ "grad_norm": 280.62310791015625,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.501589504,
15
+ "gpu_mem": 4.722122752,
16
+ "loss": 4.6994,
17
+ "grad_norm": 286.9012451171875,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.501786112,
24
+ "gpu_mem": 4.722153472,
25
+ "loss": 2.1324,
26
+ "grad_norm": 415.12750244140625,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.501786112,
33
+ "gpu_mem": 4.72211968,
34
+ "loss": 1.7543,
35
+ "grad_norm": 44.261512756347656,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.501786112,
42
+ "gpu_mem": 4.722107392,
43
+ "loss": 1.508,
44
+ "grad_norm": 22.360448837280273,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.501786112,
51
+ "gpu_mem": 4.722170368,
52
+ "loss": 1.4382,
53
+ "grad_norm": 9.388525009155273,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.50198272,
60
+ "gpu_mem": 4.722176512,
61
+ "loss": 1.4429,
62
+ "grad_norm": 13.978992462158203,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.50198272,
69
+ "gpu_mem": 4.72213504,
70
+ "loss": 1.564,
71
+ "grad_norm": 15.369060516357422,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.50198272,
78
+ "gpu_mem": 4.722130432,
79
+ "loss": 1.5394,
80
+ "grad_norm": 17.35812759399414,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.50198272,
87
+ "gpu_mem": 4.72211968,
88
+ "loss": 1.4216,
89
+ "grad_norm": 7.401285648345947,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.50198272,
96
+ "gpu_mem": 4.722130432,
97
+ "loss": 1.7083,
98
+ "grad_norm": 20.328474044799805,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.50198272,
105
+ "gpu_mem": 4.722155008,
106
+ "loss": 1.3558,
107
+ "grad_norm": 2.454993724822998,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.50198272,
114
+ "gpu_mem": 4.722155008,
115
+ "loss": 1.5307,
116
+ "grad_norm": 12.548927307128906,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.50198272,
123
+ "gpu_mem": 4.722102784,
124
+ "loss": 1.4728,
125
+ "grad_norm": 6.753164768218994,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.50198272,
132
+ "gpu_mem": 4.722178048,
133
+ "loss": 1.4047,
134
+ "grad_norm": 5.13401460647583,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.50198272,
141
+ "gpu_mem": 4.722171904,
142
+ "loss": 1.7088,
143
+ "grad_norm": 15.40377140045166,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.50198272,
150
+ "gpu_mem": 4.722176512,
151
+ "loss": 1.6083,
152
+ "grad_norm": 22.233034133911133,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.50198272,
159
+ "gpu_mem": 4.823689728,
160
+ "loss": 2.0617,
161
+ "grad_norm": 4.666072845458984,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.50198272,
168
+ "gpu_mem": 4.823688192,
169
+ "loss": 1.3905,
170
+ "grad_norm": 4.700724124908447,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.50198272,
177
+ "gpu_mem": 4.823663616,
178
+ "loss": 1.3482,
179
+ "grad_norm": 5.293838024139404,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.50198272,
186
+ "gpu_mem": 4.823671296,
187
+ "loss": 1.4783,
188
+ "grad_norm": 6.684973239898682,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.50198272,
195
+ "gpu_mem": 4.82370048,
196
+ "loss": 1.3886,
197
+ "grad_norm": 4.591466426849365,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.50198272,
204
+ "gpu_mem": 4.823729664,
205
+ "loss": 1.3456,
206
+ "grad_norm": 3.4432966709136963,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.50198272,
213
+ "gpu_mem": 4.823672832,
214
+ "loss": 1.3971,
215
+ "grad_norm": 4.504514694213867,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.50198272,
222
+ "gpu_mem": 4.823741952,
223
+ "loss": 1.3702,
224
+ "grad_norm": 4.352265357971191,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.50198272,
231
+ "gpu_mem": 4.823698944,
232
+ "loss": 1.3863,
233
+ "grad_norm": 3.940533399581909,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.50198272,
240
+ "gpu_mem": 4.823657472,
241
+ "loss": 1.3904,
242
+ "grad_norm": 3.843360662460327,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.50198272,
249
+ "gpu_mem": 4.823703552,
250
+ "loss": 1.6316,
251
+ "grad_norm": 9.981597900390625,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.50198272,
258
+ "gpu_mem": 4.823698944,
259
+ "loss": 1.5115,
260
+ "grad_norm": 6.392779350280762,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.50198272,
267
+ "gpu_mem": 4.823688192,
268
+ "loss": 1.4569,
269
+ "grad_norm": 6.152426242828369,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.50198272,
276
+ "gpu_mem": 4.823718912,
277
+ "loss": 1.3658,
278
+ "grad_norm": 3.233621835708618,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.50198272,
285
+ "gpu_mem": 4.823728128,
286
+ "loss": 1.3712,
287
+ "grad_norm": 2.499112129211426,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.50198272,
294
+ "gpu_mem": 4.82370816,
295
+ "loss": 1.4401,
296
+ "grad_norm": 4.652753829956055,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.50198272,
303
+ "gpu_mem": 4.823686656,
304
+ "loss": 1.4205,
305
+ "grad_norm": 3.1102354526519775,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.50198272,
312
+ "gpu_mem": 4.823574528,
313
+ "loss": 2.0597,
314
+ "grad_norm": 1.4106764793395996,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.50198272,
321
+ "gpu_mem": 4.722148864,
322
+ "loss": 1.3446,
323
+ "grad_norm": 2.1405179500579834,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.50198272,
330
+ "gpu_mem": 4.72215808,
331
+ "loss": 1.4145,
332
+ "grad_norm": 3.9450926780700684,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.50198272,
339
+ "gpu_mem": 4.722128896,
340
+ "loss": 1.3416,
341
+ "grad_norm": 2.131178617477417,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.50198272,
348
+ "gpu_mem": 4.722147328,
349
+ "loss": 1.3367,
350
+ "grad_norm": 2.133847713470459,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.50198272,
357
+ "gpu_mem": 4.722124288,
358
+ "loss": 1.3975,
359
+ "grad_norm": 3.288180112838745,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.50198272,
366
+ "gpu_mem": 4.722125824,
367
+ "loss": 1.3958,
368
+ "grad_norm": 3.1248035430908203,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.50198272,
375
+ "gpu_mem": 4.722155008,
376
+ "loss": 1.335,
377
+ "grad_norm": 3.862166404724121,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.50198272,
384
+ "gpu_mem": 4.722170368,
385
+ "loss": 1.3372,
386
+ "grad_norm": 3.2153024673461914,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.50198272,
393
+ "gpu_mem": 4.7221888,
394
+ "loss": 1.2856,
395
+ "grad_norm": 1.854359745979309,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.50198272,
402
+ "gpu_mem": 4.72214272,
403
+ "loss": 1.322,
404
+ "grad_norm": 2.8490543365478516,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.50198272,
411
+ "gpu_mem": 4.722136576,
412
+ "loss": 1.2721,
413
+ "grad_norm": 2.5255608558654785,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.50198272,
420
+ "gpu_mem": 4.722130432,
421
+ "loss": 1.2753,
422
+ "grad_norm": 2.5269887447357178,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.50198272,
429
+ "gpu_mem": 4.72213504,
430
+ "loss": 1.2814,
431
+ "grad_norm": 3.4005813598632812,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.50198272,
438
+ "gpu_mem": 4.722125824,
439
+ "loss": 1.2355,
440
+ "grad_norm": 2.77209734916687,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.50198272,
447
+ "gpu_mem": 4.722107392,
448
+ "loss": 1.2617,
449
+ "grad_norm": 2.584846258163452,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.50198272,
456
+ "gpu_mem": 4.722131968,
457
+ "loss": 1.2872,
458
+ "grad_norm": 3.403454542160034,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.50198272,
465
+ "gpu_mem": 4.722159616,
466
+ "loss": 1.323,
467
+ "grad_norm": 4.359912872314453,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.50198272,
474
+ "gpu_mem": 4.82368512,
475
+ "loss": 1.8447,
476
+ "grad_norm": 5.682162761688232,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.50198272,
483
+ "gpu_mem": 4.8236544,
484
+ "loss": 1.1505,
485
+ "grad_norm": 3.0071282386779785,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.50198272,
492
+ "gpu_mem": 4.823688192,
493
+ "loss": 1.151,
494
+ "grad_norm": 3.7012956142425537,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.50198272,
501
+ "gpu_mem": 4.82376192,
502
+ "loss": 1.1417,
503
+ "grad_norm": 4.017345905303955,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.50198272,
510
+ "gpu_mem": 4.823705088,
511
+ "loss": 1.0821,
512
+ "grad_norm": 3.950089454650879,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.50198272,
519
+ "gpu_mem": 4.823698944,
520
+ "loss": 1.0089,
521
+ "grad_norm": 4.133927345275879,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.50198272,
528
+ "gpu_mem": 4.823749632,
529
+ "loss": 1.0377,
530
+ "grad_norm": 4.262353420257568,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.50198272,
537
+ "gpu_mem": 4.823675904,
538
+ "loss": 1.0539,
539
+ "grad_norm": 5.903791904449463,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.50198272,
546
+ "gpu_mem": 4.823689728,
547
+ "loss": 1.1056,
548
+ "grad_norm": 5.543725490570068,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.50198272,
555
+ "gpu_mem": 4.823691264,
556
+ "loss": 1.0513,
557
+ "grad_norm": 5.472988128662109,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.50198272,
564
+ "gpu_mem": 4.823680512,
565
+ "loss": 1.0188,
566
+ "grad_norm": 5.62291955947876,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.50198272,
573
+ "gpu_mem": 4.823697408,
574
+ "loss": 1.0059,
575
+ "grad_norm": 5.603131294250488,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.50198272,
582
+ "gpu_mem": 4.823718912,
583
+ "loss": 1.0754,
584
+ "grad_norm": 6.040858268737793,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.50198272,
591
+ "gpu_mem": 4.823709696,
592
+ "loss": 0.9901,
593
+ "grad_norm": 5.5554585456848145,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.50198272,
600
+ "gpu_mem": 4.823735808,
601
+ "loss": 1.0235,
602
+ "grad_norm": 6.136469841003418,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.50198272,
609
+ "gpu_mem": 4.823686656,
610
+ "loss": 1.0859,
611
+ "grad_norm": 5.908904075622559,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.50198272,
618
+ "gpu_mem": 4.823686656,
619
+ "train_runtime": 378.6352,
620
+ "train_samples_per_second": 11.821,
621
+ "train_steps_per_second": 0.18,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.4682180960388744
624
+ }
625
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.378839590443686
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 6317696
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_c-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T23:32:37.041918"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_c-r8-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.489108992,
6
+ "gpu_mem": 4.442774016,
7
+ "loss": 4.4614,
8
+ "grad_norm": 272.1399230957031,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.494614016,
15
+ "gpu_mem": 4.493384704,
16
+ "loss": 4.6994,
17
+ "grad_norm": 279.0349426269531,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.494614016,
24
+ "gpu_mem": 4.493415424,
25
+ "loss": 2.3086,
26
+ "grad_norm": 260.66900634765625,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.494810624,
33
+ "gpu_mem": 4.493381632,
34
+ "loss": 1.572,
35
+ "grad_norm": 18.902830123901367,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.495007232,
42
+ "gpu_mem": 4.493369344,
43
+ "loss": 1.5805,
44
+ "grad_norm": 30.244815826416016,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.495007232,
51
+ "gpu_mem": 4.49343232,
52
+ "loss": 1.3975,
53
+ "grad_norm": 10.924633026123047,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.495007232,
60
+ "gpu_mem": 4.493438464,
61
+ "loss": 1.4746,
62
+ "grad_norm": 35.98440933227539,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.495007232,
69
+ "gpu_mem": 4.493396992,
70
+ "loss": 1.5768,
71
+ "grad_norm": 21.909250259399414,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.495007232,
78
+ "gpu_mem": 4.493392384,
79
+ "loss": 1.3486,
80
+ "grad_norm": 8.877981185913086,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.495007232,
87
+ "gpu_mem": 4.493381632,
88
+ "loss": 1.6285,
89
+ "grad_norm": 21.38736915588379,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.495007232,
96
+ "gpu_mem": 4.493392384,
97
+ "loss": 1.4523,
98
+ "grad_norm": 7.3010358810424805,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.495007232,
105
+ "gpu_mem": 4.49341696,
106
+ "loss": 1.4579,
107
+ "grad_norm": 8.112820625305176,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.495007232,
114
+ "gpu_mem": 4.49341696,
115
+ "loss": 1.3493,
116
+ "grad_norm": 11.578926086425781,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.495007232,
123
+ "gpu_mem": 4.493364736,
124
+ "loss": 1.6974,
125
+ "grad_norm": 17.703752517700195,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.495007232,
132
+ "gpu_mem": 4.49344,
133
+ "loss": 1.5182,
134
+ "grad_norm": 8.22641658782959,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.495007232,
141
+ "gpu_mem": 4.493433856,
142
+ "loss": 1.4541,
143
+ "grad_norm": 7.929551124572754,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.495007232,
150
+ "gpu_mem": 4.493438464,
151
+ "loss": 1.3941,
152
+ "grad_norm": 5.744842052459717,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.495007232,
159
+ "gpu_mem": 4.518705664,
160
+ "loss": 2.1397,
161
+ "grad_norm": 13.66163158416748,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.495007232,
168
+ "gpu_mem": 4.518704128,
169
+ "loss": 1.4262,
170
+ "grad_norm": 6.4307355880737305,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.495007232,
177
+ "gpu_mem": 4.518679552,
178
+ "loss": 1.3419,
179
+ "grad_norm": 18.498199462890625,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.495007232,
186
+ "gpu_mem": 4.518687232,
187
+ "loss": 1.3816,
188
+ "grad_norm": 3.349029541015625,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.495007232,
195
+ "gpu_mem": 4.518716416,
196
+ "loss": 1.3216,
197
+ "grad_norm": 3.663336753845215,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.495007232,
204
+ "gpu_mem": 4.5187456,
205
+ "loss": 1.3365,
206
+ "grad_norm": 6.960829734802246,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.495007232,
213
+ "gpu_mem": 4.518688768,
214
+ "loss": 1.3619,
215
+ "grad_norm": 5.14816427230835,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.495007232,
222
+ "gpu_mem": 4.518757888,
223
+ "loss": 1.3392,
224
+ "grad_norm": 4.903714656829834,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.495007232,
231
+ "gpu_mem": 4.51871488,
232
+ "loss": 1.3861,
233
+ "grad_norm": 6.120626926422119,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.495007232,
240
+ "gpu_mem": 4.518673408,
241
+ "loss": 1.3876,
242
+ "grad_norm": 6.803613662719727,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.495007232,
249
+ "gpu_mem": 4.518719488,
250
+ "loss": 1.6675,
251
+ "grad_norm": 14.671072006225586,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.495007232,
258
+ "gpu_mem": 4.51871488,
259
+ "loss": 1.4052,
260
+ "grad_norm": 6.3027143478393555,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.495007232,
267
+ "gpu_mem": 4.518704128,
268
+ "loss": 1.3962,
269
+ "grad_norm": 5.853539943695068,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.495007232,
276
+ "gpu_mem": 4.518734848,
277
+ "loss": 1.3627,
278
+ "grad_norm": 4.129302024841309,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.495007232,
285
+ "gpu_mem": 4.518744064,
286
+ "loss": 1.347,
287
+ "grad_norm": 6.030110836029053,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.495007232,
294
+ "gpu_mem": 4.518724096,
295
+ "loss": 1.4082,
296
+ "grad_norm": 8.369693756103516,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.495007232,
303
+ "gpu_mem": 4.518702592,
304
+ "loss": 1.4094,
305
+ "grad_norm": 14.688669204711914,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.495007232,
312
+ "gpu_mem": 4.518590464,
313
+ "loss": 2.1062,
314
+ "grad_norm": 14.797432899475098,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.495007232,
321
+ "gpu_mem": 4.493410816,
322
+ "loss": 1.3897,
323
+ "grad_norm": 10.996891021728516,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.495007232,
330
+ "gpu_mem": 4.493420032,
331
+ "loss": 1.4705,
332
+ "grad_norm": 15.965860366821289,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.495007232,
339
+ "gpu_mem": 4.493390848,
340
+ "loss": 1.3793,
341
+ "grad_norm": 5.473352432250977,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.495007232,
348
+ "gpu_mem": 4.49340928,
349
+ "loss": 1.3478,
350
+ "grad_norm": 2.9749255180358887,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.495007232,
357
+ "gpu_mem": 4.49338624,
358
+ "loss": 1.4282,
359
+ "grad_norm": 7.062312126159668,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.495007232,
366
+ "gpu_mem": 4.493387776,
367
+ "loss": 1.4169,
368
+ "grad_norm": 5.1576642990112305,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.495007232,
375
+ "gpu_mem": 4.49341696,
376
+ "loss": 1.4134,
377
+ "grad_norm": 9.255854606628418,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.495007232,
384
+ "gpu_mem": 4.49343232,
385
+ "loss": 1.3524,
386
+ "grad_norm": 5.755366802215576,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.495007232,
393
+ "gpu_mem": 4.493450752,
394
+ "loss": 1.3488,
395
+ "grad_norm": 4.835580348968506,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.495007232,
402
+ "gpu_mem": 4.493404672,
403
+ "loss": 1.3469,
404
+ "grad_norm": 6.548555374145508,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.495007232,
411
+ "gpu_mem": 4.493398528,
412
+ "loss": 1.2932,
413
+ "grad_norm": 3.8608975410461426,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.495007232,
420
+ "gpu_mem": 4.493392384,
421
+ "loss": 1.3043,
422
+ "grad_norm": 4.572495460510254,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.495007232,
429
+ "gpu_mem": 4.493396992,
430
+ "loss": 1.2866,
431
+ "grad_norm": 3.4438271522521973,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.495007232,
438
+ "gpu_mem": 4.493387776,
439
+ "loss": 1.3026,
440
+ "grad_norm": 3.738175868988037,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.495007232,
447
+ "gpu_mem": 4.493369344,
448
+ "loss": 1.3494,
449
+ "grad_norm": 4.942461967468262,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.495007232,
456
+ "gpu_mem": 4.49339392,
457
+ "loss": 1.3157,
458
+ "grad_norm": 4.040122985839844,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.495007232,
465
+ "gpu_mem": 4.493421568,
466
+ "loss": 1.4034,
467
+ "grad_norm": 4.740878105163574,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.495007232,
474
+ "gpu_mem": 4.518701056,
475
+ "loss": 1.9696,
476
+ "grad_norm": 6.076801300048828,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.495007232,
483
+ "gpu_mem": 4.518670336,
484
+ "loss": 1.3186,
485
+ "grad_norm": 6.321723937988281,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.495007232,
492
+ "gpu_mem": 4.518704128,
493
+ "loss": 1.3322,
494
+ "grad_norm": 4.518864154815674,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.495007232,
501
+ "gpu_mem": 4.518777856,
502
+ "loss": 1.3229,
503
+ "grad_norm": 3.9502453804016113,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.495007232,
510
+ "gpu_mem": 4.518721024,
511
+ "loss": 1.2922,
512
+ "grad_norm": 2.77620530128479,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.495007232,
519
+ "gpu_mem": 4.51871488,
520
+ "loss": 1.221,
521
+ "grad_norm": 5.571518898010254,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.495007232,
528
+ "gpu_mem": 4.518765568,
529
+ "loss": 1.2745,
530
+ "grad_norm": 4.341223239898682,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.495007232,
537
+ "gpu_mem": 4.51869184,
538
+ "loss": 1.3552,
539
+ "grad_norm": 5.9276251792907715,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.495007232,
546
+ "gpu_mem": 4.518705664,
547
+ "loss": 1.3201,
548
+ "grad_norm": 4.258768558502197,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.495007232,
555
+ "gpu_mem": 4.5187072,
556
+ "loss": 1.3376,
557
+ "grad_norm": 5.822268009185791,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.495007232,
564
+ "gpu_mem": 4.518696448,
565
+ "loss": 1.3397,
566
+ "grad_norm": 3.860724925994873,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.495007232,
573
+ "gpu_mem": 4.518713344,
574
+ "loss": 1.2999,
575
+ "grad_norm": 4.712264060974121,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.495007232,
582
+ "gpu_mem": 4.518734848,
583
+ "loss": 1.3846,
584
+ "grad_norm": 7.827590465545654,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.495007232,
591
+ "gpu_mem": 4.518725632,
592
+ "loss": 1.255,
593
+ "grad_norm": 4.00440788269043,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.495007232,
600
+ "gpu_mem": 4.518751744,
601
+ "loss": 1.2525,
602
+ "grad_norm": 4.9576640129089355,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.495007232,
609
+ "gpu_mem": 4.518702592,
610
+ "loss": 1.3006,
611
+ "grad_norm": 4.395829677581787,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.495007232,
618
+ "gpu_mem": 4.518702592,
619
+ "train_runtime": 376.1937,
620
+ "train_samples_per_second": 11.898,
621
+ "train_steps_per_second": 0.181,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.523840169696247
624
+ }
625
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.33375420875420875
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1577576
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_e-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T15:57:00.430559"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r2-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.48697088,
6
+ "gpu_mem": 4.4237952,
7
+ "loss": 4.6319,
8
+ "grad_norm": 334.8832702636719,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.492672512,
15
+ "gpu_mem": 4.436629504,
16
+ "loss": 4.4578,
17
+ "grad_norm": 338.71502685546875,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.49286912,
24
+ "gpu_mem": 4.436608,
25
+ "loss": 3.0613,
26
+ "grad_norm": 203.26577758789062,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.493065728,
33
+ "gpu_mem": 4.436586496,
34
+ "loss": 2.1672,
35
+ "grad_norm": 93.64673614501953,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.493065728,
42
+ "gpu_mem": 4.436627968,
43
+ "loss": 1.5508,
44
+ "grad_norm": 19.551036834716797,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.493262336,
51
+ "gpu_mem": 4.436603392,
52
+ "loss": 1.4936,
53
+ "grad_norm": 32.31931686401367,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.493262336,
60
+ "gpu_mem": 4.436626432,
61
+ "loss": 1.4445,
62
+ "grad_norm": 20.193700790405273,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.493262336,
69
+ "gpu_mem": 4.43658496,
70
+ "loss": 1.3912,
71
+ "grad_norm": 15.018762588500977,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.493262336,
78
+ "gpu_mem": 4.436586496,
79
+ "loss": 1.3628,
80
+ "grad_norm": 12.806224822998047,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.493262336,
87
+ "gpu_mem": 4.436581888,
88
+ "loss": 1.6795,
89
+ "grad_norm": 60.71196746826172,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.493458944,
96
+ "gpu_mem": 4.436660224,
97
+ "loss": 1.3897,
98
+ "grad_norm": 14.609763145446777,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.493458944,
105
+ "gpu_mem": 4.436634112,
106
+ "loss": 1.3519,
107
+ "grad_norm": 10.01632308959961,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.493458944,
114
+ "gpu_mem": 4.43658496,
115
+ "loss": 1.3813,
116
+ "grad_norm": 9.067853927612305,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.493458944,
123
+ "gpu_mem": 4.436606464,
124
+ "loss": 1.4137,
125
+ "grad_norm": 11.54834270477295,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.493458944,
132
+ "gpu_mem": 4.436583424,
133
+ "loss": 1.33,
134
+ "grad_norm": 4.984076499938965,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.493458944,
141
+ "gpu_mem": 4.436588032,
142
+ "loss": 1.3787,
143
+ "grad_norm": 4.959704875946045,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.493458944,
150
+ "gpu_mem": 4.436624896,
151
+ "loss": 1.3491,
152
+ "grad_norm": 6.165195465087891,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.493458944,
159
+ "gpu_mem": 4.436635648,
160
+ "loss": 1.3499,
161
+ "grad_norm": 6.145087242126465,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.493458944,
168
+ "gpu_mem": 4.436578816,
169
+ "loss": 1.3647,
170
+ "grad_norm": 14.54247760772705,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.493458944,
177
+ "gpu_mem": 4.436649472,
178
+ "loss": 1.3599,
179
+ "grad_norm": 6.127029895782471,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.493458944,
186
+ "gpu_mem": 4.436647936,
187
+ "loss": 1.3275,
188
+ "grad_norm": 5.688446521759033,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.493458944,
195
+ "gpu_mem": 4.436604928,
196
+ "loss": 1.3288,
197
+ "grad_norm": 7.160696029663086,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.493458944,
204
+ "gpu_mem": 4.436621824,
205
+ "loss": 1.3048,
206
+ "grad_norm": 5.542471408843994,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.493458944,
213
+ "gpu_mem": 4.436578816,
214
+ "loss": 1.3567,
215
+ "grad_norm": 9.522262573242188,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.493458944,
222
+ "gpu_mem": 4.436608,
223
+ "loss": 1.3671,
224
+ "grad_norm": 9.449694633483887,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.493458944,
231
+ "gpu_mem": 4.436588032,
232
+ "loss": 1.4186,
233
+ "grad_norm": 7.2168426513671875,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.493458944,
240
+ "gpu_mem": 4.436614144,
241
+ "loss": 1.3558,
242
+ "grad_norm": 10.885024070739746,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.493458944,
249
+ "gpu_mem": 4.436614144,
250
+ "loss": 1.3774,
251
+ "grad_norm": 6.4214277267456055,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.493458944,
258
+ "gpu_mem": 4.43659264,
259
+ "loss": 1.2741,
260
+ "grad_norm": 6.542706489562988,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.493458944,
267
+ "gpu_mem": 4.436583424,
268
+ "loss": 1.3852,
269
+ "grad_norm": 9.458147048950195,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.493458944,
276
+ "gpu_mem": 4.436601856,
277
+ "loss": 1.3504,
278
+ "grad_norm": 16.29282569885254,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.493458944,
285
+ "gpu_mem": 4.436624896,
286
+ "loss": 1.3272,
287
+ "grad_norm": 7.9392499923706055,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.493458944,
294
+ "gpu_mem": 4.436621824,
295
+ "loss": 1.3411,
296
+ "grad_norm": 4.136773109436035,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.493458944,
303
+ "gpu_mem": 4.436624896,
304
+ "loss": 1.3751,
305
+ "grad_norm": 5.5571393966674805,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.493458944,
312
+ "gpu_mem": 4.436606464,
313
+ "loss": 1.2823,
314
+ "grad_norm": 3.0069351196289062,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.493458944,
321
+ "gpu_mem": 4.442983936,
322
+ "loss": 1.9088,
323
+ "grad_norm": 9.872271537780762,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.493458944,
330
+ "gpu_mem": 4.442988544,
331
+ "loss": 1.3898,
332
+ "grad_norm": 9.080979347229004,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.493458944,
339
+ "gpu_mem": 4.44296704,
340
+ "loss": 1.2297,
341
+ "grad_norm": 4.792629718780518,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.493458944,
348
+ "gpu_mem": 4.442956288,
349
+ "loss": 1.3502,
350
+ "grad_norm": 10.400541305541992,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.493458944,
357
+ "gpu_mem": 4.443019264,
358
+ "loss": 1.3253,
359
+ "grad_norm": 4.320893287658691,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.493458944,
366
+ "gpu_mem": 4.442979328,
367
+ "loss": 1.3928,
368
+ "grad_norm": 12.37241268157959,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.493458944,
375
+ "gpu_mem": 4.443022336,
376
+ "loss": 1.3985,
377
+ "grad_norm": 9.03736686706543,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.493458944,
384
+ "gpu_mem": 4.442971648,
385
+ "loss": 1.3813,
386
+ "grad_norm": 5.811436176300049,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.493458944,
393
+ "gpu_mem": 4.44303616,
394
+ "loss": 1.3624,
395
+ "grad_norm": 5.308187961578369,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.493458944,
402
+ "gpu_mem": 4.443003904,
403
+ "loss": 1.3711,
404
+ "grad_norm": 4.097907066345215,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.493458944,
411
+ "gpu_mem": 4.443008512,
412
+ "loss": 1.3712,
413
+ "grad_norm": 4.213882923126221,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.493458944,
420
+ "gpu_mem": 4.442954752,
421
+ "loss": 1.3301,
422
+ "grad_norm": 7.293313503265381,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.493458944,
429
+ "gpu_mem": 4.442968576,
430
+ "loss": 1.3347,
431
+ "grad_norm": 8.195301055908203,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.493458944,
438
+ "gpu_mem": 4.442957824,
439
+ "loss": 1.3941,
440
+ "grad_norm": 25.371461868286133,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.493458944,
447
+ "gpu_mem": 4.442971648,
448
+ "loss": 1.4377,
449
+ "grad_norm": 18.559114456176758,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.493458944,
456
+ "gpu_mem": 4.443023872,
457
+ "loss": 1.364,
458
+ "grad_norm": 8.876504898071289,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.493458944,
465
+ "gpu_mem": 4.442971648,
466
+ "loss": 1.686,
467
+ "grad_norm": 56.97942352294922,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.493458944,
474
+ "gpu_mem": 4.443040768,
475
+ "loss": 1.4077,
476
+ "grad_norm": 22.452436447143555,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.493458944,
483
+ "gpu_mem": 4.443008512,
484
+ "loss": 1.3135,
485
+ "grad_norm": 6.5965576171875,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.493458944,
492
+ "gpu_mem": 4.443017728,
493
+ "loss": 1.3823,
494
+ "grad_norm": 6.214559078216553,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.493458944,
501
+ "gpu_mem": 4.442993152,
502
+ "loss": 1.3107,
503
+ "grad_norm": 2.944521903991699,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.493458944,
510
+ "gpu_mem": 4.443026944,
511
+ "loss": 1.3232,
512
+ "grad_norm": 4.46520471572876,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.493458944,
519
+ "gpu_mem": 4.443008512,
520
+ "loss": 1.3273,
521
+ "grad_norm": 3.4397385120391846,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.493458944,
528
+ "gpu_mem": 4.442994688,
529
+ "loss": 1.3453,
530
+ "grad_norm": 3.8322765827178955,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.493458944,
537
+ "gpu_mem": 4.443033088,
538
+ "loss": 1.3242,
539
+ "grad_norm": 4.89071798324585,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.493458944,
546
+ "gpu_mem": 4.442965504,
547
+ "loss": 1.2936,
548
+ "grad_norm": 4.631297588348389,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.493458944,
555
+ "gpu_mem": 4.44301312,
556
+ "loss": 1.4406,
557
+ "grad_norm": 12.51733684539795,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.493458944,
564
+ "gpu_mem": 4.442962432,
565
+ "loss": 1.446,
566
+ "grad_norm": 15.611908912658691,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.493458944,
573
+ "gpu_mem": 4.443011584,
574
+ "loss": 1.4236,
575
+ "grad_norm": 12.190735816955566,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.493458944,
582
+ "gpu_mem": 4.443010048,
583
+ "loss": 1.3596,
584
+ "grad_norm": 5.446914196014404,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.493458944,
591
+ "gpu_mem": 4.44302848,
592
+ "loss": 1.2865,
593
+ "grad_norm": 4.165730953216553,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.493458944,
600
+ "gpu_mem": 4.442970112,
601
+ "loss": 1.2999,
602
+ "grad_norm": 3.62854266166687,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.493458944,
609
+ "gpu_mem": 4.4429824,
610
+ "loss": 1.3443,
611
+ "grad_norm": 3.0337541103363037,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.493458944,
618
+ "gpu_mem": 4.443006976,
619
+ "loss": 1.3273,
620
+ "grad_norm": 4.934184551239014,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.493458944,
627
+ "gpu_mem": 4.442983936,
628
+ "loss": 1.2715,
629
+ "grad_norm": 3.110337495803833,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.493458944,
636
+ "gpu_mem": 4.442819584,
637
+ "loss": 1.9529,
638
+ "grad_norm": 7.656370639801025,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.493458944,
645
+ "gpu_mem": 4.436617216,
646
+ "loss": 1.3812,
647
+ "grad_norm": 5.92775297164917,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.493458944,
654
+ "gpu_mem": 4.436580352,
655
+ "loss": 1.3011,
656
+ "grad_norm": 7.2833356857299805,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.493458944,
663
+ "gpu_mem": 4.436640256,
664
+ "loss": 1.3427,
665
+ "grad_norm": 14.072025299072266,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.493458944,
672
+ "gpu_mem": 4.436608,
673
+ "loss": 1.3463,
674
+ "grad_norm": 7.366079807281494,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.493458944,
681
+ "gpu_mem": 4.436618752,
682
+ "loss": 1.3097,
683
+ "grad_norm": 11.609695434570312,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.493458944,
690
+ "gpu_mem": 4.436655616,
691
+ "loss": 1.3418,
692
+ "grad_norm": 11.972086906433105,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.493458944,
699
+ "gpu_mem": 4.436640256,
700
+ "loss": 1.3932,
701
+ "grad_norm": 11.582221984863281,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.493458944,
708
+ "gpu_mem": 4.436591104,
709
+ "loss": 1.2812,
710
+ "grad_norm": 7.3847832679748535,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.493458944,
717
+ "gpu_mem": 4.436635648,
718
+ "loss": 1.3296,
719
+ "grad_norm": 6.362971782684326,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.493458944,
726
+ "gpu_mem": 4.436621824,
727
+ "loss": 1.3876,
728
+ "grad_norm": 11.030096054077148,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.493458944,
735
+ "gpu_mem": 4.436589568,
736
+ "loss": 1.4036,
737
+ "grad_norm": 9.106473922729492,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.493458944,
744
+ "gpu_mem": 4.436640256,
745
+ "loss": 1.3261,
746
+ "grad_norm": 11.944342613220215,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.493458944,
753
+ "gpu_mem": 4.436578816,
754
+ "loss": 1.3887,
755
+ "grad_norm": 12.192349433898926,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.493458944,
762
+ "gpu_mem": 4.436624896,
763
+ "loss": 1.3469,
764
+ "grad_norm": 7.058681488037109,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.493458944,
771
+ "gpu_mem": 4.436578816,
772
+ "loss": 1.3328,
773
+ "grad_norm": 5.5536932945251465,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.493458944,
780
+ "gpu_mem": 4.436609536,
781
+ "loss": 1.3939,
782
+ "grad_norm": 7.393185138702393,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.493458944,
789
+ "gpu_mem": 4.43658496,
790
+ "loss": 1.3269,
791
+ "grad_norm": 7.776304721832275,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.493458944,
798
+ "gpu_mem": 4.43663872,
799
+ "loss": 1.3153,
800
+ "grad_norm": 5.972353935241699,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.493458944,
807
+ "gpu_mem": 4.436620288,
808
+ "loss": 1.3751,
809
+ "grad_norm": 6.654686450958252,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.493458944,
816
+ "gpu_mem": 4.4365696,
817
+ "loss": 1.3689,
818
+ "grad_norm": 8.81556224822998,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.493458944,
825
+ "gpu_mem": 4.436594176,
826
+ "loss": 1.3643,
827
+ "grad_norm": 6.586201190948486,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.493458944,
834
+ "gpu_mem": 4.436597248,
835
+ "loss": 1.2994,
836
+ "grad_norm": 4.92837381362915,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.493458944,
843
+ "gpu_mem": 4.436589568,
844
+ "loss": 1.2857,
845
+ "grad_norm": 4.152866840362549,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.493458944,
852
+ "gpu_mem": 4.436627968,
853
+ "loss": 1.3404,
854
+ "grad_norm": 4.53918981552124,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.493458944,
861
+ "gpu_mem": 4.436637184,
862
+ "loss": 1.2617,
863
+ "grad_norm": 6.160858631134033,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.493458944,
870
+ "gpu_mem": 4.436580352,
871
+ "loss": 1.3543,
872
+ "grad_norm": 4.770242691040039,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.493458944,
879
+ "gpu_mem": 4.436580352,
880
+ "loss": 1.327,
881
+ "grad_norm": 2.966948986053467,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.493458944,
888
+ "gpu_mem": 4.43657728,
889
+ "loss": 1.3015,
890
+ "grad_norm": 3.1356072425842285,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.493458944,
897
+ "gpu_mem": 4.436575744,
898
+ "loss": 1.272,
899
+ "grad_norm": 5.338186264038086,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.493458944,
906
+ "gpu_mem": 4.436618752,
907
+ "loss": 1.2616,
908
+ "grad_norm": 4.881860733032227,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.493458944,
915
+ "gpu_mem": 4.436557312,
916
+ "loss": 1.3467,
917
+ "grad_norm": 5.19181489944458,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.493458944,
924
+ "gpu_mem": 4.436606464,
925
+ "loss": 1.309,
926
+ "grad_norm": 3.279639959335327,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.493458944,
933
+ "gpu_mem": 4.43666944,
934
+ "loss": 1.3868,
935
+ "grad_norm": 6.660994052886963,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.493458944,
942
+ "gpu_mem": 4.436621824,
943
+ "loss": 1.2845,
944
+ "grad_norm": 3.30313777923584,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.493458944,
951
+ "gpu_mem": 4.436603392,
952
+ "loss": 1.3232,
953
+ "grad_norm": 3.6408579349517822,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.493458944,
960
+ "gpu_mem": 4.443006976,
961
+ "loss": 1.8794,
962
+ "grad_norm": 7.724911212921143,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.493458944,
969
+ "gpu_mem": 4.442988544,
970
+ "loss": 1.3102,
971
+ "grad_norm": 3.513817310333252,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.493458944,
978
+ "gpu_mem": 4.442977792,
979
+ "loss": 1.3073,
980
+ "grad_norm": 2.583456516265869,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.493458944,
987
+ "gpu_mem": 4.443031552,
988
+ "loss": 1.327,
989
+ "grad_norm": 5.944295406341553,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.493458944,
996
+ "gpu_mem": 4.442991616,
997
+ "loss": 1.3193,
998
+ "grad_norm": 3.914555072784424,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.493458944,
1005
+ "gpu_mem": 4.443010048,
1006
+ "loss": 1.2872,
1007
+ "grad_norm": 2.4625840187072754,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.493458944,
1014
+ "gpu_mem": 4.443073024,
1015
+ "loss": 1.3187,
1016
+ "grad_norm": 3.991567611694336,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.493458944,
1023
+ "gpu_mem": 4.443000832,
1024
+ "loss": 1.3259,
1025
+ "grad_norm": 3.1732451915740967,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.493458944,
1032
+ "gpu_mem": 4.442994688,
1033
+ "loss": 1.3577,
1034
+ "grad_norm": 4.750394821166992,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.493458944,
1041
+ "gpu_mem": 4.443010048,
1042
+ "loss": 1.3211,
1043
+ "grad_norm": 3.380751132965088,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.493458944,
1050
+ "gpu_mem": 4.443025408,
1051
+ "loss": 1.29,
1052
+ "grad_norm": 4.058185577392578,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.493458944,
1059
+ "gpu_mem": 4.443016192,
1060
+ "loss": 1.2777,
1061
+ "grad_norm": 5.971620082855225,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.493458944,
1068
+ "gpu_mem": 4.443006976,
1069
+ "loss": 1.2883,
1070
+ "grad_norm": 3.4085781574249268,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.493458944,
1077
+ "gpu_mem": 4.443025408,
1078
+ "loss": 1.2777,
1079
+ "grad_norm": 2.7961478233337402,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.493458944,
1086
+ "gpu_mem": 4.443023872,
1087
+ "loss": 1.2971,
1088
+ "grad_norm": 4.205790042877197,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.493458944,
1095
+ "gpu_mem": 4.442980864,
1096
+ "loss": 1.2862,
1097
+ "grad_norm": 4.291749000549316,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.493458944,
1104
+ "gpu_mem": 4.44301312,
1105
+ "loss": 1.2822,
1106
+ "grad_norm": 4.5682053565979,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.493458944,
1113
+ "gpu_mem": 4.44296704,
1114
+ "loss": 1.3005,
1115
+ "grad_norm": 2.844740867614746,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.493458944,
1122
+ "gpu_mem": 4.443011584,
1123
+ "loss": 1.2295,
1124
+ "grad_norm": 5.023967266082764,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.493458944,
1131
+ "gpu_mem": 4.442962432,
1132
+ "loss": 1.2662,
1133
+ "grad_norm": 4.076255798339844,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.493458944,
1140
+ "gpu_mem": 4.44297472,
1141
+ "loss": 1.2682,
1142
+ "grad_norm": 3.1658072471618652,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.493458944,
1149
+ "gpu_mem": 4.442999296,
1150
+ "loss": 1.2845,
1151
+ "grad_norm": 3.069446325302124,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.493458944,
1158
+ "gpu_mem": 4.442960896,
1159
+ "loss": 1.3036,
1160
+ "grad_norm": 3.197641372680664,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.493458944,
1167
+ "gpu_mem": 4.442963968,
1168
+ "loss": 1.3179,
1169
+ "grad_norm": 3.2924671173095703,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.493458944,
1176
+ "gpu_mem": 4.442976256,
1177
+ "loss": 1.2769,
1178
+ "grad_norm": 4.022733211517334,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.493458944,
1185
+ "gpu_mem": 4.442940928,
1186
+ "loss": 1.2735,
1187
+ "grad_norm": 3.0946011543273926,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.493458944,
1194
+ "gpu_mem": 4.4429824,
1195
+ "loss": 1.2261,
1196
+ "grad_norm": 3.561920166015625,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.493458944,
1203
+ "gpu_mem": 4.44299776,
1204
+ "loss": 1.248,
1205
+ "grad_norm": 3.3285350799560547,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.493458944,
1212
+ "gpu_mem": 4.442962432,
1213
+ "loss": 1.3244,
1214
+ "grad_norm": 4.943061828613281,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.493458944,
1221
+ "gpu_mem": 4.442970112,
1222
+ "loss": 1.3367,
1223
+ "grad_norm": 4.610203266143799,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.493458944,
1230
+ "gpu_mem": 4.442991616,
1231
+ "loss": 1.3017,
1232
+ "grad_norm": 4.349905967712402,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.493458944,
1239
+ "gpu_mem": 4.443002368,
1240
+ "loss": 1.3214,
1241
+ "grad_norm": 3.8690969944000244,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.493458944,
1248
+ "gpu_mem": 4.442994688,
1249
+ "loss": 1.3432,
1250
+ "grad_norm": 3.9236385822296143,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.493458944,
1257
+ "gpu_mem": 4.44302848,
1258
+ "loss": 1.3178,
1259
+ "grad_norm": 3.8790531158447266,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.493458944,
1266
+ "gpu_mem": 4.44302848,
1267
+ "train_runtime": 672.9448,
1268
+ "train_samples_per_second": 13.38,
1269
+ "train_steps_per_second": 0.208,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.4192385068961553
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.37247474747474746
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 25389056
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_e-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T05:51:47.380869"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r32-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.496137728,
6
+ "gpu_mem": 4.518964736,
7
+ "loss": 4.6319,
8
+ "grad_norm": 285.1859436035156,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.501642752,
15
+ "gpu_mem": 4.722138112,
16
+ "loss": 4.4578,
17
+ "grad_norm": 290.5561218261719,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.50183936,
24
+ "gpu_mem": 4.722116608,
25
+ "loss": 2.8478,
26
+ "grad_norm": 381.0905456542969,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.502035968,
33
+ "gpu_mem": 4.722095104,
34
+ "loss": 1.6686,
35
+ "grad_norm": 25.219541549682617,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.502035968,
42
+ "gpu_mem": 4.722136576,
43
+ "loss": 1.5693,
44
+ "grad_norm": 22.9230899810791,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.502035968,
51
+ "gpu_mem": 4.722112,
52
+ "loss": 1.4612,
53
+ "grad_norm": 21.357065200805664,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.502232576,
60
+ "gpu_mem": 4.72213504,
61
+ "loss": 1.6244,
62
+ "grad_norm": 26.6319637298584,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.502232576,
69
+ "gpu_mem": 4.722093568,
70
+ "loss": 1.3759,
71
+ "grad_norm": 7.972470760345459,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.502429184,
78
+ "gpu_mem": 4.722095104,
79
+ "loss": 1.398,
80
+ "grad_norm": 12.75944995880127,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.502429184,
87
+ "gpu_mem": 4.722090496,
88
+ "loss": 1.7315,
89
+ "grad_norm": 20.16497802734375,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.502429184,
96
+ "gpu_mem": 4.722168832,
97
+ "loss": 1.5034,
98
+ "grad_norm": 12.408662796020508,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.502429184,
105
+ "gpu_mem": 4.72214272,
106
+ "loss": 1.3497,
107
+ "grad_norm": 6.996767044067383,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.502429184,
114
+ "gpu_mem": 4.722093568,
115
+ "loss": 2.105,
116
+ "grad_norm": 52.36497116088867,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.502429184,
123
+ "gpu_mem": 4.722115072,
124
+ "loss": 1.4537,
125
+ "grad_norm": 8.213821411132812,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.502429184,
132
+ "gpu_mem": 4.722092032,
133
+ "loss": 1.4046,
134
+ "grad_norm": 8.018381118774414,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.502429184,
141
+ "gpu_mem": 4.72209664,
142
+ "loss": 1.4606,
143
+ "grad_norm": 8.81755256652832,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.502429184,
150
+ "gpu_mem": 4.722133504,
151
+ "loss": 1.535,
152
+ "grad_norm": 16.856678009033203,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.502429184,
159
+ "gpu_mem": 4.722144256,
160
+ "loss": 1.4464,
161
+ "grad_norm": 6.95269775390625,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.502429184,
168
+ "gpu_mem": 4.722087424,
169
+ "loss": 1.2935,
170
+ "grad_norm": 4.264415264129639,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.502429184,
177
+ "gpu_mem": 4.72215808,
178
+ "loss": 1.6446,
179
+ "grad_norm": 16.577444076538086,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.502429184,
186
+ "gpu_mem": 4.722156544,
187
+ "loss": 1.4246,
188
+ "grad_norm": 9.108543395996094,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.502429184,
195
+ "gpu_mem": 4.722113536,
196
+ "loss": 1.4405,
197
+ "grad_norm": 8.338932991027832,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.502429184,
204
+ "gpu_mem": 4.722130432,
205
+ "loss": 1.3325,
206
+ "grad_norm": 4.5872039794921875,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.502429184,
213
+ "gpu_mem": 4.722087424,
214
+ "loss": 1.346,
215
+ "grad_norm": 3.732668161392212,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.502429184,
222
+ "gpu_mem": 4.722116608,
223
+ "loss": 1.4111,
224
+ "grad_norm": 3.657146692276001,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.502429184,
231
+ "gpu_mem": 4.72209664,
232
+ "loss": 1.5039,
233
+ "grad_norm": 3.2886135578155518,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.502429184,
240
+ "gpu_mem": 4.722122752,
241
+ "loss": 1.358,
242
+ "grad_norm": 2.674607276916504,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.502429184,
249
+ "gpu_mem": 4.722122752,
250
+ "loss": 1.4135,
251
+ "grad_norm": 3.7271816730499268,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.502625792,
258
+ "gpu_mem": 4.722101248,
259
+ "loss": 1.2997,
260
+ "grad_norm": 3.6826894283294678,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.502625792,
267
+ "gpu_mem": 4.722092032,
268
+ "loss": 1.3454,
269
+ "grad_norm": 2.921555757522583,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.502625792,
276
+ "gpu_mem": 4.722110464,
277
+ "loss": 1.4119,
278
+ "grad_norm": 6.010624885559082,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.502625792,
285
+ "gpu_mem": 4.722133504,
286
+ "loss": 1.3421,
287
+ "grad_norm": 3.9047536849975586,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.502625792,
294
+ "gpu_mem": 4.722130432,
295
+ "loss": 1.38,
296
+ "grad_norm": 3.3717494010925293,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.502625792,
303
+ "gpu_mem": 4.722133504,
304
+ "loss": 1.3892,
305
+ "grad_norm": 2.1161556243896484,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.502625792,
312
+ "gpu_mem": 4.722115072,
313
+ "loss": 1.3076,
314
+ "grad_norm": 1.9025923013687134,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.502625792,
321
+ "gpu_mem": 4.82366208,
322
+ "loss": 1.9438,
323
+ "grad_norm": 4.72139835357666,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.502625792,
330
+ "gpu_mem": 4.823666688,
331
+ "loss": 1.3326,
332
+ "grad_norm": 4.325397968292236,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.502625792,
339
+ "gpu_mem": 4.823645184,
340
+ "loss": 1.2542,
341
+ "grad_norm": 3.8574061393737793,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.502625792,
348
+ "gpu_mem": 4.823634432,
349
+ "loss": 1.3937,
350
+ "grad_norm": 8.146137237548828,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.502625792,
357
+ "gpu_mem": 4.823697408,
358
+ "loss": 1.398,
359
+ "grad_norm": 4.23073148727417,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.502625792,
366
+ "gpu_mem": 4.823657472,
367
+ "loss": 1.5805,
368
+ "grad_norm": 9.995660781860352,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.502625792,
375
+ "gpu_mem": 4.82370048,
376
+ "loss": 1.3628,
377
+ "grad_norm": 3.161339521408081,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.502625792,
384
+ "gpu_mem": 4.823649792,
385
+ "loss": 1.4042,
386
+ "grad_norm": 4.245759010314941,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.502625792,
393
+ "gpu_mem": 4.823714304,
394
+ "loss": 1.3295,
395
+ "grad_norm": 3.277341365814209,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.502625792,
402
+ "gpu_mem": 4.823682048,
403
+ "loss": 1.4885,
404
+ "grad_norm": 4.2645487785339355,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.502625792,
411
+ "gpu_mem": 4.823686656,
412
+ "loss": 1.4298,
413
+ "grad_norm": 5.765285015106201,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.502625792,
420
+ "gpu_mem": 4.823632896,
421
+ "loss": 1.3327,
422
+ "grad_norm": 3.2660598754882812,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.502625792,
429
+ "gpu_mem": 4.82364672,
430
+ "loss": 1.3981,
431
+ "grad_norm": 4.678296089172363,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.502625792,
438
+ "gpu_mem": 4.823635968,
439
+ "loss": 2.184,
440
+ "grad_norm": 293.4337463378906,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.502625792,
447
+ "gpu_mem": 4.823649792,
448
+ "loss": 1.4052,
449
+ "grad_norm": 5.673724174499512,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.502625792,
456
+ "gpu_mem": 4.823702016,
457
+ "loss": 1.4252,
458
+ "grad_norm": 6.799444198608398,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.502625792,
465
+ "gpu_mem": 4.823649792,
466
+ "loss": 1.3539,
467
+ "grad_norm": 2.679464340209961,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.502625792,
474
+ "gpu_mem": 4.823718912,
475
+ "loss": 1.4057,
476
+ "grad_norm": 4.1786208152771,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.502625792,
483
+ "gpu_mem": 4.823686656,
484
+ "loss": 1.4722,
485
+ "grad_norm": 5.400444507598877,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.502625792,
492
+ "gpu_mem": 4.823695872,
493
+ "loss": 1.3521,
494
+ "grad_norm": 1.586466908454895,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.502625792,
501
+ "gpu_mem": 4.823671296,
502
+ "loss": 1.3238,
503
+ "grad_norm": 1.901183843612671,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.502625792,
510
+ "gpu_mem": 4.823705088,
511
+ "loss": 1.3406,
512
+ "grad_norm": 2.266773223876953,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.502625792,
519
+ "gpu_mem": 4.823686656,
520
+ "loss": 1.3164,
521
+ "grad_norm": 0.935617983341217,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.502625792,
528
+ "gpu_mem": 4.823672832,
529
+ "loss": 1.3413,
530
+ "grad_norm": 1.1456469297409058,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.502625792,
537
+ "gpu_mem": 4.823711232,
538
+ "loss": 1.2904,
539
+ "grad_norm": 1.728401780128479,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.502625792,
546
+ "gpu_mem": 4.823643648,
547
+ "loss": 1.3529,
548
+ "grad_norm": 3.0350029468536377,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.502625792,
555
+ "gpu_mem": 4.823691264,
556
+ "loss": 1.4544,
557
+ "grad_norm": 3.896820068359375,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.502625792,
564
+ "gpu_mem": 4.823640576,
565
+ "loss": 1.4155,
566
+ "grad_norm": 3.300171136856079,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.502625792,
573
+ "gpu_mem": 4.823689728,
574
+ "loss": 1.3494,
575
+ "grad_norm": 2.4495275020599365,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.502625792,
582
+ "gpu_mem": 4.823688192,
583
+ "loss": 1.4032,
584
+ "grad_norm": 3.058351993560791,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.502625792,
591
+ "gpu_mem": 4.823706624,
592
+ "loss": 1.4233,
593
+ "grad_norm": 4.7667107582092285,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.502625792,
600
+ "gpu_mem": 4.823648256,
601
+ "loss": 1.3687,
602
+ "grad_norm": 5.0993218421936035,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.502625792,
609
+ "gpu_mem": 4.823660544,
610
+ "loss": 20.6412,
611
+ "grad_norm": 727.7623901367188,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.502625792,
618
+ "gpu_mem": 4.82368512,
619
+ "loss": 5.2158,
620
+ "grad_norm": 216.47073364257812,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.502625792,
627
+ "gpu_mem": 4.82366208,
628
+ "loss": 1.9586,
629
+ "grad_norm": 153.92445373535156,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.502625792,
636
+ "gpu_mem": 4.823497728,
637
+ "loss": 2.0123,
638
+ "grad_norm": 5.384662628173828,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.502625792,
645
+ "gpu_mem": 4.722125824,
646
+ "loss": 1.3851,
647
+ "grad_norm": 2.2103421688079834,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.502625792,
654
+ "gpu_mem": 4.72208896,
655
+ "loss": 1.3777,
656
+ "grad_norm": 1.1181278228759766,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.502625792,
663
+ "gpu_mem": 4.722148864,
664
+ "loss": 1.3305,
665
+ "grad_norm": 0.8732612133026123,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.502625792,
672
+ "gpu_mem": 4.722116608,
673
+ "loss": 1.344,
674
+ "grad_norm": 1.5410144329071045,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.502625792,
681
+ "gpu_mem": 4.72212736,
682
+ "loss": 1.2997,
683
+ "grad_norm": 0.754600465297699,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.502625792,
690
+ "gpu_mem": 4.722164224,
691
+ "loss": 1.3713,
692
+ "grad_norm": 1.0481202602386475,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.502625792,
699
+ "gpu_mem": 4.722148864,
700
+ "loss": 1.4209,
701
+ "grad_norm": 2.4341037273406982,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.502625792,
708
+ "gpu_mem": 4.722099712,
709
+ "loss": 1.291,
710
+ "grad_norm": 1.7542685270309448,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.502625792,
717
+ "gpu_mem": 4.722144256,
718
+ "loss": 1.346,
719
+ "grad_norm": 1.6453899145126343,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.502625792,
726
+ "gpu_mem": 4.722130432,
727
+ "loss": 1.4232,
728
+ "grad_norm": 2.9081246852874756,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.502625792,
735
+ "gpu_mem": 4.722098176,
736
+ "loss": 1.4118,
737
+ "grad_norm": 3.444525718688965,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.502625792,
744
+ "gpu_mem": 4.722148864,
745
+ "loss": 1.3006,
746
+ "grad_norm": 2.5711867809295654,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.502625792,
753
+ "gpu_mem": 4.722087424,
754
+ "loss": 1.4037,
755
+ "grad_norm": 2.308828115463257,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.502625792,
762
+ "gpu_mem": 4.722133504,
763
+ "loss": 1.3524,
764
+ "grad_norm": 2.286288261413574,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.502625792,
771
+ "gpu_mem": 4.722087424,
772
+ "loss": 1.3171,
773
+ "grad_norm": 0.8145406246185303,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.502625792,
780
+ "gpu_mem": 4.722118144,
781
+ "loss": 1.4024,
782
+ "grad_norm": 2.189497232437134,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.502625792,
789
+ "gpu_mem": 4.722093568,
790
+ "loss": 1.3276,
791
+ "grad_norm": 2.225213050842285,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.502625792,
798
+ "gpu_mem": 4.722147328,
799
+ "loss": 1.3019,
800
+ "grad_norm": 1.4794793128967285,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.502625792,
807
+ "gpu_mem": 4.722128896,
808
+ "loss": 1.3902,
809
+ "grad_norm": 2.832213878631592,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.502625792,
816
+ "gpu_mem": 4.722078208,
817
+ "loss": 1.3622,
818
+ "grad_norm": 2.659364700317383,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.502625792,
825
+ "gpu_mem": 4.722102784,
826
+ "loss": 1.3862,
827
+ "grad_norm": 2.9223179817199707,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.502625792,
834
+ "gpu_mem": 4.722105856,
835
+ "loss": 1.3254,
836
+ "grad_norm": 2.7396914958953857,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.502625792,
843
+ "gpu_mem": 4.722098176,
844
+ "loss": 1.3504,
845
+ "grad_norm": 2.2619333267211914,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.502625792,
852
+ "gpu_mem": 4.722136576,
853
+ "loss": 1.375,
854
+ "grad_norm": 2.095527172088623,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.502625792,
861
+ "gpu_mem": 4.722145792,
862
+ "loss": 1.2863,
863
+ "grad_norm": 2.3975515365600586,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.502625792,
870
+ "gpu_mem": 4.72208896,
871
+ "loss": 1.3655,
872
+ "grad_norm": 2.111492395401001,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.502625792,
879
+ "gpu_mem": 4.72208896,
880
+ "loss": 1.3394,
881
+ "grad_norm": 0.9519637823104858,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.502625792,
888
+ "gpu_mem": 4.722085888,
889
+ "loss": 1.2922,
890
+ "grad_norm": 1.1138625144958496,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.502625792,
897
+ "gpu_mem": 4.722084352,
898
+ "loss": 1.2824,
899
+ "grad_norm": 1.63324773311615,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.502625792,
906
+ "gpu_mem": 4.72212736,
907
+ "loss": 1.2767,
908
+ "grad_norm": 1.6835689544677734,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.502625792,
915
+ "gpu_mem": 4.72206592,
916
+ "loss": 1.3731,
917
+ "grad_norm": 2.8863682746887207,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.502625792,
924
+ "gpu_mem": 4.722115072,
925
+ "loss": 1.3284,
926
+ "grad_norm": 1.9598286151885986,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.502625792,
933
+ "gpu_mem": 4.722178048,
934
+ "loss": 1.3901,
935
+ "grad_norm": 3.4294772148132324,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.502625792,
942
+ "gpu_mem": 4.722130432,
943
+ "loss": 1.3027,
944
+ "grad_norm": 1.786590814590454,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.502625792,
951
+ "gpu_mem": 4.722112,
952
+ "loss": 1.3242,
953
+ "grad_norm": 1.6533207893371582,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.502625792,
960
+ "gpu_mem": 4.82368512,
961
+ "loss": 1.8733,
962
+ "grad_norm": 3.4330568313598633,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.502625792,
969
+ "gpu_mem": 4.823666688,
970
+ "loss": 1.2738,
971
+ "grad_norm": 1.685111403465271,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.502625792,
978
+ "gpu_mem": 4.823655936,
979
+ "loss": 1.3244,
980
+ "grad_norm": 1.720697045326233,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.502625792,
987
+ "gpu_mem": 4.823709696,
988
+ "loss": 1.2854,
989
+ "grad_norm": 2.801140308380127,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.502625792,
996
+ "gpu_mem": 4.82366976,
997
+ "loss": 1.3167,
998
+ "grad_norm": 1.9198555946350098,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.502625792,
1005
+ "gpu_mem": 4.823688192,
1006
+ "loss": 1.2922,
1007
+ "grad_norm": 1.4882051944732666,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.502625792,
1014
+ "gpu_mem": 4.823751168,
1015
+ "loss": 1.2862,
1016
+ "grad_norm": 1.7628090381622314,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.502625792,
1023
+ "gpu_mem": 4.823678976,
1024
+ "loss": 1.3162,
1025
+ "grad_norm": 1.6963146924972534,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.502625792,
1032
+ "gpu_mem": 4.823672832,
1033
+ "loss": 1.3671,
1034
+ "grad_norm": 2.351639747619629,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.502625792,
1041
+ "gpu_mem": 4.823688192,
1042
+ "loss": 1.3129,
1043
+ "grad_norm": 1.3395206928253174,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.502625792,
1050
+ "gpu_mem": 4.823703552,
1051
+ "loss": 1.276,
1052
+ "grad_norm": 1.8355594873428345,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.502625792,
1059
+ "gpu_mem": 4.823694336,
1060
+ "loss": 1.2855,
1061
+ "grad_norm": 1.5066239833831787,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.502625792,
1068
+ "gpu_mem": 4.82368512,
1069
+ "loss": 1.2956,
1070
+ "grad_norm": 1.6072317361831665,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.502625792,
1077
+ "gpu_mem": 4.823703552,
1078
+ "loss": 1.2925,
1079
+ "grad_norm": 1.6089941263198853,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.502625792,
1086
+ "gpu_mem": 4.823702016,
1087
+ "loss": 1.2467,
1088
+ "grad_norm": 1.8599209785461426,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.502625792,
1095
+ "gpu_mem": 4.823659008,
1096
+ "loss": 1.2741,
1097
+ "grad_norm": 1.5860140323638916,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.502625792,
1104
+ "gpu_mem": 4.823691264,
1105
+ "loss": 1.2627,
1106
+ "grad_norm": 1.814361810684204,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.502625792,
1113
+ "gpu_mem": 4.823645184,
1114
+ "loss": 1.2697,
1115
+ "grad_norm": 1.6075295209884644,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.502625792,
1122
+ "gpu_mem": 4.823689728,
1123
+ "loss": 1.2082,
1124
+ "grad_norm": 2.1656956672668457,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.502625792,
1131
+ "gpu_mem": 4.823640576,
1132
+ "loss": 1.2388,
1133
+ "grad_norm": 1.7753299474716187,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.502625792,
1140
+ "gpu_mem": 4.823652864,
1141
+ "loss": 1.2669,
1142
+ "grad_norm": 1.6709108352661133,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.502625792,
1149
+ "gpu_mem": 4.82367744,
1150
+ "loss": 1.28,
1151
+ "grad_norm": 2.1730709075927734,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.502625792,
1158
+ "gpu_mem": 4.82363904,
1159
+ "loss": 1.2608,
1160
+ "grad_norm": 1.86732017993927,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.502625792,
1167
+ "gpu_mem": 4.823642112,
1168
+ "loss": 1.3212,
1169
+ "grad_norm": 2.1604793071746826,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.502625792,
1176
+ "gpu_mem": 4.8236544,
1177
+ "loss": 1.2794,
1178
+ "grad_norm": 2.1699371337890625,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.502625792,
1185
+ "gpu_mem": 4.823619072,
1186
+ "loss": 1.2438,
1187
+ "grad_norm": 1.8384634256362915,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.502625792,
1194
+ "gpu_mem": 4.823660544,
1195
+ "loss": 1.206,
1196
+ "grad_norm": 1.811904788017273,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.502625792,
1203
+ "gpu_mem": 4.823675904,
1204
+ "loss": 1.2126,
1205
+ "grad_norm": 1.9175496101379395,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.502625792,
1212
+ "gpu_mem": 4.823640576,
1213
+ "loss": 1.3218,
1214
+ "grad_norm": 2.67806077003479,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.502625792,
1221
+ "gpu_mem": 4.823648256,
1222
+ "loss": 1.2947,
1223
+ "grad_norm": 2.2404983043670654,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.502625792,
1230
+ "gpu_mem": 4.82366976,
1231
+ "loss": 1.2894,
1232
+ "grad_norm": 2.716714382171631,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.502625792,
1239
+ "gpu_mem": 4.823680512,
1240
+ "loss": 1.2933,
1241
+ "grad_norm": 1.7514668703079224,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.502625792,
1248
+ "gpu_mem": 4.823672832,
1249
+ "loss": 1.3346,
1250
+ "grad_norm": 1.9952417612075806,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.502625792,
1257
+ "gpu_mem": 4.823706624,
1258
+ "loss": 1.3088,
1259
+ "grad_norm": 2.4654135704040527,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.502625792,
1266
+ "gpu_mem": 4.823706624,
1267
+ "train_runtime": 678.9966,
1268
+ "train_samples_per_second": 13.261,
1269
+ "train_steps_per_second": 0.206,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.6106574450220381
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.6611952861952862
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 6317696
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-arc_e-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T22:53:40.430500"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-arc_e-r8-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.488003072,
6
+ "gpu_mem": 4.44271872,
7
+ "loss": 4.6319,
8
+ "grad_norm": 276.5605773925781,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.493704704,
15
+ "gpu_mem": 4.493400064,
16
+ "loss": 4.4578,
17
+ "grad_norm": 282.3363952636719,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.493901312,
24
+ "gpu_mem": 4.49337856,
25
+ "loss": 2.9702,
26
+ "grad_norm": 613.773193359375,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.493901312,
33
+ "gpu_mem": 4.493357056,
34
+ "loss": 1.9283,
35
+ "grad_norm": 45.13872146606445,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.49409792,
42
+ "gpu_mem": 4.493398528,
43
+ "loss": 1.493,
44
+ "grad_norm": 16.81772232055664,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.494294528,
51
+ "gpu_mem": 4.493373952,
52
+ "loss": 1.3415,
53
+ "grad_norm": 12.790365219116211,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.494294528,
60
+ "gpu_mem": 4.493396992,
61
+ "loss": 1.5003,
62
+ "grad_norm": 25.754161834716797,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.494294528,
69
+ "gpu_mem": 4.49335552,
70
+ "loss": 1.3258,
71
+ "grad_norm": 6.1254353523254395,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.494294528,
78
+ "gpu_mem": 4.493357056,
79
+ "loss": 1.4644,
80
+ "grad_norm": 34.32180404663086,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.494294528,
87
+ "gpu_mem": 4.493352448,
88
+ "loss": 1.4288,
89
+ "grad_norm": 23.392850875854492,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.494294528,
96
+ "gpu_mem": 4.493430784,
97
+ "loss": 1.5429,
98
+ "grad_norm": 20.585525512695312,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.494294528,
105
+ "gpu_mem": 4.493404672,
106
+ "loss": 1.305,
107
+ "grad_norm": 5.136800765991211,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.494294528,
114
+ "gpu_mem": 4.49335552,
115
+ "loss": 1.3883,
116
+ "grad_norm": 26.17745590209961,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.494294528,
123
+ "gpu_mem": 4.493377024,
124
+ "loss": 1.4056,
125
+ "grad_norm": 12.29943561553955,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.494294528,
132
+ "gpu_mem": 4.493353984,
133
+ "loss": 1.3972,
134
+ "grad_norm": 5.9298176765441895,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.494294528,
141
+ "gpu_mem": 4.493358592,
142
+ "loss": 1.3433,
143
+ "grad_norm": 2.671994924545288,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.494294528,
150
+ "gpu_mem": 4.493395456,
151
+ "loss": 1.3402,
152
+ "grad_norm": 5.09276008605957,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.494491136,
159
+ "gpu_mem": 4.493406208,
160
+ "loss": 1.3272,
161
+ "grad_norm": 4.426205635070801,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.494491136,
168
+ "gpu_mem": 4.493349376,
169
+ "loss": 1.4243,
170
+ "grad_norm": 10.765657424926758,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.494491136,
177
+ "gpu_mem": 4.493420032,
178
+ "loss": 1.4339,
179
+ "grad_norm": 7.541872024536133,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.494491136,
186
+ "gpu_mem": 4.493418496,
187
+ "loss": 1.349,
188
+ "grad_norm": 5.185708522796631,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.494491136,
195
+ "gpu_mem": 4.493375488,
196
+ "loss": 1.3539,
197
+ "grad_norm": 6.7187676429748535,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.494491136,
204
+ "gpu_mem": 4.493392384,
205
+ "loss": 1.3318,
206
+ "grad_norm": 6.468657493591309,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.494491136,
213
+ "gpu_mem": 4.493349376,
214
+ "loss": 1.306,
215
+ "grad_norm": 3.4460861682891846,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.494491136,
222
+ "gpu_mem": 4.49337856,
223
+ "loss": 1.5276,
224
+ "grad_norm": 12.144696235656738,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.494491136,
231
+ "gpu_mem": 4.493358592,
232
+ "loss": 1.4864,
233
+ "grad_norm": 5.351874351501465,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.494491136,
240
+ "gpu_mem": 4.493384704,
241
+ "loss": 1.3675,
242
+ "grad_norm": 5.478825092315674,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.494491136,
249
+ "gpu_mem": 4.493384704,
250
+ "loss": 1.4199,
251
+ "grad_norm": 5.453819274902344,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.494491136,
258
+ "gpu_mem": 4.4933632,
259
+ "loss": 1.2599,
260
+ "grad_norm": 2.9942517280578613,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.494491136,
267
+ "gpu_mem": 4.493353984,
268
+ "loss": 1.3457,
269
+ "grad_norm": 3.5960090160369873,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.494491136,
276
+ "gpu_mem": 4.493372416,
277
+ "loss": 1.3961,
278
+ "grad_norm": 5.403853416442871,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.494491136,
285
+ "gpu_mem": 4.493395456,
286
+ "loss": 1.3711,
287
+ "grad_norm": 6.018552303314209,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.494491136,
294
+ "gpu_mem": 4.493392384,
295
+ "loss": 1.3374,
296
+ "grad_norm": 2.8503122329711914,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.494491136,
303
+ "gpu_mem": 4.493395456,
304
+ "loss": 1.386,
305
+ "grad_norm": 4.291422367095947,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.494491136,
312
+ "gpu_mem": 4.493377024,
313
+ "loss": 1.3118,
314
+ "grad_norm": 3.1133341789245605,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.494491136,
321
+ "gpu_mem": 4.518678016,
322
+ "loss": 1.9673,
323
+ "grad_norm": 5.740923881530762,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.494491136,
330
+ "gpu_mem": 4.518682624,
331
+ "loss": 1.339,
332
+ "grad_norm": 3.4668984413146973,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.494491136,
339
+ "gpu_mem": 4.51866112,
340
+ "loss": 1.2514,
341
+ "grad_norm": 3.128293514251709,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.494491136,
348
+ "gpu_mem": 4.518650368,
349
+ "loss": 1.3329,
350
+ "grad_norm": 4.251654148101807,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.494491136,
357
+ "gpu_mem": 4.518713344,
358
+ "loss": 1.3431,
359
+ "grad_norm": 3.2362022399902344,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.494491136,
366
+ "gpu_mem": 4.518673408,
367
+ "loss": 1.3388,
368
+ "grad_norm": 3.327332019805908,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.494491136,
375
+ "gpu_mem": 4.518716416,
376
+ "loss": 1.3812,
377
+ "grad_norm": 3.0941553115844727,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.494491136,
384
+ "gpu_mem": 4.518665728,
385
+ "loss": 1.3735,
386
+ "grad_norm": 2.356278419494629,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.494491136,
393
+ "gpu_mem": 4.51873024,
394
+ "loss": 1.3766,
395
+ "grad_norm": 3.27545428276062,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.494491136,
402
+ "gpu_mem": 4.518697984,
403
+ "loss": 1.3891,
404
+ "grad_norm": 2.333723306655884,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.494491136,
411
+ "gpu_mem": 4.518702592,
412
+ "loss": 1.3937,
413
+ "grad_norm": 3.481844425201416,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.494491136,
420
+ "gpu_mem": 4.518648832,
421
+ "loss": 1.3532,
422
+ "grad_norm": 4.956977844238281,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.494491136,
429
+ "gpu_mem": 4.518662656,
430
+ "loss": 1.3068,
431
+ "grad_norm": 3.1343212127685547,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.494491136,
438
+ "gpu_mem": 4.518651904,
439
+ "loss": 1.3936,
440
+ "grad_norm": 8.055434226989746,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.494491136,
447
+ "gpu_mem": 4.518665728,
448
+ "loss": 1.4007,
449
+ "grad_norm": 16.459016799926758,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.494491136,
456
+ "gpu_mem": 4.518717952,
457
+ "loss": 1.4211,
458
+ "grad_norm": 16.374134063720703,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.494491136,
465
+ "gpu_mem": 4.518665728,
466
+ "loss": 1.3547,
467
+ "grad_norm": 4.7861762046813965,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.494491136,
474
+ "gpu_mem": 4.518734848,
475
+ "loss": 1.3086,
476
+ "grad_norm": 5.092458724975586,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.494491136,
483
+ "gpu_mem": 4.518702592,
484
+ "loss": 1.3138,
485
+ "grad_norm": 3.623900890350342,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.494491136,
492
+ "gpu_mem": 4.518711808,
493
+ "loss": 1.3956,
494
+ "grad_norm": 4.312674522399902,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.494491136,
501
+ "gpu_mem": 4.518687232,
502
+ "loss": 1.341,
503
+ "grad_norm": 3.958331346511841,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.494491136,
510
+ "gpu_mem": 4.518721024,
511
+ "loss": 1.3175,
512
+ "grad_norm": 3.097858190536499,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.494491136,
519
+ "gpu_mem": 4.518702592,
520
+ "loss": 1.3063,
521
+ "grad_norm": 3.0172431468963623,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.494491136,
528
+ "gpu_mem": 4.518688768,
529
+ "loss": 1.3144,
530
+ "grad_norm": 2.152859687805176,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.494491136,
537
+ "gpu_mem": 4.518727168,
538
+ "loss": 1.3108,
539
+ "grad_norm": 3.8040716648101807,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.494491136,
546
+ "gpu_mem": 4.518659584,
547
+ "loss": 1.3203,
548
+ "grad_norm": 3.8633904457092285,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.494491136,
555
+ "gpu_mem": 4.5187072,
556
+ "loss": 1.3983,
557
+ "grad_norm": 4.749051094055176,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.494491136,
564
+ "gpu_mem": 4.518656512,
565
+ "loss": 1.3682,
566
+ "grad_norm": 5.358827114105225,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.494491136,
573
+ "gpu_mem": 4.518705664,
574
+ "loss": 1.4047,
575
+ "grad_norm": 5.390880584716797,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.494491136,
582
+ "gpu_mem": 4.518704128,
583
+ "loss": 1.3214,
584
+ "grad_norm": 4.8978447914123535,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.494491136,
591
+ "gpu_mem": 4.51872256,
592
+ "loss": 1.3365,
593
+ "grad_norm": 4.024534225463867,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.494491136,
600
+ "gpu_mem": 4.518664192,
601
+ "loss": 1.3593,
602
+ "grad_norm": 10.947258949279785,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.494491136,
609
+ "gpu_mem": 4.51867648,
610
+ "loss": 1.3801,
611
+ "grad_norm": 7.698343276977539,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.494491136,
618
+ "gpu_mem": 4.518701056,
619
+ "loss": 1.365,
620
+ "grad_norm": 6.346189022064209,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.494491136,
627
+ "gpu_mem": 4.518678016,
628
+ "loss": 1.2689,
629
+ "grad_norm": 4.332313060760498,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.494491136,
636
+ "gpu_mem": 4.518513664,
637
+ "loss": 1.9494,
638
+ "grad_norm": 61.389652252197266,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.494491136,
645
+ "gpu_mem": 4.493387776,
646
+ "loss": 1.369,
647
+ "grad_norm": 5.659824848175049,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.494491136,
654
+ "gpu_mem": 4.493350912,
655
+ "loss": 1.3145,
656
+ "grad_norm": 4.936837673187256,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.494491136,
663
+ "gpu_mem": 4.493410816,
664
+ "loss": 1.2625,
665
+ "grad_norm": 3.075138568878174,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.494491136,
672
+ "gpu_mem": 4.49337856,
673
+ "loss": 1.34,
674
+ "grad_norm": 2.8183681964874268,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.494491136,
681
+ "gpu_mem": 4.493389312,
682
+ "loss": 1.2405,
683
+ "grad_norm": 2.041452169418335,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.494491136,
690
+ "gpu_mem": 4.493426176,
691
+ "loss": 1.3003,
692
+ "grad_norm": 3.310304880142212,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.494491136,
699
+ "gpu_mem": 4.493410816,
700
+ "loss": 1.3301,
701
+ "grad_norm": 4.006730079650879,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.494491136,
708
+ "gpu_mem": 4.493361664,
709
+ "loss": 1.2198,
710
+ "grad_norm": 3.7885594367980957,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.494491136,
717
+ "gpu_mem": 4.493406208,
718
+ "loss": 1.3053,
719
+ "grad_norm": 3.224207639694214,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.494491136,
726
+ "gpu_mem": 4.493392384,
727
+ "loss": 1.3576,
728
+ "grad_norm": 5.85601282119751,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.494491136,
735
+ "gpu_mem": 4.493360128,
736
+ "loss": 1.3497,
737
+ "grad_norm": 6.43959903717041,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.494491136,
744
+ "gpu_mem": 4.493410816,
745
+ "loss": 1.2215,
746
+ "grad_norm": 5.418457508087158,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.494491136,
753
+ "gpu_mem": 4.493349376,
754
+ "loss": 1.3708,
755
+ "grad_norm": 6.882091045379639,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.494491136,
762
+ "gpu_mem": 4.493395456,
763
+ "loss": 1.3319,
764
+ "grad_norm": 6.375625133514404,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.494491136,
771
+ "gpu_mem": 4.493349376,
772
+ "loss": 1.29,
773
+ "grad_norm": 4.537503719329834,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.494491136,
780
+ "gpu_mem": 4.493380096,
781
+ "loss": 1.2986,
782
+ "grad_norm": 5.286020278930664,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.494491136,
789
+ "gpu_mem": 4.49335552,
790
+ "loss": 1.2059,
791
+ "grad_norm": 4.801527500152588,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.494491136,
798
+ "gpu_mem": 4.49340928,
799
+ "loss": 1.2417,
800
+ "grad_norm": 4.618600845336914,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.494491136,
807
+ "gpu_mem": 4.493390848,
808
+ "loss": 1.3213,
809
+ "grad_norm": 5.864246845245361,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.494491136,
816
+ "gpu_mem": 4.49334016,
817
+ "loss": 1.3045,
818
+ "grad_norm": 5.989525318145752,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.494491136,
825
+ "gpu_mem": 4.493364736,
826
+ "loss": 1.3411,
827
+ "grad_norm": 6.808497905731201,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.494491136,
834
+ "gpu_mem": 4.493367808,
835
+ "loss": 1.2999,
836
+ "grad_norm": 7.139451026916504,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.494491136,
843
+ "gpu_mem": 4.493360128,
844
+ "loss": 1.2241,
845
+ "grad_norm": 5.613903522491455,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.494491136,
852
+ "gpu_mem": 4.493398528,
853
+ "loss": 1.2218,
854
+ "grad_norm": 4.103054523468018,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.494491136,
861
+ "gpu_mem": 4.493407744,
862
+ "loss": 1.1553,
863
+ "grad_norm": 4.9666829109191895,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.494491136,
870
+ "gpu_mem": 4.493350912,
871
+ "loss": 1.2102,
872
+ "grad_norm": 4.077977180480957,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.494491136,
879
+ "gpu_mem": 4.493350912,
880
+ "loss": 1.276,
881
+ "grad_norm": 4.605331897735596,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.494491136,
888
+ "gpu_mem": 4.49334784,
889
+ "loss": 1.2602,
890
+ "grad_norm": 7.203280448913574,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.494491136,
897
+ "gpu_mem": 4.493346304,
898
+ "loss": 1.2185,
899
+ "grad_norm": 4.943838119506836,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.494491136,
906
+ "gpu_mem": 4.493389312,
907
+ "loss": 1.1767,
908
+ "grad_norm": 4.598587512969971,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.494491136,
915
+ "gpu_mem": 4.493327872,
916
+ "loss": 1.2838,
917
+ "grad_norm": 4.879561901092529,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.494491136,
924
+ "gpu_mem": 4.493377024,
925
+ "loss": 1.2128,
926
+ "grad_norm": 4.067531108856201,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.494491136,
933
+ "gpu_mem": 4.49344,
934
+ "loss": 1.2998,
935
+ "grad_norm": 5.858630180358887,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.494491136,
942
+ "gpu_mem": 4.493392384,
943
+ "loss": 1.1456,
944
+ "grad_norm": 4.226877689361572,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.494491136,
951
+ "gpu_mem": 4.493373952,
952
+ "loss": 1.2223,
953
+ "grad_norm": 4.734609127044678,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.494491136,
960
+ "gpu_mem": 4.518701056,
961
+ "loss": 1.625,
962
+ "grad_norm": 12.016558647155762,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.494491136,
969
+ "gpu_mem": 4.518682624,
970
+ "loss": 1.025,
971
+ "grad_norm": 5.136899948120117,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.494491136,
978
+ "gpu_mem": 4.518671872,
979
+ "loss": 1.0919,
980
+ "grad_norm": 6.250692844390869,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.494491136,
987
+ "gpu_mem": 4.518725632,
988
+ "loss": 0.9629,
989
+ "grad_norm": 6.343587875366211,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.494491136,
996
+ "gpu_mem": 4.518685696,
997
+ "loss": 1.0155,
998
+ "grad_norm": 6.334781169891357,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.494491136,
1005
+ "gpu_mem": 4.518704128,
1006
+ "loss": 1.039,
1007
+ "grad_norm": 8.391195297241211,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.494491136,
1014
+ "gpu_mem": 4.518767104,
1015
+ "loss": 0.9537,
1016
+ "grad_norm": 7.252392768859863,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.494491136,
1023
+ "gpu_mem": 4.518694912,
1024
+ "loss": 1.0189,
1025
+ "grad_norm": 8.310603141784668,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.494491136,
1032
+ "gpu_mem": 4.518688768,
1033
+ "loss": 1.0358,
1034
+ "grad_norm": 7.496110916137695,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.494491136,
1041
+ "gpu_mem": 4.518704128,
1042
+ "loss": 0.962,
1043
+ "grad_norm": 8.337830543518066,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.494491136,
1050
+ "gpu_mem": 4.518719488,
1051
+ "loss": 0.9699,
1052
+ "grad_norm": 7.644598007202148,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.494491136,
1059
+ "gpu_mem": 4.518710272,
1060
+ "loss": 0.89,
1061
+ "grad_norm": 6.858006000518799,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.494491136,
1068
+ "gpu_mem": 4.518701056,
1069
+ "loss": 0.8971,
1070
+ "grad_norm": 8.39448356628418,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.494491136,
1077
+ "gpu_mem": 4.518719488,
1078
+ "loss": 0.9552,
1079
+ "grad_norm": 8.790838241577148,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.494491136,
1086
+ "gpu_mem": 4.518717952,
1087
+ "loss": 0.7973,
1088
+ "grad_norm": 9.548598289489746,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.494491136,
1095
+ "gpu_mem": 4.518674944,
1096
+ "loss": 0.8216,
1097
+ "grad_norm": 9.652661323547363,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.494491136,
1104
+ "gpu_mem": 4.5187072,
1105
+ "loss": 0.804,
1106
+ "grad_norm": 8.80784797668457,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.494491136,
1113
+ "gpu_mem": 4.51866112,
1114
+ "loss": 0.8326,
1115
+ "grad_norm": 9.57839584350586,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.494491136,
1122
+ "gpu_mem": 4.518705664,
1123
+ "loss": 0.8423,
1124
+ "grad_norm": 11.19555377960205,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.494491136,
1131
+ "gpu_mem": 4.518656512,
1132
+ "loss": 0.7886,
1133
+ "grad_norm": 10.461869239807129,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.494491136,
1140
+ "gpu_mem": 4.5186688,
1141
+ "loss": 0.8942,
1142
+ "grad_norm": 10.252334594726562,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.494491136,
1149
+ "gpu_mem": 4.518693376,
1150
+ "loss": 0.9278,
1151
+ "grad_norm": 11.462838172912598,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.494491136,
1158
+ "gpu_mem": 4.518654976,
1159
+ "loss": 0.7816,
1160
+ "grad_norm": 10.681913375854492,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.494491136,
1167
+ "gpu_mem": 4.518658048,
1168
+ "loss": 0.9345,
1169
+ "grad_norm": 15.080108642578125,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.494491136,
1176
+ "gpu_mem": 4.518670336,
1177
+ "loss": 0.8457,
1178
+ "grad_norm": 11.008662223815918,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.494491136,
1185
+ "gpu_mem": 4.518635008,
1186
+ "loss": 0.8203,
1187
+ "grad_norm": 9.407642364501953,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.494491136,
1194
+ "gpu_mem": 4.51867648,
1195
+ "loss": 0.8339,
1196
+ "grad_norm": 9.52961254119873,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.494491136,
1203
+ "gpu_mem": 4.51869184,
1204
+ "loss": 0.8828,
1205
+ "grad_norm": 9.80649185180664,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.494491136,
1212
+ "gpu_mem": 4.518656512,
1213
+ "loss": 0.9178,
1214
+ "grad_norm": 12.667458534240723,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.494491136,
1221
+ "gpu_mem": 4.518664192,
1222
+ "loss": 0.7627,
1223
+ "grad_norm": 11.412312507629395,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.494491136,
1230
+ "gpu_mem": 4.518685696,
1231
+ "loss": 0.7683,
1232
+ "grad_norm": 11.488083839416504,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.494491136,
1239
+ "gpu_mem": 4.518696448,
1240
+ "loss": 0.816,
1241
+ "grad_norm": 10.21458911895752,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.494491136,
1248
+ "gpu_mem": 4.518688768,
1249
+ "loss": 0.9006,
1250
+ "grad_norm": 12.040121078491211,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.494491136,
1257
+ "gpu_mem": 4.51872256,
1258
+ "loss": 0.8363,
1259
+ "grad_norm": 12.032706260681152,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.494491136,
1266
+ "gpu_mem": 4.51872256,
1267
+ "train_runtime": 674.8012,
1268
+ "train_samples_per_second": 13.343,
1269
+ "train_steps_per_second": 0.207,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.3040791460445949
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.7926605504587156
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1577576
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-boolq-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T12:13:11.031630"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r2-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.4856192,
6
+ "gpu_mem": 4.424159232,
7
+ "loss": 8.869,
8
+ "grad_norm": 265.7165832519531,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.491910656,
15
+ "gpu_mem": 4.437070336,
16
+ "loss": 8.9376,
17
+ "grad_norm": 272.0975036621094,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.492697088,
24
+ "gpu_mem": 4.436988928,
25
+ "loss": 8.2439,
26
+ "grad_norm": 279.63671875,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.49348352,
33
+ "gpu_mem": 4.436988928,
34
+ "loss": 6.7912,
35
+ "grad_norm": 283.7794494628906,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.493876736,
42
+ "gpu_mem": 4.436924416,
43
+ "loss": 4.9316,
44
+ "grad_norm": 254.95008850097656,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.49446656,
51
+ "gpu_mem": 4.436944384,
52
+ "loss": 3.114,
53
+ "grad_norm": 200.49691772460938,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.495056384,
60
+ "gpu_mem": 4.436996608,
61
+ "loss": 1.6758,
62
+ "grad_norm": 93.27091217041016,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.495646208,
69
+ "gpu_mem": 4.437082624,
70
+ "loss": 1.039,
71
+ "grad_norm": 57.26808547973633,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.496039424,
78
+ "gpu_mem": 4.436990464,
79
+ "loss": 1.0551,
80
+ "grad_norm": 59.7805061340332,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.49643264,
87
+ "gpu_mem": 4.436890624,
88
+ "loss": 0.7487,
89
+ "grad_norm": 19.48933982849121,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.496825856,
96
+ "gpu_mem": 4.436995072,
97
+ "loss": 1.1287,
98
+ "grad_norm": 91.75373840332031,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.49741568,
105
+ "gpu_mem": 4.437366784,
106
+ "loss": 1.0336,
107
+ "grad_norm": 73.30252838134766,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.497808896,
114
+ "gpu_mem": 4.436970496,
115
+ "loss": 0.6842,
116
+ "grad_norm": 6.66005802154541,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.498202112,
123
+ "gpu_mem": 4.436947456,
124
+ "loss": 0.745,
125
+ "grad_norm": 33.216796875,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.49839872,
132
+ "gpu_mem": 4.436886016,
133
+ "loss": 0.8204,
134
+ "grad_norm": 36.137210845947266,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.498791936,
141
+ "gpu_mem": 4.436970496,
142
+ "loss": 0.7792,
143
+ "grad_norm": 31.646080017089844,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.499185152,
150
+ "gpu_mem": 4.437010432,
151
+ "loss": 0.7334,
152
+ "grad_norm": 24.66205596923828,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.499578368,
159
+ "gpu_mem": 4.437073408,
160
+ "loss": 0.6744,
161
+ "grad_norm": 3.4665486812591553,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.499971584,
168
+ "gpu_mem": 4.436910592,
169
+ "loss": 0.8234,
170
+ "grad_norm": 28.424806594848633,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.500168192,
177
+ "gpu_mem": 4.43702272,
178
+ "loss": 0.6248,
179
+ "grad_norm": 3.359628677368164,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.500561408,
186
+ "gpu_mem": 4.437180928,
187
+ "loss": 0.6311,
188
+ "grad_norm": 10.9364595413208,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.500954624,
195
+ "gpu_mem": 4.437073408,
196
+ "loss": 0.7505,
197
+ "grad_norm": 14.361480712890625,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.501151232,
204
+ "gpu_mem": 4.43704576,
205
+ "loss": 0.6607,
206
+ "grad_norm": 11.354830741882324,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.50134784,
213
+ "gpu_mem": 4.437102592,
214
+ "loss": 0.608,
215
+ "grad_norm": 5.800236225128174,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.501741056,
222
+ "gpu_mem": 4.436887552,
223
+ "loss": 0.8785,
224
+ "grad_norm": 32.040443420410156,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.501937664,
231
+ "gpu_mem": 4.436942848,
232
+ "loss": 1.0202,
233
+ "grad_norm": 42.82334899902344,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.50233088,
240
+ "gpu_mem": 4.437234688,
241
+ "loss": 0.6503,
242
+ "grad_norm": 10.639617919921875,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.502527488,
249
+ "gpu_mem": 4.436913664,
250
+ "loss": 0.7747,
251
+ "grad_norm": 29.44213104248047,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.502920704,
258
+ "gpu_mem": 4.436978176,
259
+ "loss": 1.0303,
260
+ "grad_norm": 55.7458381652832,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.503117312,
267
+ "gpu_mem": 4.437056512,
268
+ "loss": 0.7492,
269
+ "grad_norm": 22.058603286743164,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.50331392,
276
+ "gpu_mem": 4.436859904,
277
+ "loss": 0.5481,
278
+ "grad_norm": 3.686203956604004,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.503510528,
285
+ "gpu_mem": 4.436973568,
286
+ "loss": 0.834,
287
+ "grad_norm": 23.762094497680664,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.503707136,
294
+ "gpu_mem": 4.437211648,
295
+ "loss": 0.8735,
296
+ "grad_norm": 20.501628875732422,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.503903744,
303
+ "gpu_mem": 4.436913664,
304
+ "loss": 0.5833,
305
+ "grad_norm": 5.7299933433532715,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.504100352,
312
+ "gpu_mem": 4.437124096,
313
+ "loss": 0.6658,
314
+ "grad_norm": 7.160278797149658,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.50429696,
321
+ "gpu_mem": 4.437074944,
322
+ "loss": 0.711,
323
+ "grad_norm": 18.156116485595703,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.504493568,
330
+ "gpu_mem": 4.436886016,
331
+ "loss": 0.5941,
332
+ "grad_norm": 4.312148094177246,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.504690176,
339
+ "gpu_mem": 4.437133312,
340
+ "loss": 0.8683,
341
+ "grad_norm": 32.92335891723633,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.504886784,
348
+ "gpu_mem": 4.437512704,
349
+ "loss": 0.7286,
350
+ "grad_norm": 17.68904685974121,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.505083392,
357
+ "gpu_mem": 4.437082624,
358
+ "loss": 0.5857,
359
+ "grad_norm": 4.258547782897949,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.50528,
366
+ "gpu_mem": 4.437309952,
367
+ "loss": 0.6943,
368
+ "grad_norm": 15.192357063293457,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.505476608,
375
+ "gpu_mem": 4.43720704,
376
+ "loss": 0.5717,
377
+ "grad_norm": 10.226648330688477,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.505673216,
384
+ "gpu_mem": 4.437028864,
385
+ "loss": 0.6568,
386
+ "grad_norm": 8.291679382324219,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.505869824,
393
+ "gpu_mem": 4.437171712,
394
+ "loss": 0.6187,
395
+ "grad_norm": 9.696277618408203,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.505869824,
402
+ "gpu_mem": 4.436952064,
403
+ "loss": 0.8114,
404
+ "grad_norm": 17.778474807739258,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.506066432,
411
+ "gpu_mem": 4.437194752,
412
+ "loss": 0.7589,
413
+ "grad_norm": 25.64463996887207,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.50626304,
420
+ "gpu_mem": 4.436918272,
421
+ "loss": 0.8276,
422
+ "grad_norm": 28.21428108215332,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.50626304,
429
+ "gpu_mem": 4.436995072,
430
+ "loss": 0.6789,
431
+ "grad_norm": 26.740333557128906,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.506459648,
438
+ "gpu_mem": 4.437011968,
439
+ "loss": 0.7783,
440
+ "grad_norm": 16.810571670532227,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.506459648,
447
+ "gpu_mem": 4.436950528,
448
+ "loss": 0.6132,
449
+ "grad_norm": 5.90757942199707,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.506656256,
456
+ "gpu_mem": 4.436955136,
457
+ "loss": 0.5682,
458
+ "grad_norm": 6.385296821594238,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.506656256,
465
+ "gpu_mem": 4.437035008,
466
+ "loss": 0.5911,
467
+ "grad_norm": 4.772059917449951,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.506852864,
474
+ "gpu_mem": 4.437058048,
475
+ "loss": 0.5848,
476
+ "grad_norm": 9.717635154724121,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.507049472,
483
+ "gpu_mem": 4.436985856,
484
+ "loss": 0.6352,
485
+ "grad_norm": 10.524502754211426,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.507049472,
492
+ "gpu_mem": 4.437256192,
493
+ "loss": 0.6172,
494
+ "grad_norm": 6.137028694152832,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.50724608,
501
+ "gpu_mem": 4.437042688,
502
+ "loss": 0.7273,
503
+ "grad_norm": 11.40707015991211,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.50724608,
510
+ "gpu_mem": 4.437036544,
511
+ "loss": 0.5833,
512
+ "grad_norm": 8.116436958312988,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.507442688,
519
+ "gpu_mem": 4.436932096,
520
+ "loss": 0.5561,
521
+ "grad_norm": 4.918622016906738,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.507442688,
528
+ "gpu_mem": 4.436948992,
529
+ "loss": 0.6541,
530
+ "grad_norm": 8.79553508758545,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.507639296,
537
+ "gpu_mem": 4.437042688,
538
+ "loss": 0.5448,
539
+ "grad_norm": 4.7404704093933105,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.507639296,
546
+ "gpu_mem": 4.43705344,
547
+ "loss": 0.5885,
548
+ "grad_norm": 4.77510929107666,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.507835904,
555
+ "gpu_mem": 4.437041152,
556
+ "loss": 0.7054,
557
+ "grad_norm": 11.460348129272461,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.507835904,
564
+ "gpu_mem": 4.437033472,
565
+ "loss": 0.5369,
566
+ "grad_norm": 31.070810317993164,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.507835904,
573
+ "gpu_mem": 4.436962816,
574
+ "loss": 0.609,
575
+ "grad_norm": 15.022584915161133,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.507835904,
582
+ "gpu_mem": 4.43700736,
583
+ "loss": 0.5679,
584
+ "grad_norm": 5.917741775512695,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.507835904,
591
+ "gpu_mem": 4.437200896,
592
+ "loss": 0.5544,
593
+ "grad_norm": 6.815539836883545,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.508032512,
600
+ "gpu_mem": 4.436910592,
601
+ "loss": 0.6308,
602
+ "grad_norm": 13.951557159423828,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.508032512,
609
+ "gpu_mem": 4.436878336,
610
+ "loss": 0.7345,
611
+ "grad_norm": 16.037248611450195,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.50822912,
618
+ "gpu_mem": 4.436944384,
619
+ "loss": 0.6223,
620
+ "grad_norm": 8.664552688598633,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.50822912,
627
+ "gpu_mem": 4.43693824,
628
+ "loss": 0.5409,
629
+ "grad_norm": 9.303351402282715,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.50822912,
636
+ "gpu_mem": 4.437167104,
637
+ "loss": 0.5304,
638
+ "grad_norm": 7.759465217590332,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.50822912,
645
+ "gpu_mem": 4.437159424,
646
+ "loss": 0.5726,
647
+ "grad_norm": 8.323700904846191,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.508425728,
654
+ "gpu_mem": 4.437125632,
655
+ "loss": 0.8418,
656
+ "grad_norm": 20.5197696685791,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.508622336,
663
+ "gpu_mem": 4.436985856,
664
+ "loss": 0.506,
665
+ "grad_norm": 8.913507461547852,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.508622336,
672
+ "gpu_mem": 4.436910592,
673
+ "loss": 0.5309,
674
+ "grad_norm": 10.124161720275879,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.508622336,
681
+ "gpu_mem": 4.436850688,
682
+ "loss": 0.6009,
683
+ "grad_norm": 12.703954696655273,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.508622336,
690
+ "gpu_mem": 4.436924416,
691
+ "loss": 0.5558,
692
+ "grad_norm": 10.466872215270996,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.508622336,
699
+ "gpu_mem": 4.43697664,
700
+ "loss": 0.7043,
701
+ "grad_norm": 20.635786056518555,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.508622336,
708
+ "gpu_mem": 4.437108736,
709
+ "loss": 0.6018,
710
+ "grad_norm": 11.645002365112305,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.508622336,
717
+ "gpu_mem": 4.43699968,
718
+ "loss": 0.5279,
719
+ "grad_norm": 6.659698963165283,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.508622336,
726
+ "gpu_mem": 4.436879872,
727
+ "loss": 0.5791,
728
+ "grad_norm": 6.4264235496521,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.508622336,
735
+ "gpu_mem": 4.436948992,
736
+ "loss": 0.6501,
737
+ "grad_norm": 9.969060897827148,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.508818944,
744
+ "gpu_mem": 4.437048832,
745
+ "loss": 0.6573,
746
+ "grad_norm": 8.324270248413086,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.508818944,
753
+ "gpu_mem": 4.437011968,
754
+ "loss": 0.5463,
755
+ "grad_norm": 8.097661972045898,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.508818944,
762
+ "gpu_mem": 4.437044224,
763
+ "loss": 0.5767,
764
+ "grad_norm": 6.6934494972229,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.509015552,
771
+ "gpu_mem": 4.436995072,
772
+ "loss": 0.6116,
773
+ "grad_norm": 9.769342422485352,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.509015552,
780
+ "gpu_mem": 4.437002752,
781
+ "loss": 0.564,
782
+ "grad_norm": 9.166972160339355,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.509015552,
789
+ "gpu_mem": 4.437147136,
790
+ "loss": 0.5411,
791
+ "grad_norm": 5.252224445343018,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.509015552,
798
+ "gpu_mem": 4.436929024,
799
+ "loss": 0.6234,
800
+ "grad_norm": 6.647994518280029,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.509015552,
807
+ "gpu_mem": 4.436982784,
808
+ "loss": 0.5256,
809
+ "grad_norm": 7.916544437408447,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.509015552,
816
+ "gpu_mem": 4.436950528,
817
+ "loss": 0.4528,
818
+ "grad_norm": 6.592220306396484,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.509015552,
825
+ "gpu_mem": 4.437031936,
826
+ "loss": 0.6788,
827
+ "grad_norm": 12.278592109680176,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.509015552,
834
+ "gpu_mem": 4.436835328,
835
+ "loss": 0.8175,
836
+ "grad_norm": 13.33436393737793,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.509015552,
843
+ "gpu_mem": 4.436948992,
844
+ "loss": 0.5207,
845
+ "grad_norm": 6.806389331817627,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.509015552,
852
+ "gpu_mem": 4.43696896,
853
+ "loss": 0.5228,
854
+ "grad_norm": 6.125219821929932,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.50921216,
861
+ "gpu_mem": 4.43700736,
862
+ "loss": 0.4348,
863
+ "grad_norm": 6.964541912078857,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.50921216,
870
+ "gpu_mem": 4.436992,
871
+ "loss": 0.6261,
872
+ "grad_norm": 8.247956275939941,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.50921216,
879
+ "gpu_mem": 4.436904448,
880
+ "loss": 0.4827,
881
+ "grad_norm": 8.204977035522461,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.50921216,
888
+ "gpu_mem": 4.43685376,
889
+ "loss": 0.5029,
890
+ "grad_norm": 11.964973449707031,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.50921216,
897
+ "gpu_mem": 4.436970496,
898
+ "loss": 0.51,
899
+ "grad_norm": 11.552522659301758,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.50921216,
906
+ "gpu_mem": 4.437002752,
907
+ "loss": 0.5467,
908
+ "grad_norm": 13.213567733764648,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.50921216,
915
+ "gpu_mem": 4.437036544,
916
+ "loss": 0.5372,
917
+ "grad_norm": 8.773509979248047,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.50921216,
924
+ "gpu_mem": 4.437087232,
925
+ "loss": 0.6076,
926
+ "grad_norm": 12.28891372680664,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.50921216,
933
+ "gpu_mem": 4.436992,
934
+ "loss": 0.5038,
935
+ "grad_norm": 11.784523010253906,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.50921216,
942
+ "gpu_mem": 4.437093376,
943
+ "loss": 0.4978,
944
+ "grad_norm": 8.252092361450195,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.50921216,
951
+ "gpu_mem": 4.437044224,
952
+ "loss": 0.5585,
953
+ "grad_norm": 9.146671295166016,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.509408768,
960
+ "gpu_mem": 4.436932096,
961
+ "loss": 0.512,
962
+ "grad_norm": 7.901447772979736,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.509408768,
969
+ "gpu_mem": 4.437116416,
970
+ "loss": 0.5794,
971
+ "grad_norm": 7.828333377838135,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.509408768,
978
+ "gpu_mem": 4.436970496,
979
+ "loss": 0.5699,
980
+ "grad_norm": 5.465837478637695,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.509408768,
987
+ "gpu_mem": 4.436973568,
988
+ "loss": 0.5263,
989
+ "grad_norm": 5.203726768493652,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.509408768,
996
+ "gpu_mem": 4.436942848,
997
+ "loss": 0.5348,
998
+ "grad_norm": 6.427229881286621,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.509408768,
1005
+ "gpu_mem": 4.436988928,
1006
+ "loss": 0.5472,
1007
+ "grad_norm": 6.567240238189697,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.509408768,
1014
+ "gpu_mem": 4.436979712,
1015
+ "loss": 0.5146,
1016
+ "grad_norm": 5.874990463256836,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.509408768,
1023
+ "gpu_mem": 4.43696128,
1024
+ "loss": 0.4087,
1025
+ "grad_norm": 6.523895740509033,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.509408768,
1032
+ "gpu_mem": 4.437036544,
1033
+ "loss": 0.4837,
1034
+ "grad_norm": 6.8895134925842285,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.509408768,
1041
+ "gpu_mem": 4.436956672,
1042
+ "loss": 0.53,
1043
+ "grad_norm": 12.884737968444824,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.509408768,
1050
+ "gpu_mem": 4.436847616,
1051
+ "loss": 0.418,
1052
+ "grad_norm": 8.543777465820312,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.509408768,
1059
+ "gpu_mem": 4.437085696,
1060
+ "loss": 0.6238,
1061
+ "grad_norm": 18.599990844726562,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.509408768,
1068
+ "gpu_mem": 4.437256192,
1069
+ "loss": 0.4314,
1070
+ "grad_norm": 8.732511520385742,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.509408768,
1077
+ "gpu_mem": 4.436988928,
1078
+ "loss": 0.4056,
1079
+ "grad_norm": 9.370827674865723,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.509408768,
1086
+ "gpu_mem": 4.437016576,
1087
+ "loss": 0.6384,
1088
+ "grad_norm": 15.165224075317383,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.509408768,
1095
+ "gpu_mem": 4.437067264,
1096
+ "loss": 0.5406,
1097
+ "grad_norm": 14.913131713867188,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.509408768,
1104
+ "gpu_mem": 4.4368768,
1105
+ "loss": 0.429,
1106
+ "grad_norm": 10.716595649719238,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.509408768,
1113
+ "gpu_mem": 4.437319168,
1114
+ "loss": 0.3996,
1115
+ "grad_norm": 7.652353286743164,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.509408768,
1122
+ "gpu_mem": 4.43704576,
1123
+ "loss": 0.5075,
1124
+ "grad_norm": 13.265007972717285,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.509408768,
1131
+ "gpu_mem": 4.436929024,
1132
+ "loss": 0.4667,
1133
+ "grad_norm": 9.696459770202637,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.509408768,
1140
+ "gpu_mem": 4.43736832,
1141
+ "loss": 0.5595,
1142
+ "grad_norm": 15.351218223571777,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.509408768,
1149
+ "gpu_mem": 4.437144064,
1150
+ "loss": 0.5056,
1151
+ "grad_norm": 10.441043853759766,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.509408768,
1158
+ "gpu_mem": 4.437184,
1159
+ "loss": 0.6595,
1160
+ "grad_norm": 13.417473793029785,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.509408768,
1167
+ "gpu_mem": 4.436965888,
1168
+ "loss": 0.6462,
1169
+ "grad_norm": 7.923618316650391,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.509408768,
1176
+ "gpu_mem": 4.437094912,
1177
+ "loss": 0.5114,
1178
+ "grad_norm": 6.110599994659424,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.509408768,
1185
+ "gpu_mem": 4.43717632,
1186
+ "loss": 0.4793,
1187
+ "grad_norm": 12.383698463439941,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.509408768,
1194
+ "gpu_mem": 4.436959744,
1195
+ "loss": 0.5583,
1196
+ "grad_norm": 12.624692916870117,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.509408768,
1203
+ "gpu_mem": 4.437093376,
1204
+ "loss": 0.4981,
1205
+ "grad_norm": 12.406174659729004,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.509605376,
1212
+ "gpu_mem": 4.437116416,
1213
+ "loss": 0.4536,
1214
+ "grad_norm": 7.415020942687988,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.509605376,
1221
+ "gpu_mem": 4.4369536,
1222
+ "loss": 0.5096,
1223
+ "grad_norm": 9.677729606628418,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.509605376,
1230
+ "gpu_mem": 4.436833792,
1231
+ "loss": 0.5439,
1232
+ "grad_norm": 11.361948013305664,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.509605376,
1239
+ "gpu_mem": 4.43701504,
1240
+ "loss": 0.5307,
1241
+ "grad_norm": 11.554671287536621,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.509605376,
1248
+ "gpu_mem": 4.436913664,
1249
+ "loss": 0.4409,
1250
+ "grad_norm": 7.895120620727539,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.509605376,
1257
+ "gpu_mem": 4.436965888,
1258
+ "loss": 0.4858,
1259
+ "grad_norm": 10.334193229675293,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.509605376,
1266
+ "gpu_mem": 4.436998144,
1267
+ "loss": 0.6513,
1268
+ "grad_norm": 19.94317626953125,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.509605376,
1275
+ "gpu_mem": 4.43713792,
1276
+ "loss": 0.5677,
1277
+ "grad_norm": 16.330734252929688,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.509605376,
1284
+ "gpu_mem": 4.437121024,
1285
+ "loss": 0.5919,
1286
+ "grad_norm": 11.14167308807373,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.509605376,
1293
+ "gpu_mem": 4.437313024,
1294
+ "loss": 0.5742,
1295
+ "grad_norm": 7.925104141235352,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.509605376,
1302
+ "gpu_mem": 4.437024256,
1303
+ "loss": 0.4559,
1304
+ "grad_norm": 6.353579044342041,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.509605376,
1311
+ "gpu_mem": 4.437059584,
1312
+ "loss": 0.579,
1313
+ "grad_norm": 9.637496948242188,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.509605376,
1320
+ "gpu_mem": 4.436958208,
1321
+ "loss": 0.5382,
1322
+ "grad_norm": 7.5189595222473145,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.509605376,
1329
+ "gpu_mem": 4.443509248,
1330
+ "loss": 0.6157,
1331
+ "grad_norm": 11.473037719726562,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.509605376,
1338
+ "gpu_mem": 4.443444736,
1339
+ "loss": 0.3944,
1340
+ "grad_norm": 5.674694538116455,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.509605376,
1347
+ "gpu_mem": 4.44328192,
1348
+ "loss": 0.3795,
1349
+ "grad_norm": 6.071469783782959,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.509605376,
1356
+ "gpu_mem": 4.443354112,
1357
+ "loss": 0.3988,
1358
+ "grad_norm": 5.714435577392578,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.509605376,
1365
+ "gpu_mem": 4.44338944,
1366
+ "loss": 0.3796,
1367
+ "grad_norm": 6.586073875427246,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.509605376,
1374
+ "gpu_mem": 4.443414016,
1375
+ "loss": 0.452,
1376
+ "grad_norm": 6.130358695983887,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.509605376,
1383
+ "gpu_mem": 4.443375616,
1384
+ "loss": 0.3782,
1385
+ "grad_norm": 6.469320774078369,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.509605376,
1392
+ "gpu_mem": 4.4435968,
1393
+ "loss": 0.4447,
1394
+ "grad_norm": 9.796608924865723,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.509605376,
1401
+ "gpu_mem": 4.44350464,
1402
+ "loss": 0.3952,
1403
+ "grad_norm": 10.653168678283691,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.509605376,
1410
+ "gpu_mem": 4.443410944,
1411
+ "loss": 0.4352,
1412
+ "grad_norm": 10.047937393188477,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.509605376,
1419
+ "gpu_mem": 4.443337216,
1420
+ "loss": 0.3561,
1421
+ "grad_norm": 7.982423782348633,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.509605376,
1428
+ "gpu_mem": 4.443685888,
1429
+ "loss": 0.3552,
1430
+ "grad_norm": 8.629858016967773,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.509605376,
1437
+ "gpu_mem": 4.443280384,
1438
+ "loss": 0.3805,
1439
+ "grad_norm": 10.6151123046875,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.509605376,
1446
+ "gpu_mem": 4.443226624,
1447
+ "loss": 0.38,
1448
+ "grad_norm": 8.782222747802734,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.509605376,
1455
+ "gpu_mem": 4.444002304,
1456
+ "loss": 0.26,
1457
+ "grad_norm": 7.612800598144531,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.509605376,
1464
+ "gpu_mem": 4.443478528,
1465
+ "loss": 0.3393,
1466
+ "grad_norm": 7.5802998542785645,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.509605376,
1473
+ "gpu_mem": 4.443390976,
1474
+ "loss": 0.429,
1475
+ "grad_norm": 9.032646179199219,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.509605376,
1482
+ "gpu_mem": 4.443340288,
1483
+ "loss": 0.3151,
1484
+ "grad_norm": 8.619991302490234,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.509605376,
1491
+ "gpu_mem": 4.44343552,
1492
+ "loss": 0.3742,
1493
+ "grad_norm": 9.681302070617676,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.509605376,
1500
+ "gpu_mem": 4.443352576,
1501
+ "loss": 0.4062,
1502
+ "grad_norm": 14.850049018859863,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.509605376,
1509
+ "gpu_mem": 4.443371008,
1510
+ "loss": 0.4535,
1511
+ "grad_norm": 10.279086112976074,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.509605376,
1518
+ "gpu_mem": 4.44345856,
1519
+ "loss": 0.3463,
1520
+ "grad_norm": 9.80820369720459,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.509605376,
1527
+ "gpu_mem": 4.44334336,
1528
+ "loss": 0.3783,
1529
+ "grad_norm": 10.852489471435547,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.509605376,
1536
+ "gpu_mem": 4.443406336,
1537
+ "loss": 0.4453,
1538
+ "grad_norm": 11.156643867492676,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.509605376,
1545
+ "gpu_mem": 4.443314176,
1546
+ "loss": 0.3344,
1547
+ "grad_norm": 8.870190620422363,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.509605376,
1554
+ "gpu_mem": 4.443613696,
1555
+ "loss": 0.4352,
1556
+ "grad_norm": 9.64120101928711,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.509605376,
1563
+ "gpu_mem": 4.443337216,
1564
+ "loss": 0.333,
1565
+ "grad_norm": 8.450601577758789,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.509605376,
1572
+ "gpu_mem": 4.443303424,
1573
+ "loss": 0.3363,
1574
+ "grad_norm": 7.288397312164307,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.509605376,
1581
+ "gpu_mem": 4.443441664,
1582
+ "loss": 0.3218,
1583
+ "grad_norm": 10.227561950683594,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.509605376,
1590
+ "gpu_mem": 4.443539968,
1591
+ "loss": 0.351,
1592
+ "grad_norm": 7.246642112731934,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.509605376,
1599
+ "gpu_mem": 4.443286528,
1600
+ "loss": 0.5062,
1601
+ "grad_norm": 15.948833465576172,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.509605376,
1608
+ "gpu_mem": 4.443386368,
1609
+ "loss": 0.413,
1610
+ "grad_norm": 9.143070220947266,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.509605376,
1617
+ "gpu_mem": 4.44335872,
1618
+ "loss": 0.314,
1619
+ "grad_norm": 9.495607376098633,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.509605376,
1626
+ "gpu_mem": 4.443295744,
1627
+ "loss": 0.3162,
1628
+ "grad_norm": 9.015178680419922,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.509605376,
1635
+ "gpu_mem": 4.443513856,
1636
+ "loss": 0.4318,
1637
+ "grad_norm": 10.653160095214844,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.509605376,
1644
+ "gpu_mem": 4.443410944,
1645
+ "loss": 0.3551,
1646
+ "grad_norm": 11.33315372467041,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.509605376,
1653
+ "gpu_mem": 4.44335872,
1654
+ "loss": 0.3593,
1655
+ "grad_norm": 9.929872512817383,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.509605376,
1662
+ "gpu_mem": 4.443337216,
1663
+ "loss": 0.3663,
1664
+ "grad_norm": 9.877488136291504,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.509605376,
1671
+ "gpu_mem": 4.443346432,
1672
+ "loss": 0.2439,
1673
+ "grad_norm": 6.803407192230225,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.509605376,
1680
+ "gpu_mem": 4.443278848,
1681
+ "loss": 0.4327,
1682
+ "grad_norm": 13.585451126098633,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.509605376,
1689
+ "gpu_mem": 4.443441664,
1690
+ "loss": 0.245,
1691
+ "grad_norm": 10.62351131439209,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.509605376,
1698
+ "gpu_mem": 4.443311104,
1699
+ "loss": 0.4337,
1700
+ "grad_norm": 9.73697280883789,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.509605376,
1707
+ "gpu_mem": 4.443430912,
1708
+ "loss": 0.3561,
1709
+ "grad_norm": 11.984561920166016,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.509605376,
1716
+ "gpu_mem": 4.443249664,
1717
+ "loss": 0.3225,
1718
+ "grad_norm": 12.810022354125977,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.509605376,
1725
+ "gpu_mem": 4.44338176,
1726
+ "loss": 0.2664,
1727
+ "grad_norm": 10.05370807647705,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.509605376,
1734
+ "gpu_mem": 4.443355648,
1735
+ "loss": 0.4455,
1736
+ "grad_norm": 14.030529975891113,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.509605376,
1743
+ "gpu_mem": 4.443321856,
1744
+ "loss": 0.3602,
1745
+ "grad_norm": 12.235883712768555,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.509605376,
1752
+ "gpu_mem": 4.443426304,
1753
+ "loss": 0.2762,
1754
+ "grad_norm": 9.612601280212402,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.509605376,
1761
+ "gpu_mem": 4.443421696,
1762
+ "loss": 0.3528,
1763
+ "grad_norm": 8.862492561340332,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.509605376,
1770
+ "gpu_mem": 4.443280384,
1771
+ "loss": 0.2842,
1772
+ "grad_norm": 11.436196327209473,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.509605376,
1779
+ "gpu_mem": 4.443472384,
1780
+ "loss": 0.3027,
1781
+ "grad_norm": 10.520874977111816,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.509605376,
1788
+ "gpu_mem": 4.443323392,
1789
+ "loss": 0.3717,
1790
+ "grad_norm": 14.724334716796875,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.509605376,
1797
+ "gpu_mem": 4.443426304,
1798
+ "loss": 0.3074,
1799
+ "grad_norm": 12.475892066955566,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.509605376,
1806
+ "gpu_mem": 4.443629056,
1807
+ "loss": 0.2782,
1808
+ "grad_norm": 8.169771194458008,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.509605376,
1815
+ "gpu_mem": 4.443430912,
1816
+ "loss": 0.2923,
1817
+ "grad_norm": 9.363910675048828,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.509605376,
1824
+ "gpu_mem": 4.443317248,
1825
+ "loss": 0.2826,
1826
+ "grad_norm": 7.223607063293457,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.509605376,
1833
+ "gpu_mem": 4.443331072,
1834
+ "loss": 0.262,
1835
+ "grad_norm": 8.399836540222168,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.509605376,
1842
+ "gpu_mem": 4.443375616,
1843
+ "loss": 0.2814,
1844
+ "grad_norm": 8.11514663696289,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.509605376,
1851
+ "gpu_mem": 4.443317248,
1852
+ "loss": 0.2771,
1853
+ "grad_norm": 9.344612121582031,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.509605376,
1860
+ "gpu_mem": 4.44355072,
1861
+ "loss": 0.3327,
1862
+ "grad_norm": 10.335131645202637,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.509605376,
1869
+ "gpu_mem": 4.44358144,
1870
+ "loss": 0.2319,
1871
+ "grad_norm": 12.18466567993164,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.509605376,
1878
+ "gpu_mem": 4.443507712,
1879
+ "loss": 0.3699,
1880
+ "grad_norm": 11.099276542663574,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.509605376,
1887
+ "gpu_mem": 4.443395584,
1888
+ "loss": 0.4507,
1889
+ "grad_norm": 12.702630996704102,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.509605376,
1896
+ "gpu_mem": 4.443344896,
1897
+ "loss": 0.4079,
1898
+ "grad_norm": 10.903487205505371,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.509605376,
1905
+ "gpu_mem": 4.443314176,
1906
+ "loss": 0.3,
1907
+ "grad_norm": 10.615644454956055,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.509605376,
1914
+ "gpu_mem": 4.443337216,
1915
+ "loss": 0.3518,
1916
+ "grad_norm": 10.656424522399902,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.509605376,
1923
+ "gpu_mem": 4.44342016,
1924
+ "loss": 0.4096,
1925
+ "grad_norm": 16.58029556274414,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.509605376,
1932
+ "gpu_mem": 4.443347968,
1933
+ "loss": 0.4284,
1934
+ "grad_norm": 16.37483024597168,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.509605376,
1941
+ "gpu_mem": 4.443513856,
1942
+ "loss": 0.2537,
1943
+ "grad_norm": 8.520498275756836,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.509605376,
1950
+ "gpu_mem": 4.443355648,
1951
+ "loss": 0.3216,
1952
+ "grad_norm": 15.882022857666016,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.509801984,
1959
+ "gpu_mem": 4.443332608,
1960
+ "loss": 0.3527,
1961
+ "grad_norm": 11.748701095581055,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.509801984,
1968
+ "gpu_mem": 4.44345856,
1969
+ "loss": 0.4225,
1970
+ "grad_norm": 10.945619583129883,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.509801984,
1977
+ "gpu_mem": 4.443492352,
1978
+ "loss": 0.3201,
1979
+ "grad_norm": 10.223454475402832,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.509801984,
1986
+ "gpu_mem": 4.443360256,
1987
+ "loss": 0.2756,
1988
+ "grad_norm": 8.79985237121582,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.509801984,
1995
+ "gpu_mem": 4.44349696,
1996
+ "loss": 0.3005,
1997
+ "grad_norm": 9.963611602783203,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.509801984,
2004
+ "gpu_mem": 4.443410944,
2005
+ "loss": 0.2828,
2006
+ "grad_norm": 13.369943618774414,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.509801984,
2013
+ "gpu_mem": 4.443375616,
2014
+ "loss": 0.2504,
2015
+ "grad_norm": 8.19646167755127,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.509801984,
2022
+ "gpu_mem": 4.443340288,
2023
+ "loss": 0.2712,
2024
+ "grad_norm": 10.986623764038086,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.509801984,
2031
+ "gpu_mem": 4.44348928,
2032
+ "loss": 0.3418,
2033
+ "grad_norm": 9.064874649047852,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.509801984,
2040
+ "gpu_mem": 4.443378688,
2041
+ "loss": 0.3776,
2042
+ "grad_norm": 13.318199157714844,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.509801984,
2049
+ "gpu_mem": 4.443323392,
2050
+ "loss": 0.4031,
2051
+ "grad_norm": 11.604644775390625,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.509801984,
2058
+ "gpu_mem": 4.443265024,
2059
+ "loss": 0.2382,
2060
+ "grad_norm": 7.073112487792969,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.509801984,
2067
+ "gpu_mem": 4.443321856,
2068
+ "loss": 0.2682,
2069
+ "grad_norm": 8.373688697814941,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.509801984,
2076
+ "gpu_mem": 4.443599872,
2077
+ "loss": 0.4171,
2078
+ "grad_norm": 13.975024223327637,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.509801984,
2085
+ "gpu_mem": 4.443323392,
2086
+ "loss": 0.3271,
2087
+ "grad_norm": 9.13338851928711,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.509801984,
2094
+ "gpu_mem": 4.443636736,
2095
+ "loss": 0.3622,
2096
+ "grad_norm": 12.784040451049805,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.509801984,
2103
+ "gpu_mem": 4.44351232,
2104
+ "loss": 0.2907,
2105
+ "grad_norm": 9.858100891113281,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.509801984,
2112
+ "gpu_mem": 4.443268096,
2113
+ "loss": 0.2375,
2114
+ "grad_norm": 7.437261581420898,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.509801984,
2121
+ "gpu_mem": 4.443328,
2122
+ "loss": 0.3846,
2123
+ "grad_norm": 15.018045425415039,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.509801984,
2130
+ "gpu_mem": 4.44338944,
2131
+ "loss": 0.4068,
2132
+ "grad_norm": 13.178804397583008,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.509801984,
2139
+ "gpu_mem": 4.443390976,
2140
+ "loss": 0.3033,
2141
+ "grad_norm": 11.403144836425781,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.509801984,
2148
+ "gpu_mem": 4.443644416,
2149
+ "loss": 0.3641,
2150
+ "grad_norm": 11.864053726196289,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.509801984,
2157
+ "gpu_mem": 4.443294208,
2158
+ "loss": 0.5409,
2159
+ "grad_norm": 16.487295150756836,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.509801984,
2166
+ "gpu_mem": 4.443590656,
2167
+ "loss": 0.2953,
2168
+ "grad_norm": 10.208303451538086,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.509801984,
2175
+ "gpu_mem": 4.443452416,
2176
+ "loss": 0.3304,
2177
+ "grad_norm": 12.945493698120117,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.509801984,
2184
+ "gpu_mem": 4.44330496,
2185
+ "loss": 0.4223,
2186
+ "grad_norm": 10.953478813171387,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.509801984,
2193
+ "gpu_mem": 4.443444736,
2194
+ "loss": 0.2251,
2195
+ "grad_norm": 8.908162117004395,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.509801984,
2202
+ "gpu_mem": 4.443323392,
2203
+ "loss": 0.3956,
2204
+ "grad_norm": 11.671682357788086,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.509801984,
2211
+ "gpu_mem": 4.443417088,
2212
+ "loss": 0.4087,
2213
+ "grad_norm": 10.720558166503906,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.509801984,
2220
+ "gpu_mem": 4.44343552,
2221
+ "loss": 0.2392,
2222
+ "grad_norm": 7.951722621917725,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.509801984,
2229
+ "gpu_mem": 4.443387904,
2230
+ "loss": 0.286,
2231
+ "grad_norm": 8.448762893676758,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.509801984,
2238
+ "gpu_mem": 4.443294208,
2239
+ "loss": 0.2903,
2240
+ "grad_norm": 7.876997947692871,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.510785024,
2247
+ "gpu_mem": 4.443386368,
2248
+ "loss": 0.3307,
2249
+ "grad_norm": 11.870707511901855,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.510785024,
2256
+ "gpu_mem": 4.443298816,
2257
+ "loss": 0.2187,
2258
+ "grad_norm": 9.706609725952148,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.510785024,
2265
+ "gpu_mem": 4.443341824,
2266
+ "loss": 0.3021,
2267
+ "grad_norm": 8.621807098388672,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.510785024,
2274
+ "gpu_mem": 4.44353536,
2275
+ "loss": 0.3015,
2276
+ "grad_norm": 10.553934097290039,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.510785024,
2283
+ "gpu_mem": 4.443331072,
2284
+ "loss": 0.3854,
2285
+ "grad_norm": 11.682550430297852,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.510785024,
2292
+ "gpu_mem": 4.443487744,
2293
+ "loss": 0.3721,
2294
+ "grad_norm": 12.575425148010254,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.510785024,
2301
+ "gpu_mem": 4.443314176,
2302
+ "loss": 0.3563,
2303
+ "grad_norm": 13.305479049682617,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.510785024,
2310
+ "gpu_mem": 4.44372736,
2311
+ "loss": 0.372,
2312
+ "grad_norm": 8.018106460571289,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.510785024,
2319
+ "gpu_mem": 4.443386368,
2320
+ "loss": 0.3647,
2321
+ "grad_norm": 11.9230375289917,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.510785024,
2328
+ "gpu_mem": 4.443303424,
2329
+ "loss": 0.3188,
2330
+ "grad_norm": 9.143546104431152,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.510785024,
2337
+ "gpu_mem": 4.44342016,
2338
+ "loss": 0.2007,
2339
+ "grad_norm": 7.097804069519043,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.510785024,
2346
+ "gpu_mem": 4.443375616,
2347
+ "loss": 0.1669,
2348
+ "grad_norm": 6.277010440826416,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.510785024,
2355
+ "gpu_mem": 4.443332608,
2356
+ "loss": 0.2355,
2357
+ "grad_norm": 8.6841459274292,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.510785024,
2364
+ "gpu_mem": 4.443369472,
2365
+ "loss": 0.273,
2366
+ "grad_norm": 7.298412322998047,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.510785024,
2373
+ "gpu_mem": 4.443457024,
2374
+ "loss": 0.2702,
2375
+ "grad_norm": 10.429343223571777,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.510785024,
2382
+ "gpu_mem": 4.443375616,
2383
+ "loss": 0.3714,
2384
+ "grad_norm": 10.106574058532715,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.510785024,
2391
+ "gpu_mem": 4.443590656,
2392
+ "loss": 0.3821,
2393
+ "grad_norm": 12.320114135742188,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.510785024,
2400
+ "gpu_mem": 4.443383296,
2401
+ "loss": 0.2547,
2402
+ "grad_norm": 9.98828125,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.510785024,
2409
+ "gpu_mem": 4.443387904,
2410
+ "loss": 0.2823,
2411
+ "grad_norm": 10.371786117553711,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.510785024,
2418
+ "gpu_mem": 4.443398656,
2419
+ "loss": 0.3287,
2420
+ "grad_norm": 9.897567749023438,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.510785024,
2427
+ "gpu_mem": 4.443437056,
2428
+ "loss": 0.2544,
2429
+ "grad_norm": 10.063066482543945,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.510785024,
2436
+ "gpu_mem": 4.44348928,
2437
+ "loss": 0.3142,
2438
+ "grad_norm": 10.186683654785156,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.510785024,
2445
+ "gpu_mem": 4.443347968,
2446
+ "loss": 0.3402,
2447
+ "grad_norm": 8.863958358764648,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.510785024,
2454
+ "gpu_mem": 4.44322816,
2455
+ "loss": 0.2973,
2456
+ "grad_norm": 9.513245582580566,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.510785024,
2463
+ "gpu_mem": 4.443455488,
2464
+ "loss": 0.279,
2465
+ "grad_norm": 10.836873054504395,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.510785024,
2472
+ "gpu_mem": 4.443699712,
2473
+ "loss": 0.32,
2474
+ "grad_norm": 11.420705795288086,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.510785024,
2481
+ "gpu_mem": 4.443360256,
2482
+ "loss": 0.3155,
2483
+ "grad_norm": 10.466375350952148,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.510785024,
2490
+ "gpu_mem": 4.443306496,
2491
+ "loss": 0.3167,
2492
+ "grad_norm": 10.945162773132324,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.510785024,
2499
+ "gpu_mem": 4.443469312,
2500
+ "loss": 0.4518,
2501
+ "grad_norm": 12.45777416229248,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.510785024,
2508
+ "gpu_mem": 4.443409408,
2509
+ "loss": 0.2909,
2510
+ "grad_norm": 8.404402732849121,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.510785024,
2517
+ "gpu_mem": 4.44338944,
2518
+ "loss": 0.3295,
2519
+ "grad_norm": 10.143949508666992,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.510785024,
2526
+ "gpu_mem": 4.443324928,
2527
+ "loss": 0.3426,
2528
+ "grad_norm": 10.10682201385498,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.510785024,
2535
+ "gpu_mem": 4.443753472,
2536
+ "loss": 0.2746,
2537
+ "grad_norm": 8.048744201660156,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.510785024,
2544
+ "gpu_mem": 4.443460096,
2545
+ "loss": 0.3007,
2546
+ "grad_norm": 10.570841789245605,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.510785024,
2553
+ "gpu_mem": 4.443315712,
2554
+ "loss": 0.3486,
2555
+ "grad_norm": 8.057785987854004,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.510785024,
2562
+ "gpu_mem": 4.443369472,
2563
+ "loss": 0.3287,
2564
+ "grad_norm": 9.795562744140625,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.510785024,
2571
+ "gpu_mem": 4.443787264,
2572
+ "loss": 0.2857,
2573
+ "grad_norm": 13.503580093383789,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.510785024,
2580
+ "gpu_mem": 4.443556864,
2581
+ "loss": 0.2571,
2582
+ "grad_norm": 9.61729621887207,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.510785024,
2589
+ "gpu_mem": 4.443341824,
2590
+ "loss": 0.4314,
2591
+ "grad_norm": 11.989745140075684,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.510785024,
2598
+ "gpu_mem": 4.443426304,
2599
+ "loss": 0.3231,
2600
+ "grad_norm": 10.468039512634277,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.510785024,
2607
+ "gpu_mem": 4.44335104,
2608
+ "loss": 0.3028,
2609
+ "grad_norm": 13.236854553222656,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.510785024,
2616
+ "gpu_mem": 4.443386368,
2617
+ "loss": 0.3675,
2618
+ "grad_norm": 10.517420768737793,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.510785024,
2625
+ "gpu_mem": 4.443469312,
2626
+ "loss": 0.3511,
2627
+ "grad_norm": 9.854143142700195,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.510785024,
2634
+ "gpu_mem": 4.443386368,
2635
+ "loss": 0.4175,
2636
+ "grad_norm": 13.601873397827148,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.510785024,
2643
+ "gpu_mem": 4.44341248,
2644
+ "loss": 0.3715,
2645
+ "grad_norm": 12.147957801818848,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.510785024,
2652
+ "gpu_mem": 4.44341248,
2653
+ "train_runtime": 4455.2502,
2654
+ "train_samples_per_second": 4.232,
2655
+ "train_steps_per_second": 0.066,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.610709656562124
2658
+ }
2659
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.7929663608562691
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 25389056
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-boolq-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T02:06:30.838829"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r32-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.503981568,
6
+ "gpu_mem": 4.519328768,
7
+ "loss": 8.869,
8
+ "grad_norm": 233.80860900878906,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.510076416,
15
+ "gpu_mem": 4.722578944,
16
+ "loss": 8.9376,
17
+ "grad_norm": 239.66294860839844,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.510862848,
24
+ "gpu_mem": 4.722497536,
25
+ "loss": 6.3632,
26
+ "grad_norm": 227.827392578125,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.511452672,
33
+ "gpu_mem": 4.722497536,
34
+ "loss": 2.6529,
35
+ "grad_norm": 144.43919372558594,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.512042496,
42
+ "gpu_mem": 4.722433024,
43
+ "loss": 1.212,
44
+ "grad_norm": 47.84184265136719,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.51263232,
51
+ "gpu_mem": 4.722452992,
52
+ "loss": 0.8617,
53
+ "grad_norm": 43.27740478515625,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.513222144,
60
+ "gpu_mem": 4.722505216,
61
+ "loss": 1.1695,
62
+ "grad_norm": 69.79517364501953,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.51361536,
69
+ "gpu_mem": 4.722591232,
70
+ "loss": 0.6559,
71
+ "grad_norm": 18.012210845947266,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.514008576,
78
+ "gpu_mem": 4.722499072,
79
+ "loss": 2.141,
80
+ "grad_norm": 79.17948913574219,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.5145984,
87
+ "gpu_mem": 4.722399232,
88
+ "loss": 1.456,
89
+ "grad_norm": 58.11606979370117,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.515188224,
96
+ "gpu_mem": 4.72250368,
97
+ "loss": 0.6457,
98
+ "grad_norm": 6.229933738708496,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.51558144,
105
+ "gpu_mem": 4.722875392,
106
+ "loss": 1.2234,
107
+ "grad_norm": 116.1498794555664,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.515974656,
114
+ "gpu_mem": 4.722479104,
115
+ "loss": 1.2587,
116
+ "grad_norm": 52.9954719543457,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.516367872,
123
+ "gpu_mem": 4.722456064,
124
+ "loss": 0.686,
125
+ "grad_norm": 9.985751152038574,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.516761088,
132
+ "gpu_mem": 4.722394624,
133
+ "loss": 1.1792,
134
+ "grad_norm": 29.390972137451172,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.516957696,
141
+ "gpu_mem": 4.722479104,
142
+ "loss": 1.8027,
143
+ "grad_norm": 77.97930908203125,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.517350912,
150
+ "gpu_mem": 4.72251904,
151
+ "loss": 0.9339,
152
+ "grad_norm": 31.851720809936523,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.517744128,
159
+ "gpu_mem": 4.722582016,
160
+ "loss": 1.2629,
161
+ "grad_norm": 45.89571762084961,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.518137344,
168
+ "gpu_mem": 4.7224192,
169
+ "loss": 1.2291,
170
+ "grad_norm": 32.983150482177734,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.51853056,
177
+ "gpu_mem": 4.722531328,
178
+ "loss": 0.8234,
179
+ "grad_norm": 22.25924301147461,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.518727168,
186
+ "gpu_mem": 4.722689536,
187
+ "loss": 0.6795,
188
+ "grad_norm": 8.868040084838867,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.519120384,
195
+ "gpu_mem": 4.722582016,
196
+ "loss": 1.0734,
197
+ "grad_norm": 22.563232421875,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.519316992,
204
+ "gpu_mem": 4.722554368,
205
+ "loss": 0.8224,
206
+ "grad_norm": 13.91633415222168,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.5195136,
213
+ "gpu_mem": 4.7226112,
214
+ "loss": 0.7573,
215
+ "grad_norm": 16.691375732421875,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.519906816,
222
+ "gpu_mem": 4.72239616,
223
+ "loss": 0.6975,
224
+ "grad_norm": 6.51262092590332,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.520300032,
231
+ "gpu_mem": 4.722451456,
232
+ "loss": 0.882,
233
+ "grad_norm": 14.44697093963623,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.52049664,
240
+ "gpu_mem": 4.722743296,
241
+ "loss": 0.6362,
242
+ "grad_norm": 3.7360706329345703,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.520693248,
249
+ "gpu_mem": 4.722422272,
250
+ "loss": 0.7593,
251
+ "grad_norm": 8.032002449035645,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.521086464,
258
+ "gpu_mem": 4.722486784,
259
+ "loss": 0.7451,
260
+ "grad_norm": 12.261842727661133,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.521283072,
267
+ "gpu_mem": 4.72256512,
268
+ "loss": 0.7261,
269
+ "grad_norm": 7.222959518432617,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.52147968,
276
+ "gpu_mem": 4.722368512,
277
+ "loss": 0.6136,
278
+ "grad_norm": 2.5524110794067383,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.521676288,
285
+ "gpu_mem": 4.722482176,
286
+ "loss": 0.7509,
287
+ "grad_norm": 9.033954620361328,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.521872896,
294
+ "gpu_mem": 4.722720256,
295
+ "loss": 0.7194,
296
+ "grad_norm": 8.025568008422852,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.522069504,
303
+ "gpu_mem": 4.722422272,
304
+ "loss": 0.5858,
305
+ "grad_norm": 2.7295961380004883,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.522266112,
312
+ "gpu_mem": 4.722632704,
313
+ "loss": 0.8056,
314
+ "grad_norm": 10.636892318725586,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.52246272,
321
+ "gpu_mem": 4.722583552,
322
+ "loss": 0.6826,
323
+ "grad_norm": 2.589643955230713,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.522659328,
330
+ "gpu_mem": 4.722394624,
331
+ "loss": 0.6468,
332
+ "grad_norm": 4.602322101593018,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.522855936,
339
+ "gpu_mem": 4.72264192,
340
+ "loss": 0.8638,
341
+ "grad_norm": 11.864049911499023,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.523052544,
348
+ "gpu_mem": 4.723021312,
349
+ "loss": 0.7293,
350
+ "grad_norm": 7.218650817871094,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.523249152,
357
+ "gpu_mem": 4.722591232,
358
+ "loss": 0.6592,
359
+ "grad_norm": 2.3266420364379883,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.52344576,
366
+ "gpu_mem": 4.72281856,
367
+ "loss": 0.6739,
368
+ "grad_norm": 2.876038074493408,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.523642368,
375
+ "gpu_mem": 4.722715648,
376
+ "loss": 0.6569,
377
+ "grad_norm": 5.236875057220459,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.523838976,
384
+ "gpu_mem": 4.722537472,
385
+ "loss": 0.6302,
386
+ "grad_norm": 2.8164334297180176,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.524035584,
393
+ "gpu_mem": 4.72268032,
394
+ "loss": 0.6166,
395
+ "grad_norm": 4.41773796081543,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.524232192,
402
+ "gpu_mem": 4.722460672,
403
+ "loss": 0.8092,
404
+ "grad_norm": 9.20035457611084,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.524232192,
411
+ "gpu_mem": 4.72270336,
412
+ "loss": 0.8993,
413
+ "grad_norm": 15.634708404541016,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.5244288,
420
+ "gpu_mem": 4.72242688,
421
+ "loss": 0.8146,
422
+ "grad_norm": 14.326847076416016,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.524625408,
429
+ "gpu_mem": 4.72250368,
430
+ "loss": 0.6621,
431
+ "grad_norm": 7.587663650512695,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.524822016,
438
+ "gpu_mem": 4.722520576,
439
+ "loss": 0.9905,
440
+ "grad_norm": 13.618229866027832,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.524822016,
447
+ "gpu_mem": 4.722459136,
448
+ "loss": 0.8435,
449
+ "grad_norm": 9.786823272705078,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.524822016,
456
+ "gpu_mem": 4.722463744,
457
+ "loss": 0.6079,
458
+ "grad_norm": 3.4515862464904785,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.525018624,
465
+ "gpu_mem": 4.722543616,
466
+ "loss": 0.6645,
467
+ "grad_norm": 2.619760751724243,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.525018624,
474
+ "gpu_mem": 4.722566656,
475
+ "loss": 0.7463,
476
+ "grad_norm": 10.795845031738281,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.525215232,
483
+ "gpu_mem": 4.722494464,
484
+ "loss": 0.6756,
485
+ "grad_norm": 3.1487743854522705,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.52541184,
492
+ "gpu_mem": 4.7227648,
493
+ "loss": 0.6088,
494
+ "grad_norm": 2.5326249599456787,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.525608448,
501
+ "gpu_mem": 4.722551296,
502
+ "loss": 0.8536,
503
+ "grad_norm": 9.061946868896484,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.525608448,
510
+ "gpu_mem": 4.722545152,
511
+ "loss": 0.6668,
512
+ "grad_norm": 4.727614879608154,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.525805056,
519
+ "gpu_mem": 4.722440704,
520
+ "loss": 0.6361,
521
+ "grad_norm": 3.2904443740844727,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.525805056,
528
+ "gpu_mem": 4.7224576,
529
+ "loss": 0.6821,
530
+ "grad_norm": 1.5682965517044067,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.526001664,
537
+ "gpu_mem": 4.722551296,
538
+ "loss": 0.6106,
539
+ "grad_norm": 5.173000812530518,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.526001664,
546
+ "gpu_mem": 4.722562048,
547
+ "loss": 0.6116,
548
+ "grad_norm": 2.3430614471435547,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.526198272,
555
+ "gpu_mem": 4.72254976,
556
+ "loss": 0.7339,
557
+ "grad_norm": 6.764347553253174,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.526198272,
564
+ "gpu_mem": 4.72254208,
565
+ "loss": 0.5956,
566
+ "grad_norm": 6.654665946960449,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.526198272,
573
+ "gpu_mem": 4.722471424,
574
+ "loss": 0.6023,
575
+ "grad_norm": 3.233610153198242,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.52639488,
582
+ "gpu_mem": 4.722515968,
583
+ "loss": 0.6724,
584
+ "grad_norm": 5.837782382965088,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.52639488,
591
+ "gpu_mem": 4.722709504,
592
+ "loss": 0.6014,
593
+ "grad_norm": 4.1243205070495605,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.52639488,
600
+ "gpu_mem": 4.7224192,
601
+ "loss": 0.6198,
602
+ "grad_norm": 4.467631816864014,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.526591488,
609
+ "gpu_mem": 4.722386944,
610
+ "loss": 0.6373,
611
+ "grad_norm": 3.114952802658081,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.526591488,
618
+ "gpu_mem": 4.722452992,
619
+ "loss": 0.6229,
620
+ "grad_norm": 4.952956199645996,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.526591488,
627
+ "gpu_mem": 4.722446848,
628
+ "loss": 0.6326,
629
+ "grad_norm": 3.901777744293213,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.526591488,
636
+ "gpu_mem": 4.722675712,
637
+ "loss": 0.525,
638
+ "grad_norm": 4.247412204742432,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.526788096,
645
+ "gpu_mem": 4.722668032,
646
+ "loss": 0.5622,
647
+ "grad_norm": 5.4284772872924805,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.526984704,
654
+ "gpu_mem": 4.72263424,
655
+ "loss": 0.7902,
656
+ "grad_norm": 8.788385391235352,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.526984704,
663
+ "gpu_mem": 4.722494464,
664
+ "loss": 0.5742,
665
+ "grad_norm": 8.913131713867188,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.526984704,
672
+ "gpu_mem": 4.7224192,
673
+ "loss": 0.6146,
674
+ "grad_norm": 8.38949203491211,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.526984704,
681
+ "gpu_mem": 4.722359296,
682
+ "loss": 0.5725,
683
+ "grad_norm": 4.2171711921691895,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.526984704,
690
+ "gpu_mem": 4.722433024,
691
+ "loss": 0.6514,
692
+ "grad_norm": 6.086472988128662,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.526984704,
699
+ "gpu_mem": 4.722485248,
700
+ "loss": 0.8012,
701
+ "grad_norm": 10.957486152648926,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.527181312,
708
+ "gpu_mem": 4.722617344,
709
+ "loss": 0.5858,
710
+ "grad_norm": 3.74336576461792,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.527181312,
717
+ "gpu_mem": 4.722508288,
718
+ "loss": 0.6445,
719
+ "grad_norm": 7.315963268280029,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.527181312,
726
+ "gpu_mem": 4.72238848,
727
+ "loss": 0.6349,
728
+ "grad_norm": 4.654501438140869,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.527181312,
735
+ "gpu_mem": 4.7224576,
736
+ "loss": 0.6408,
737
+ "grad_norm": 4.707687854766846,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.527181312,
744
+ "gpu_mem": 4.72255744,
745
+ "loss": 0.7144,
746
+ "grad_norm": 7.812034606933594,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.527181312,
753
+ "gpu_mem": 4.722520576,
754
+ "loss": 0.5678,
755
+ "grad_norm": 3.4686105251312256,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.527181312,
762
+ "gpu_mem": 4.722552832,
763
+ "loss": 0.5893,
764
+ "grad_norm": 5.409703254699707,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.52737792,
771
+ "gpu_mem": 4.72250368,
772
+ "loss": 0.6168,
773
+ "grad_norm": 7.087602138519287,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.52737792,
780
+ "gpu_mem": 4.72251136,
781
+ "loss": 0.5399,
782
+ "grad_norm": 5.04774284362793,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.52737792,
789
+ "gpu_mem": 4.722655744,
790
+ "loss": 0.512,
791
+ "grad_norm": 4.10666561126709,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.52737792,
798
+ "gpu_mem": 4.722437632,
799
+ "loss": 0.5958,
800
+ "grad_norm": 5.078695774078369,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.527574528,
807
+ "gpu_mem": 4.722491392,
808
+ "loss": 0.5342,
809
+ "grad_norm": 4.536576747894287,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.527574528,
816
+ "gpu_mem": 4.722459136,
817
+ "loss": 0.4877,
818
+ "grad_norm": 4.9868693351745605,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.527574528,
825
+ "gpu_mem": 4.722540544,
826
+ "loss": 0.7501,
827
+ "grad_norm": 11.705029487609863,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.527574528,
834
+ "gpu_mem": 4.722343936,
835
+ "loss": 0.7142,
836
+ "grad_norm": 9.641088485717773,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.527574528,
843
+ "gpu_mem": 4.7224576,
844
+ "loss": 0.5724,
845
+ "grad_norm": 6.110875129699707,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.527574528,
852
+ "gpu_mem": 4.722477568,
853
+ "loss": 0.6165,
854
+ "grad_norm": 4.48118257522583,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.527574528,
861
+ "gpu_mem": 4.722515968,
862
+ "loss": 0.5351,
863
+ "grad_norm": 2.5242867469787598,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.527574528,
870
+ "gpu_mem": 4.722500608,
871
+ "loss": 0.7382,
872
+ "grad_norm": 7.024951457977295,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.527574528,
879
+ "gpu_mem": 4.722413056,
880
+ "loss": 0.6812,
881
+ "grad_norm": 5.02927827835083,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.527574528,
888
+ "gpu_mem": 4.722362368,
889
+ "loss": 0.5979,
890
+ "grad_norm": 5.04942512512207,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.527574528,
897
+ "gpu_mem": 4.722479104,
898
+ "loss": 0.6131,
899
+ "grad_norm": 6.651904106140137,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.527574528,
906
+ "gpu_mem": 4.72251136,
907
+ "loss": 0.6984,
908
+ "grad_norm": 3.8917150497436523,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.527574528,
915
+ "gpu_mem": 4.722545152,
916
+ "loss": 0.5914,
917
+ "grad_norm": 2.322913885116577,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.527574528,
924
+ "gpu_mem": 4.72259584,
925
+ "loss": 0.6322,
926
+ "grad_norm": 3.5212390422821045,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.527574528,
933
+ "gpu_mem": 4.722500608,
934
+ "loss": 0.5647,
935
+ "grad_norm": 3.077224016189575,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.527574528,
942
+ "gpu_mem": 4.722601984,
943
+ "loss": 0.5667,
944
+ "grad_norm": 3.436150550842285,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.527574528,
951
+ "gpu_mem": 4.722552832,
952
+ "loss": 0.5603,
953
+ "grad_norm": 5.609046936035156,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.527574528,
960
+ "gpu_mem": 4.722440704,
961
+ "loss": 0.4929,
962
+ "grad_norm": 4.49297571182251,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.527574528,
969
+ "gpu_mem": 4.722625024,
970
+ "loss": 0.5647,
971
+ "grad_norm": 5.14377498626709,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.527574528,
978
+ "gpu_mem": 4.722479104,
979
+ "loss": 0.591,
980
+ "grad_norm": 4.947544574737549,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.527574528,
987
+ "gpu_mem": 4.722482176,
988
+ "loss": 0.5745,
989
+ "grad_norm": 4.043002605438232,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.527574528,
996
+ "gpu_mem": 4.722451456,
997
+ "loss": 0.5688,
998
+ "grad_norm": 6.002414703369141,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.527771136,
1005
+ "gpu_mem": 4.722497536,
1006
+ "loss": 0.6035,
1007
+ "grad_norm": 5.334151268005371,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.527771136,
1014
+ "gpu_mem": 4.72248832,
1015
+ "loss": 0.5538,
1016
+ "grad_norm": 4.7518310546875,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.527771136,
1023
+ "gpu_mem": 4.722469888,
1024
+ "loss": 0.4641,
1025
+ "grad_norm": 4.555304527282715,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.527967744,
1032
+ "gpu_mem": 4.722545152,
1033
+ "loss": 0.6187,
1034
+ "grad_norm": 5.561794281005859,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.527967744,
1041
+ "gpu_mem": 4.72246528,
1042
+ "loss": 0.5549,
1043
+ "grad_norm": 4.324403285980225,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.527967744,
1050
+ "gpu_mem": 4.722356224,
1051
+ "loss": 0.4801,
1052
+ "grad_norm": 2.37454891204834,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.527967744,
1059
+ "gpu_mem": 4.722594304,
1060
+ "loss": 0.6734,
1061
+ "grad_norm": 4.633813858032227,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.527967744,
1068
+ "gpu_mem": 4.7227648,
1069
+ "loss": 0.4906,
1070
+ "grad_norm": 4.976076126098633,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.527967744,
1077
+ "gpu_mem": 4.722497536,
1078
+ "loss": 0.4778,
1079
+ "grad_norm": 4.014054775238037,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.527967744,
1086
+ "gpu_mem": 4.722525184,
1087
+ "loss": 0.5773,
1088
+ "grad_norm": 4.499019622802734,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.527967744,
1095
+ "gpu_mem": 4.722575872,
1096
+ "loss": 0.4382,
1097
+ "grad_norm": 4.889860153198242,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.527967744,
1104
+ "gpu_mem": 4.722385408,
1105
+ "loss": 0.5738,
1106
+ "grad_norm": 6.226327419281006,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.527967744,
1113
+ "gpu_mem": 4.722827776,
1114
+ "loss": 0.5588,
1115
+ "grad_norm": 5.272500514984131,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.527967744,
1122
+ "gpu_mem": 4.722554368,
1123
+ "loss": 0.6352,
1124
+ "grad_norm": 8.107681274414062,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.527967744,
1131
+ "gpu_mem": 4.722437632,
1132
+ "loss": 0.5789,
1133
+ "grad_norm": 6.859676837921143,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.530130432,
1140
+ "gpu_mem": 4.722876928,
1141
+ "loss": 0.5435,
1142
+ "grad_norm": 4.632746696472168,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.530130432,
1149
+ "gpu_mem": 4.722652672,
1150
+ "loss": 0.4812,
1151
+ "grad_norm": 3.7110488414764404,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.530130432,
1158
+ "gpu_mem": 4.722692608,
1159
+ "loss": 0.5938,
1160
+ "grad_norm": 6.665708541870117,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.530130432,
1167
+ "gpu_mem": 4.722474496,
1168
+ "loss": 0.5986,
1169
+ "grad_norm": 3.7718665599823,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.530130432,
1176
+ "gpu_mem": 4.72260352,
1177
+ "loss": 0.5023,
1178
+ "grad_norm": 3.4843013286590576,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.530130432,
1185
+ "gpu_mem": 4.722684928,
1186
+ "loss": 0.5568,
1187
+ "grad_norm": 6.419014930725098,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.530130432,
1194
+ "gpu_mem": 4.722468352,
1195
+ "loss": 0.4996,
1196
+ "grad_norm": 4.196140766143799,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.530130432,
1203
+ "gpu_mem": 4.722601984,
1204
+ "loss": 0.4804,
1205
+ "grad_norm": 3.8392629623413086,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.530130432,
1212
+ "gpu_mem": 4.722625024,
1213
+ "loss": 0.664,
1214
+ "grad_norm": 9.533957481384277,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.530130432,
1221
+ "gpu_mem": 4.722462208,
1222
+ "loss": 0.526,
1223
+ "grad_norm": 6.26369047164917,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.530130432,
1230
+ "gpu_mem": 4.7223424,
1231
+ "loss": 0.5442,
1232
+ "grad_norm": 5.538697719573975,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.530130432,
1239
+ "gpu_mem": 4.722523648,
1240
+ "loss": 0.4846,
1241
+ "grad_norm": 3.0243053436279297,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.530130432,
1248
+ "gpu_mem": 4.722422272,
1249
+ "loss": 0.5617,
1250
+ "grad_norm": 5.228503704071045,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.530130432,
1257
+ "gpu_mem": 4.722474496,
1258
+ "loss": 0.5831,
1259
+ "grad_norm": 6.361831188201904,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.530130432,
1266
+ "gpu_mem": 4.722506752,
1267
+ "loss": 0.5879,
1268
+ "grad_norm": 5.449702262878418,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.530130432,
1275
+ "gpu_mem": 4.722646528,
1276
+ "loss": 0.4928,
1277
+ "grad_norm": 4.112429141998291,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.530130432,
1284
+ "gpu_mem": 4.722629632,
1285
+ "loss": 0.6331,
1286
+ "grad_norm": 4.983449459075928,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.530130432,
1293
+ "gpu_mem": 4.722821632,
1294
+ "loss": 0.6526,
1295
+ "grad_norm": 6.455733299255371,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.530130432,
1302
+ "gpu_mem": 4.722532864,
1303
+ "loss": 0.505,
1304
+ "grad_norm": 3.729292392730713,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.530130432,
1311
+ "gpu_mem": 4.722568192,
1312
+ "loss": 0.5684,
1313
+ "grad_norm": 3.6822805404663086,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.530130432,
1320
+ "gpu_mem": 4.722466816,
1321
+ "loss": 0.5114,
1322
+ "grad_norm": 3.8303308486938477,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.530130432,
1329
+ "gpu_mem": 4.824187392,
1330
+ "loss": 0.7086,
1331
+ "grad_norm": 4.2697529792785645,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.530130432,
1338
+ "gpu_mem": 4.82412288,
1339
+ "loss": 0.4824,
1340
+ "grad_norm": 3.1086933612823486,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.530130432,
1347
+ "gpu_mem": 4.823960064,
1348
+ "loss": 0.4912,
1349
+ "grad_norm": 3.0585293769836426,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.530130432,
1356
+ "gpu_mem": 4.824032256,
1357
+ "loss": 0.5404,
1358
+ "grad_norm": 4.751421928405762,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.530130432,
1365
+ "gpu_mem": 4.824067584,
1366
+ "loss": 0.4155,
1367
+ "grad_norm": 3.7576749324798584,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.530130432,
1374
+ "gpu_mem": 4.82409216,
1375
+ "loss": 0.5912,
1376
+ "grad_norm": 4.714956283569336,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.530130432,
1383
+ "gpu_mem": 4.82405376,
1384
+ "loss": 0.4068,
1385
+ "grad_norm": 6.185878753662109,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.530130432,
1392
+ "gpu_mem": 4.824274944,
1393
+ "loss": 0.5478,
1394
+ "grad_norm": 5.617146015167236,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.530130432,
1401
+ "gpu_mem": 4.824182784,
1402
+ "loss": 0.3949,
1403
+ "grad_norm": 3.8420050144195557,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.530130432,
1410
+ "gpu_mem": 4.824089088,
1411
+ "loss": 0.3574,
1412
+ "grad_norm": 3.955946683883667,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.530130432,
1419
+ "gpu_mem": 4.82401536,
1420
+ "loss": 0.478,
1421
+ "grad_norm": 5.138974189758301,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.530130432,
1428
+ "gpu_mem": 4.824364032,
1429
+ "loss": 0.3684,
1430
+ "grad_norm": 5.789059638977051,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.530130432,
1437
+ "gpu_mem": 4.823958528,
1438
+ "loss": 0.417,
1439
+ "grad_norm": 4.242332935333252,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.530130432,
1446
+ "gpu_mem": 4.823904768,
1447
+ "loss": 0.4055,
1448
+ "grad_norm": 5.762329578399658,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.530130432,
1455
+ "gpu_mem": 4.824680448,
1456
+ "loss": 0.3548,
1457
+ "grad_norm": 4.2064528465271,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.530130432,
1464
+ "gpu_mem": 4.824156672,
1465
+ "loss": 0.4353,
1466
+ "grad_norm": 7.034615993499756,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.530130432,
1473
+ "gpu_mem": 4.82406912,
1474
+ "loss": 0.4521,
1475
+ "grad_norm": 5.314269065856934,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.530130432,
1482
+ "gpu_mem": 4.824018432,
1483
+ "loss": 0.4232,
1484
+ "grad_norm": 4.84352970123291,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.530130432,
1491
+ "gpu_mem": 4.824113664,
1492
+ "loss": 0.2749,
1493
+ "grad_norm": 6.184368133544922,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.530130432,
1500
+ "gpu_mem": 4.82403072,
1501
+ "loss": 0.5079,
1502
+ "grad_norm": 6.006751537322998,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.530130432,
1509
+ "gpu_mem": 4.824049152,
1510
+ "loss": 0.4548,
1511
+ "grad_norm": 6.466878414154053,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.530130432,
1518
+ "gpu_mem": 4.824136704,
1519
+ "loss": 0.4594,
1520
+ "grad_norm": 6.34998083114624,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.530130432,
1527
+ "gpu_mem": 4.824021504,
1528
+ "loss": 0.5691,
1529
+ "grad_norm": 7.32921028137207,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.530130432,
1536
+ "gpu_mem": 4.82408448,
1537
+ "loss": 0.5156,
1538
+ "grad_norm": 7.228818416595459,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.530130432,
1545
+ "gpu_mem": 4.82399232,
1546
+ "loss": 0.3484,
1547
+ "grad_norm": 5.381630897521973,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.530130432,
1554
+ "gpu_mem": 4.82429184,
1555
+ "loss": 0.4047,
1556
+ "grad_norm": 7.392773151397705,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.530130432,
1563
+ "gpu_mem": 4.82401536,
1564
+ "loss": 0.5447,
1565
+ "grad_norm": 7.2776360511779785,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.530130432,
1572
+ "gpu_mem": 4.823981568,
1573
+ "loss": 0.422,
1574
+ "grad_norm": 5.06213903427124,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.530130432,
1581
+ "gpu_mem": 4.824119808,
1582
+ "loss": 0.4156,
1583
+ "grad_norm": 7.205942630767822,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.530130432,
1590
+ "gpu_mem": 4.824218112,
1591
+ "loss": 0.4374,
1592
+ "grad_norm": 4.606486797332764,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.530130432,
1599
+ "gpu_mem": 4.823964672,
1600
+ "loss": 0.4501,
1601
+ "grad_norm": 5.773846626281738,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.530130432,
1608
+ "gpu_mem": 4.824064512,
1609
+ "loss": 0.4543,
1610
+ "grad_norm": 5.905703544616699,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.530130432,
1617
+ "gpu_mem": 4.824036864,
1618
+ "loss": 0.4158,
1619
+ "grad_norm": 5.3787360191345215,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.530130432,
1626
+ "gpu_mem": 4.823973888,
1627
+ "loss": 0.3065,
1628
+ "grad_norm": 4.495090007781982,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.530130432,
1635
+ "gpu_mem": 4.824192,
1636
+ "loss": 0.5224,
1637
+ "grad_norm": 7.384599685668945,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.530130432,
1644
+ "gpu_mem": 4.824089088,
1645
+ "loss": 0.3435,
1646
+ "grad_norm": 5.406001091003418,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.530130432,
1653
+ "gpu_mem": 4.824036864,
1654
+ "loss": 0.396,
1655
+ "grad_norm": 5.245143890380859,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.530130432,
1662
+ "gpu_mem": 4.82401536,
1663
+ "loss": 0.5028,
1664
+ "grad_norm": 8.920019149780273,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.530130432,
1671
+ "gpu_mem": 4.824024576,
1672
+ "loss": 0.315,
1673
+ "grad_norm": 6.18515682220459,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.530130432,
1680
+ "gpu_mem": 4.823956992,
1681
+ "loss": 0.4535,
1682
+ "grad_norm": 10.364043235778809,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.530130432,
1689
+ "gpu_mem": 4.824119808,
1690
+ "loss": 0.2713,
1691
+ "grad_norm": 4.690507888793945,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.530130432,
1698
+ "gpu_mem": 4.823989248,
1699
+ "loss": 0.4559,
1700
+ "grad_norm": 5.3982133865356445,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.530130432,
1707
+ "gpu_mem": 4.824109056,
1708
+ "loss": 0.3843,
1709
+ "grad_norm": 4.67440128326416,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.530130432,
1716
+ "gpu_mem": 4.823927808,
1717
+ "loss": 0.3162,
1718
+ "grad_norm": 5.031257152557373,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.530130432,
1725
+ "gpu_mem": 4.824059904,
1726
+ "loss": 0.2999,
1727
+ "grad_norm": 5.343701362609863,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.530130432,
1734
+ "gpu_mem": 4.824033792,
1735
+ "loss": 0.3915,
1736
+ "grad_norm": 7.458854675292969,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.530130432,
1743
+ "gpu_mem": 4.824,
1744
+ "loss": 0.345,
1745
+ "grad_norm": 7.649142265319824,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.530130432,
1752
+ "gpu_mem": 4.824104448,
1753
+ "loss": 0.2545,
1754
+ "grad_norm": 6.707334518432617,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.530130432,
1761
+ "gpu_mem": 4.82409984,
1762
+ "loss": 0.4273,
1763
+ "grad_norm": 6.2424116134643555,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.530130432,
1770
+ "gpu_mem": 4.823958528,
1771
+ "loss": 0.3421,
1772
+ "grad_norm": 6.736060619354248,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.530130432,
1779
+ "gpu_mem": 4.824150528,
1780
+ "loss": 0.409,
1781
+ "grad_norm": 6.864956378936768,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.530130432,
1788
+ "gpu_mem": 4.824001536,
1789
+ "loss": 0.4033,
1790
+ "grad_norm": 8.16348934173584,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.530130432,
1797
+ "gpu_mem": 4.824104448,
1798
+ "loss": 0.2264,
1799
+ "grad_norm": 4.169793605804443,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.530130432,
1806
+ "gpu_mem": 4.8243072,
1807
+ "loss": 0.301,
1808
+ "grad_norm": 5.401573657989502,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.530130432,
1815
+ "gpu_mem": 4.824109056,
1816
+ "loss": 0.2946,
1817
+ "grad_norm": 5.444881439208984,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.530130432,
1824
+ "gpu_mem": 4.823995392,
1825
+ "loss": 0.3586,
1826
+ "grad_norm": 7.087218284606934,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.530130432,
1833
+ "gpu_mem": 4.824009216,
1834
+ "loss": 0.2954,
1835
+ "grad_norm": 5.234076976776123,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.530130432,
1842
+ "gpu_mem": 4.82405376,
1843
+ "loss": 0.3908,
1844
+ "grad_norm": 6.1905012130737305,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.530130432,
1851
+ "gpu_mem": 4.823995392,
1852
+ "loss": 0.3348,
1853
+ "grad_norm": 5.874136447906494,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.530130432,
1860
+ "gpu_mem": 4.824228864,
1861
+ "loss": 0.3414,
1862
+ "grad_norm": 5.023383617401123,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.530130432,
1869
+ "gpu_mem": 4.824259584,
1870
+ "loss": 0.2438,
1871
+ "grad_norm": 7.677697658538818,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.530130432,
1878
+ "gpu_mem": 4.824185856,
1879
+ "loss": 0.3552,
1880
+ "grad_norm": 5.987409591674805,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.530130432,
1887
+ "gpu_mem": 4.824073728,
1888
+ "loss": 0.4027,
1889
+ "grad_norm": 5.519845485687256,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.530130432,
1896
+ "gpu_mem": 4.82402304,
1897
+ "loss": 0.3283,
1898
+ "grad_norm": 6.361313819885254,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.530130432,
1905
+ "gpu_mem": 4.82399232,
1906
+ "loss": 0.3592,
1907
+ "grad_norm": 5.429263591766357,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.530130432,
1914
+ "gpu_mem": 4.82401536,
1915
+ "loss": 0.4217,
1916
+ "grad_norm": 8.665788650512695,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.530130432,
1923
+ "gpu_mem": 4.824098304,
1924
+ "loss": 0.421,
1925
+ "grad_norm": 7.960748195648193,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.530130432,
1932
+ "gpu_mem": 4.824026112,
1933
+ "loss": 0.397,
1934
+ "grad_norm": 8.969430923461914,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.530130432,
1941
+ "gpu_mem": 4.824192,
1942
+ "loss": 0.373,
1943
+ "grad_norm": 5.677824020385742,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.530130432,
1950
+ "gpu_mem": 4.824033792,
1951
+ "loss": 0.4046,
1952
+ "grad_norm": 6.211999893188477,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.530130432,
1959
+ "gpu_mem": 4.824010752,
1960
+ "loss": 0.3678,
1961
+ "grad_norm": 5.141634464263916,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.530130432,
1968
+ "gpu_mem": 4.824136704,
1969
+ "loss": 0.4164,
1970
+ "grad_norm": 5.972975254058838,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.530130432,
1977
+ "gpu_mem": 4.824170496,
1978
+ "loss": 0.3468,
1979
+ "grad_norm": 5.6360673904418945,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.530130432,
1986
+ "gpu_mem": 4.8240384,
1987
+ "loss": 0.3753,
1988
+ "grad_norm": 5.116311550140381,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.530130432,
1995
+ "gpu_mem": 4.824175104,
1996
+ "loss": 0.3698,
1997
+ "grad_norm": 6.675261974334717,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.530130432,
2004
+ "gpu_mem": 4.824089088,
2005
+ "loss": 0.4058,
2006
+ "grad_norm": 6.807044982910156,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.530130432,
2013
+ "gpu_mem": 4.82405376,
2014
+ "loss": 0.2952,
2015
+ "grad_norm": 4.836634635925293,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.530130432,
2022
+ "gpu_mem": 4.824018432,
2023
+ "loss": 0.2557,
2024
+ "grad_norm": 3.945059299468994,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.530130432,
2031
+ "gpu_mem": 4.824167424,
2032
+ "loss": 0.3338,
2033
+ "grad_norm": 4.126738548278809,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.530130432,
2040
+ "gpu_mem": 4.824056832,
2041
+ "loss": 0.2903,
2042
+ "grad_norm": 6.6302809715271,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.530130432,
2049
+ "gpu_mem": 4.824001536,
2050
+ "loss": 0.4299,
2051
+ "grad_norm": 7.667603015899658,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.530130432,
2058
+ "gpu_mem": 4.823943168,
2059
+ "loss": 0.2388,
2060
+ "grad_norm": 4.576896667480469,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.530130432,
2067
+ "gpu_mem": 4.824,
2068
+ "loss": 0.3806,
2069
+ "grad_norm": 6.581130027770996,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.530130432,
2076
+ "gpu_mem": 4.824278016,
2077
+ "loss": 0.3558,
2078
+ "grad_norm": 4.444263458251953,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.530130432,
2085
+ "gpu_mem": 4.824001536,
2086
+ "loss": 0.3733,
2087
+ "grad_norm": 6.31781005859375,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.530130432,
2094
+ "gpu_mem": 4.82431488,
2095
+ "loss": 0.2761,
2096
+ "grad_norm": 4.112189769744873,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.530130432,
2103
+ "gpu_mem": 4.824190464,
2104
+ "loss": 0.218,
2105
+ "grad_norm": 3.6443135738372803,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.530130432,
2112
+ "gpu_mem": 4.82394624,
2113
+ "loss": 0.3588,
2114
+ "grad_norm": 6.56972599029541,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.530130432,
2121
+ "gpu_mem": 4.824006144,
2122
+ "loss": 0.3269,
2123
+ "grad_norm": 5.762430191040039,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.530130432,
2130
+ "gpu_mem": 4.824067584,
2131
+ "loss": 0.4199,
2132
+ "grad_norm": 6.846770763397217,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.530130432,
2139
+ "gpu_mem": 4.82406912,
2140
+ "loss": 0.3317,
2141
+ "grad_norm": 5.823359489440918,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.530130432,
2148
+ "gpu_mem": 4.82432256,
2149
+ "loss": 0.3896,
2150
+ "grad_norm": 7.253147125244141,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.530130432,
2157
+ "gpu_mem": 4.823972352,
2158
+ "loss": 0.5372,
2159
+ "grad_norm": 7.62632417678833,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.530130432,
2166
+ "gpu_mem": 4.8242688,
2167
+ "loss": 0.2841,
2168
+ "grad_norm": 5.374094009399414,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.530130432,
2175
+ "gpu_mem": 4.82413056,
2176
+ "loss": 0.2746,
2177
+ "grad_norm": 5.998660564422607,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.530130432,
2184
+ "gpu_mem": 4.823983104,
2185
+ "loss": 0.4833,
2186
+ "grad_norm": 7.127227306365967,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.530130432,
2193
+ "gpu_mem": 4.82412288,
2194
+ "loss": 0.1932,
2195
+ "grad_norm": 3.3074233531951904,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.530130432,
2202
+ "gpu_mem": 4.824001536,
2203
+ "loss": 0.4678,
2204
+ "grad_norm": 7.167958736419678,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.530130432,
2211
+ "gpu_mem": 4.824095232,
2212
+ "loss": 0.3263,
2213
+ "grad_norm": 5.361066818237305,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.530130432,
2220
+ "gpu_mem": 4.824113664,
2221
+ "loss": 0.3142,
2222
+ "grad_norm": 7.1523284912109375,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.530130432,
2229
+ "gpu_mem": 4.824066048,
2230
+ "loss": 0.4077,
2231
+ "grad_norm": 5.50556755065918,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.530130432,
2238
+ "gpu_mem": 4.823972352,
2239
+ "loss": 0.2624,
2240
+ "grad_norm": 4.57589864730835,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.530130432,
2247
+ "gpu_mem": 4.824064512,
2248
+ "loss": 0.2678,
2249
+ "grad_norm": 6.028458118438721,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.530130432,
2256
+ "gpu_mem": 4.82397696,
2257
+ "loss": 0.2385,
2258
+ "grad_norm": 9.162259101867676,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.530130432,
2265
+ "gpu_mem": 4.824019968,
2266
+ "loss": 0.3512,
2267
+ "grad_norm": 4.852019309997559,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.530130432,
2274
+ "gpu_mem": 4.824213504,
2275
+ "loss": 0.3405,
2276
+ "grad_norm": 6.2241082191467285,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.530130432,
2283
+ "gpu_mem": 4.824009216,
2284
+ "loss": 0.3576,
2285
+ "grad_norm": 5.062891960144043,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.530130432,
2292
+ "gpu_mem": 4.824165888,
2293
+ "loss": 0.4266,
2294
+ "grad_norm": 6.995367527008057,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.530130432,
2301
+ "gpu_mem": 4.82399232,
2302
+ "loss": 0.2653,
2303
+ "grad_norm": 4.953624248504639,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.530130432,
2310
+ "gpu_mem": 4.824405504,
2311
+ "loss": 0.4008,
2312
+ "grad_norm": 4.404302597045898,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.530130432,
2319
+ "gpu_mem": 4.824064512,
2320
+ "loss": 0.4191,
2321
+ "grad_norm": 7.855671405792236,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.530130432,
2328
+ "gpu_mem": 4.823981568,
2329
+ "loss": 0.2937,
2330
+ "grad_norm": 7.6881279945373535,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.530130432,
2337
+ "gpu_mem": 4.824098304,
2338
+ "loss": 0.1671,
2339
+ "grad_norm": 4.152520179748535,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.530130432,
2346
+ "gpu_mem": 4.82405376,
2347
+ "loss": 0.2918,
2348
+ "grad_norm": 4.768893241882324,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.530130432,
2355
+ "gpu_mem": 4.824010752,
2356
+ "loss": 0.2511,
2357
+ "grad_norm": 3.7594785690307617,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.530130432,
2364
+ "gpu_mem": 4.824047616,
2365
+ "loss": 0.2518,
2366
+ "grad_norm": 4.245553970336914,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.530130432,
2373
+ "gpu_mem": 4.824135168,
2374
+ "loss": 0.2495,
2375
+ "grad_norm": 4.321831226348877,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.530130432,
2382
+ "gpu_mem": 4.82405376,
2383
+ "loss": 0.3896,
2384
+ "grad_norm": 5.726314067840576,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.530130432,
2391
+ "gpu_mem": 4.8242688,
2392
+ "loss": 0.4383,
2393
+ "grad_norm": 5.080480098724365,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.530130432,
2400
+ "gpu_mem": 4.82406144,
2401
+ "loss": 0.3601,
2402
+ "grad_norm": 5.688658714294434,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.530130432,
2409
+ "gpu_mem": 4.824066048,
2410
+ "loss": 0.3501,
2411
+ "grad_norm": 6.421919345855713,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.530130432,
2418
+ "gpu_mem": 4.8240768,
2419
+ "loss": 0.3546,
2420
+ "grad_norm": 6.48486328125,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.530130432,
2427
+ "gpu_mem": 4.8241152,
2428
+ "loss": 0.2864,
2429
+ "grad_norm": 5.4872260093688965,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.530130432,
2436
+ "gpu_mem": 4.824167424,
2437
+ "loss": 0.3182,
2438
+ "grad_norm": 5.292596817016602,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.530130432,
2445
+ "gpu_mem": 4.824026112,
2446
+ "loss": 0.3154,
2447
+ "grad_norm": 4.313265800476074,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.530130432,
2454
+ "gpu_mem": 4.823906304,
2455
+ "loss": 0.2951,
2456
+ "grad_norm": 5.555436134338379,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.530130432,
2463
+ "gpu_mem": 4.824133632,
2464
+ "loss": 0.2837,
2465
+ "grad_norm": 4.105805397033691,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.530130432,
2472
+ "gpu_mem": 4.824377856,
2473
+ "loss": 0.2878,
2474
+ "grad_norm": 6.073948860168457,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.530130432,
2481
+ "gpu_mem": 4.8240384,
2482
+ "loss": 0.2768,
2483
+ "grad_norm": 4.70308256149292,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.530130432,
2490
+ "gpu_mem": 4.82398464,
2491
+ "loss": 0.3124,
2492
+ "grad_norm": 4.56823205947876,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.530130432,
2499
+ "gpu_mem": 4.824147456,
2500
+ "loss": 0.3178,
2501
+ "grad_norm": 6.935946464538574,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.530130432,
2508
+ "gpu_mem": 4.824087552,
2509
+ "loss": 0.4199,
2510
+ "grad_norm": 5.427887439727783,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.530130432,
2517
+ "gpu_mem": 4.824067584,
2518
+ "loss": 0.3266,
2519
+ "grad_norm": 4.603908538818359,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.530130432,
2526
+ "gpu_mem": 4.824003072,
2527
+ "loss": 0.3995,
2528
+ "grad_norm": 4.938188076019287,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.530130432,
2535
+ "gpu_mem": 4.824431616,
2536
+ "loss": 0.2409,
2537
+ "grad_norm": 4.201538562774658,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.530130432,
2544
+ "gpu_mem": 4.82413824,
2545
+ "loss": 0.3625,
2546
+ "grad_norm": 6.524600982666016,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.530130432,
2553
+ "gpu_mem": 4.823993856,
2554
+ "loss": 0.3915,
2555
+ "grad_norm": 4.94792366027832,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.530130432,
2562
+ "gpu_mem": 4.824047616,
2563
+ "loss": 0.3007,
2564
+ "grad_norm": 6.609028339385986,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.530130432,
2571
+ "gpu_mem": 4.824465408,
2572
+ "loss": 0.293,
2573
+ "grad_norm": 4.769420623779297,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.530130432,
2580
+ "gpu_mem": 4.824235008,
2581
+ "loss": 0.3999,
2582
+ "grad_norm": 8.176187515258789,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.530130432,
2589
+ "gpu_mem": 4.824019968,
2590
+ "loss": 0.4427,
2591
+ "grad_norm": 6.034461975097656,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.530130432,
2598
+ "gpu_mem": 4.824104448,
2599
+ "loss": 0.254,
2600
+ "grad_norm": 7.134485721588135,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.530130432,
2607
+ "gpu_mem": 4.824029184,
2608
+ "loss": 0.405,
2609
+ "grad_norm": 6.609726905822754,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.530130432,
2616
+ "gpu_mem": 4.824064512,
2617
+ "loss": 0.4325,
2618
+ "grad_norm": 6.219490051269531,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.530130432,
2625
+ "gpu_mem": 4.824147456,
2626
+ "loss": 0.3779,
2627
+ "grad_norm": 4.7131218910217285,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.530130432,
2634
+ "gpu_mem": 4.824064512,
2635
+ "loss": 0.4359,
2636
+ "grad_norm": 6.299029350280762,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.530130432,
2643
+ "gpu_mem": 4.824090624,
2644
+ "loss": 0.3799,
2645
+ "grad_norm": 6.89005184173584,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.530130432,
2652
+ "gpu_mem": 4.824090624,
2653
+ "train_runtime": 4484.6084,
2654
+ "train_samples_per_second": 4.204,
2655
+ "train_steps_per_second": 0.066,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.6128277448671204
2658
+ }
2659
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.7033639143730887
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 6317696
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-boolq-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T19:09:28.617533"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-boolq-r8-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.48830208,
6
+ "gpu_mem": 4.443082752,
7
+ "loss": 8.869,
8
+ "grad_norm": 234.86416625976562,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.494396928,
15
+ "gpu_mem": 4.493840896,
16
+ "loss": 8.9376,
17
+ "grad_norm": 240.33407592773438,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.49518336,
24
+ "gpu_mem": 4.493759488,
25
+ "loss": 7.5679,
26
+ "grad_norm": 243.47679138183594,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.495773184,
33
+ "gpu_mem": 4.493759488,
34
+ "loss": 4.959,
35
+ "grad_norm": 228.1814727783203,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.496363008,
42
+ "gpu_mem": 4.493694976,
43
+ "loss": 2.537,
44
+ "grad_norm": 137.45384216308594,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.496952832,
51
+ "gpu_mem": 4.493714944,
52
+ "loss": 1.4387,
53
+ "grad_norm": 56.679893493652344,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.497542656,
60
+ "gpu_mem": 4.493767168,
61
+ "loss": 0.8578,
62
+ "grad_norm": 21.124313354492188,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.497935872,
69
+ "gpu_mem": 4.493853184,
70
+ "loss": 0.6193,
71
+ "grad_norm": 10.238547325134277,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.498329088,
78
+ "gpu_mem": 4.493761024,
79
+ "loss": 0.6998,
80
+ "grad_norm": 18.19664764404297,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.498918912,
87
+ "gpu_mem": 4.493661184,
88
+ "loss": 1.6846,
89
+ "grad_norm": 192.40335083007812,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.499312128,
96
+ "gpu_mem": 4.493765632,
97
+ "loss": 1.0755,
98
+ "grad_norm": 123.23554229736328,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.499705344,
105
+ "gpu_mem": 4.494137344,
106
+ "loss": 1.2442,
107
+ "grad_norm": 281.9166259765625,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.50009856,
114
+ "gpu_mem": 4.493741056,
115
+ "loss": 1.585,
116
+ "grad_norm": 92.95726013183594,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.500688384,
123
+ "gpu_mem": 4.493718016,
124
+ "loss": 0.7672,
125
+ "grad_norm": 36.01921081542969,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.500884992,
132
+ "gpu_mem": 4.493656576,
133
+ "loss": 1.018,
134
+ "grad_norm": 49.682037353515625,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.501278208,
141
+ "gpu_mem": 4.493741056,
142
+ "loss": 0.9599,
143
+ "grad_norm": 113.08747100830078,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.501671424,
150
+ "gpu_mem": 4.493780992,
151
+ "loss": 0.6904,
152
+ "grad_norm": 5.700827598571777,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.50206464,
159
+ "gpu_mem": 4.493843968,
160
+ "loss": 1.0158,
161
+ "grad_norm": 47.9433479309082,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.502261248,
168
+ "gpu_mem": 4.493681152,
169
+ "loss": 1.2045,
170
+ "grad_norm": 46.0986213684082,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.502654464,
177
+ "gpu_mem": 4.49379328,
178
+ "loss": 0.617,
179
+ "grad_norm": 4.902522087097168,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.50304768,
186
+ "gpu_mem": 4.493951488,
187
+ "loss": 0.6933,
188
+ "grad_norm": 14.804486274719238,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.503244288,
195
+ "gpu_mem": 4.493843968,
196
+ "loss": 0.7679,
197
+ "grad_norm": 14.584829330444336,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.503440896,
204
+ "gpu_mem": 4.49381632,
205
+ "loss": 0.6614,
206
+ "grad_norm": 6.266756057739258,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.503834112,
213
+ "gpu_mem": 4.493873152,
214
+ "loss": 0.6063,
215
+ "grad_norm": 5.272337913513184,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.504227328,
222
+ "gpu_mem": 4.493658112,
223
+ "loss": 0.7254,
224
+ "grad_norm": 11.210253715515137,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.504423936,
231
+ "gpu_mem": 4.493713408,
232
+ "loss": 0.7634,
233
+ "grad_norm": 18.455121994018555,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.504620544,
240
+ "gpu_mem": 4.494005248,
241
+ "loss": 0.7479,
242
+ "grad_norm": 19.921911239624023,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.50501376,
249
+ "gpu_mem": 4.493684224,
250
+ "loss": 0.845,
251
+ "grad_norm": 19.626916885375977,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.505210368,
258
+ "gpu_mem": 4.493748736,
259
+ "loss": 0.678,
260
+ "grad_norm": 8.194727897644043,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.505406976,
267
+ "gpu_mem": 4.493827072,
268
+ "loss": 0.7646,
269
+ "grad_norm": 10.00369644165039,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.505603584,
276
+ "gpu_mem": 4.493630464,
277
+ "loss": 0.6244,
278
+ "grad_norm": 6.776846885681152,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.505800192,
285
+ "gpu_mem": 4.493744128,
286
+ "loss": 0.7586,
287
+ "grad_norm": 16.354310989379883,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.506193408,
294
+ "gpu_mem": 4.493982208,
295
+ "loss": 0.7397,
296
+ "grad_norm": 12.160492897033691,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.506390016,
303
+ "gpu_mem": 4.493684224,
304
+ "loss": 0.6239,
305
+ "grad_norm": 9.15272331237793,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.506586624,
312
+ "gpu_mem": 4.493894656,
313
+ "loss": 0.6729,
314
+ "grad_norm": 5.571009635925293,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.506783232,
321
+ "gpu_mem": 4.493845504,
322
+ "loss": 0.7223,
323
+ "grad_norm": 25.372941970825195,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.50697984,
330
+ "gpu_mem": 4.493656576,
331
+ "loss": 0.8172,
332
+ "grad_norm": 16.308820724487305,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.507373056,
339
+ "gpu_mem": 4.493903872,
340
+ "loss": 0.6857,
341
+ "grad_norm": 4.158070087432861,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.507569664,
348
+ "gpu_mem": 4.494283264,
349
+ "loss": 0.664,
350
+ "grad_norm": 3.092892646789551,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.507569664,
357
+ "gpu_mem": 4.493853184,
358
+ "loss": 0.6499,
359
+ "grad_norm": 4.837502956390381,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.507766272,
366
+ "gpu_mem": 4.494080512,
367
+ "loss": 0.6745,
368
+ "grad_norm": 2.230825424194336,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.507766272,
375
+ "gpu_mem": 4.4939776,
376
+ "loss": 0.6285,
377
+ "grad_norm": 6.971991539001465,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.50796288,
384
+ "gpu_mem": 4.493799424,
385
+ "loss": 0.6839,
386
+ "grad_norm": 7.030607223510742,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.508159488,
393
+ "gpu_mem": 4.493942272,
394
+ "loss": 0.6815,
395
+ "grad_norm": 9.800080299377441,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.508356096,
402
+ "gpu_mem": 4.493722624,
403
+ "loss": 1.188,
404
+ "grad_norm": 21.56556510925293,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.508552704,
411
+ "gpu_mem": 4.493965312,
412
+ "loss": 0.7325,
413
+ "grad_norm": 7.097214221954346,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.508552704,
420
+ "gpu_mem": 4.493688832,
421
+ "loss": 0.686,
422
+ "grad_norm": 10.250021934509277,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.508749312,
429
+ "gpu_mem": 4.493765632,
430
+ "loss": 0.8689,
431
+ "grad_norm": 17.215639114379883,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.50894592,
438
+ "gpu_mem": 4.493782528,
439
+ "loss": 0.7259,
440
+ "grad_norm": 3.048801898956299,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.509142528,
447
+ "gpu_mem": 4.493721088,
448
+ "loss": 0.6172,
449
+ "grad_norm": 2.2575485706329346,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.509142528,
456
+ "gpu_mem": 4.493725696,
457
+ "loss": 0.5723,
458
+ "grad_norm": 2.044959306716919,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.509339136,
465
+ "gpu_mem": 4.493805568,
466
+ "loss": 0.6726,
467
+ "grad_norm": 4.235073089599609,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.509339136,
474
+ "gpu_mem": 4.493828608,
475
+ "loss": 0.6754,
476
+ "grad_norm": 10.029523849487305,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.509535744,
483
+ "gpu_mem": 4.493756416,
484
+ "loss": 0.6683,
485
+ "grad_norm": 4.766758918762207,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.509535744,
492
+ "gpu_mem": 4.494026752,
493
+ "loss": 0.6831,
494
+ "grad_norm": 3.753432273864746,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.509732352,
501
+ "gpu_mem": 4.493813248,
502
+ "loss": 0.926,
503
+ "grad_norm": 12.049140930175781,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.509732352,
510
+ "gpu_mem": 4.493807104,
511
+ "loss": 0.7591,
512
+ "grad_norm": 7.700575351715088,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.50992896,
519
+ "gpu_mem": 4.493702656,
520
+ "loss": 0.6608,
521
+ "grad_norm": 2.9501571655273438,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.50992896,
528
+ "gpu_mem": 4.493719552,
529
+ "loss": 0.7055,
530
+ "grad_norm": 4.570174217224121,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.510125568,
537
+ "gpu_mem": 4.493813248,
538
+ "loss": 0.7155,
539
+ "grad_norm": 10.250066757202148,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.510125568,
546
+ "gpu_mem": 4.493824,
547
+ "loss": 0.6198,
548
+ "grad_norm": 1.4150381088256836,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.510125568,
555
+ "gpu_mem": 4.493811712,
556
+ "loss": 0.8658,
557
+ "grad_norm": 11.572601318359375,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.510125568,
564
+ "gpu_mem": 4.493804032,
565
+ "loss": 0.5689,
566
+ "grad_norm": 2.9226982593536377,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.510518784,
573
+ "gpu_mem": 4.493733376,
574
+ "loss": 0.6423,
575
+ "grad_norm": 1.547162413597107,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.510715392,
582
+ "gpu_mem": 4.49377792,
583
+ "loss": 0.6797,
584
+ "grad_norm": 3.6416873931884766,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.510715392,
591
+ "gpu_mem": 4.493971456,
592
+ "loss": 0.6036,
593
+ "grad_norm": 3.8238625526428223,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.510715392,
600
+ "gpu_mem": 4.493681152,
601
+ "loss": 0.7277,
602
+ "grad_norm": 8.09846019744873,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.510912,
609
+ "gpu_mem": 4.493648896,
610
+ "loss": 0.8994,
611
+ "grad_norm": 13.207178115844727,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.510912,
618
+ "gpu_mem": 4.493714944,
619
+ "loss": 0.5864,
620
+ "grad_norm": 1.9375393390655518,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.510912,
627
+ "gpu_mem": 4.4937088,
628
+ "loss": 0.7792,
629
+ "grad_norm": 10.860440254211426,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.511108608,
636
+ "gpu_mem": 4.493937664,
637
+ "loss": 0.8567,
638
+ "grad_norm": 15.644757270812988,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.511108608,
645
+ "gpu_mem": 4.493929984,
646
+ "loss": 0.6608,
647
+ "grad_norm": 6.629893779754639,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.511108608,
654
+ "gpu_mem": 4.493896192,
655
+ "loss": 0.8023,
656
+ "grad_norm": 10.461833953857422,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.511305216,
663
+ "gpu_mem": 4.493756416,
664
+ "loss": 0.6224,
665
+ "grad_norm": 7.1698713302612305,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.511305216,
672
+ "gpu_mem": 4.493681152,
673
+ "loss": 0.5662,
674
+ "grad_norm": 2.1262013912200928,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.511305216,
681
+ "gpu_mem": 4.493621248,
682
+ "loss": 0.6138,
683
+ "grad_norm": 3.743492841720581,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.511305216,
690
+ "gpu_mem": 4.493694976,
691
+ "loss": 0.6539,
692
+ "grad_norm": 6.320612907409668,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.511305216,
699
+ "gpu_mem": 4.4937472,
700
+ "loss": 0.7095,
701
+ "grad_norm": 5.929784774780273,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.511305216,
708
+ "gpu_mem": 4.493879296,
709
+ "loss": 0.6698,
710
+ "grad_norm": 1.6754264831542969,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.511305216,
717
+ "gpu_mem": 4.49377024,
718
+ "loss": 0.6258,
719
+ "grad_norm": 1.8663870096206665,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.511305216,
726
+ "gpu_mem": 4.493650432,
727
+ "loss": 0.6314,
728
+ "grad_norm": 2.4315168857574463,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.511305216,
735
+ "gpu_mem": 4.493719552,
736
+ "loss": 0.635,
737
+ "grad_norm": 1.932876467704773,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.511305216,
744
+ "gpu_mem": 4.493819392,
745
+ "loss": 0.6554,
746
+ "grad_norm": 3.4201409816741943,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.511501824,
753
+ "gpu_mem": 4.493782528,
754
+ "loss": 0.7156,
755
+ "grad_norm": 7.90298318862915,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.511698432,
762
+ "gpu_mem": 4.493814784,
763
+ "loss": 0.5996,
764
+ "grad_norm": 3.2398500442504883,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.511698432,
771
+ "gpu_mem": 4.493765632,
772
+ "loss": 0.731,
773
+ "grad_norm": 5.571208953857422,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.511698432,
780
+ "gpu_mem": 4.493773312,
781
+ "loss": 0.7147,
782
+ "grad_norm": 7.965809345245361,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.511698432,
789
+ "gpu_mem": 4.493917696,
790
+ "loss": 0.5825,
791
+ "grad_norm": 2.4796321392059326,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.511698432,
798
+ "gpu_mem": 4.493699584,
799
+ "loss": 0.6858,
800
+ "grad_norm": 3.1727724075317383,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.511698432,
807
+ "gpu_mem": 4.493753344,
808
+ "loss": 0.7181,
809
+ "grad_norm": 8.902009010314941,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.511698432,
816
+ "gpu_mem": 4.493721088,
817
+ "loss": 0.7394,
818
+ "grad_norm": 10.157062530517578,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.511698432,
825
+ "gpu_mem": 4.493802496,
826
+ "loss": 0.6819,
827
+ "grad_norm": 2.2884974479675293,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.511698432,
834
+ "gpu_mem": 4.493605888,
835
+ "loss": 0.7171,
836
+ "grad_norm": 3.9411673545837402,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.511698432,
843
+ "gpu_mem": 4.493719552,
844
+ "loss": 0.5627,
845
+ "grad_norm": 2.8555171489715576,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.511698432,
852
+ "gpu_mem": 4.49373952,
853
+ "loss": 0.6865,
854
+ "grad_norm": 5.10888147354126,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.511698432,
861
+ "gpu_mem": 4.49377792,
862
+ "loss": 0.631,
863
+ "grad_norm": 4.621267318725586,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.51189504,
870
+ "gpu_mem": 4.49376256,
871
+ "loss": 0.6521,
872
+ "grad_norm": 3.2591891288757324,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.51189504,
879
+ "gpu_mem": 4.493675008,
880
+ "loss": 0.6631,
881
+ "grad_norm": 3.0139002799987793,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.51189504,
888
+ "gpu_mem": 4.49362432,
889
+ "loss": 0.6299,
890
+ "grad_norm": 2.386324405670166,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.51189504,
897
+ "gpu_mem": 4.493741056,
898
+ "loss": 0.6204,
899
+ "grad_norm": 1.986992359161377,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.51189504,
906
+ "gpu_mem": 4.493773312,
907
+ "loss": 0.7108,
908
+ "grad_norm": 6.049999237060547,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.51189504,
915
+ "gpu_mem": 4.493807104,
916
+ "loss": 0.6755,
917
+ "grad_norm": 5.4609575271606445,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.51189504,
924
+ "gpu_mem": 4.493857792,
925
+ "loss": 0.6327,
926
+ "grad_norm": 2.271766185760498,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.51189504,
933
+ "gpu_mem": 4.49376256,
934
+ "loss": 0.6587,
935
+ "grad_norm": 3.347034454345703,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.51189504,
942
+ "gpu_mem": 4.493863936,
943
+ "loss": 0.6336,
944
+ "grad_norm": 4.408857822418213,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.51189504,
951
+ "gpu_mem": 4.493814784,
952
+ "loss": 0.6127,
953
+ "grad_norm": 4.522818088531494,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.51189504,
960
+ "gpu_mem": 4.493702656,
961
+ "loss": 0.5933,
962
+ "grad_norm": 1.8976800441741943,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.51189504,
969
+ "gpu_mem": 4.493886976,
970
+ "loss": 0.6641,
971
+ "grad_norm": 3.7306277751922607,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.512091648,
978
+ "gpu_mem": 4.493741056,
979
+ "loss": 0.7569,
980
+ "grad_norm": 5.521296501159668,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.512091648,
987
+ "gpu_mem": 4.493744128,
988
+ "loss": 0.6737,
989
+ "grad_norm": 4.360266208648682,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.512091648,
996
+ "gpu_mem": 4.493713408,
997
+ "loss": 0.5683,
998
+ "grad_norm": 2.3229658603668213,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.512091648,
1005
+ "gpu_mem": 4.493759488,
1006
+ "loss": 0.6631,
1007
+ "grad_norm": 6.712233543395996,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.512091648,
1014
+ "gpu_mem": 4.493750272,
1015
+ "loss": 0.7655,
1016
+ "grad_norm": 6.895766258239746,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.512091648,
1023
+ "gpu_mem": 4.49373184,
1024
+ "loss": 0.7358,
1025
+ "grad_norm": 8.146341323852539,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.512091648,
1032
+ "gpu_mem": 4.493807104,
1033
+ "loss": 0.601,
1034
+ "grad_norm": 3.9480652809143066,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.512091648,
1041
+ "gpu_mem": 4.493727232,
1042
+ "loss": 0.5963,
1043
+ "grad_norm": 2.6513514518737793,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.512091648,
1050
+ "gpu_mem": 4.493618176,
1051
+ "loss": 0.7067,
1052
+ "grad_norm": 5.624129772186279,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.512091648,
1059
+ "gpu_mem": 4.493856256,
1060
+ "loss": 0.7582,
1061
+ "grad_norm": 6.480310916900635,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.512091648,
1068
+ "gpu_mem": 4.494026752,
1069
+ "loss": 0.5894,
1070
+ "grad_norm": 2.979290723800659,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.512091648,
1077
+ "gpu_mem": 4.493759488,
1078
+ "loss": 0.5885,
1079
+ "grad_norm": 1.9377977848052979,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.512091648,
1086
+ "gpu_mem": 4.493787136,
1087
+ "loss": 0.6238,
1088
+ "grad_norm": 3.6376171112060547,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.512091648,
1095
+ "gpu_mem": 4.493837824,
1096
+ "loss": 0.5311,
1097
+ "grad_norm": 4.018235206604004,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.512091648,
1104
+ "gpu_mem": 4.49364736,
1105
+ "loss": 0.6679,
1106
+ "grad_norm": 5.683865070343018,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.512091648,
1113
+ "gpu_mem": 4.494089728,
1114
+ "loss": 0.6208,
1115
+ "grad_norm": 5.339485168457031,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.512091648,
1122
+ "gpu_mem": 4.49381632,
1123
+ "loss": 0.5845,
1124
+ "grad_norm": 3.5815553665161133,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.512091648,
1131
+ "gpu_mem": 4.493699584,
1132
+ "loss": 0.5642,
1133
+ "grad_norm": 4.037660121917725,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.512288256,
1140
+ "gpu_mem": 4.49413888,
1141
+ "loss": 0.6483,
1142
+ "grad_norm": 5.6473846435546875,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.512288256,
1149
+ "gpu_mem": 4.493914624,
1150
+ "loss": 0.5481,
1151
+ "grad_norm": 3.1490492820739746,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.512288256,
1158
+ "gpu_mem": 4.49395456,
1159
+ "loss": 0.6907,
1160
+ "grad_norm": 3.3728561401367188,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.512288256,
1167
+ "gpu_mem": 4.493736448,
1168
+ "loss": 0.6776,
1169
+ "grad_norm": 2.8839058876037598,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.512288256,
1176
+ "gpu_mem": 4.493865472,
1177
+ "loss": 0.6277,
1178
+ "grad_norm": 2.4115381240844727,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.512288256,
1185
+ "gpu_mem": 4.49394688,
1186
+ "loss": 0.6367,
1187
+ "grad_norm": 3.60898494720459,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.512288256,
1194
+ "gpu_mem": 4.493730304,
1195
+ "loss": 0.5515,
1196
+ "grad_norm": 2.1373813152313232,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.512288256,
1203
+ "gpu_mem": 4.493863936,
1204
+ "loss": 0.5556,
1205
+ "grad_norm": 2.2262206077575684,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.512288256,
1212
+ "gpu_mem": 4.493886976,
1213
+ "loss": 0.6743,
1214
+ "grad_norm": 6.190613746643066,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.512288256,
1221
+ "gpu_mem": 4.49372416,
1222
+ "loss": 0.601,
1223
+ "grad_norm": 4.490257263183594,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.512288256,
1230
+ "gpu_mem": 4.493604352,
1231
+ "loss": 0.6619,
1232
+ "grad_norm": 4.613885402679443,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.512288256,
1239
+ "gpu_mem": 4.4937856,
1240
+ "loss": 0.5927,
1241
+ "grad_norm": 2.2556755542755127,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.512288256,
1248
+ "gpu_mem": 4.493684224,
1249
+ "loss": 0.6136,
1250
+ "grad_norm": 3.3856916427612305,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.512288256,
1257
+ "gpu_mem": 4.493736448,
1258
+ "loss": 0.6362,
1259
+ "grad_norm": 5.717785835266113,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.512288256,
1266
+ "gpu_mem": 4.493768704,
1267
+ "loss": 0.6383,
1268
+ "grad_norm": 5.319495677947998,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.512288256,
1275
+ "gpu_mem": 4.49390848,
1276
+ "loss": 0.5894,
1277
+ "grad_norm": 4.640230178833008,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.512288256,
1284
+ "gpu_mem": 4.493891584,
1285
+ "loss": 0.6378,
1286
+ "grad_norm": 3.016573905944824,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.512288256,
1293
+ "gpu_mem": 4.494083584,
1294
+ "loss": 0.6572,
1295
+ "grad_norm": 4.5237603187561035,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.512288256,
1302
+ "gpu_mem": 4.493794816,
1303
+ "loss": 0.5759,
1304
+ "grad_norm": 3.0536630153656006,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.512288256,
1311
+ "gpu_mem": 4.493830144,
1312
+ "loss": 0.6313,
1313
+ "grad_norm": 3.5982203483581543,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.512288256,
1320
+ "gpu_mem": 4.493728768,
1321
+ "loss": 0.5625,
1322
+ "grad_norm": 3.026538133621216,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.512288256,
1329
+ "gpu_mem": 4.519203328,
1330
+ "loss": 0.884,
1331
+ "grad_norm": 3.9491984844207764,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.512288256,
1338
+ "gpu_mem": 4.519138816,
1339
+ "loss": 0.6115,
1340
+ "grad_norm": 2.8990418910980225,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.512288256,
1347
+ "gpu_mem": 4.518976,
1348
+ "loss": 0.5619,
1349
+ "grad_norm": 3.0098154544830322,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.512288256,
1356
+ "gpu_mem": 4.519048192,
1357
+ "loss": 0.6179,
1358
+ "grad_norm": 3.9371068477630615,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.512288256,
1365
+ "gpu_mem": 4.51908352,
1366
+ "loss": 0.4941,
1367
+ "grad_norm": 2.8057267665863037,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.512288256,
1374
+ "gpu_mem": 4.519108096,
1375
+ "loss": 0.5268,
1376
+ "grad_norm": 4.285440444946289,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.512288256,
1383
+ "gpu_mem": 4.519069696,
1384
+ "loss": 0.5617,
1385
+ "grad_norm": 4.94078254699707,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.512288256,
1392
+ "gpu_mem": 4.51929088,
1393
+ "loss": 0.5337,
1394
+ "grad_norm": 3.763066291809082,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.512288256,
1401
+ "gpu_mem": 4.51919872,
1402
+ "loss": 0.5724,
1403
+ "grad_norm": 5.346607685089111,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.512288256,
1410
+ "gpu_mem": 4.519105024,
1411
+ "loss": 0.5009,
1412
+ "grad_norm": 5.106917381286621,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.512288256,
1419
+ "gpu_mem": 4.519031296,
1420
+ "loss": 0.5795,
1421
+ "grad_norm": 4.663048267364502,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.512288256,
1428
+ "gpu_mem": 4.519379968,
1429
+ "loss": 0.4231,
1430
+ "grad_norm": 4.06447696685791,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.512288256,
1437
+ "gpu_mem": 4.518974464,
1438
+ "loss": 0.6225,
1439
+ "grad_norm": 5.922128200531006,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.512288256,
1446
+ "gpu_mem": 4.518920704,
1447
+ "loss": 0.5124,
1448
+ "grad_norm": 5.51249885559082,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.512288256,
1455
+ "gpu_mem": 4.519696384,
1456
+ "loss": 0.5305,
1457
+ "grad_norm": 4.262174129486084,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.512288256,
1464
+ "gpu_mem": 4.519172608,
1465
+ "loss": 0.5712,
1466
+ "grad_norm": 6.790377140045166,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.512288256,
1473
+ "gpu_mem": 4.519085056,
1474
+ "loss": 0.6282,
1475
+ "grad_norm": 5.247696876525879,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.512288256,
1482
+ "gpu_mem": 4.519034368,
1483
+ "loss": 0.5203,
1484
+ "grad_norm": 5.485547065734863,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.512288256,
1491
+ "gpu_mem": 4.5191296,
1492
+ "loss": 0.5093,
1493
+ "grad_norm": 4.293337821960449,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.512288256,
1500
+ "gpu_mem": 4.519046656,
1501
+ "loss": 0.5422,
1502
+ "grad_norm": 4.634438991546631,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.512288256,
1509
+ "gpu_mem": 4.519065088,
1510
+ "loss": 0.5884,
1511
+ "grad_norm": 6.198184490203857,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.512288256,
1518
+ "gpu_mem": 4.51915264,
1519
+ "loss": 0.5658,
1520
+ "grad_norm": 6.395980358123779,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.512288256,
1527
+ "gpu_mem": 4.51903744,
1528
+ "loss": 0.4778,
1529
+ "grad_norm": 4.4228129386901855,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.512288256,
1536
+ "gpu_mem": 4.519100416,
1537
+ "loss": 0.5222,
1538
+ "grad_norm": 6.1567864418029785,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.512288256,
1545
+ "gpu_mem": 4.519008256,
1546
+ "loss": 0.6052,
1547
+ "grad_norm": 6.764645099639893,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.512288256,
1554
+ "gpu_mem": 4.519307776,
1555
+ "loss": 0.4933,
1556
+ "grad_norm": 4.290213108062744,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.512288256,
1563
+ "gpu_mem": 4.519031296,
1564
+ "loss": 0.579,
1565
+ "grad_norm": 4.750498294830322,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.512288256,
1572
+ "gpu_mem": 4.518997504,
1573
+ "loss": 0.4567,
1574
+ "grad_norm": 4.482128620147705,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.512288256,
1581
+ "gpu_mem": 4.519135744,
1582
+ "loss": 0.5528,
1583
+ "grad_norm": 7.500720500946045,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.512288256,
1590
+ "gpu_mem": 4.519234048,
1591
+ "loss": 0.4816,
1592
+ "grad_norm": 5.709393501281738,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.512288256,
1599
+ "gpu_mem": 4.518980608,
1600
+ "loss": 0.6219,
1601
+ "grad_norm": 5.3650712966918945,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.512288256,
1608
+ "gpu_mem": 4.519080448,
1609
+ "loss": 0.6314,
1610
+ "grad_norm": 7.034570693969727,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.512288256,
1617
+ "gpu_mem": 4.5190528,
1618
+ "loss": 0.5441,
1619
+ "grad_norm": 4.4938225746154785,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.512288256,
1626
+ "gpu_mem": 4.518989824,
1627
+ "loss": 0.4821,
1628
+ "grad_norm": 5.907620906829834,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.512288256,
1635
+ "gpu_mem": 4.519207936,
1636
+ "loss": 0.7578,
1637
+ "grad_norm": 9.750370025634766,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.512288256,
1644
+ "gpu_mem": 4.519105024,
1645
+ "loss": 0.5486,
1646
+ "grad_norm": 5.949014663696289,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.512288256,
1653
+ "gpu_mem": 4.5190528,
1654
+ "loss": 0.4688,
1655
+ "grad_norm": 4.6593017578125,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.512288256,
1662
+ "gpu_mem": 4.519031296,
1663
+ "loss": 0.5701,
1664
+ "grad_norm": 4.530482769012451,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.512288256,
1671
+ "gpu_mem": 4.519040512,
1672
+ "loss": 0.4945,
1673
+ "grad_norm": 3.4325568675994873,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.512288256,
1680
+ "gpu_mem": 4.518972928,
1681
+ "loss": 0.6189,
1682
+ "grad_norm": 4.750690460205078,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.512288256,
1689
+ "gpu_mem": 4.519135744,
1690
+ "loss": 0.5404,
1691
+ "grad_norm": 4.004016399383545,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.512288256,
1698
+ "gpu_mem": 4.519005184,
1699
+ "loss": 0.547,
1700
+ "grad_norm": 4.070260524749756,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.512288256,
1707
+ "gpu_mem": 4.519124992,
1708
+ "loss": 0.5295,
1709
+ "grad_norm": 5.301217555999756,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.512288256,
1716
+ "gpu_mem": 4.518943744,
1717
+ "loss": 0.4816,
1718
+ "grad_norm": 3.699140787124634,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.512288256,
1725
+ "gpu_mem": 4.51907584,
1726
+ "loss": 0.4443,
1727
+ "grad_norm": 4.096963882446289,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.512288256,
1734
+ "gpu_mem": 4.519049728,
1735
+ "loss": 0.5866,
1736
+ "grad_norm": 4.5070319175720215,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.512288256,
1743
+ "gpu_mem": 4.519015936,
1744
+ "loss": 0.5797,
1745
+ "grad_norm": 5.173367500305176,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.512288256,
1752
+ "gpu_mem": 4.519120384,
1753
+ "loss": 0.4506,
1754
+ "grad_norm": 5.948993682861328,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.512288256,
1761
+ "gpu_mem": 4.519115776,
1762
+ "loss": 0.5417,
1763
+ "grad_norm": 4.922159671783447,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.512288256,
1770
+ "gpu_mem": 4.518974464,
1771
+ "loss": 0.4126,
1772
+ "grad_norm": 4.2993011474609375,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.512288256,
1779
+ "gpu_mem": 4.519166464,
1780
+ "loss": 0.4516,
1781
+ "grad_norm": 3.3898727893829346,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.512288256,
1788
+ "gpu_mem": 4.519017472,
1789
+ "loss": 0.5272,
1790
+ "grad_norm": 4.7902374267578125,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.512288256,
1797
+ "gpu_mem": 4.519120384,
1798
+ "loss": 0.3785,
1799
+ "grad_norm": 3.5034830570220947,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.512288256,
1806
+ "gpu_mem": 4.519323136,
1807
+ "loss": 0.4795,
1808
+ "grad_norm": 4.421779632568359,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.512288256,
1815
+ "gpu_mem": 4.519124992,
1816
+ "loss": 0.4514,
1817
+ "grad_norm": 4.375877380371094,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.512288256,
1824
+ "gpu_mem": 4.519011328,
1825
+ "loss": 0.4037,
1826
+ "grad_norm": 5.991669654846191,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.512288256,
1833
+ "gpu_mem": 4.519025152,
1834
+ "loss": 0.4836,
1835
+ "grad_norm": 5.3182172775268555,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.512288256,
1842
+ "gpu_mem": 4.519069696,
1843
+ "loss": 0.4796,
1844
+ "grad_norm": 5.581629276275635,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.512288256,
1851
+ "gpu_mem": 4.519011328,
1852
+ "loss": 0.481,
1853
+ "grad_norm": 5.053388595581055,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.512288256,
1860
+ "gpu_mem": 4.5192448,
1861
+ "loss": 0.4202,
1862
+ "grad_norm": 5.716559886932373,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.512288256,
1869
+ "gpu_mem": 4.51927552,
1870
+ "loss": 0.3872,
1871
+ "grad_norm": 6.213109493255615,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.512288256,
1878
+ "gpu_mem": 4.519201792,
1879
+ "loss": 0.5066,
1880
+ "grad_norm": 6.249429225921631,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.512288256,
1887
+ "gpu_mem": 4.519089664,
1888
+ "loss": 0.4701,
1889
+ "grad_norm": 5.749514102935791,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.512288256,
1896
+ "gpu_mem": 4.519038976,
1897
+ "loss": 0.42,
1898
+ "grad_norm": 6.426360130310059,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.512288256,
1905
+ "gpu_mem": 4.519008256,
1906
+ "loss": 0.4786,
1907
+ "grad_norm": 5.285887241363525,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.512288256,
1914
+ "gpu_mem": 4.519031296,
1915
+ "loss": 0.5431,
1916
+ "grad_norm": 8.069002151489258,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.512288256,
1923
+ "gpu_mem": 4.51911424,
1924
+ "loss": 0.527,
1925
+ "grad_norm": 6.687179088592529,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.512288256,
1932
+ "gpu_mem": 4.519042048,
1933
+ "loss": 0.538,
1934
+ "grad_norm": 7.980434894561768,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.512288256,
1941
+ "gpu_mem": 4.519207936,
1942
+ "loss": 0.4374,
1943
+ "grad_norm": 5.38814640045166,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.512288256,
1950
+ "gpu_mem": 4.519049728,
1951
+ "loss": 0.5355,
1952
+ "grad_norm": 8.008475303649902,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.512288256,
1959
+ "gpu_mem": 4.519026688,
1960
+ "loss": 0.4467,
1961
+ "grad_norm": 5.458549499511719,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.512288256,
1968
+ "gpu_mem": 4.51915264,
1969
+ "loss": 0.477,
1970
+ "grad_norm": 6.730329990386963,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.512288256,
1977
+ "gpu_mem": 4.519186432,
1978
+ "loss": 0.5097,
1979
+ "grad_norm": 6.172441005706787,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.512288256,
1986
+ "gpu_mem": 4.519054336,
1987
+ "loss": 0.5732,
1988
+ "grad_norm": 7.062180519104004,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.512288256,
1995
+ "gpu_mem": 4.51919104,
1996
+ "loss": 0.4604,
1997
+ "grad_norm": 6.362549781799316,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.512288256,
2004
+ "gpu_mem": 4.519105024,
2005
+ "loss": 0.4812,
2006
+ "grad_norm": 6.742758274078369,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.512288256,
2013
+ "gpu_mem": 4.519069696,
2014
+ "loss": 0.5005,
2015
+ "grad_norm": 7.343570709228516,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.512288256,
2022
+ "gpu_mem": 4.519034368,
2023
+ "loss": 0.5068,
2024
+ "grad_norm": 5.180023670196533,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.512288256,
2031
+ "gpu_mem": 4.51918336,
2032
+ "loss": 0.379,
2033
+ "grad_norm": 5.091320991516113,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.512288256,
2040
+ "gpu_mem": 4.519072768,
2041
+ "loss": 0.4426,
2042
+ "grad_norm": 5.11602258682251,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.512288256,
2049
+ "gpu_mem": 4.519017472,
2050
+ "loss": 0.5256,
2051
+ "grad_norm": 5.844997882843018,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.512288256,
2058
+ "gpu_mem": 4.518959104,
2059
+ "loss": 0.5054,
2060
+ "grad_norm": 5.75474214553833,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.512288256,
2067
+ "gpu_mem": 4.519015936,
2068
+ "loss": 0.5413,
2069
+ "grad_norm": 6.338455677032471,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.512288256,
2076
+ "gpu_mem": 4.519293952,
2077
+ "loss": 0.4582,
2078
+ "grad_norm": 5.49623966217041,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.512288256,
2085
+ "gpu_mem": 4.519017472,
2086
+ "loss": 0.5658,
2087
+ "grad_norm": 5.572755813598633,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.512288256,
2094
+ "gpu_mem": 4.519330816,
2095
+ "loss": 0.5023,
2096
+ "grad_norm": 5.0739946365356445,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.512288256,
2103
+ "gpu_mem": 4.5192064,
2104
+ "loss": 0.4126,
2105
+ "grad_norm": 5.143454074859619,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.512288256,
2112
+ "gpu_mem": 4.518962176,
2113
+ "loss": 0.4623,
2114
+ "grad_norm": 5.956307888031006,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.512288256,
2121
+ "gpu_mem": 4.51902208,
2122
+ "loss": 0.4677,
2123
+ "grad_norm": 5.250843524932861,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.512288256,
2130
+ "gpu_mem": 4.51908352,
2131
+ "loss": 0.4254,
2132
+ "grad_norm": 4.902020454406738,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.512288256,
2139
+ "gpu_mem": 4.519085056,
2140
+ "loss": 0.4989,
2141
+ "grad_norm": 5.615092754364014,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.512288256,
2148
+ "gpu_mem": 4.519338496,
2149
+ "loss": 0.5546,
2150
+ "grad_norm": 5.185640335083008,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.512288256,
2157
+ "gpu_mem": 4.518988288,
2158
+ "loss": 0.6922,
2159
+ "grad_norm": 7.394616603851318,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.512288256,
2166
+ "gpu_mem": 4.519284736,
2167
+ "loss": 0.4441,
2168
+ "grad_norm": 5.41588020324707,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.512288256,
2175
+ "gpu_mem": 4.519146496,
2176
+ "loss": 0.5156,
2177
+ "grad_norm": 6.340667247772217,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.512288256,
2184
+ "gpu_mem": 4.51899904,
2185
+ "loss": 0.5521,
2186
+ "grad_norm": 5.889646053314209,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.512288256,
2193
+ "gpu_mem": 4.519138816,
2194
+ "loss": 0.4731,
2195
+ "grad_norm": 5.065695285797119,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.512288256,
2202
+ "gpu_mem": 4.519017472,
2203
+ "loss": 0.4947,
2204
+ "grad_norm": 5.03191614151001,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.512288256,
2211
+ "gpu_mem": 4.519111168,
2212
+ "loss": 0.4658,
2213
+ "grad_norm": 4.863046169281006,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.512288256,
2220
+ "gpu_mem": 4.5191296,
2221
+ "loss": 0.5575,
2222
+ "grad_norm": 5.03303337097168,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.512288256,
2229
+ "gpu_mem": 4.519081984,
2230
+ "loss": 0.4857,
2231
+ "grad_norm": 4.907963752746582,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.512288256,
2238
+ "gpu_mem": 4.518988288,
2239
+ "loss": 0.4642,
2240
+ "grad_norm": 5.556704521179199,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.512288256,
2247
+ "gpu_mem": 4.519080448,
2248
+ "loss": 0.458,
2249
+ "grad_norm": 4.857274055480957,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.512288256,
2256
+ "gpu_mem": 4.518992896,
2257
+ "loss": 0.417,
2258
+ "grad_norm": 5.613741874694824,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.512288256,
2265
+ "gpu_mem": 4.519035904,
2266
+ "loss": 0.4969,
2267
+ "grad_norm": 4.631195068359375,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.512288256,
2274
+ "gpu_mem": 4.51922944,
2275
+ "loss": 0.4858,
2276
+ "grad_norm": 5.4093122482299805,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.512288256,
2283
+ "gpu_mem": 4.519025152,
2284
+ "loss": 0.5449,
2285
+ "grad_norm": 5.517231464385986,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.512288256,
2292
+ "gpu_mem": 4.519181824,
2293
+ "loss": 0.5259,
2294
+ "grad_norm": 6.029206275939941,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.512288256,
2301
+ "gpu_mem": 4.519008256,
2302
+ "loss": 0.392,
2303
+ "grad_norm": 4.221251010894775,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.512288256,
2310
+ "gpu_mem": 4.51942144,
2311
+ "loss": 0.5529,
2312
+ "grad_norm": 6.13136100769043,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.512288256,
2319
+ "gpu_mem": 4.519080448,
2320
+ "loss": 0.4382,
2321
+ "grad_norm": 5.44303035736084,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.512288256,
2328
+ "gpu_mem": 4.518997504,
2329
+ "loss": 0.5205,
2330
+ "grad_norm": 4.832195281982422,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.512288256,
2337
+ "gpu_mem": 4.51911424,
2338
+ "loss": 0.3783,
2339
+ "grad_norm": 4.001471519470215,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.512288256,
2346
+ "gpu_mem": 4.519069696,
2347
+ "loss": 0.3939,
2348
+ "grad_norm": 4.915110111236572,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.512288256,
2355
+ "gpu_mem": 4.519026688,
2356
+ "loss": 0.381,
2357
+ "grad_norm": 3.9966530799865723,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.512288256,
2364
+ "gpu_mem": 4.519063552,
2365
+ "loss": 0.4564,
2366
+ "grad_norm": 4.660472393035889,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.512288256,
2373
+ "gpu_mem": 4.519151104,
2374
+ "loss": 0.4723,
2375
+ "grad_norm": 5.000064849853516,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.512288256,
2382
+ "gpu_mem": 4.519069696,
2383
+ "loss": 0.5149,
2384
+ "grad_norm": 5.103734970092773,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.512288256,
2391
+ "gpu_mem": 4.519284736,
2392
+ "loss": 0.4547,
2393
+ "grad_norm": 5.005780220031738,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.512288256,
2400
+ "gpu_mem": 4.519077376,
2401
+ "loss": 0.4271,
2402
+ "grad_norm": 5.692404270172119,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.512288256,
2409
+ "gpu_mem": 4.519081984,
2410
+ "loss": 0.405,
2411
+ "grad_norm": 5.452561855316162,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.512288256,
2418
+ "gpu_mem": 4.519092736,
2419
+ "loss": 0.4592,
2420
+ "grad_norm": 5.44649600982666,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.512288256,
2427
+ "gpu_mem": 4.519131136,
2428
+ "loss": 0.4592,
2429
+ "grad_norm": 5.148582935333252,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.512288256,
2436
+ "gpu_mem": 4.51918336,
2437
+ "loss": 0.414,
2438
+ "grad_norm": 5.350499153137207,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.512288256,
2445
+ "gpu_mem": 4.519042048,
2446
+ "loss": 0.4478,
2447
+ "grad_norm": 4.392088413238525,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.512288256,
2454
+ "gpu_mem": 4.51892224,
2455
+ "loss": 0.4744,
2456
+ "grad_norm": 5.41953706741333,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.512288256,
2463
+ "gpu_mem": 4.519149568,
2464
+ "loss": 0.4541,
2465
+ "grad_norm": 4.928563594818115,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.512288256,
2472
+ "gpu_mem": 4.519393792,
2473
+ "loss": 0.396,
2474
+ "grad_norm": 5.382067680358887,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.512288256,
2481
+ "gpu_mem": 4.519054336,
2482
+ "loss": 0.4373,
2483
+ "grad_norm": 4.41796875,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.512288256,
2490
+ "gpu_mem": 4.519000576,
2491
+ "loss": 0.6379,
2492
+ "grad_norm": 5.958452224731445,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.512288256,
2499
+ "gpu_mem": 4.519163392,
2500
+ "loss": 0.4834,
2501
+ "grad_norm": 4.944537162780762,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.512288256,
2508
+ "gpu_mem": 4.519103488,
2509
+ "loss": 0.4909,
2510
+ "grad_norm": 4.551985740661621,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.512288256,
2517
+ "gpu_mem": 4.51908352,
2518
+ "loss": 0.5207,
2519
+ "grad_norm": 4.847994327545166,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.512288256,
2526
+ "gpu_mem": 4.519019008,
2527
+ "loss": 0.5324,
2528
+ "grad_norm": 5.380536079406738,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.512288256,
2535
+ "gpu_mem": 4.519447552,
2536
+ "loss": 0.4495,
2537
+ "grad_norm": 5.5009050369262695,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.512288256,
2544
+ "gpu_mem": 4.519154176,
2545
+ "loss": 0.4852,
2546
+ "grad_norm": 5.446907997131348,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.512288256,
2553
+ "gpu_mem": 4.519009792,
2554
+ "loss": 0.4832,
2555
+ "grad_norm": 4.65576171875,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.512288256,
2562
+ "gpu_mem": 4.519063552,
2563
+ "loss": 0.4847,
2564
+ "grad_norm": 4.889958381652832,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.512288256,
2571
+ "gpu_mem": 4.519481344,
2572
+ "loss": 0.4748,
2573
+ "grad_norm": 5.272000789642334,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.512288256,
2580
+ "gpu_mem": 4.519250944,
2581
+ "loss": 0.4557,
2582
+ "grad_norm": 4.938218116760254,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.512288256,
2589
+ "gpu_mem": 4.519035904,
2590
+ "loss": 0.481,
2591
+ "grad_norm": 5.558644771575928,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.512288256,
2598
+ "gpu_mem": 4.519120384,
2599
+ "loss": 0.4614,
2600
+ "grad_norm": 5.039109706878662,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.512288256,
2607
+ "gpu_mem": 4.51904512,
2608
+ "loss": 0.5343,
2609
+ "grad_norm": 5.333126068115234,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.512288256,
2616
+ "gpu_mem": 4.519080448,
2617
+ "loss": 0.6398,
2618
+ "grad_norm": 5.9029459953308105,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.512288256,
2625
+ "gpu_mem": 4.519163392,
2626
+ "loss": 0.4624,
2627
+ "grad_norm": 5.084424018859863,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.512288256,
2634
+ "gpu_mem": 4.519080448,
2635
+ "loss": 0.5694,
2636
+ "grad_norm": 5.880096435546875,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.512288256,
2643
+ "gpu_mem": 4.51910656,
2644
+ "loss": 0.5335,
2645
+ "grad_norm": 5.264451503753662,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.512288256,
2652
+ "gpu_mem": 4.51910656,
2653
+ "train_runtime": 4458.5149,
2654
+ "train_samples_per_second": 4.229,
2655
+ "train_steps_per_second": 0.066,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.7076091230118355
2658
+ }
2659
+ ]
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.7826130252937662
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1577576
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T16:43:23.732951"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r2-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.33917546305516827
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 25389056
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T06:39:18.710581"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r32-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.2504481179047998
4
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_A",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 6317696
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_A-hellaswag-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-30T23:40:44.452046"
38
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-hellaswag-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "A"
30
+ }
TinyLlama_v1.1-abl_A/TinyLlama_v1.1-abl_A-logiqa-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.28465193141912826
4
+ }