martinkorelic commited on
Commit
21935ca
·
verified ·
1 Parent(s): 5464e23

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/adapter_config.json +30 -0
  2. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/eval_results.json +4 -0
  3. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/training_configuration.json +38 -0
  4. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/training_logs.json +625 -0
  5. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/adapter_config.json +30 -0
  6. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/eval_results.json +4 -0
  7. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/training_configuration.json +38 -0
  8. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/training_logs.json +625 -0
  9. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/adapter_config.json +30 -0
  10. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/eval_results.json +4 -0
  11. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/training_configuration.json +38 -0
  12. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/training_logs.json +625 -0
  13. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/adapter_config.json +30 -0
  14. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/eval_results.json +4 -0
  15. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/training_configuration.json +38 -0
  16. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/training_logs.json +1273 -0
  17. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/adapter_config.json +30 -0
  18. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/eval_results.json +4 -0
  19. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/training_configuration.json +38 -0
  20. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/training_logs.json +1273 -0
  21. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/adapter_config.json +30 -0
  22. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/eval_results.json +4 -0
  23. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/training_configuration.json +38 -0
  24. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/training_logs.json +1273 -0
  25. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/adapter_config.json +30 -0
  26. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/eval_results.json +4 -0
  27. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/training_configuration.json +38 -0
  28. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/training_logs.json +2659 -0
  29. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/adapter_config.json +30 -0
  30. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/eval_results.json +4 -0
  31. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/training_configuration.json +38 -0
  32. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/training_logs.json +2659 -0
  33. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r8-a2/adapter_config.json +30 -0
  34. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r8-a2/training_configuration.json +38 -0
  35. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/adapter_config.json +30 -0
  36. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/eval_results.json +4 -0
  37. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/training_configuration.json +38 -0
  38. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/training_logs.json +0 -0
  39. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/adapter_config.json +30 -0
  40. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/eval_results.json +4 -0
  41. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/training_configuration.json +38 -0
  42. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/training_logs.json +0 -0
  43. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/adapter_config.json +30 -0
  44. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/eval_results.json +4 -0
  45. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/training_configuration.json +38 -0
  46. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/training_logs.json +0 -0
  47. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/adapter_config.json +30 -0
  48. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/eval_results.json +4 -0
  49. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/training_configuration.json +38 -0
  50. TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/training_logs.json +0 -0
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.38139931740614336
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1182720
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_c-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T13:31:46.211850"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r2-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.506254848,
6
+ "gpu_mem": 4.425348608,
7
+ "loss": 4.4614,
8
+ "grad_norm": 256.6861267089844,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.506451456,
15
+ "gpu_mem": 4.43480064,
16
+ "loss": 4.6994,
17
+ "grad_norm": 261.9680480957031,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.506451456,
24
+ "gpu_mem": 4.43483136,
25
+ "loss": 2.2137,
26
+ "grad_norm": 135.3577423095703,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.506648064,
33
+ "gpu_mem": 4.434797568,
34
+ "loss": 1.5723,
35
+ "grad_norm": 16.541139602661133,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.506648064,
42
+ "gpu_mem": 4.43478528,
43
+ "loss": 1.4149,
44
+ "grad_norm": 9.431467056274414,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.506648064,
51
+ "gpu_mem": 4.434848256,
52
+ "loss": 1.4493,
53
+ "grad_norm": 18.17936134338379,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.506648064,
60
+ "gpu_mem": 4.4348544,
61
+ "loss": 1.5623,
62
+ "grad_norm": 17.265527725219727,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.506648064,
69
+ "gpu_mem": 4.434812928,
70
+ "loss": 1.3595,
71
+ "grad_norm": 3.7705695629119873,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.506648064,
78
+ "gpu_mem": 4.43480832,
79
+ "loss": 1.3274,
80
+ "grad_norm": 7.432311058044434,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.506648064,
87
+ "gpu_mem": 4.434797568,
88
+ "loss": 1.6472,
89
+ "grad_norm": 23.43598747253418,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.506648064,
96
+ "gpu_mem": 4.43480832,
97
+ "loss": 1.5246,
98
+ "grad_norm": 11.669015884399414,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.506648064,
105
+ "gpu_mem": 4.434832896,
106
+ "loss": 1.4175,
107
+ "grad_norm": 7.646242618560791,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.506648064,
114
+ "gpu_mem": 4.434832896,
115
+ "loss": 1.3104,
116
+ "grad_norm": 16.06755828857422,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.506648064,
123
+ "gpu_mem": 4.434780672,
124
+ "loss": 1.5926,
125
+ "grad_norm": 24.16870880126953,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.506648064,
132
+ "gpu_mem": 4.434855936,
133
+ "loss": 1.5066,
134
+ "grad_norm": 12.364516258239746,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.506648064,
141
+ "gpu_mem": 4.434849792,
142
+ "loss": 1.4177,
143
+ "grad_norm": 11.54814624786377,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.506648064,
150
+ "gpu_mem": 4.4348544,
151
+ "loss": 1.3599,
152
+ "grad_norm": 5.233391284942627,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.506648064,
159
+ "gpu_mem": 4.439542272,
160
+ "loss": 2.0643,
161
+ "grad_norm": 10.010595321655273,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.506648064,
168
+ "gpu_mem": 4.439540736,
169
+ "loss": 1.4071,
170
+ "grad_norm": 7.176828861236572,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.506648064,
177
+ "gpu_mem": 4.43951616,
178
+ "loss": 1.3075,
179
+ "grad_norm": 5.2276411056518555,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.506648064,
186
+ "gpu_mem": 4.43952384,
187
+ "loss": 1.4644,
188
+ "grad_norm": 11.008788108825684,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.506648064,
195
+ "gpu_mem": 4.439553024,
196
+ "loss": 1.4464,
197
+ "grad_norm": 10.353914260864258,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.506648064,
204
+ "gpu_mem": 4.439582208,
205
+ "loss": 1.3721,
206
+ "grad_norm": 5.877532482147217,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.506648064,
213
+ "gpu_mem": 4.439525376,
214
+ "loss": 1.3644,
215
+ "grad_norm": 4.342014312744141,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.506648064,
222
+ "gpu_mem": 4.439594496,
223
+ "loss": 1.3209,
224
+ "grad_norm": 3.078639507293701,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.506648064,
231
+ "gpu_mem": 4.439551488,
232
+ "loss": 1.3689,
233
+ "grad_norm": 3.996753454208374,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.506648064,
240
+ "gpu_mem": 4.439510016,
241
+ "loss": 1.415,
242
+ "grad_norm": 4.786011219024658,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.506648064,
249
+ "gpu_mem": 4.439556096,
250
+ "loss": 1.5722,
251
+ "grad_norm": 8.570415496826172,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.506648064,
258
+ "gpu_mem": 4.439551488,
259
+ "loss": 1.3872,
260
+ "grad_norm": 2.8165605068206787,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.506648064,
267
+ "gpu_mem": 4.439540736,
268
+ "loss": 1.3614,
269
+ "grad_norm": 3.3478944301605225,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.506648064,
276
+ "gpu_mem": 4.439571456,
277
+ "loss": 1.3404,
278
+ "grad_norm": 2.628307819366455,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.506648064,
285
+ "gpu_mem": 4.439580672,
286
+ "loss": 1.3803,
287
+ "grad_norm": 4.98179292678833,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.506648064,
294
+ "gpu_mem": 4.439560704,
295
+ "loss": 1.4267,
296
+ "grad_norm": 6.194660663604736,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.506648064,
303
+ "gpu_mem": 4.4395392,
304
+ "loss": 1.4209,
305
+ "grad_norm": 5.258482456207275,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.506648064,
312
+ "gpu_mem": 4.439427072,
313
+ "loss": 2.1003,
314
+ "grad_norm": 6.661643981933594,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.506648064,
321
+ "gpu_mem": 4.434826752,
322
+ "loss": 1.399,
323
+ "grad_norm": 5.453505039215088,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.506648064,
330
+ "gpu_mem": 4.434835968,
331
+ "loss": 1.3992,
332
+ "grad_norm": 4.897593975067139,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.506648064,
339
+ "gpu_mem": 4.434806784,
340
+ "loss": 1.3695,
341
+ "grad_norm": 4.509614944458008,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.506648064,
348
+ "gpu_mem": 4.434825216,
349
+ "loss": 1.3894,
350
+ "grad_norm": 4.451006889343262,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.506648064,
357
+ "gpu_mem": 4.434802176,
358
+ "loss": 1.3774,
359
+ "grad_norm": 2.4717953205108643,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.506648064,
366
+ "gpu_mem": 4.434803712,
367
+ "loss": 1.3856,
368
+ "grad_norm": 2.598550796508789,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.506648064,
375
+ "gpu_mem": 4.434832896,
376
+ "loss": 1.3837,
377
+ "grad_norm": 5.809165954589844,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.506648064,
384
+ "gpu_mem": 4.434848256,
385
+ "loss": 1.3518,
386
+ "grad_norm": 4.0668511390686035,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.506648064,
393
+ "gpu_mem": 4.434866688,
394
+ "loss": 1.3572,
395
+ "grad_norm": 3.543848991394043,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.506648064,
402
+ "gpu_mem": 4.434820608,
403
+ "loss": 1.3099,
404
+ "grad_norm": 2.2563862800598145,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.506648064,
411
+ "gpu_mem": 4.434814464,
412
+ "loss": 1.2907,
413
+ "grad_norm": 3.035221815109253,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.506648064,
420
+ "gpu_mem": 4.43480832,
421
+ "loss": 1.2951,
422
+ "grad_norm": 2.453887462615967,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.506648064,
429
+ "gpu_mem": 4.434812928,
430
+ "loss": 1.2507,
431
+ "grad_norm": 2.245753526687622,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.506648064,
438
+ "gpu_mem": 4.434803712,
439
+ "loss": 1.3112,
440
+ "grad_norm": 2.7252495288848877,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.506648064,
447
+ "gpu_mem": 4.43478528,
448
+ "loss": 1.3279,
449
+ "grad_norm": 2.834775447845459,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.506648064,
456
+ "gpu_mem": 4.434809856,
457
+ "loss": 1.3215,
458
+ "grad_norm": 3.576658010482788,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.506648064,
465
+ "gpu_mem": 4.434837504,
466
+ "loss": 1.351,
467
+ "grad_norm": 2.464062452316284,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.506648064,
474
+ "gpu_mem": 4.439537664,
475
+ "loss": 1.9408,
476
+ "grad_norm": 4.601680755615234,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.506648064,
483
+ "gpu_mem": 4.439506944,
484
+ "loss": 1.3573,
485
+ "grad_norm": 3.305359363555908,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.506648064,
492
+ "gpu_mem": 4.439540736,
493
+ "loss": 1.3641,
494
+ "grad_norm": 4.420423984527588,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.506648064,
501
+ "gpu_mem": 4.439614464,
502
+ "loss": 1.3082,
503
+ "grad_norm": 2.968301296234131,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.506648064,
510
+ "gpu_mem": 4.439557632,
511
+ "loss": 1.287,
512
+ "grad_norm": 2.3714208602905273,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.506648064,
519
+ "gpu_mem": 4.439551488,
520
+ "loss": 1.2157,
521
+ "grad_norm": 3.4141745567321777,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.506648064,
528
+ "gpu_mem": 4.439602176,
529
+ "loss": 1.2699,
530
+ "grad_norm": 3.1746935844421387,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.506648064,
537
+ "gpu_mem": 4.439528448,
538
+ "loss": 1.3339,
539
+ "grad_norm": 4.8189496994018555,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.506648064,
546
+ "gpu_mem": 4.439542272,
547
+ "loss": 1.341,
548
+ "grad_norm": 4.947413921356201,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.506648064,
555
+ "gpu_mem": 4.439543808,
556
+ "loss": 1.2974,
557
+ "grad_norm": 3.260141611099243,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.506648064,
564
+ "gpu_mem": 4.439533056,
565
+ "loss": 1.3065,
566
+ "grad_norm": 3.7296063899993896,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.506648064,
573
+ "gpu_mem": 4.439549952,
574
+ "loss": 1.2641,
575
+ "grad_norm": 4.162058353424072,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.506648064,
582
+ "gpu_mem": 4.439571456,
583
+ "loss": 1.2935,
584
+ "grad_norm": 3.655007839202881,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.506648064,
591
+ "gpu_mem": 4.43956224,
592
+ "loss": 1.2682,
593
+ "grad_norm": 3.4037258625030518,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.506648064,
600
+ "gpu_mem": 4.439588352,
601
+ "loss": 1.2646,
602
+ "grad_norm": 3.657987117767334,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.506648064,
609
+ "gpu_mem": 4.4395392,
610
+ "loss": 1.2839,
611
+ "grad_norm": 4.297943115234375,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.506648064,
618
+ "gpu_mem": 4.4395392,
619
+ "train_runtime": 380.012,
620
+ "train_samples_per_second": 11.779,
621
+ "train_steps_per_second": 0.179,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.5091714701231789
624
+ }
625
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.3293515358361775
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 13009920
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_c-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-01T03:38:45.959130"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r32-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.49262336,
6
+ "gpu_mem": 4.519966208,
7
+ "loss": 4.4614,
8
+ "grad_norm": 15.242693901062012,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.492819968,
15
+ "gpu_mem": 4.62403584,
16
+ "loss": 4.6994,
17
+ "grad_norm": 15.576388359069824,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.492819968,
24
+ "gpu_mem": 4.62406656,
25
+ "loss": 3.5902,
26
+ "grad_norm": 14.862706184387207,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.492819968,
33
+ "gpu_mem": 4.624032768,
34
+ "loss": 2.4093,
35
+ "grad_norm": 8.196996688842773,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.492819968,
42
+ "gpu_mem": 4.62402048,
43
+ "loss": 1.8535,
44
+ "grad_norm": 4.226119518280029,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.492819968,
51
+ "gpu_mem": 4.624083456,
52
+ "loss": 1.7171,
53
+ "grad_norm": 1.8783652782440186,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.492819968,
60
+ "gpu_mem": 4.6240896,
61
+ "loss": 1.5023,
62
+ "grad_norm": 0.8889955282211304,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.492819968,
69
+ "gpu_mem": 4.624048128,
70
+ "loss": 1.4388,
71
+ "grad_norm": 0.607149064540863,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.492819968,
78
+ "gpu_mem": 4.62404352,
79
+ "loss": 1.4488,
80
+ "grad_norm": 1.5999906063079834,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.492819968,
87
+ "gpu_mem": 4.624032768,
88
+ "loss": 1.4183,
89
+ "grad_norm": 0.8587966561317444,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.492819968,
96
+ "gpu_mem": 4.62404352,
97
+ "loss": 1.3995,
98
+ "grad_norm": 0.7488240599632263,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.492819968,
105
+ "gpu_mem": 4.624068096,
106
+ "loss": 1.4221,
107
+ "grad_norm": 0.9820311665534973,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.492819968,
114
+ "gpu_mem": 4.624068096,
115
+ "loss": 1.3384,
116
+ "grad_norm": 0.888247013092041,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.492819968,
123
+ "gpu_mem": 4.624015872,
124
+ "loss": 1.5276,
125
+ "grad_norm": 1.6412670612335205,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.492819968,
132
+ "gpu_mem": 4.624091136,
133
+ "loss": 1.6764,
134
+ "grad_norm": 2.260718822479248,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.492819968,
141
+ "gpu_mem": 4.624084992,
142
+ "loss": 1.4155,
143
+ "grad_norm": 0.7070791125297546,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.492414464,
150
+ "gpu_mem": 4.6240896,
151
+ "loss": 1.3948,
152
+ "grad_norm": 0.6293061971664429,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.492414464,
159
+ "gpu_mem": 4.676086272,
160
+ "loss": 1.9828,
161
+ "grad_norm": 0.523653507232666,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.492414464,
168
+ "gpu_mem": 4.676084736,
169
+ "loss": 1.3436,
170
+ "grad_norm": 0.35439494252204895,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.492414464,
177
+ "gpu_mem": 4.67606016,
178
+ "loss": 1.4187,
179
+ "grad_norm": 0.9338923096656799,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.492414464,
186
+ "gpu_mem": 4.67606784,
187
+ "loss": 1.3475,
188
+ "grad_norm": 0.28095149993896484,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.492414464,
195
+ "gpu_mem": 4.676097024,
196
+ "loss": 1.3209,
197
+ "grad_norm": 0.2404586225748062,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.492414464,
204
+ "gpu_mem": 4.676126208,
205
+ "loss": 1.3192,
206
+ "grad_norm": 0.388563334941864,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.492414464,
213
+ "gpu_mem": 4.676069376,
214
+ "loss": 1.3566,
215
+ "grad_norm": 0.3809518814086914,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.492414464,
222
+ "gpu_mem": 4.676138496,
223
+ "loss": 1.2867,
224
+ "grad_norm": 0.30288708209991455,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.492414464,
231
+ "gpu_mem": 4.676095488,
232
+ "loss": 1.3138,
233
+ "grad_norm": 0.23125135898590088,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.492414464,
240
+ "gpu_mem": 4.676054016,
241
+ "loss": 1.36,
242
+ "grad_norm": 0.4470292925834656,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.492414464,
249
+ "gpu_mem": 4.676100096,
250
+ "loss": 1.5448,
251
+ "grad_norm": 1.1614753007888794,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.492414464,
258
+ "gpu_mem": 4.676095488,
259
+ "loss": 1.3743,
260
+ "grad_norm": 0.4348692297935486,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.492414464,
267
+ "gpu_mem": 4.676084736,
268
+ "loss": 1.3902,
269
+ "grad_norm": 0.5657250285148621,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.492414464,
276
+ "gpu_mem": 4.676115456,
277
+ "loss": 1.3738,
278
+ "grad_norm": 0.4655613303184509,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.492414464,
285
+ "gpu_mem": 4.676124672,
286
+ "loss": 1.3621,
287
+ "grad_norm": 0.28164371848106384,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.492414464,
294
+ "gpu_mem": 4.676104704,
295
+ "loss": 1.3797,
296
+ "grad_norm": 0.26845723390579224,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.492414464,
303
+ "gpu_mem": 4.6760832,
304
+ "loss": 1.3641,
305
+ "grad_norm": 0.3237358331680298,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.492414464,
312
+ "gpu_mem": 4.675971072,
313
+ "loss": 2.0906,
314
+ "grad_norm": 0.5221468210220337,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.492414464,
321
+ "gpu_mem": 4.624061952,
322
+ "loss": 1.4109,
323
+ "grad_norm": 0.6460707187652588,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.492414464,
330
+ "gpu_mem": 4.624071168,
331
+ "loss": 1.3558,
332
+ "grad_norm": 0.27621448040008545,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.492414464,
339
+ "gpu_mem": 4.624041984,
340
+ "loss": 1.3643,
341
+ "grad_norm": 0.31860432028770447,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.492414464,
348
+ "gpu_mem": 4.624060416,
349
+ "loss": 1.3291,
350
+ "grad_norm": 0.14847567677497864,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.492414464,
357
+ "gpu_mem": 4.624037376,
358
+ "loss": 1.3953,
359
+ "grad_norm": 0.37305229902267456,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.492414464,
366
+ "gpu_mem": 4.624038912,
367
+ "loss": 1.3988,
368
+ "grad_norm": 0.34824004769325256,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.492414464,
375
+ "gpu_mem": 4.624068096,
376
+ "loss": 1.3218,
377
+ "grad_norm": 0.4003921449184418,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.492414464,
384
+ "gpu_mem": 4.624083456,
385
+ "loss": 1.3708,
386
+ "grad_norm": 0.5072891116142273,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.492414464,
393
+ "gpu_mem": 4.624101888,
394
+ "loss": 1.3432,
395
+ "grad_norm": 0.2869243025779724,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.492414464,
402
+ "gpu_mem": 4.624055808,
403
+ "loss": 1.326,
404
+ "grad_norm": 0.1572774201631546,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.492414464,
411
+ "gpu_mem": 4.624049664,
412
+ "loss": 1.3203,
413
+ "grad_norm": 0.38407614827156067,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.492414464,
420
+ "gpu_mem": 4.62404352,
421
+ "loss": 1.3093,
422
+ "grad_norm": 0.3907293677330017,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.492414464,
429
+ "gpu_mem": 4.624048128,
430
+ "loss": 1.2786,
431
+ "grad_norm": 0.27290043234825134,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.492414464,
438
+ "gpu_mem": 4.624038912,
439
+ "loss": 1.3194,
440
+ "grad_norm": 0.35774123668670654,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.492414464,
447
+ "gpu_mem": 4.62402048,
448
+ "loss": 1.3196,
449
+ "grad_norm": 0.16639018058776855,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.492414464,
456
+ "gpu_mem": 4.624045056,
457
+ "loss": 1.3523,
458
+ "grad_norm": 0.514198899269104,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.492414464,
465
+ "gpu_mem": 4.624072704,
466
+ "loss": 1.3465,
467
+ "grad_norm": 0.18968720734119415,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.492414464,
474
+ "gpu_mem": 4.676081664,
475
+ "loss": 1.9683,
476
+ "grad_norm": 0.4923784136772156,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.492414464,
483
+ "gpu_mem": 4.676050944,
484
+ "loss": 1.3609,
485
+ "grad_norm": 0.3217475414276123,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.492414464,
492
+ "gpu_mem": 4.676084736,
493
+ "loss": 1.3716,
494
+ "grad_norm": 0.44956937432289124,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.492414464,
501
+ "gpu_mem": 4.676158464,
502
+ "loss": 1.3291,
503
+ "grad_norm": 0.2953140139579773,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.492414464,
510
+ "gpu_mem": 4.676101632,
511
+ "loss": 1.3283,
512
+ "grad_norm": 0.19999608397483826,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.492414464,
519
+ "gpu_mem": 4.676095488,
520
+ "loss": 1.2358,
521
+ "grad_norm": 0.26531586050987244,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.492414464,
528
+ "gpu_mem": 4.676146176,
529
+ "loss": 1.3186,
530
+ "grad_norm": 0.301336407661438,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.492414464,
537
+ "gpu_mem": 4.676072448,
538
+ "loss": 1.367,
539
+ "grad_norm": 0.4051586091518402,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.492414464,
546
+ "gpu_mem": 4.676086272,
547
+ "loss": 1.3862,
548
+ "grad_norm": 0.36729010939598083,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.492414464,
555
+ "gpu_mem": 4.676087808,
556
+ "loss": 1.328,
557
+ "grad_norm": 0.2522547245025635,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.492414464,
564
+ "gpu_mem": 4.676077056,
565
+ "loss": 1.3359,
566
+ "grad_norm": 0.25604918599128723,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.492414464,
573
+ "gpu_mem": 4.676093952,
574
+ "loss": 1.3089,
575
+ "grad_norm": 0.3017137348651886,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.492414464,
582
+ "gpu_mem": 4.676115456,
583
+ "loss": 1.3253,
584
+ "grad_norm": 0.31271353363990784,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.492414464,
591
+ "gpu_mem": 4.67610624,
592
+ "loss": 1.2868,
593
+ "grad_norm": 0.2766549587249756,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.492414464,
600
+ "gpu_mem": 4.676132352,
601
+ "loss": 1.3103,
602
+ "grad_norm": 0.2598274350166321,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.492414464,
609
+ "gpu_mem": 4.6760832,
610
+ "loss": 1.3174,
611
+ "grad_norm": 0.3739108741283417,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.492414464,
618
+ "gpu_mem": 4.6760832,
619
+ "train_runtime": 383.1823,
620
+ "train_samples_per_second": 11.681,
621
+ "train_steps_per_second": 0.177,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.5497514447745155
624
+ }
625
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_c",
3
+ "results": 0.38139931740614336
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_C",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 3548160
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_c-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T20:33:13.257777"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_c-r8-a2/training_logs.json ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.05714285714285714,
5
+ "cpu_mem": 1.491714048,
6
+ "gpu_mem": 4.444272128,
7
+ "loss": 4.4614,
8
+ "grad_norm": 87.54730224609375,
9
+ "learning_rate": 4.285714285714285e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.11428571428571428,
14
+ "cpu_mem": 1.491714048,
15
+ "gpu_mem": 4.47264768,
16
+ "loss": 4.6994,
17
+ "grad_norm": 88.6588363647461,
18
+ "learning_rate": 8.57142857142857e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.17142857142857143,
23
+ "cpu_mem": 1.491714048,
24
+ "gpu_mem": 4.4726784,
25
+ "loss": 2.9296,
26
+ "grad_norm": 59.65691375732422,
27
+ "learning_rate": 0.00012857142857142855
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.22857142857142856,
32
+ "cpu_mem": 1.491910656,
33
+ "gpu_mem": 4.472644608,
34
+ "loss": 1.9248,
35
+ "grad_norm": 44.71550750732422,
36
+ "learning_rate": 0.0001714285714285714
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.2857142857142857,
41
+ "cpu_mem": 1.491910656,
42
+ "gpu_mem": 4.47263232,
43
+ "loss": 1.5841,
44
+ "grad_norm": 6.350735664367676,
45
+ "learning_rate": 0.00021428571428571427
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.34285714285714286,
50
+ "cpu_mem": 1.491910656,
51
+ "gpu_mem": 4.472695296,
52
+ "loss": 1.4618,
53
+ "grad_norm": 3.4582574367523193,
54
+ "learning_rate": 0.0002571428571428571
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.4,
59
+ "cpu_mem": 1.491910656,
60
+ "gpu_mem": 4.47270144,
61
+ "loss": 1.468,
62
+ "grad_norm": 4.226593971252441,
63
+ "learning_rate": 0.0003
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.45714285714285713,
68
+ "cpu_mem": 1.491910656,
69
+ "gpu_mem": 4.472659968,
70
+ "loss": 1.4115,
71
+ "grad_norm": 4.548079490661621,
72
+ "learning_rate": 0.00029980111348272456
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.5142857142857142,
77
+ "cpu_mem": 1.491910656,
78
+ "gpu_mem": 4.47265536,
79
+ "loss": 1.3269,
80
+ "grad_norm": 3.2502329349517822,
81
+ "learning_rate": 0.00029920498134218835
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.5714285714285714,
86
+ "cpu_mem": 1.491910656,
87
+ "gpu_mem": 4.472644608,
88
+ "loss": 1.5354,
89
+ "grad_norm": 5.945767402648926,
90
+ "learning_rate": 0.0002982131844136615
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.6285714285714286,
95
+ "cpu_mem": 1.491910656,
96
+ "gpu_mem": 4.47265536,
97
+ "loss": 1.4694,
98
+ "grad_norm": 4.522561550140381,
99
+ "learning_rate": 0.0002968283527643036
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.6857142857142857,
104
+ "cpu_mem": 1.491910656,
105
+ "gpu_mem": 4.472679936,
106
+ "loss": 1.3612,
107
+ "grad_norm": 2.172001361846924,
108
+ "learning_rate": 0.000295054158718698
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.7428571428571429,
113
+ "cpu_mem": 1.491910656,
114
+ "gpu_mem": 4.472679936,
115
+ "loss": 1.3933,
116
+ "grad_norm": 4.974757194519043,
117
+ "learning_rate": 0.00029289530712050735
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.8,
122
+ "cpu_mem": 1.491910656,
123
+ "gpu_mem": 4.472627712,
124
+ "loss": 1.6926,
125
+ "grad_norm": 8.53721809387207,
126
+ "learning_rate": 0.000290357522856074
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.8571428571428571,
131
+ "cpu_mem": 1.491910656,
132
+ "gpu_mem": 4.472702976,
133
+ "loss": 1.682,
134
+ "grad_norm": 7.365475654602051,
135
+ "learning_rate": 0.0002874475356730507
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.9142857142857143,
140
+ "cpu_mem": 1.491910656,
141
+ "gpu_mem": 4.472696832,
142
+ "loss": 1.3926,
143
+ "grad_norm": 1.4433025121688843,
144
+ "learning_rate": 0.0002841730623343193
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.9714285714285714,
149
+ "cpu_mem": 1.491910656,
150
+ "gpu_mem": 4.47270144,
151
+ "loss": 1.3892,
152
+ "grad_norm": 2.147620677947998,
153
+ "learning_rate": 0.00028054278615452326
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 1.0285714285714285,
158
+ "cpu_mem": 1.491910656,
159
+ "gpu_mem": 4.486851072,
160
+ "loss": 2.0418,
161
+ "grad_norm": 2.8316280841827393,
162
+ "learning_rate": 0.0002765663339734778
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 1.0857142857142856,
167
+ "cpu_mem": 1.491910656,
168
+ "gpu_mem": 4.486849536,
169
+ "loss": 1.3974,
170
+ "grad_norm": 2.231830358505249,
171
+ "learning_rate": 0.00027225425062752165
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 1.1428571428571428,
176
+ "cpu_mem": 1.491910656,
177
+ "gpu_mem": 4.48682496,
178
+ "loss": 1.4072,
179
+ "grad_norm": 3.0868797302246094,
180
+ "learning_rate": 0.0002676179709865066
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 1.2,
185
+ "cpu_mem": 1.491910656,
186
+ "gpu_mem": 4.48683264,
187
+ "loss": 1.362,
188
+ "grad_norm": 1.2280853986740112,
189
+ "learning_rate": 0.0002626697896305779
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 1.2571428571428571,
194
+ "cpu_mem": 1.491910656,
195
+ "gpu_mem": 4.486861824,
196
+ "loss": 1.3332,
197
+ "grad_norm": 1.2861239910125732,
198
+ "learning_rate": 0.000257422828247159
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 1.3142857142857143,
203
+ "cpu_mem": 1.491910656,
204
+ "gpu_mem": 4.486891008,
205
+ "loss": 1.2994,
206
+ "grad_norm": 0.9643853306770325,
207
+ "learning_rate": 0.00025189100083459397
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 1.3714285714285714,
212
+ "cpu_mem": 1.491910656,
213
+ "gpu_mem": 4.486834176,
214
+ "loss": 1.3951,
215
+ "grad_norm": 2.0752029418945312,
216
+ "learning_rate": 0.0002460889768047263
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 1.4285714285714286,
221
+ "cpu_mem": 1.491910656,
222
+ "gpu_mem": 4.486903296,
223
+ "loss": 1.3398,
224
+ "grad_norm": 1.7827640771865845,
225
+ "learning_rate": 0.00024003214208225522
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 1.4857142857142858,
230
+ "cpu_mem": 1.491910656,
231
+ "gpu_mem": 4.486860288,
232
+ "loss": 1.3324,
233
+ "grad_norm": 1.2238781452178955,
234
+ "learning_rate": 0.00023373655830402968
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 1.5428571428571427,
239
+ "cpu_mem": 1.491910656,
240
+ "gpu_mem": 4.486818816,
241
+ "loss": 1.3679,
242
+ "grad_norm": 1.499398112297058,
243
+ "learning_rate": 0.00022721892022647462
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 1.6,
248
+ "cpu_mem": 1.491910656,
249
+ "gpu_mem": 4.486864896,
250
+ "loss": 1.4935,
251
+ "grad_norm": 3.0713043212890625,
252
+ "learning_rate": 0.000220496511454098
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 1.657142857142857,
257
+ "cpu_mem": 1.491910656,
258
+ "gpu_mem": 4.486860288,
259
+ "loss": 1.3437,
260
+ "grad_norm": 0.9689168930053711,
261
+ "learning_rate": 0.0002135871586064791
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 1.7142857142857144,
266
+ "cpu_mem": 1.491910656,
267
+ "gpu_mem": 4.486849536,
268
+ "loss": 1.3452,
269
+ "grad_norm": 1.1040581464767456,
270
+ "learning_rate": 0.00020650918404527775
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 1.7714285714285714,
275
+ "cpu_mem": 1.491910656,
276
+ "gpu_mem": 4.486880256,
277
+ "loss": 1.3348,
278
+ "grad_norm": 1.1760023832321167,
279
+ "learning_rate": 0.00019928135728662522
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 1.8285714285714287,
284
+ "cpu_mem": 1.491910656,
285
+ "gpu_mem": 4.486889472,
286
+ "loss": 1.3825,
287
+ "grad_norm": 1.7875124216079712,
288
+ "learning_rate": 0.00019192284522774142
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 1.8857142857142857,
293
+ "cpu_mem": 1.491910656,
294
+ "gpu_mem": 4.486869504,
295
+ "loss": 1.4018,
296
+ "grad_norm": 2.0181446075439453,
297
+ "learning_rate": 0.00018445316131976934
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 1.9428571428571428,
302
+ "cpu_mem": 1.491910656,
303
+ "gpu_mem": 4.486848,
304
+ "loss": 1.362,
305
+ "grad_norm": 1.2658624649047852,
306
+ "learning_rate": 0.00017689211382161034
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 2.0,
311
+ "cpu_mem": 1.491910656,
312
+ "gpu_mem": 4.486735872,
313
+ "loss": 2.084,
314
+ "grad_norm": 2.159017562866211,
315
+ "learning_rate": 0.00016925975327198266
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 2.057142857142857,
320
+ "cpu_mem": 1.491910656,
321
+ "gpu_mem": 4.472673792,
322
+ "loss": 1.3323,
323
+ "grad_norm": 1.2686223983764648,
324
+ "learning_rate": 0.00016157631931899697
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 2.1142857142857143,
329
+ "cpu_mem": 1.491910656,
330
+ "gpu_mem": 4.472683008,
331
+ "loss": 1.4262,
332
+ "grad_norm": 2.404953956604004,
333
+ "learning_rate": 0.0001538621870482483
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 2.1714285714285713,
338
+ "cpu_mem": 1.491910656,
339
+ "gpu_mem": 4.472653824,
340
+ "loss": 1.3707,
341
+ "grad_norm": 1.484898567199707,
342
+ "learning_rate": 0.00014613781295175172
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 2.2285714285714286,
347
+ "cpu_mem": 1.491910656,
348
+ "gpu_mem": 4.472672256,
349
+ "loss": 1.4011,
350
+ "grad_norm": 1.994706153869629,
351
+ "learning_rate": 0.00013842368068100303
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 2.2857142857142856,
356
+ "cpu_mem": 1.491910656,
357
+ "gpu_mem": 4.472649216,
358
+ "loss": 1.3828,
359
+ "grad_norm": 1.137582540512085,
360
+ "learning_rate": 0.00013074024672801731
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 2.342857142857143,
365
+ "cpu_mem": 1.491910656,
366
+ "gpu_mem": 4.472650752,
367
+ "loss": 1.3832,
368
+ "grad_norm": 1.2223880290985107,
369
+ "learning_rate": 0.00012310788617838966
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 2.4,
374
+ "cpu_mem": 1.491910656,
375
+ "gpu_mem": 4.472679936,
376
+ "loss": 1.3495,
377
+ "grad_norm": 1.9196335077285767,
378
+ "learning_rate": 0.00011554683868023067
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 2.4571428571428573,
383
+ "cpu_mem": 1.491910656,
384
+ "gpu_mem": 4.472695296,
385
+ "loss": 1.3356,
386
+ "grad_norm": 1.193792700767517,
387
+ "learning_rate": 0.00010807715477225858
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 2.5142857142857142,
392
+ "cpu_mem": 1.491910656,
393
+ "gpu_mem": 4.472713728,
394
+ "loss": 1.3222,
395
+ "grad_norm": 0.7033736705780029,
396
+ "learning_rate": 0.00010071864271337478
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 2.571428571428571,
401
+ "cpu_mem": 1.491910656,
402
+ "gpu_mem": 4.472667648,
403
+ "loss": 1.2936,
404
+ "grad_norm": 0.6882470846176147,
405
+ "learning_rate": 9.34908159547222e-05
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 2.6285714285714286,
410
+ "cpu_mem": 1.491910656,
411
+ "gpu_mem": 4.472661504,
412
+ "loss": 1.2888,
413
+ "grad_norm": 1.0970045328140259,
414
+ "learning_rate": 8.641284139352091e-05
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 2.685714285714286,
419
+ "cpu_mem": 1.491910656,
420
+ "gpu_mem": 4.47265536,
421
+ "loss": 1.2839,
422
+ "grad_norm": 0.9996795058250427,
423
+ "learning_rate": 7.950348854590204e-05
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 2.742857142857143,
428
+ "cpu_mem": 1.491910656,
429
+ "gpu_mem": 4.472659968,
430
+ "loss": 1.2405,
431
+ "grad_norm": 0.8805912137031555,
432
+ "learning_rate": 7.278107977352543e-05
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 2.8,
437
+ "cpu_mem": 1.491910656,
438
+ "gpu_mem": 4.472650752,
439
+ "loss": 1.2877,
440
+ "grad_norm": 1.0169543027877808,
441
+ "learning_rate": 6.626344169597031e-05
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 2.857142857142857,
446
+ "cpu_mem": 1.491910656,
447
+ "gpu_mem": 4.47263232,
448
+ "loss": 1.3195,
449
+ "grad_norm": 1.2063992023468018,
450
+ "learning_rate": 5.996785791774478e-05
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 2.914285714285714,
455
+ "cpu_mem": 1.491910656,
456
+ "gpu_mem": 4.472656896,
457
+ "loss": 1.2716,
458
+ "grad_norm": 1.298764944076538,
459
+ "learning_rate": 5.391102319527373e-05
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 2.9714285714285715,
464
+ "cpu_mem": 1.491910656,
465
+ "gpu_mem": 4.472684544,
466
+ "loss": 1.3397,
467
+ "grad_norm": 1.3159348964691162,
468
+ "learning_rate": 4.8108999165406026e-05
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 3.0285714285714285,
473
+ "cpu_mem": 1.491910656,
474
+ "gpu_mem": 4.486846464,
475
+ "loss": 1.9005,
476
+ "grad_norm": 1.5513451099395752,
477
+ "learning_rate": 4.257717175284103e-05
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 3.085714285714286,
482
+ "cpu_mem": 1.491910656,
483
+ "gpu_mem": 4.486815744,
484
+ "loss": 1.3357,
485
+ "grad_norm": 1.5055204629898071,
486
+ "learning_rate": 3.733021036942205e-05
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 3.142857142857143,
491
+ "cpu_mem": 1.491910656,
492
+ "gpu_mem": 4.486849536,
493
+ "loss": 1.338,
494
+ "grad_norm": 1.9263288974761963,
495
+ "learning_rate": 3.238202901349345e-05
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 3.2,
500
+ "cpu_mem": 1.491910656,
501
+ "gpu_mem": 4.486923264,
502
+ "loss": 1.2948,
503
+ "grad_norm": 1.05913245677948,
504
+ "learning_rate": 2.774574937247831e-05
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 3.257142857142857,
509
+ "cpu_mem": 1.491910656,
510
+ "gpu_mem": 4.486866432,
511
+ "loss": 1.2813,
512
+ "grad_norm": 1.0687165260314941,
513
+ "learning_rate": 2.3433666026522153e-05
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 3.314285714285714,
518
+ "cpu_mem": 1.491910656,
519
+ "gpu_mem": 4.486860288,
520
+ "loss": 1.1917,
521
+ "grad_norm": 1.3125996589660645,
522
+ "learning_rate": 1.945721384547671e-05
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 3.3714285714285714,
527
+ "cpu_mem": 1.491910656,
528
+ "gpu_mem": 4.486910976,
529
+ "loss": 1.2695,
530
+ "grad_norm": 1.1107628345489502,
531
+ "learning_rate": 1.5826937665680693e-05
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 3.4285714285714284,
536
+ "cpu_mem": 1.491910656,
537
+ "gpu_mem": 4.486837248,
538
+ "loss": 1.3273,
539
+ "grad_norm": 1.5512784719467163,
540
+ "learning_rate": 1.2552464326949302e-05
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 3.4857142857142858,
545
+ "cpu_mem": 1.491910656,
546
+ "gpu_mem": 4.486851072,
547
+ "loss": 1.3364,
548
+ "grad_norm": 1.1313272714614868,
549
+ "learning_rate": 9.64247714392597e-06
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 3.5428571428571427,
554
+ "cpu_mem": 1.491910656,
555
+ "gpu_mem": 4.486852608,
556
+ "loss": 1.3142,
557
+ "grad_norm": 1.285913109779358,
558
+ "learning_rate": 7.104692879492624e-06
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 3.6,
563
+ "cpu_mem": 1.491910656,
564
+ "gpu_mem": 4.486841856,
565
+ "loss": 1.3043,
566
+ "grad_norm": 1.1523516178131104,
567
+ "learning_rate": 4.945841281301943e-06
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 3.657142857142857,
572
+ "cpu_mem": 1.491910656,
573
+ "gpu_mem": 4.486858752,
574
+ "loss": 1.2581,
575
+ "grad_norm": 1.2959764003753662,
576
+ "learning_rate": 3.1716472356963286e-06
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 3.7142857142857144,
581
+ "cpu_mem": 1.491910656,
582
+ "gpu_mem": 4.486880256,
583
+ "loss": 1.307,
584
+ "grad_norm": 1.4972169399261475,
585
+ "learning_rate": 1.7868155863384415e-06
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 3.7714285714285714,
590
+ "cpu_mem": 1.491910656,
591
+ "gpu_mem": 4.48687104,
592
+ "loss": 1.2616,
593
+ "grad_norm": 1.236932635307312,
594
+ "learning_rate": 7.950186578116413e-07
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 3.8285714285714287,
599
+ "cpu_mem": 1.491910656,
600
+ "gpu_mem": 4.486897152,
601
+ "loss": 1.2551,
602
+ "grad_norm": 1.1077874898910522,
603
+ "learning_rate": 1.988865172754206e-07
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 3.8857142857142857,
608
+ "cpu_mem": 1.491910656,
609
+ "gpu_mem": 4.486848,
610
+ "loss": 1.2744,
611
+ "grad_norm": 1.5151118040084839,
612
+ "learning_rate": 0.0
613
+ },
614
+ {
615
+ "step": 68,
616
+ "epoch": 3.8857142857142857,
617
+ "cpu_mem": 1.491910656,
618
+ "gpu_mem": 4.486848,
619
+ "train_runtime": 380.8275,
620
+ "train_samples_per_second": 11.753,
621
+ "train_steps_per_second": 0.179,
622
+ "total_flos": 0.0,
623
+ "train_loss": 1.5159650935846216
624
+ }
625
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.48695286195286197
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1182720
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_e-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T12:54:20.156634"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r2-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.50663168,
6
+ "gpu_mem": 4.425293312,
7
+ "loss": 4.6319,
8
+ "grad_norm": 259.6687927246094,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.506828288,
15
+ "gpu_mem": 4.434816,
16
+ "loss": 4.4578,
17
+ "grad_norm": 265.4711608886719,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.507024896,
24
+ "gpu_mem": 4.434794496,
25
+ "loss": 3.1953,
26
+ "grad_norm": 169.68458557128906,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.507221504,
33
+ "gpu_mem": 4.434772992,
34
+ "loss": 2.375,
35
+ "grad_norm": 182.9171600341797,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.507418112,
42
+ "gpu_mem": 4.434814464,
43
+ "loss": 1.6471,
44
+ "grad_norm": 24.682857513427734,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.507418112,
51
+ "gpu_mem": 4.434789888,
52
+ "loss": 1.5066,
53
+ "grad_norm": 23.628555297851562,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.50761472,
60
+ "gpu_mem": 4.434812928,
61
+ "loss": 1.4403,
62
+ "grad_norm": 15.431472778320312,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.50761472,
69
+ "gpu_mem": 4.434771456,
70
+ "loss": 1.3832,
71
+ "grad_norm": 12.077364921569824,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.50761472,
78
+ "gpu_mem": 4.434772992,
79
+ "loss": 1.37,
80
+ "grad_norm": 13.536406517028809,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.50761472,
87
+ "gpu_mem": 4.434768384,
88
+ "loss": 1.6613,
89
+ "grad_norm": 55.036895751953125,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.50761472,
96
+ "gpu_mem": 4.43484672,
97
+ "loss": 1.4147,
98
+ "grad_norm": 16.728759765625,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.50761472,
105
+ "gpu_mem": 4.434820608,
106
+ "loss": 1.3499,
107
+ "grad_norm": 8.593183517456055,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.50761472,
114
+ "gpu_mem": 4.434771456,
115
+ "loss": 1.4139,
116
+ "grad_norm": 10.364399909973145,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.50761472,
123
+ "gpu_mem": 4.43479296,
124
+ "loss": 1.3907,
125
+ "grad_norm": 8.241915702819824,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.50761472,
132
+ "gpu_mem": 4.43476992,
133
+ "loss": 1.3206,
134
+ "grad_norm": 3.446953296661377,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.50761472,
141
+ "gpu_mem": 4.434774528,
142
+ "loss": 1.4047,
143
+ "grad_norm": 7.349297523498535,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.50761472,
150
+ "gpu_mem": 4.434811392,
151
+ "loss": 1.3294,
152
+ "grad_norm": 4.483480453491211,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.50761472,
159
+ "gpu_mem": 4.434822144,
160
+ "loss": 1.339,
161
+ "grad_norm": 5.043764114379883,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.50761472,
168
+ "gpu_mem": 4.434765312,
169
+ "loss": 1.3036,
170
+ "grad_norm": 5.584592819213867,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.50761472,
177
+ "gpu_mem": 4.434835968,
178
+ "loss": 1.3402,
179
+ "grad_norm": 2.576645851135254,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.50761472,
186
+ "gpu_mem": 4.434834432,
187
+ "loss": 1.3322,
188
+ "grad_norm": 4.193614959716797,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.50761472,
195
+ "gpu_mem": 4.434791424,
196
+ "loss": 1.3737,
197
+ "grad_norm": 4.681839942932129,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.50761472,
204
+ "gpu_mem": 4.43480832,
205
+ "loss": 1.33,
206
+ "grad_norm": 3.810232400894165,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.50761472,
213
+ "gpu_mem": 4.434765312,
214
+ "loss": 1.3134,
215
+ "grad_norm": 3.2136316299438477,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.50761472,
222
+ "gpu_mem": 4.434794496,
223
+ "loss": 1.512,
224
+ "grad_norm": 13.608572006225586,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.50761472,
231
+ "gpu_mem": 4.434774528,
232
+ "loss": 1.6166,
233
+ "grad_norm": 14.765206336975098,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.50761472,
240
+ "gpu_mem": 4.43480064,
241
+ "loss": 1.3597,
242
+ "grad_norm": 4.8024702072143555,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.50761472,
249
+ "gpu_mem": 4.43480064,
250
+ "loss": 1.3922,
251
+ "grad_norm": 3.2379608154296875,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.50761472,
258
+ "gpu_mem": 4.434779136,
259
+ "loss": 1.3246,
260
+ "grad_norm": 4.987905025482178,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.50761472,
267
+ "gpu_mem": 4.43476992,
268
+ "loss": 1.3724,
269
+ "grad_norm": 4.277651786804199,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.50761472,
276
+ "gpu_mem": 4.434788352,
277
+ "loss": 1.3908,
278
+ "grad_norm": 3.794233560562134,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.50761472,
285
+ "gpu_mem": 4.434811392,
286
+ "loss": 1.35,
287
+ "grad_norm": 3.4455714225769043,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.50761472,
294
+ "gpu_mem": 4.43480832,
295
+ "loss": 1.3438,
296
+ "grad_norm": 2.1978235244750977,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.50761472,
303
+ "gpu_mem": 4.434811392,
304
+ "loss": 1.3851,
305
+ "grad_norm": 3.6687681674957275,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.50761472,
312
+ "gpu_mem": 4.43479296,
313
+ "loss": 1.3028,
314
+ "grad_norm": 2.3399291038513184,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.50761472,
321
+ "gpu_mem": 4.439514624,
322
+ "loss": 1.9693,
323
+ "grad_norm": 6.734819412231445,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.50761472,
330
+ "gpu_mem": 4.439519232,
331
+ "loss": 1.3661,
332
+ "grad_norm": 3.9805550575256348,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.50761472,
339
+ "gpu_mem": 4.439497728,
340
+ "loss": 1.2265,
341
+ "grad_norm": 3.169705390930176,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.50761472,
348
+ "gpu_mem": 4.439486976,
349
+ "loss": 1.4149,
350
+ "grad_norm": 7.221917629241943,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.50761472,
357
+ "gpu_mem": 4.439549952,
358
+ "loss": 1.4342,
359
+ "grad_norm": 6.082188129425049,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.50761472,
366
+ "gpu_mem": 4.439510016,
367
+ "loss": 1.3285,
368
+ "grad_norm": 3.6209311485290527,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.50761472,
375
+ "gpu_mem": 4.439553024,
376
+ "loss": 1.3293,
377
+ "grad_norm": 2.048436164855957,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.50761472,
384
+ "gpu_mem": 4.439502336,
385
+ "loss": 1.3793,
386
+ "grad_norm": 2.7470545768737793,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.50761472,
393
+ "gpu_mem": 4.439566848,
394
+ "loss": 1.3528,
395
+ "grad_norm": 2.686230421066284,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.50761472,
402
+ "gpu_mem": 4.439534592,
403
+ "loss": 1.3625,
404
+ "grad_norm": 2.582090139389038,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.50761472,
411
+ "gpu_mem": 4.4395392,
412
+ "loss": 1.3958,
413
+ "grad_norm": 3.0206172466278076,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.50761472,
420
+ "gpu_mem": 4.43948544,
421
+ "loss": 1.3788,
422
+ "grad_norm": 4.6008501052856445,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.50761472,
429
+ "gpu_mem": 4.439499264,
430
+ "loss": 1.3214,
431
+ "grad_norm": 2.9762120246887207,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.50761472,
438
+ "gpu_mem": 4.439488512,
439
+ "loss": 1.306,
440
+ "grad_norm": 2.677886724472046,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.50761472,
447
+ "gpu_mem": 4.439502336,
448
+ "loss": 1.3691,
449
+ "grad_norm": 4.243171691894531,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.50761472,
456
+ "gpu_mem": 4.43955456,
457
+ "loss": 1.4131,
458
+ "grad_norm": 5.571732521057129,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.50761472,
465
+ "gpu_mem": 4.439502336,
466
+ "loss": 1.375,
467
+ "grad_norm": 4.6382060050964355,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.50761472,
474
+ "gpu_mem": 4.439571456,
475
+ "loss": 1.3311,
476
+ "grad_norm": 3.3071107864379883,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.50761472,
483
+ "gpu_mem": 4.4395392,
484
+ "loss": 1.3042,
485
+ "grad_norm": 3.131740093231201,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.50761472,
492
+ "gpu_mem": 4.439548416,
493
+ "loss": 1.3975,
494
+ "grad_norm": 4.488919258117676,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.50761472,
501
+ "gpu_mem": 4.43952384,
502
+ "loss": 1.3192,
503
+ "grad_norm": 3.3439676761627197,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.50761472,
510
+ "gpu_mem": 4.439557632,
511
+ "loss": 1.299,
512
+ "grad_norm": 1.8695595264434814,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.50761472,
519
+ "gpu_mem": 4.4395392,
520
+ "loss": 1.3204,
521
+ "grad_norm": 2.3153836727142334,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.50761472,
528
+ "gpu_mem": 4.439525376,
529
+ "loss": 1.3336,
530
+ "grad_norm": 2.244190216064453,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.50761472,
537
+ "gpu_mem": 4.439563776,
538
+ "loss": 1.3321,
539
+ "grad_norm": 3.2910544872283936,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.50761472,
546
+ "gpu_mem": 4.439496192,
547
+ "loss": 1.3344,
548
+ "grad_norm": 3.4403202533721924,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.50761472,
555
+ "gpu_mem": 4.439543808,
556
+ "loss": 1.3526,
557
+ "grad_norm": 1.968464732170105,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.50761472,
564
+ "gpu_mem": 4.43949312,
565
+ "loss": 1.3002,
566
+ "grad_norm": 2.35099196434021,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.50761472,
573
+ "gpu_mem": 4.439542272,
574
+ "loss": 1.3641,
575
+ "grad_norm": 3.8307111263275146,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.50761472,
582
+ "gpu_mem": 4.439540736,
583
+ "loss": 1.337,
584
+ "grad_norm": 2.769908905029297,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.50761472,
591
+ "gpu_mem": 4.439559168,
592
+ "loss": 1.3166,
593
+ "grad_norm": 3.008100748062134,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.50761472,
600
+ "gpu_mem": 4.4395008,
601
+ "loss": 1.3218,
602
+ "grad_norm": 2.7646050453186035,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.50761472,
609
+ "gpu_mem": 4.439513088,
610
+ "loss": 1.343,
611
+ "grad_norm": 2.6678524017333984,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.50761472,
618
+ "gpu_mem": 4.439537664,
619
+ "loss": 1.3152,
620
+ "grad_norm": 3.518205404281616,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.50761472,
627
+ "gpu_mem": 4.439514624,
628
+ "loss": 1.2606,
629
+ "grad_norm": 1.7116825580596924,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.50761472,
636
+ "gpu_mem": 4.439350272,
637
+ "loss": 2.0035,
638
+ "grad_norm": 7.806818962097168,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.50761472,
645
+ "gpu_mem": 4.434803712,
646
+ "loss": 1.3873,
647
+ "grad_norm": 4.118950843811035,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.50761472,
654
+ "gpu_mem": 4.434766848,
655
+ "loss": 1.2968,
656
+ "grad_norm": 2.314663887023926,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.50761472,
663
+ "gpu_mem": 4.434826752,
664
+ "loss": 1.2931,
665
+ "grad_norm": 2.5138306617736816,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.50761472,
672
+ "gpu_mem": 4.434794496,
673
+ "loss": 1.2959,
674
+ "grad_norm": 2.421915054321289,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.50761472,
681
+ "gpu_mem": 4.434805248,
682
+ "loss": 1.2741,
683
+ "grad_norm": 1.9817582368850708,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.50761472,
690
+ "gpu_mem": 4.434842112,
691
+ "loss": 1.3254,
692
+ "grad_norm": 2.0456702709198,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.50761472,
699
+ "gpu_mem": 4.434826752,
700
+ "loss": 1.3843,
701
+ "grad_norm": 2.987546443939209,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.50761472,
708
+ "gpu_mem": 4.4347776,
709
+ "loss": 1.2465,
710
+ "grad_norm": 2.78094744682312,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.50761472,
717
+ "gpu_mem": 4.434822144,
718
+ "loss": 1.3265,
719
+ "grad_norm": 2.5948455333709717,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.50761472,
726
+ "gpu_mem": 4.43480832,
727
+ "loss": 1.3044,
728
+ "grad_norm": 3.1513984203338623,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.50761472,
735
+ "gpu_mem": 4.434776064,
736
+ "loss": 1.2917,
737
+ "grad_norm": 3.952747344970703,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.50761472,
744
+ "gpu_mem": 4.434826752,
745
+ "loss": 1.295,
746
+ "grad_norm": 4.772861480712891,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.50761472,
753
+ "gpu_mem": 4.434765312,
754
+ "loss": 1.35,
755
+ "grad_norm": 5.37269401550293,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.50761472,
762
+ "gpu_mem": 4.434811392,
763
+ "loss": 1.3514,
764
+ "grad_norm": 4.7276458740234375,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.50761472,
771
+ "gpu_mem": 4.434765312,
772
+ "loss": 1.3044,
773
+ "grad_norm": 4.575231075286865,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.50761472,
780
+ "gpu_mem": 4.434796032,
781
+ "loss": 1.3396,
782
+ "grad_norm": 5.390990257263184,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.50761472,
789
+ "gpu_mem": 4.434771456,
790
+ "loss": 1.273,
791
+ "grad_norm": 7.428049564361572,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.50761472,
798
+ "gpu_mem": 4.434825216,
799
+ "loss": 1.2715,
800
+ "grad_norm": 5.177944660186768,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.50761472,
807
+ "gpu_mem": 4.434806784,
808
+ "loss": 1.2945,
809
+ "grad_norm": 4.6876220703125,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.50761472,
816
+ "gpu_mem": 4.434756096,
817
+ "loss": 1.362,
818
+ "grad_norm": 6.015515327453613,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.50761472,
825
+ "gpu_mem": 4.434780672,
826
+ "loss": 1.2821,
827
+ "grad_norm": 4.289063453674316,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.50761472,
834
+ "gpu_mem": 4.434783744,
835
+ "loss": 1.2382,
836
+ "grad_norm": 4.802374362945557,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.50761472,
843
+ "gpu_mem": 4.434776064,
844
+ "loss": 1.224,
845
+ "grad_norm": 4.2559494972229,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.50761472,
852
+ "gpu_mem": 4.434814464,
853
+ "loss": 1.2576,
854
+ "grad_norm": 4.885746955871582,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.50761472,
861
+ "gpu_mem": 4.43482368,
862
+ "loss": 1.1952,
863
+ "grad_norm": 5.2525129318237305,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.50761472,
870
+ "gpu_mem": 4.434766848,
871
+ "loss": 1.313,
872
+ "grad_norm": 6.298973083496094,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.50761472,
879
+ "gpu_mem": 4.434766848,
880
+ "loss": 1.2644,
881
+ "grad_norm": 5.16410493850708,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.50761472,
888
+ "gpu_mem": 4.434763776,
889
+ "loss": 1.2443,
890
+ "grad_norm": 5.660140514373779,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.50761472,
897
+ "gpu_mem": 4.43476224,
898
+ "loss": 1.1987,
899
+ "grad_norm": 6.567827224731445,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.50761472,
906
+ "gpu_mem": 4.434805248,
907
+ "loss": 1.1556,
908
+ "grad_norm": 4.935121059417725,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.50761472,
915
+ "gpu_mem": 4.434743808,
916
+ "loss": 1.2275,
917
+ "grad_norm": 6.281479358673096,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.50761472,
924
+ "gpu_mem": 4.43479296,
925
+ "loss": 1.2214,
926
+ "grad_norm": 5.953158855438232,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.50761472,
933
+ "gpu_mem": 4.434855936,
934
+ "loss": 1.2748,
935
+ "grad_norm": 9.715459823608398,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.50761472,
942
+ "gpu_mem": 4.43480832,
943
+ "loss": 1.2011,
944
+ "grad_norm": 5.6507368087768555,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.50761472,
951
+ "gpu_mem": 4.434789888,
952
+ "loss": 1.2578,
953
+ "grad_norm": 6.601775646209717,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.50761472,
960
+ "gpu_mem": 4.439537664,
961
+ "loss": 1.7458,
962
+ "grad_norm": 18.245891571044922,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.50761472,
969
+ "gpu_mem": 4.439519232,
970
+ "loss": 1.2056,
971
+ "grad_norm": 6.946835994720459,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.50761472,
978
+ "gpu_mem": 4.43950848,
979
+ "loss": 1.1129,
980
+ "grad_norm": 6.792351245880127,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.50761472,
987
+ "gpu_mem": 4.43956224,
988
+ "loss": 1.1966,
989
+ "grad_norm": 10.979827880859375,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.50761472,
996
+ "gpu_mem": 4.439522304,
997
+ "loss": 1.1583,
998
+ "grad_norm": 8.557021141052246,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.50761472,
1005
+ "gpu_mem": 4.439540736,
1006
+ "loss": 1.1496,
1007
+ "grad_norm": 7.465101718902588,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.50761472,
1014
+ "gpu_mem": 4.439603712,
1015
+ "loss": 1.1222,
1016
+ "grad_norm": 7.832935333251953,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.50761472,
1023
+ "gpu_mem": 4.43953152,
1024
+ "loss": 1.1831,
1025
+ "grad_norm": 14.592466354370117,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.50761472,
1032
+ "gpu_mem": 4.439525376,
1033
+ "loss": 1.2385,
1034
+ "grad_norm": 8.725632667541504,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.50761472,
1041
+ "gpu_mem": 4.439540736,
1042
+ "loss": 1.0783,
1043
+ "grad_norm": 9.574886322021484,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.50761472,
1050
+ "gpu_mem": 4.439556096,
1051
+ "loss": 1.126,
1052
+ "grad_norm": 9.345331192016602,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.50761472,
1059
+ "gpu_mem": 4.43954688,
1060
+ "loss": 1.1187,
1061
+ "grad_norm": 10.025165557861328,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.50761472,
1068
+ "gpu_mem": 4.439537664,
1069
+ "loss": 1.108,
1070
+ "grad_norm": 8.818310737609863,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.50761472,
1077
+ "gpu_mem": 4.439556096,
1078
+ "loss": 1.1482,
1079
+ "grad_norm": 9.437005043029785,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.50761472,
1086
+ "gpu_mem": 4.43955456,
1087
+ "loss": 1.0596,
1088
+ "grad_norm": 9.051286697387695,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.50761472,
1095
+ "gpu_mem": 4.439511552,
1096
+ "loss": 0.9866,
1097
+ "grad_norm": 8.84517765045166,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.50761472,
1104
+ "gpu_mem": 4.439543808,
1105
+ "loss": 1.1725,
1106
+ "grad_norm": 13.230541229248047,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.50761472,
1113
+ "gpu_mem": 4.439497728,
1114
+ "loss": 1.1315,
1115
+ "grad_norm": 9.880842208862305,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.50761472,
1122
+ "gpu_mem": 4.439542272,
1123
+ "loss": 1.0279,
1124
+ "grad_norm": 12.509747505187988,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.50761472,
1131
+ "gpu_mem": 4.43949312,
1132
+ "loss": 1.0526,
1133
+ "grad_norm": 14.200373649597168,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.50761472,
1140
+ "gpu_mem": 4.439505408,
1141
+ "loss": 1.0684,
1142
+ "grad_norm": 12.265142440795898,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.50761472,
1149
+ "gpu_mem": 4.439529984,
1150
+ "loss": 1.0435,
1151
+ "grad_norm": 9.855721473693848,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.50761472,
1158
+ "gpu_mem": 4.439491584,
1159
+ "loss": 1.0585,
1160
+ "grad_norm": 9.79635238647461,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.50761472,
1167
+ "gpu_mem": 4.439494656,
1168
+ "loss": 1.1007,
1169
+ "grad_norm": 12.417722702026367,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.50761472,
1176
+ "gpu_mem": 4.439506944,
1177
+ "loss": 1.0559,
1178
+ "grad_norm": 9.638754844665527,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.50761472,
1185
+ "gpu_mem": 4.439471616,
1186
+ "loss": 1.1001,
1187
+ "grad_norm": 12.243162155151367,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.50761472,
1194
+ "gpu_mem": 4.439513088,
1195
+ "loss": 1.0142,
1196
+ "grad_norm": 8.843420028686523,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.50761472,
1203
+ "gpu_mem": 4.439528448,
1204
+ "loss": 1.1127,
1205
+ "grad_norm": 17.717222213745117,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.50761472,
1212
+ "gpu_mem": 4.43949312,
1213
+ "loss": 1.1962,
1214
+ "grad_norm": 12.415128707885742,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.50761472,
1221
+ "gpu_mem": 4.4395008,
1222
+ "loss": 1.1189,
1223
+ "grad_norm": 13.41920280456543,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.50761472,
1230
+ "gpu_mem": 4.439522304,
1231
+ "loss": 1.0789,
1232
+ "grad_norm": 12.066742897033691,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.50761472,
1239
+ "gpu_mem": 4.439533056,
1240
+ "loss": 1.1993,
1241
+ "grad_norm": 11.856720924377441,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.50761472,
1248
+ "gpu_mem": 4.439525376,
1249
+ "loss": 1.1628,
1250
+ "grad_norm": 14.922451972961426,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.50761472,
1257
+ "gpu_mem": 4.439559168,
1258
+ "loss": 1.0973,
1259
+ "grad_norm": 9.704615592956543,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.50761472,
1266
+ "gpu_mem": 4.439559168,
1267
+ "train_runtime": 683.2424,
1268
+ "train_samples_per_second": 13.178,
1269
+ "train_steps_per_second": 0.205,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.3638307809829713
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.32196969696969696
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 13009920
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_e-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-01T02:59:40.533088"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r32-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.506336768,
6
+ "gpu_mem": 4.519910912,
7
+ "loss": 4.6319,
8
+ "grad_norm": 15.45500373840332,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.506533376,
15
+ "gpu_mem": 4.6240512,
16
+ "loss": 4.4578,
17
+ "grad_norm": 15.729421615600586,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.506729984,
24
+ "gpu_mem": 4.624029696,
25
+ "loss": 4.2421,
26
+ "grad_norm": 15.735064506530762,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.506926592,
33
+ "gpu_mem": 4.624008192,
34
+ "loss": 3.7264,
35
+ "grad_norm": 12.702402114868164,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.506926592,
42
+ "gpu_mem": 4.624049664,
43
+ "loss": 2.8628,
44
+ "grad_norm": 16.878061294555664,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.5071232,
51
+ "gpu_mem": 4.624025088,
52
+ "loss": 2.1096,
53
+ "grad_norm": 6.79240083694458,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.507319808,
60
+ "gpu_mem": 4.624048128,
61
+ "loss": 1.9211,
62
+ "grad_norm": 2.7695512771606445,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.507319808,
69
+ "gpu_mem": 4.624006656,
70
+ "loss": 1.6771,
71
+ "grad_norm": 1.5878878831863403,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.507319808,
78
+ "gpu_mem": 4.624008192,
79
+ "loss": 1.552,
80
+ "grad_norm": 1.507412075996399,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.507319808,
87
+ "gpu_mem": 4.624003584,
88
+ "loss": 1.5163,
89
+ "grad_norm": 0.9610570073127747,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.507319808,
96
+ "gpu_mem": 4.62408192,
97
+ "loss": 1.4057,
98
+ "grad_norm": 0.9633939862251282,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.507319808,
105
+ "gpu_mem": 4.624055808,
106
+ "loss": 1.3628,
107
+ "grad_norm": 0.8077499270439148,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.507319808,
114
+ "gpu_mem": 4.624006656,
115
+ "loss": 1.3515,
116
+ "grad_norm": 0.8354933261871338,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.507319808,
123
+ "gpu_mem": 4.62402816,
124
+ "loss": 1.3241,
125
+ "grad_norm": 0.6401271820068359,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.507319808,
132
+ "gpu_mem": 4.62400512,
133
+ "loss": 1.3536,
134
+ "grad_norm": 0.915156364440918,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.507319808,
141
+ "gpu_mem": 4.624009728,
142
+ "loss": 1.386,
143
+ "grad_norm": 1.0561822652816772,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.507319808,
150
+ "gpu_mem": 4.624046592,
151
+ "loss": 1.3383,
152
+ "grad_norm": 0.8924583196640015,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.507319808,
159
+ "gpu_mem": 4.624057344,
160
+ "loss": 1.3418,
161
+ "grad_norm": 1.1358929872512817,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.507319808,
168
+ "gpu_mem": 4.624000512,
169
+ "loss": 1.3633,
170
+ "grad_norm": 1.5113840103149414,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.507319808,
177
+ "gpu_mem": 4.624071168,
178
+ "loss": 1.3698,
179
+ "grad_norm": 0.9107031226158142,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.507319808,
186
+ "gpu_mem": 4.624069632,
187
+ "loss": 1.3323,
188
+ "grad_norm": 0.9094489216804504,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.507319808,
195
+ "gpu_mem": 4.624026624,
196
+ "loss": 1.3659,
197
+ "grad_norm": 1.0449373722076416,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.507319808,
204
+ "gpu_mem": 4.62404352,
205
+ "loss": 1.3191,
206
+ "grad_norm": 0.6375622153282166,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.507319808,
213
+ "gpu_mem": 4.624000512,
214
+ "loss": 1.3682,
215
+ "grad_norm": 0.6815070509910583,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.507319808,
222
+ "gpu_mem": 4.624029696,
223
+ "loss": 1.3646,
224
+ "grad_norm": 0.43889474868774414,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.507319808,
231
+ "gpu_mem": 4.624009728,
232
+ "loss": 1.4326,
233
+ "grad_norm": 0.6083115935325623,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.507319808,
240
+ "gpu_mem": 4.62403584,
241
+ "loss": 1.3713,
242
+ "grad_norm": 0.5347280502319336,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.507319808,
249
+ "gpu_mem": 4.62403584,
250
+ "loss": 1.342,
251
+ "grad_norm": 0.31040897965431213,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.507319808,
258
+ "gpu_mem": 4.624014336,
259
+ "loss": 1.2572,
260
+ "grad_norm": 0.41217169165611267,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.507319808,
267
+ "gpu_mem": 4.62400512,
268
+ "loss": 1.3938,
269
+ "grad_norm": 0.6330100893974304,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.507319808,
276
+ "gpu_mem": 4.624023552,
277
+ "loss": 1.3735,
278
+ "grad_norm": 0.5508550405502319,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.507319808,
285
+ "gpu_mem": 4.624046592,
286
+ "loss": 1.3095,
287
+ "grad_norm": 0.24852244555950165,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.507319808,
294
+ "gpu_mem": 4.62404352,
295
+ "loss": 1.3292,
296
+ "grad_norm": 0.25524747371673584,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.507319808,
303
+ "gpu_mem": 4.624046592,
304
+ "loss": 1.4214,
305
+ "grad_norm": 0.6475468873977661,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.507319808,
312
+ "gpu_mem": 4.62402816,
313
+ "loss": 1.3162,
314
+ "grad_norm": 0.3794228434562683,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.507319808,
321
+ "gpu_mem": 4.676058624,
322
+ "loss": 2.0002,
323
+ "grad_norm": 0.7675129175186157,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.507319808,
330
+ "gpu_mem": 4.676063232,
331
+ "loss": 1.3351,
332
+ "grad_norm": 0.1956872045993805,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.507319808,
339
+ "gpu_mem": 4.676041728,
340
+ "loss": 1.245,
341
+ "grad_norm": 0.3838285803794861,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.507319808,
348
+ "gpu_mem": 4.676030976,
349
+ "loss": 1.3188,
350
+ "grad_norm": 0.49647337198257446,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.507319808,
357
+ "gpu_mem": 4.676093952,
358
+ "loss": 1.3697,
359
+ "grad_norm": 0.4313946068286896,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.507319808,
366
+ "gpu_mem": 4.676054016,
367
+ "loss": 1.3231,
368
+ "grad_norm": 0.2102261781692505,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.507319808,
375
+ "gpu_mem": 4.676097024,
376
+ "loss": 1.3535,
377
+ "grad_norm": 0.3038178086280823,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.507319808,
384
+ "gpu_mem": 4.676046336,
385
+ "loss": 1.3822,
386
+ "grad_norm": 0.22646814584732056,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.507319808,
393
+ "gpu_mem": 4.676110848,
394
+ "loss": 1.3555,
395
+ "grad_norm": 0.2905891537666321,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.507319808,
402
+ "gpu_mem": 4.676078592,
403
+ "loss": 1.3617,
404
+ "grad_norm": 0.22614973783493042,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.507319808,
411
+ "gpu_mem": 4.6760832,
412
+ "loss": 1.3523,
413
+ "grad_norm": 0.19625195860862732,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.507319808,
420
+ "gpu_mem": 4.67602944,
421
+ "loss": 1.3234,
422
+ "grad_norm": 0.5174495577812195,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.507319808,
429
+ "gpu_mem": 4.676043264,
430
+ "loss": 1.3128,
431
+ "grad_norm": 0.23632147908210754,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.507319808,
438
+ "gpu_mem": 4.676032512,
439
+ "loss": 1.3276,
440
+ "grad_norm": 0.3750535249710083,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.507319808,
447
+ "gpu_mem": 4.676046336,
448
+ "loss": 1.3333,
449
+ "grad_norm": 0.326159805059433,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.507319808,
456
+ "gpu_mem": 4.67609856,
457
+ "loss": 1.3605,
458
+ "grad_norm": 0.43731603026390076,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.507319808,
465
+ "gpu_mem": 4.676046336,
466
+ "loss": 1.3531,
467
+ "grad_norm": 0.3836096525192261,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.507319808,
474
+ "gpu_mem": 4.676115456,
475
+ "loss": 1.3079,
476
+ "grad_norm": 0.3928913474082947,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.507319808,
483
+ "gpu_mem": 4.6760832,
484
+ "loss": 1.3158,
485
+ "grad_norm": 0.4037013351917267,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.507319808,
492
+ "gpu_mem": 4.676092416,
493
+ "loss": 1.3535,
494
+ "grad_norm": 0.22461530566215515,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.507319808,
501
+ "gpu_mem": 4.67606784,
502
+ "loss": 1.3064,
503
+ "grad_norm": 0.1745154857635498,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.507319808,
510
+ "gpu_mem": 4.676101632,
511
+ "loss": 1.311,
512
+ "grad_norm": 0.1602332592010498,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.507319808,
519
+ "gpu_mem": 4.6760832,
520
+ "loss": 1.3149,
521
+ "grad_norm": 0.2614760994911194,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.507319808,
528
+ "gpu_mem": 4.676069376,
529
+ "loss": 1.3111,
530
+ "grad_norm": 0.15861870348453522,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.507319808,
537
+ "gpu_mem": 4.676107776,
538
+ "loss": 1.3024,
539
+ "grad_norm": 0.4162784814834595,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.507319808,
546
+ "gpu_mem": 4.676040192,
547
+ "loss": 1.3036,
548
+ "grad_norm": 0.376035213470459,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.507319808,
555
+ "gpu_mem": 4.676087808,
556
+ "loss": 1.3625,
557
+ "grad_norm": 0.26167038083076477,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.507319808,
564
+ "gpu_mem": 4.67603712,
565
+ "loss": 1.3322,
566
+ "grad_norm": 0.36649367213249207,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.507319808,
573
+ "gpu_mem": 4.676086272,
574
+ "loss": 1.3437,
575
+ "grad_norm": 0.45341187715530396,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.507319808,
582
+ "gpu_mem": 4.676084736,
583
+ "loss": 1.3348,
584
+ "grad_norm": 0.34720247983932495,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.507319808,
591
+ "gpu_mem": 4.676103168,
592
+ "loss": 1.2949,
593
+ "grad_norm": 0.29440757632255554,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.507319808,
600
+ "gpu_mem": 4.6760448,
601
+ "loss": 1.3086,
602
+ "grad_norm": 0.22650545835494995,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.507319808,
609
+ "gpu_mem": 4.676057088,
610
+ "loss": 1.3501,
611
+ "grad_norm": 0.18094563484191895,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.507319808,
618
+ "gpu_mem": 4.676081664,
619
+ "loss": 1.3019,
620
+ "grad_norm": 0.4657556116580963,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.507319808,
627
+ "gpu_mem": 4.676058624,
628
+ "loss": 1.2815,
629
+ "grad_norm": 0.32910600304603577,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.507319808,
636
+ "gpu_mem": 4.675894272,
637
+ "loss": 1.9533,
638
+ "grad_norm": 0.9542618989944458,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.507319808,
645
+ "gpu_mem": 4.624038912,
646
+ "loss": 1.3393,
647
+ "grad_norm": 0.26049795746803284,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.507319808,
654
+ "gpu_mem": 4.624002048,
655
+ "loss": 1.3217,
656
+ "grad_norm": 0.27294543385505676,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.507319808,
663
+ "gpu_mem": 4.624061952,
664
+ "loss": 1.3088,
665
+ "grad_norm": 0.2529256045818329,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.507319808,
672
+ "gpu_mem": 4.624029696,
673
+ "loss": 1.3425,
674
+ "grad_norm": 0.39007511734962463,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.507319808,
681
+ "gpu_mem": 4.624040448,
682
+ "loss": 1.2894,
683
+ "grad_norm": 0.2318618893623352,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.507319808,
690
+ "gpu_mem": 4.624077312,
691
+ "loss": 1.3408,
692
+ "grad_norm": 0.17781618237495422,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.507319808,
699
+ "gpu_mem": 4.624061952,
700
+ "loss": 1.3582,
701
+ "grad_norm": 0.3542206287384033,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.507319808,
708
+ "gpu_mem": 4.6240128,
709
+ "loss": 1.2668,
710
+ "grad_norm": 0.3510591387748718,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.507319808,
717
+ "gpu_mem": 4.624057344,
718
+ "loss": 1.3527,
719
+ "grad_norm": 0.4764325022697449,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.507319808,
726
+ "gpu_mem": 4.62404352,
727
+ "loss": 1.3285,
728
+ "grad_norm": 0.29078051447868347,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.507319808,
735
+ "gpu_mem": 4.624011264,
736
+ "loss": 1.3261,
737
+ "grad_norm": 0.329904705286026,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.507319808,
744
+ "gpu_mem": 4.624061952,
745
+ "loss": 1.3449,
746
+ "grad_norm": 0.671593427658081,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.507319808,
753
+ "gpu_mem": 4.624000512,
754
+ "loss": 1.3563,
755
+ "grad_norm": 0.3557969629764557,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.507319808,
762
+ "gpu_mem": 4.624046592,
763
+ "loss": 1.3714,
764
+ "grad_norm": 0.5320531129837036,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.507319808,
771
+ "gpu_mem": 4.624000512,
772
+ "loss": 1.337,
773
+ "grad_norm": 0.38929489254951477,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.507319808,
780
+ "gpu_mem": 4.624031232,
781
+ "loss": 1.3616,
782
+ "grad_norm": 0.3265232741832733,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.507319808,
789
+ "gpu_mem": 4.624006656,
790
+ "loss": 1.3341,
791
+ "grad_norm": 0.5538458824157715,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.507319808,
798
+ "gpu_mem": 4.624060416,
799
+ "loss": 1.2784,
800
+ "grad_norm": 0.30906733870506287,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.507319808,
807
+ "gpu_mem": 4.624041984,
808
+ "loss": 1.3251,
809
+ "grad_norm": 0.30996736884117126,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.507319808,
816
+ "gpu_mem": 4.623991296,
817
+ "loss": 1.4019,
818
+ "grad_norm": 0.8204323053359985,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.507319808,
825
+ "gpu_mem": 4.624015872,
826
+ "loss": 1.3102,
827
+ "grad_norm": 0.2713940143585205,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.507319808,
834
+ "gpu_mem": 4.624018944,
835
+ "loss": 1.2545,
836
+ "grad_norm": 0.43170469999313354,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.507319808,
843
+ "gpu_mem": 4.624011264,
844
+ "loss": 1.279,
845
+ "grad_norm": 0.260023832321167,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.507319808,
852
+ "gpu_mem": 4.624049664,
853
+ "loss": 1.3035,
854
+ "grad_norm": 0.2027701735496521,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.507319808,
861
+ "gpu_mem": 4.62405888,
862
+ "loss": 1.2398,
863
+ "grad_norm": 0.5005522966384888,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.507319808,
870
+ "gpu_mem": 4.624002048,
871
+ "loss": 1.3283,
872
+ "grad_norm": 0.4746371805667877,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.507319808,
879
+ "gpu_mem": 4.624002048,
880
+ "loss": 1.3273,
881
+ "grad_norm": 0.1946246474981308,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.507319808,
888
+ "gpu_mem": 4.623998976,
889
+ "loss": 1.2802,
890
+ "grad_norm": 0.2766704559326172,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.507319808,
897
+ "gpu_mem": 4.62399744,
898
+ "loss": 1.2638,
899
+ "grad_norm": 0.29240837693214417,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.507319808,
906
+ "gpu_mem": 4.624040448,
907
+ "loss": 1.2422,
908
+ "grad_norm": 0.3155769407749176,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.507319808,
915
+ "gpu_mem": 4.623979008,
916
+ "loss": 1.2926,
917
+ "grad_norm": 0.27102938294410706,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.507319808,
924
+ "gpu_mem": 4.62402816,
925
+ "loss": 1.2868,
926
+ "grad_norm": 0.2362990826368332,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.507319808,
933
+ "gpu_mem": 4.624091136,
934
+ "loss": 1.323,
935
+ "grad_norm": 0.4770653247833252,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.507319808,
942
+ "gpu_mem": 4.62404352,
943
+ "loss": 1.2533,
944
+ "grad_norm": 0.3264782726764679,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.507319808,
951
+ "gpu_mem": 4.624025088,
952
+ "loss": 1.3119,
953
+ "grad_norm": 0.20415638387203217,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.507319808,
960
+ "gpu_mem": 4.676081664,
961
+ "loss": 1.8464,
962
+ "grad_norm": 0.7195807695388794,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.507319808,
969
+ "gpu_mem": 4.676063232,
970
+ "loss": 1.2953,
971
+ "grad_norm": 0.45035746693611145,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.507319808,
978
+ "gpu_mem": 4.67605248,
979
+ "loss": 1.2954,
980
+ "grad_norm": 0.2725832462310791,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.507319808,
987
+ "gpu_mem": 4.67610624,
988
+ "loss": 1.3324,
989
+ "grad_norm": 0.6822441816329956,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.507319808,
996
+ "gpu_mem": 4.676066304,
997
+ "loss": 1.3058,
998
+ "grad_norm": 0.3432196378707886,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.507319808,
1005
+ "gpu_mem": 4.676084736,
1006
+ "loss": 1.2886,
1007
+ "grad_norm": 0.21611112356185913,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.507319808,
1014
+ "gpu_mem": 4.676147712,
1015
+ "loss": 1.2902,
1016
+ "grad_norm": 0.24663004279136658,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.507319808,
1023
+ "gpu_mem": 4.67607552,
1024
+ "loss": 1.3149,
1025
+ "grad_norm": 0.3060291111469269,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.507319808,
1032
+ "gpu_mem": 4.676069376,
1033
+ "loss": 1.3368,
1034
+ "grad_norm": 0.4497460722923279,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.507319808,
1041
+ "gpu_mem": 4.676084736,
1042
+ "loss": 1.3084,
1043
+ "grad_norm": 0.27042055130004883,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.507319808,
1050
+ "gpu_mem": 4.676100096,
1051
+ "loss": 1.2573,
1052
+ "grad_norm": 0.2981705367565155,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.507319808,
1059
+ "gpu_mem": 4.67609088,
1060
+ "loss": 1.2871,
1061
+ "grad_norm": 0.305347740650177,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.507319808,
1068
+ "gpu_mem": 4.676081664,
1069
+ "loss": 1.3115,
1070
+ "grad_norm": 0.23850424587726593,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.507319808,
1077
+ "gpu_mem": 4.676100096,
1078
+ "loss": 1.2868,
1079
+ "grad_norm": 0.288796603679657,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.507319808,
1086
+ "gpu_mem": 4.67609856,
1087
+ "loss": 1.2793,
1088
+ "grad_norm": 0.31622397899627686,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.507319808,
1095
+ "gpu_mem": 4.676055552,
1096
+ "loss": 1.279,
1097
+ "grad_norm": 0.3226993680000305,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.507319808,
1104
+ "gpu_mem": 4.676087808,
1105
+ "loss": 1.2784,
1106
+ "grad_norm": 0.3610694706439972,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.507319808,
1113
+ "gpu_mem": 4.676041728,
1114
+ "loss": 1.3062,
1115
+ "grad_norm": 0.27508386969566345,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.507319808,
1122
+ "gpu_mem": 4.676086272,
1123
+ "loss": 1.2735,
1124
+ "grad_norm": 0.5249335169792175,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.507319808,
1131
+ "gpu_mem": 4.67603712,
1132
+ "loss": 1.267,
1133
+ "grad_norm": 0.382548063993454,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.507319808,
1140
+ "gpu_mem": 4.676049408,
1141
+ "loss": 1.2886,
1142
+ "grad_norm": 0.2308499962091446,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.507319808,
1149
+ "gpu_mem": 4.676073984,
1150
+ "loss": 1.2877,
1151
+ "grad_norm": 0.2855219841003418,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.507319808,
1158
+ "gpu_mem": 4.676035584,
1159
+ "loss": 1.2868,
1160
+ "grad_norm": 0.21983076632022858,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.507319808,
1167
+ "gpu_mem": 4.676038656,
1168
+ "loss": 1.3086,
1169
+ "grad_norm": 0.26207077503204346,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.507319808,
1176
+ "gpu_mem": 4.676050944,
1177
+ "loss": 1.2698,
1178
+ "grad_norm": 0.31314849853515625,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.507319808,
1185
+ "gpu_mem": 4.676015616,
1186
+ "loss": 1.2776,
1187
+ "grad_norm": 0.25790277123451233,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.507319808,
1194
+ "gpu_mem": 4.676057088,
1195
+ "loss": 1.2492,
1196
+ "grad_norm": 0.2964359223842621,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.507319808,
1203
+ "gpu_mem": 4.676072448,
1204
+ "loss": 1.251,
1205
+ "grad_norm": 0.3126276433467865,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.507319808,
1212
+ "gpu_mem": 4.67603712,
1213
+ "loss": 1.3024,
1214
+ "grad_norm": 0.4269898235797882,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.507319808,
1221
+ "gpu_mem": 4.6760448,
1222
+ "loss": 1.3213,
1223
+ "grad_norm": 0.368845134973526,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.507319808,
1230
+ "gpu_mem": 4.676066304,
1231
+ "loss": 1.2884,
1232
+ "grad_norm": 0.4196622669696808,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.507319808,
1239
+ "gpu_mem": 4.676077056,
1240
+ "loss": 1.3246,
1241
+ "grad_norm": 0.25930461287498474,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.507319808,
1248
+ "gpu_mem": 4.676069376,
1249
+ "loss": 1.351,
1250
+ "grad_norm": 0.3037481904029846,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.507319808,
1257
+ "gpu_mem": 4.676103168,
1258
+ "loss": 1.312,
1259
+ "grad_norm": 0.31067949533462524,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.507319808,
1266
+ "gpu_mem": 4.676103168,
1267
+ "train_runtime": 689.3565,
1268
+ "train_samples_per_second": 13.061,
1269
+ "train_steps_per_second": 0.203,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.444502578462873
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "arc_e",
3
+ "results": 0.4718013468013468
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "ARC_E",
5
+ "dataset_id": "allenai/ai2_arc",
6
+ "preprocess_id": "arc_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 3548160
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 4,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-arc_e-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T19:54:43.238015"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-arc_e-r8-a2/training_logs.json ADDED
@@ -0,0 +1,1273 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.028169014084507043,
5
+ "cpu_mem": 1.497096192,
6
+ "gpu_mem": 4.444216832,
7
+ "loss": 4.6319,
8
+ "grad_norm": 90.68135070800781,
9
+ "learning_rate": 2.1428571428571425e-05
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.056338028169014086,
14
+ "cpu_mem": 1.500438528,
15
+ "gpu_mem": 4.47266304,
16
+ "loss": 4.4578,
17
+ "grad_norm": 88.92479705810547,
18
+ "learning_rate": 4.285714285714285e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.08450704225352113,
23
+ "cpu_mem": 1.500635136,
24
+ "gpu_mem": 4.472641536,
25
+ "loss": 3.8569,
26
+ "grad_norm": 78.79303741455078,
27
+ "learning_rate": 6.428571428571427e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.11267605633802817,
32
+ "cpu_mem": 1.500831744,
33
+ "gpu_mem": 4.472620032,
34
+ "loss": 3.0224,
35
+ "grad_norm": 53.755924224853516,
36
+ "learning_rate": 8.57142857142857e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.14084507042253522,
41
+ "cpu_mem": 1.500831744,
42
+ "gpu_mem": 4.472661504,
43
+ "loss": 2.0876,
44
+ "grad_norm": 23.539710998535156,
45
+ "learning_rate": 0.00010714285714285714
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.16901408450704225,
50
+ "cpu_mem": 1.500831744,
51
+ "gpu_mem": 4.472636928,
52
+ "loss": 1.6383,
53
+ "grad_norm": 9.254476547241211,
54
+ "learning_rate": 0.00012857142857142855
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.19718309859154928,
59
+ "cpu_mem": 1.500831744,
60
+ "gpu_mem": 4.472659968,
61
+ "loss": 1.5543,
62
+ "grad_norm": 5.9162468910217285,
63
+ "learning_rate": 0.00015
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.22535211267605634,
68
+ "cpu_mem": 1.500831744,
69
+ "gpu_mem": 4.472618496,
70
+ "loss": 1.4366,
71
+ "grad_norm": 4.133505344390869,
72
+ "learning_rate": 0.0001714285714285714
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.2535211267605634,
77
+ "cpu_mem": 1.500831744,
78
+ "gpu_mem": 4.472620032,
79
+ "loss": 1.4241,
80
+ "grad_norm": 8.360941886901855,
81
+ "learning_rate": 0.00019285714285714286
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.28169014084507044,
86
+ "cpu_mem": 1.501028352,
87
+ "gpu_mem": 4.472615424,
88
+ "loss": 1.533,
89
+ "grad_norm": 9.691158294677734,
90
+ "learning_rate": 0.00021428571428571427
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.30985915492957744,
95
+ "cpu_mem": 1.50122496,
96
+ "gpu_mem": 4.47269376,
97
+ "loss": 1.4012,
98
+ "grad_norm": 4.719605922698975,
99
+ "learning_rate": 0.00023571428571428569
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.3380281690140845,
104
+ "cpu_mem": 1.50122496,
105
+ "gpu_mem": 4.472667648,
106
+ "loss": 1.3237,
107
+ "grad_norm": 3.155473470687866,
108
+ "learning_rate": 0.0002571428571428571
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.36619718309859156,
113
+ "cpu_mem": 1.50122496,
114
+ "gpu_mem": 4.472618496,
115
+ "loss": 1.4359,
116
+ "grad_norm": 5.259221076965332,
117
+ "learning_rate": 0.00027857142857142854
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.39436619718309857,
122
+ "cpu_mem": 1.50122496,
123
+ "gpu_mem": 4.47264,
124
+ "loss": 1.3195,
125
+ "grad_norm": 1.7507895231246948,
126
+ "learning_rate": 0.0003
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.4225352112676056,
131
+ "cpu_mem": 1.50122496,
132
+ "gpu_mem": 4.47261696,
133
+ "loss": 1.406,
134
+ "grad_norm": 4.849987030029297,
135
+ "learning_rate": 0.0002999533773001224
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.4507042253521127,
140
+ "cpu_mem": 1.50122496,
141
+ "gpu_mem": 4.472621568,
142
+ "loss": 1.4401,
143
+ "grad_norm": 4.631597518920898,
144
+ "learning_rate": 0.0002998135381828383
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.4788732394366197,
149
+ "cpu_mem": 1.50122496,
150
+ "gpu_mem": 4.472658432,
151
+ "loss": 1.3536,
152
+ "grad_norm": 2.7577764987945557,
153
+ "learning_rate": 0.00029958056957717696
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.5070422535211268,
158
+ "cpu_mem": 1.50122496,
159
+ "gpu_mem": 4.472669184,
160
+ "loss": 1.3395,
161
+ "grad_norm": 2.595319986343384,
162
+ "learning_rate": 0.0002992546163048102
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.5352112676056338,
167
+ "cpu_mem": 1.50122496,
168
+ "gpu_mem": 4.472612352,
169
+ "loss": 1.2984,
170
+ "grad_norm": 2.2393436431884766,
171
+ "learning_rate": 0.0002988358809900258
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.5633802816901409,
176
+ "cpu_mem": 1.50122496,
177
+ "gpu_mem": 4.472683008,
178
+ "loss": 1.3524,
179
+ "grad_norm": 1.4507402181625366,
180
+ "learning_rate": 0.0002983246239337692
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.5915492957746479,
185
+ "cpu_mem": 1.50122496,
186
+ "gpu_mem": 4.472681472,
187
+ "loss": 1.3479,
188
+ "grad_norm": 2.475274085998535,
189
+ "learning_rate": 0.0002977211629518312
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.6197183098591549,
194
+ "cpu_mem": 1.50122496,
195
+ "gpu_mem": 4.472638464,
196
+ "loss": 1.3795,
197
+ "grad_norm": 2.8394975662231445,
198
+ "learning_rate": 0.00029702587317728153
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.647887323943662,
203
+ "cpu_mem": 1.50122496,
204
+ "gpu_mem": 4.47265536,
205
+ "loss": 1.3218,
206
+ "grad_norm": 1.7564839124679565,
207
+ "learning_rate": 0.0002962391868272735
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.676056338028169,
212
+ "cpu_mem": 1.50122496,
213
+ "gpu_mem": 4.472612352,
214
+ "loss": 1.3391,
215
+ "grad_norm": 1.4141991138458252,
216
+ "learning_rate": 0.00029536159293436166
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.704225352112676,
221
+ "cpu_mem": 1.50122496,
222
+ "gpu_mem": 4.472641536,
223
+ "loss": 1.3794,
224
+ "grad_norm": 1.3704174757003784,
225
+ "learning_rate": 0.00029439363704250176
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.7323943661971831,
230
+ "cpu_mem": 1.50122496,
231
+ "gpu_mem": 4.472621568,
232
+ "loss": 1.4543,
233
+ "grad_norm": 1.8014308214187622,
234
+ "learning_rate": 0.00029333592086792107
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.7605633802816901,
239
+ "cpu_mem": 1.50122496,
240
+ "gpu_mem": 4.47264768,
241
+ "loss": 1.3353,
242
+ "grad_norm": 1.1347827911376953,
243
+ "learning_rate": 0.0002921891019250697
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.7887323943661971,
248
+ "cpu_mem": 1.50122496,
249
+ "gpu_mem": 4.47264768,
250
+ "loss": 1.386,
251
+ "grad_norm": 1.4631065130233765,
252
+ "learning_rate": 0.0002909538931178862
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.8169014084507042,
257
+ "cpu_mem": 1.50122496,
258
+ "gpu_mem": 4.472626176,
259
+ "loss": 1.2431,
260
+ "grad_norm": 1.1491742134094238,
261
+ "learning_rate": 0.00028963106229663063
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.8450704225352113,
266
+ "cpu_mem": 1.50122496,
267
+ "gpu_mem": 4.47261696,
268
+ "loss": 1.3533,
269
+ "grad_norm": 1.2432610988616943,
270
+ "learning_rate": 0.00028822143178056114
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.8732394366197183,
275
+ "cpu_mem": 1.50122496,
276
+ "gpu_mem": 4.472635392,
277
+ "loss": 1.3643,
278
+ "grad_norm": 1.3368273973464966,
279
+ "learning_rate": 0.00028672587784675096
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.9014084507042254,
284
+ "cpu_mem": 1.50122496,
285
+ "gpu_mem": 4.472658432,
286
+ "loss": 1.3383,
287
+ "grad_norm": 1.5168676376342773,
288
+ "learning_rate": 0.0002851453301853628
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.9295774647887324,
293
+ "cpu_mem": 1.50122496,
294
+ "gpu_mem": 4.47265536,
295
+ "loss": 1.356,
296
+ "grad_norm": 0.9831933975219727,
297
+ "learning_rate": 0.00028348077132172027
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.9577464788732394,
302
+ "cpu_mem": 1.50122496,
303
+ "gpu_mem": 4.472658432,
304
+ "loss": 1.4134,
305
+ "grad_norm": 1.6918983459472656,
306
+ "learning_rate": 0.0002817332360055343
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.9859154929577465,
311
+ "cpu_mem": 1.50122496,
312
+ "gpu_mem": 4.47264,
313
+ "loss": 1.328,
314
+ "grad_norm": 1.3503168821334839,
315
+ "learning_rate": 0.0002799038105676658
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 1.0140845070422535,
320
+ "cpu_mem": 1.50122496,
321
+ "gpu_mem": 4.486823424,
322
+ "loss": 1.9758,
323
+ "grad_norm": 2.2334275245666504,
324
+ "learning_rate": 0.0002779936322448233
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 1.0422535211267605,
329
+ "cpu_mem": 1.50122496,
330
+ "gpu_mem": 4.486828032,
331
+ "loss": 1.3284,
332
+ "grad_norm": 0.5176252722740173,
333
+ "learning_rate": 0.0002760038884726157
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 1.0704225352112675,
338
+ "cpu_mem": 1.50122496,
339
+ "gpu_mem": 4.486806528,
340
+ "loss": 1.23,
341
+ "grad_norm": 0.9931122660636902,
342
+ "learning_rate": 0.00027393581614739923
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 1.0985915492957747,
347
+ "cpu_mem": 1.50122496,
348
+ "gpu_mem": 4.486795776,
349
+ "loss": 1.3856,
350
+ "grad_norm": 2.389493227005005,
351
+ "learning_rate": 0.0002717907008573785
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 1.1267605633802817,
356
+ "cpu_mem": 1.50122496,
357
+ "gpu_mem": 4.486858752,
358
+ "loss": 1.4521,
359
+ "grad_norm": 2.60158109664917,
360
+ "learning_rate": 0.0002695698760834384
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 1.1549295774647887,
365
+ "cpu_mem": 1.50122496,
366
+ "gpu_mem": 4.486818816,
367
+ "loss": 1.3356,
368
+ "grad_norm": 1.25742769241333,
369
+ "learning_rate": 0.00026727472237020447
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 1.1830985915492958,
374
+ "cpu_mem": 1.50122496,
375
+ "gpu_mem": 4.486861824,
376
+ "loss": 1.3599,
377
+ "grad_norm": 1.3035224676132202,
378
+ "learning_rate": 0.00026490666646784665
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 1.2112676056338028,
383
+ "cpu_mem": 1.50122496,
384
+ "gpu_mem": 4.486811136,
385
+ "loss": 1.4067,
386
+ "grad_norm": 1.275063157081604,
387
+ "learning_rate": 0.0002624671804451601
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 1.2394366197183098,
392
+ "cpu_mem": 1.50122496,
393
+ "gpu_mem": 4.486875648,
394
+ "loss": 1.3442,
395
+ "grad_norm": 1.034511685371399,
396
+ "learning_rate": 0.0002599577807744739
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 1.267605633802817,
401
+ "cpu_mem": 1.50122496,
402
+ "gpu_mem": 4.486843392,
403
+ "loss": 1.3551,
404
+ "grad_norm": 0.8410855531692505,
405
+ "learning_rate": 0.0002573800273889577
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 1.295774647887324,
410
+ "cpu_mem": 1.50122496,
411
+ "gpu_mem": 4.486848,
412
+ "loss": 1.3701,
413
+ "grad_norm": 0.9161557555198669,
414
+ "learning_rate": 0.0002547355227129109
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 1.323943661971831,
419
+ "cpu_mem": 1.50122496,
420
+ "gpu_mem": 4.48679424,
421
+ "loss": 1.3437,
422
+ "grad_norm": 1.512771725654602,
423
+ "learning_rate": 0.00025202591066563786
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 1.352112676056338,
428
+ "cpu_mem": 1.50122496,
429
+ "gpu_mem": 4.486808064,
430
+ "loss": 1.346,
431
+ "grad_norm": 1.1189930438995361,
432
+ "learning_rate": 0.0002492528756395289
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 1.380281690140845,
437
+ "cpu_mem": 1.50122496,
438
+ "gpu_mem": 4.486797312,
439
+ "loss": 1.3254,
440
+ "grad_norm": 0.9964473247528076,
441
+ "learning_rate": 0.0002464181414529809
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 1.408450704225352,
446
+ "cpu_mem": 1.50122496,
447
+ "gpu_mem": 4.486811136,
448
+ "loss": 1.3244,
449
+ "grad_norm": 0.8401857018470764,
450
+ "learning_rate": 0.00024352347027881003
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 1.436619718309859,
455
+ "cpu_mem": 1.50122496,
456
+ "gpu_mem": 4.48686336,
457
+ "loss": 1.3491,
458
+ "grad_norm": 0.9280771017074585,
459
+ "learning_rate": 0.0002405706615488216
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 1.4647887323943662,
464
+ "cpu_mem": 1.50122496,
465
+ "gpu_mem": 4.486811136,
466
+ "loss": 1.3614,
467
+ "grad_norm": 1.0336897373199463,
468
+ "learning_rate": 0.00023756155083521846
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 1.4929577464788732,
473
+ "cpu_mem": 1.50122496,
474
+ "gpu_mem": 4.486880256,
475
+ "loss": 1.3085,
476
+ "grad_norm": 0.844360888004303,
477
+ "learning_rate": 0.00023449800870954326
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 1.5211267605633803,
482
+ "cpu_mem": 1.50122496,
483
+ "gpu_mem": 4.486848,
484
+ "loss": 1.2801,
485
+ "grad_norm": 0.8565107583999634,
486
+ "learning_rate": 0.0002313819395798639
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 1.5492957746478875,
491
+ "cpu_mem": 1.50122496,
492
+ "gpu_mem": 4.486857216,
493
+ "loss": 1.3645,
494
+ "grad_norm": 0.8742291331291199,
495
+ "learning_rate": 0.0002282152805069247
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 1.5774647887323945,
500
+ "cpu_mem": 1.50122496,
501
+ "gpu_mem": 4.48683264,
502
+ "loss": 1.3072,
503
+ "grad_norm": 0.6819919943809509,
504
+ "learning_rate": 0.000225
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 1.6056338028169015,
509
+ "cpu_mem": 1.50122496,
510
+ "gpu_mem": 4.486866432,
511
+ "loss": 1.2991,
512
+ "grad_norm": 0.5670233964920044,
513
+ "learning_rate": 0.00022173809679319772
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 1.6338028169014085,
518
+ "cpu_mem": 1.50122496,
519
+ "gpu_mem": 4.486848,
520
+ "loss": 1.3135,
521
+ "grad_norm": 0.7747777700424194,
522
+ "learning_rate": 0.00021843159860297442
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 1.6619718309859155,
527
+ "cpu_mem": 1.50122496,
528
+ "gpu_mem": 4.486834176,
529
+ "loss": 1.3134,
530
+ "grad_norm": 0.5816622972488403,
531
+ "learning_rate": 0.00021508256086763368
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 1.6901408450704225,
536
+ "cpu_mem": 1.50122496,
537
+ "gpu_mem": 4.486872576,
538
+ "loss": 1.2977,
539
+ "grad_norm": 1.1774585247039795,
540
+ "learning_rate": 0.00021169306546959174
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 1.7183098591549295,
545
+ "cpu_mem": 1.50122496,
546
+ "gpu_mem": 4.486804992,
547
+ "loss": 1.2838,
548
+ "grad_norm": 0.9861280918121338,
549
+ "learning_rate": 0.0002082652194412042
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 1.7464788732394365,
554
+ "cpu_mem": 1.50122496,
555
+ "gpu_mem": 4.486852608,
556
+ "loss": 1.3528,
557
+ "grad_norm": 1.0061546564102173,
558
+ "learning_rate": 0.00020480115365495926
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 1.7746478873239435,
563
+ "cpu_mem": 1.50122496,
564
+ "gpu_mem": 4.48680192,
565
+ "loss": 1.316,
566
+ "grad_norm": 1.0553845167160034,
567
+ "learning_rate": 0.00020130302149885031
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 1.8028169014084507,
572
+ "cpu_mem": 1.50122496,
573
+ "gpu_mem": 4.486851072,
574
+ "loss": 1.3355,
575
+ "grad_norm": 1.1395550966262817,
576
+ "learning_rate": 0.00019777299753775265
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 1.8309859154929577,
581
+ "cpu_mem": 1.50122496,
582
+ "gpu_mem": 4.486849536,
583
+ "loss": 1.3297,
584
+ "grad_norm": 1.126117467880249,
585
+ "learning_rate": 0.00019421327616163563
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 1.8591549295774648,
590
+ "cpu_mem": 1.50122496,
591
+ "gpu_mem": 4.486867968,
592
+ "loss": 1.2832,
593
+ "grad_norm": 1.099063754081726,
594
+ "learning_rate": 0.00019062607022145078
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 1.887323943661972,
599
+ "cpu_mem": 1.50122496,
600
+ "gpu_mem": 4.4868096,
601
+ "loss": 1.314,
602
+ "grad_norm": 1.0150798559188843,
603
+ "learning_rate": 0.00018701360965354402
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 1.915492957746479,
608
+ "cpu_mem": 1.50122496,
609
+ "gpu_mem": 4.486821888,
610
+ "loss": 1.3478,
611
+ "grad_norm": 1.0355052947998047,
612
+ "learning_rate": 0.00018337814009344714
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 1.943661971830986,
617
+ "cpu_mem": 1.50122496,
618
+ "gpu_mem": 4.486846464,
619
+ "loss": 1.3066,
620
+ "grad_norm": 1.6115487813949585,
621
+ "learning_rate": 0.0001797219214799096
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 1.971830985915493,
626
+ "cpu_mem": 1.50122496,
627
+ "gpu_mem": 4.486823424,
628
+ "loss": 1.2561,
629
+ "grad_norm": 0.8908656239509583,
630
+ "learning_rate": 0.00017604722665003956
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 2.0,
635
+ "cpu_mem": 1.50122496,
636
+ "gpu_mem": 4.486659072,
637
+ "loss": 1.9607,
638
+ "grad_norm": 3.0582046508789062,
639
+ "learning_rate": 0.00017235633992642615
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 2.028169014084507,
644
+ "cpu_mem": 1.50122496,
645
+ "gpu_mem": 4.472650752,
646
+ "loss": 1.3384,
647
+ "grad_norm": 1.3399873971939087,
648
+ "learning_rate": 0.00016865155569712278
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 2.056338028169014,
653
+ "cpu_mem": 1.50122496,
654
+ "gpu_mem": 4.472613888,
655
+ "loss": 1.2917,
656
+ "grad_norm": 0.8912840485572815,
657
+ "learning_rate": 0.0001649351769893725
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 2.084507042253521,
662
+ "cpu_mem": 1.50122496,
663
+ "gpu_mem": 4.472673792,
664
+ "loss": 1.278,
665
+ "grad_norm": 0.9684709906578064,
666
+ "learning_rate": 0.00016120951403796364
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 2.112676056338028,
671
+ "cpu_mem": 1.50122496,
672
+ "gpu_mem": 4.472641536,
673
+ "loss": 1.3237,
674
+ "grad_norm": 1.1640229225158691,
675
+ "learning_rate": 0.00015747688284910457
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 2.140845070422535,
680
+ "cpu_mem": 1.50122496,
681
+ "gpu_mem": 4.472652288,
682
+ "loss": 1.2621,
683
+ "grad_norm": 0.9244154095649719,
684
+ "learning_rate": 0.00015373960376071093
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 2.169014084507042,
689
+ "cpu_mem": 1.50122496,
690
+ "gpu_mem": 4.472689152,
691
+ "loss": 1.3137,
692
+ "grad_norm": 1.2289559841156006,
693
+ "learning_rate": 0.00015
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 2.1971830985915495,
698
+ "cpu_mem": 1.50122496,
699
+ "gpu_mem": 4.472673792,
700
+ "loss": 1.3245,
701
+ "grad_norm": 1.2233545780181885,
702
+ "learning_rate": 0.00014626039623928907
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 2.2253521126760565,
707
+ "cpu_mem": 1.50122496,
708
+ "gpu_mem": 4.47262464,
709
+ "loss": 1.2031,
710
+ "grad_norm": 1.283607006072998,
711
+ "learning_rate": 0.0001425231171508954
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 2.2535211267605635,
716
+ "cpu_mem": 1.50122496,
717
+ "gpu_mem": 4.472669184,
718
+ "loss": 1.299,
719
+ "grad_norm": 1.1909815073013306,
720
+ "learning_rate": 0.00013879048596203636
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 2.2816901408450705,
725
+ "cpu_mem": 1.50122496,
726
+ "gpu_mem": 4.47265536,
727
+ "loss": 1.3448,
728
+ "grad_norm": 2.1881954669952393,
729
+ "learning_rate": 0.0001350648230106275
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 2.3098591549295775,
734
+ "cpu_mem": 1.50122496,
735
+ "gpu_mem": 4.472623104,
736
+ "loss": 1.3249,
737
+ "grad_norm": 2.604688882827759,
738
+ "learning_rate": 0.00013134844430287725
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 2.3380281690140845,
743
+ "cpu_mem": 1.50122496,
744
+ "gpu_mem": 4.472673792,
745
+ "loss": 1.2162,
746
+ "grad_norm": 2.0677669048309326,
747
+ "learning_rate": 0.0001276436600735738
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 2.3661971830985915,
752
+ "cpu_mem": 1.50122496,
753
+ "gpu_mem": 4.472612352,
754
+ "loss": 1.3604,
755
+ "grad_norm": 2.706294536590576,
756
+ "learning_rate": 0.00012395277334996044
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 2.3943661971830985,
761
+ "cpu_mem": 1.50122496,
762
+ "gpu_mem": 4.472658432,
763
+ "loss": 1.2945,
764
+ "grad_norm": 1.944639801979065,
765
+ "learning_rate": 0.00012027807852009038
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 2.4225352112676055,
770
+ "cpu_mem": 1.50122496,
771
+ "gpu_mem": 4.472612352,
772
+ "loss": 1.2707,
773
+ "grad_norm": 1.4408667087554932,
774
+ "learning_rate": 0.00011662185990655284
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 2.4507042253521125,
779
+ "cpu_mem": 1.50122496,
780
+ "gpu_mem": 4.472643072,
781
+ "loss": 1.3298,
782
+ "grad_norm": 2.4344043731689453,
783
+ "learning_rate": 0.00011298639034645593
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 2.4788732394366195,
788
+ "cpu_mem": 1.50122496,
789
+ "gpu_mem": 4.472618496,
790
+ "loss": 1.2685,
791
+ "grad_norm": 1.9301280975341797,
792
+ "learning_rate": 0.00010937392977854923
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 2.507042253521127,
797
+ "cpu_mem": 1.50122496,
798
+ "gpu_mem": 4.472672256,
799
+ "loss": 1.2436,
800
+ "grad_norm": 1.8113797903060913,
801
+ "learning_rate": 0.00010578672383836435
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 2.535211267605634,
806
+ "cpu_mem": 1.50122496,
807
+ "gpu_mem": 4.472653824,
808
+ "loss": 1.2687,
809
+ "grad_norm": 1.6126199960708618,
810
+ "learning_rate": 0.00010222700246224735
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 2.563380281690141,
815
+ "cpu_mem": 1.50122496,
816
+ "gpu_mem": 4.472603136,
817
+ "loss": 1.3103,
818
+ "grad_norm": 2.3668925762176514,
819
+ "learning_rate": 9.869697850114969e-05
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 2.591549295774648,
824
+ "cpu_mem": 1.50122496,
825
+ "gpu_mem": 4.472627712,
826
+ "loss": 1.282,
827
+ "grad_norm": 1.9559086561203003,
828
+ "learning_rate": 9.519884634504074e-05
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 2.619718309859155,
833
+ "cpu_mem": 1.50122496,
834
+ "gpu_mem": 4.472630784,
835
+ "loss": 1.2231,
836
+ "grad_norm": 1.8475598096847534,
837
+ "learning_rate": 9.17347805587958e-05
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 2.647887323943662,
842
+ "cpu_mem": 1.50122496,
843
+ "gpu_mem": 4.472623104,
844
+ "loss": 1.2242,
845
+ "grad_norm": 2.266403913497925,
846
+ "learning_rate": 8.830693453040829e-05
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 2.676056338028169,
851
+ "cpu_mem": 1.50122496,
852
+ "gpu_mem": 4.472661504,
853
+ "loss": 1.2711,
854
+ "grad_norm": 1.83024263381958,
855
+ "learning_rate": 8.491743913236628e-05
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 2.704225352112676,
860
+ "cpu_mem": 1.50122496,
861
+ "gpu_mem": 4.47267072,
862
+ "loss": 1.202,
863
+ "grad_norm": 2.0701658725738525,
864
+ "learning_rate": 8.156840139702554e-05
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 2.732394366197183,
869
+ "cpu_mem": 1.50122496,
870
+ "gpu_mem": 4.472613888,
871
+ "loss": 1.2861,
872
+ "grad_norm": 2.78592848777771,
873
+ "learning_rate": 7.82619032068023e-05
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 2.76056338028169,
878
+ "cpu_mem": 1.50122496,
879
+ "gpu_mem": 4.472613888,
880
+ "loss": 1.2864,
881
+ "grad_norm": 1.7303667068481445,
882
+ "learning_rate": 7.500000000000002e-05
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 2.788732394366197,
887
+ "cpu_mem": 1.50122496,
888
+ "gpu_mem": 4.472610816,
889
+ "loss": 1.2419,
890
+ "grad_norm": 1.7865890264511108,
891
+ "learning_rate": 7.17847194930753e-05
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 2.816901408450704,
896
+ "cpu_mem": 1.50122496,
897
+ "gpu_mem": 4.47260928,
898
+ "loss": 1.1842,
899
+ "grad_norm": 2.1414265632629395,
900
+ "learning_rate": 6.86180604201361e-05
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 2.845070422535211,
905
+ "cpu_mem": 1.50122496,
906
+ "gpu_mem": 4.472652288,
907
+ "loss": 1.1673,
908
+ "grad_norm": 2.0667600631713867,
909
+ "learning_rate": 6.550199129045668e-05
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 2.873239436619718,
914
+ "cpu_mem": 1.50122496,
915
+ "gpu_mem": 4.472590848,
916
+ "loss": 1.2343,
917
+ "grad_norm": 1.725559949874878,
918
+ "learning_rate": 6.243844916478155e-05
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 2.9014084507042255,
923
+ "cpu_mem": 1.50122496,
924
+ "gpu_mem": 4.47264,
925
+ "loss": 1.2293,
926
+ "grad_norm": 2.002014636993408,
927
+ "learning_rate": 5.9429338451178355e-05
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 2.9295774647887325,
932
+ "cpu_mem": 1.50122496,
933
+ "gpu_mem": 4.472702976,
934
+ "loss": 1.3138,
935
+ "grad_norm": 3.1678977012634277,
936
+ "learning_rate": 5.6476529721189974e-05
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 2.9577464788732395,
941
+ "cpu_mem": 1.50122496,
942
+ "gpu_mem": 4.47265536,
943
+ "loss": 1.167,
944
+ "grad_norm": 2.2888123989105225,
945
+ "learning_rate": 5.358185854701909e-05
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 2.9859154929577465,
950
+ "cpu_mem": 1.50122496,
951
+ "gpu_mem": 4.472636928,
952
+ "loss": 1.2732,
953
+ "grad_norm": 2.5358312129974365,
954
+ "learning_rate": 5.074712436047112e-05
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 3.0140845070422535,
959
+ "cpu_mem": 1.50122496,
960
+ "gpu_mem": 4.486846464,
961
+ "loss": 1.681,
962
+ "grad_norm": 4.78402042388916,
963
+ "learning_rate": 4.7974089334362057e-05
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 3.0422535211267605,
968
+ "cpu_mem": 1.50122496,
969
+ "gpu_mem": 4.486828032,
970
+ "loss": 1.1895,
971
+ "grad_norm": 2.0640642642974854,
972
+ "learning_rate": 4.526447728708908e-05
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 3.0704225352112675,
977
+ "cpu_mem": 1.50122496,
978
+ "gpu_mem": 4.48681728,
979
+ "loss": 1.151,
980
+ "grad_norm": 2.4174320697784424,
981
+ "learning_rate": 4.261997261104223e-05
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 3.0985915492957745,
986
+ "cpu_mem": 1.50122496,
987
+ "gpu_mem": 4.48687104,
988
+ "loss": 1.2597,
989
+ "grad_norm": 3.956610679626465,
990
+ "learning_rate": 4.004221922552608e-05
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 3.1267605633802815,
995
+ "cpu_mem": 1.50122496,
996
+ "gpu_mem": 4.486831104,
997
+ "loss": 1.1957,
998
+ "grad_norm": 1.9434964656829834,
999
+ "learning_rate": 3.753281955483985e-05
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 3.1549295774647885,
1004
+ "cpu_mem": 1.50122496,
1005
+ "gpu_mem": 4.486849536,
1006
+ "loss": 1.1921,
1007
+ "grad_norm": 1.951103925704956,
1008
+ "learning_rate": 3.509333353215331e-05
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 3.183098591549296,
1013
+ "cpu_mem": 1.50122496,
1014
+ "gpu_mem": 4.486912512,
1015
+ "loss": 1.1653,
1016
+ "grad_norm": 1.9486435651779175,
1017
+ "learning_rate": 3.2725277629795526e-05
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 3.211267605633803,
1022
+ "cpu_mem": 1.50122496,
1023
+ "gpu_mem": 4.48684032,
1024
+ "loss": 1.2145,
1025
+ "grad_norm": 2.336803913116455,
1026
+ "learning_rate": 3.0430123916561672e-05
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 3.23943661971831,
1031
+ "cpu_mem": 1.50122496,
1032
+ "gpu_mem": 4.486834176,
1033
+ "loss": 1.234,
1034
+ "grad_norm": 2.942000389099121,
1035
+ "learning_rate": 2.8209299142621522e-05
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 3.267605633802817,
1040
+ "cpu_mem": 1.50122496,
1041
+ "gpu_mem": 4.486849536,
1042
+ "loss": 1.1358,
1043
+ "grad_norm": 1.8644171953201294,
1044
+ "learning_rate": 2.6064183852600797e-05
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 3.295774647887324,
1049
+ "cpu_mem": 1.50122496,
1050
+ "gpu_mem": 4.486864896,
1051
+ "loss": 1.1317,
1052
+ "grad_norm": 2.1343953609466553,
1053
+ "learning_rate": 2.3996111527384288e-05
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 3.323943661971831,
1058
+ "cpu_mem": 1.50122496,
1059
+ "gpu_mem": 4.48685568,
1060
+ "loss": 1.1576,
1061
+ "grad_norm": 2.370473861694336,
1062
+ "learning_rate": 2.2006367755176655e-05
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 3.352112676056338,
1067
+ "cpu_mem": 1.50122496,
1068
+ "gpu_mem": 4.486846464,
1069
+ "loss": 1.1716,
1070
+ "grad_norm": 2.426577091217041,
1071
+ "learning_rate": 2.009618943233419e-05
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 3.380281690140845,
1076
+ "cpu_mem": 1.50122496,
1077
+ "gpu_mem": 4.486864896,
1078
+ "loss": 1.176,
1079
+ "grad_norm": 2.2006006240844727,
1080
+ "learning_rate": 1.82667639944657e-05
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 3.408450704225352,
1085
+ "cpu_mem": 1.50122496,
1086
+ "gpu_mem": 4.48686336,
1087
+ "loss": 1.1531,
1088
+ "grad_norm": 2.4512929916381836,
1089
+ "learning_rate": 1.6519228678279718e-05
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 3.436619718309859,
1094
+ "cpu_mem": 1.50122496,
1095
+ "gpu_mem": 4.486820352,
1096
+ "loss": 1.0892,
1097
+ "grad_norm": 3.1564323902130127,
1098
+ "learning_rate": 1.4854669814637143e-05
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 3.464788732394366,
1103
+ "cpu_mem": 1.50122496,
1104
+ "gpu_mem": 4.486852608,
1105
+ "loss": 1.144,
1106
+ "grad_norm": 2.4003117084503174,
1107
+ "learning_rate": 1.3274122153249028e-05
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 3.492957746478873,
1112
+ "cpu_mem": 1.50122496,
1113
+ "gpu_mem": 4.486806528,
1114
+ "loss": 1.1869,
1115
+ "grad_norm": 2.464019298553467,
1116
+ "learning_rate": 1.1778568219438839e-05
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 3.52112676056338,
1121
+ "cpu_mem": 1.50122496,
1122
+ "gpu_mem": 4.486851072,
1123
+ "loss": 1.1083,
1124
+ "grad_norm": 3.2846879959106445,
1125
+ "learning_rate": 1.036893770336938e-05
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 3.5492957746478875,
1130
+ "cpu_mem": 1.50122496,
1131
+ "gpu_mem": 4.48680192,
1132
+ "loss": 1.0998,
1133
+ "grad_norm": 2.879990577697754,
1134
+ "learning_rate": 9.046106882113751e-06
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 3.5774647887323945,
1139
+ "cpu_mem": 1.50122496,
1140
+ "gpu_mem": 4.486814208,
1141
+ "loss": 1.1369,
1142
+ "grad_norm": 2.237708568572998,
1143
+ "learning_rate": 7.810898074930243e-06
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 3.6056338028169015,
1148
+ "cpu_mem": 1.50122496,
1149
+ "gpu_mem": 4.486838784,
1150
+ "loss": 1.1598,
1151
+ "grad_norm": 2.6036179065704346,
1152
+ "learning_rate": 6.664079132078881e-06
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 3.6338028169014085,
1157
+ "cpu_mem": 1.50122496,
1158
+ "gpu_mem": 4.486800384,
1159
+ "loss": 1.1029,
1160
+ "grad_norm": 2.8772053718566895,
1161
+ "learning_rate": 5.606362957498195e-06
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 3.6619718309859155,
1166
+ "cpu_mem": 1.50122496,
1167
+ "gpu_mem": 4.486803456,
1168
+ "loss": 1.167,
1169
+ "grad_norm": 2.4127655029296875,
1170
+ "learning_rate": 4.638407065638322e-06
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 3.6901408450704225,
1175
+ "cpu_mem": 1.50122496,
1176
+ "gpu_mem": 4.486815744,
1177
+ "loss": 1.0721,
1178
+ "grad_norm": 2.208256721496582,
1179
+ "learning_rate": 3.760813172726457e-06
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 3.7183098591549295,
1184
+ "cpu_mem": 1.50122496,
1185
+ "gpu_mem": 4.486780416,
1186
+ "loss": 1.1263,
1187
+ "grad_norm": 2.983234167098999,
1188
+ "learning_rate": 2.9741268227184255e-06
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 3.7464788732394365,
1193
+ "cpu_mem": 1.50122496,
1194
+ "gpu_mem": 4.486821888,
1195
+ "loss": 1.1618,
1196
+ "grad_norm": 2.294952630996704,
1197
+ "learning_rate": 2.2788370481687965e-06
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 3.7746478873239435,
1202
+ "cpu_mem": 1.50122496,
1203
+ "gpu_mem": 4.486837248,
1204
+ "loss": 1.1226,
1205
+ "grad_norm": 2.202094078063965,
1206
+ "learning_rate": 1.6753760662307215e-06
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 3.802816901408451,
1211
+ "cpu_mem": 1.50122496,
1212
+ "gpu_mem": 4.48680192,
1213
+ "loss": 1.1488,
1214
+ "grad_norm": 2.804492712020874,
1215
+ "learning_rate": 1.1641190099741904e-06
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 3.830985915492958,
1220
+ "cpu_mem": 1.50122496,
1221
+ "gpu_mem": 4.4868096,
1222
+ "loss": 1.173,
1223
+ "grad_norm": 3.213949203491211,
1224
+ "learning_rate": 7.453836951897885e-07
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 3.859154929577465,
1229
+ "cpu_mem": 1.50122496,
1230
+ "gpu_mem": 4.486831104,
1231
+ "loss": 1.112,
1232
+ "grad_norm": 3.0322439670562744,
1233
+ "learning_rate": 4.194304228229806e-07
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 3.887323943661972,
1238
+ "cpu_mem": 1.50122496,
1239
+ "gpu_mem": 4.486841856,
1240
+ "loss": 1.1919,
1241
+ "grad_norm": 2.2161591053009033,
1242
+ "learning_rate": 1.8646181716164831e-07
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 3.915492957746479,
1247
+ "cpu_mem": 1.50122496,
1248
+ "gpu_mem": 4.486834176,
1249
+ "loss": 1.1989,
1250
+ "grad_norm": 2.8926453590393066,
1251
+ "learning_rate": 4.662269987756317e-08
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 3.943661971830986,
1256
+ "cpu_mem": 1.50122496,
1257
+ "gpu_mem": 4.486867968,
1258
+ "loss": 1.1475,
1259
+ "grad_norm": 2.6210014820098877,
1260
+ "learning_rate": 0.0
1261
+ },
1262
+ {
1263
+ "step": 140,
1264
+ "epoch": 3.943661971830986,
1265
+ "cpu_mem": 1.50122496,
1266
+ "gpu_mem": 4.486867968,
1267
+ "train_runtime": 684.2008,
1268
+ "train_samples_per_second": 13.16,
1269
+ "train_steps_per_second": 0.205,
1270
+ "total_flos": 0.0,
1271
+ "train_loss": 1.3794820129871368
1272
+ }
1273
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.745565749235474
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1182720
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-boolq-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T09:05:54.792561"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r2-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.497100288,
6
+ "gpu_mem": 4.425657344,
7
+ "loss": 8.869,
8
+ "grad_norm": 207.28607177734375,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.503391744,
15
+ "gpu_mem": 4.435256832,
16
+ "loss": 8.9376,
17
+ "grad_norm": 212.0312957763672,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.504178176,
24
+ "gpu_mem": 4.435175424,
25
+ "loss": 8.332,
26
+ "grad_norm": 210.19793701171875,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.504768,
33
+ "gpu_mem": 4.435175424,
34
+ "loss": 7.0801,
35
+ "grad_norm": 211.4102783203125,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.505161216,
42
+ "gpu_mem": 4.435110912,
43
+ "loss": 5.4329,
44
+ "grad_norm": 188.5390167236328,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.50575104,
51
+ "gpu_mem": 4.43513088,
52
+ "loss": 3.7775,
53
+ "grad_norm": 175.3853759765625,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.506340864,
60
+ "gpu_mem": 4.435183104,
61
+ "loss": 2.0568,
62
+ "grad_norm": 97.72562408447266,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.506930688,
69
+ "gpu_mem": 4.43526912,
70
+ "loss": 1.3082,
71
+ "grad_norm": 58.757537841796875,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.507323904,
78
+ "gpu_mem": 4.43517696,
79
+ "loss": 1.0698,
80
+ "grad_norm": 30.670557022094727,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.50771712,
87
+ "gpu_mem": 4.43507712,
88
+ "loss": 0.8257,
89
+ "grad_norm": 19.861032485961914,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.508110336,
96
+ "gpu_mem": 4.435181568,
97
+ "loss": 0.7766,
98
+ "grad_norm": 37.903358459472656,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.50870016,
105
+ "gpu_mem": 4.43555328,
106
+ "loss": 0.7157,
107
+ "grad_norm": 27.937366485595703,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.509093376,
114
+ "gpu_mem": 4.435156992,
115
+ "loss": 0.8872,
116
+ "grad_norm": 60.55340576171875,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.509486592,
123
+ "gpu_mem": 4.435133952,
124
+ "loss": 0.8207,
125
+ "grad_norm": 46.76423263549805,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.509879808,
132
+ "gpu_mem": 4.435072512,
133
+ "loss": 0.7102,
134
+ "grad_norm": 11.082263946533203,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.510273024,
141
+ "gpu_mem": 4.435156992,
142
+ "loss": 1.083,
143
+ "grad_norm": 63.073631286621094,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.51066624,
150
+ "gpu_mem": 4.435196928,
151
+ "loss": 0.8847,
152
+ "grad_norm": 41.545772552490234,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.510862848,
159
+ "gpu_mem": 4.435259904,
160
+ "loss": 0.6895,
161
+ "grad_norm": 7.644906997680664,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.511256064,
168
+ "gpu_mem": 4.435097088,
169
+ "loss": 1.0504,
170
+ "grad_norm": 38.996315002441406,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.51164928,
177
+ "gpu_mem": 4.435209216,
178
+ "loss": 0.7336,
179
+ "grad_norm": 17.267147064208984,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.511845888,
186
+ "gpu_mem": 4.435367424,
187
+ "loss": 0.6753,
188
+ "grad_norm": 6.746187210083008,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.512239104,
195
+ "gpu_mem": 4.435259904,
196
+ "loss": 0.7545,
197
+ "grad_norm": 8.339751243591309,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.512435712,
204
+ "gpu_mem": 4.435232256,
205
+ "loss": 0.7935,
206
+ "grad_norm": 18.06570053100586,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.512828928,
213
+ "gpu_mem": 4.435289088,
214
+ "loss": 0.6579,
215
+ "grad_norm": 11.259378433227539,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.513025536,
222
+ "gpu_mem": 4.435074048,
223
+ "loss": 0.8052,
224
+ "grad_norm": 18.03961181640625,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.513418752,
231
+ "gpu_mem": 4.435129344,
232
+ "loss": 0.8655,
233
+ "grad_norm": 23.352096557617188,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.51361536,
240
+ "gpu_mem": 4.435421184,
241
+ "loss": 0.718,
242
+ "grad_norm": 33.54894256591797,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.513811968,
249
+ "gpu_mem": 4.43510016,
250
+ "loss": 0.7011,
251
+ "grad_norm": 5.752996444702148,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.514205184,
258
+ "gpu_mem": 4.435164672,
259
+ "loss": 0.7024,
260
+ "grad_norm": 24.037891387939453,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.514401792,
267
+ "gpu_mem": 4.435243008,
268
+ "loss": 0.9994,
269
+ "grad_norm": 46.48765563964844,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.5145984,
276
+ "gpu_mem": 4.4350464,
277
+ "loss": 0.6251,
278
+ "grad_norm": 3.328892946243286,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.514795008,
285
+ "gpu_mem": 4.435160064,
286
+ "loss": 0.6636,
287
+ "grad_norm": 2.738682746887207,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.514991616,
294
+ "gpu_mem": 4.435398144,
295
+ "loss": 0.7977,
296
+ "grad_norm": 45.13636779785156,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.515188224,
303
+ "gpu_mem": 4.43510016,
304
+ "loss": 0.5935,
305
+ "grad_norm": 10.75675106048584,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.51558144,
312
+ "gpu_mem": 4.435310592,
313
+ "loss": 1.3179,
314
+ "grad_norm": 55.147945404052734,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.51558144,
321
+ "gpu_mem": 4.43526144,
322
+ "loss": 0.8981,
323
+ "grad_norm": 45.676902770996094,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.515778048,
330
+ "gpu_mem": 4.435072512,
331
+ "loss": 0.6745,
332
+ "grad_norm": 13.375166893005371,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.516171264,
339
+ "gpu_mem": 4.435319808,
340
+ "loss": 0.7386,
341
+ "grad_norm": 9.287933349609375,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.516367872,
348
+ "gpu_mem": 4.4356992,
349
+ "loss": 0.6679,
350
+ "grad_norm": 2.182807445526123,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.51656448,
357
+ "gpu_mem": 4.43526912,
358
+ "loss": 0.664,
359
+ "grad_norm": 4.061365127563477,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.516761088,
366
+ "gpu_mem": 4.435496448,
367
+ "loss": 0.6778,
368
+ "grad_norm": 4.150041580200195,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.516957696,
375
+ "gpu_mem": 4.435393536,
376
+ "loss": 0.6646,
377
+ "grad_norm": 1.6074423789978027,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.516957696,
384
+ "gpu_mem": 4.43521536,
385
+ "loss": 0.6099,
386
+ "grad_norm": 3.170870780944824,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.517154304,
393
+ "gpu_mem": 4.435358208,
394
+ "loss": 0.6507,
395
+ "grad_norm": 9.807236671447754,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.517154304,
402
+ "gpu_mem": 4.43513856,
403
+ "loss": 0.9994,
404
+ "grad_norm": 25.055511474609375,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.517350912,
411
+ "gpu_mem": 4.435381248,
412
+ "loss": 0.7107,
413
+ "grad_norm": 7.407582759857178,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.51754752,
420
+ "gpu_mem": 4.435104768,
421
+ "loss": 0.6873,
422
+ "grad_norm": 13.582527160644531,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.51754752,
429
+ "gpu_mem": 4.435181568,
430
+ "loss": 0.8356,
431
+ "grad_norm": 24.060340881347656,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.517744128,
438
+ "gpu_mem": 4.435198464,
439
+ "loss": 0.7423,
440
+ "grad_norm": 3.9435641765594482,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.517940736,
447
+ "gpu_mem": 4.435137024,
448
+ "loss": 0.6187,
449
+ "grad_norm": 2.644559860229492,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.517940736,
456
+ "gpu_mem": 4.435141632,
457
+ "loss": 0.5859,
458
+ "grad_norm": 2.845573663711548,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.518137344,
465
+ "gpu_mem": 4.435221504,
466
+ "loss": 0.7151,
467
+ "grad_norm": 9.970312118530273,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.518333952,
474
+ "gpu_mem": 4.435244544,
475
+ "loss": 0.5678,
476
+ "grad_norm": 5.6443681716918945,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.518333952,
483
+ "gpu_mem": 4.435172352,
484
+ "loss": 0.7156,
485
+ "grad_norm": 6.423981189727783,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.51853056,
492
+ "gpu_mem": 4.435442688,
493
+ "loss": 0.6943,
494
+ "grad_norm": 8.590938568115234,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.51853056,
501
+ "gpu_mem": 4.435229184,
502
+ "loss": 0.6623,
503
+ "grad_norm": 2.5790529251098633,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.518727168,
510
+ "gpu_mem": 4.43522304,
511
+ "loss": 0.6916,
512
+ "grad_norm": 7.401466369628906,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.518923776,
519
+ "gpu_mem": 4.435118592,
520
+ "loss": 0.6048,
521
+ "grad_norm": 2.3391366004943848,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.518923776,
528
+ "gpu_mem": 4.435135488,
529
+ "loss": 0.808,
530
+ "grad_norm": 12.574873924255371,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.519120384,
537
+ "gpu_mem": 4.435229184,
538
+ "loss": 0.6018,
539
+ "grad_norm": 4.364191055297852,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.519120384,
546
+ "gpu_mem": 4.435239936,
547
+ "loss": 0.5963,
548
+ "grad_norm": 2.257843255996704,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.519316992,
555
+ "gpu_mem": 4.435227648,
556
+ "loss": 0.7185,
557
+ "grad_norm": 5.300965309143066,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.5195136,
564
+ "gpu_mem": 4.435219968,
565
+ "loss": 0.7026,
566
+ "grad_norm": 18.619279861450195,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.5195136,
573
+ "gpu_mem": 4.435149312,
574
+ "loss": 0.7096,
575
+ "grad_norm": 15.257659912109375,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.519710208,
582
+ "gpu_mem": 4.435193856,
583
+ "loss": 0.6736,
584
+ "grad_norm": 3.462217330932617,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.519710208,
591
+ "gpu_mem": 4.435387392,
592
+ "loss": 0.6095,
593
+ "grad_norm": 5.371577262878418,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.519710208,
600
+ "gpu_mem": 4.435097088,
601
+ "loss": 0.6853,
602
+ "grad_norm": 7.674093246459961,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.519710208,
609
+ "gpu_mem": 4.435064832,
610
+ "loss": 0.7368,
611
+ "grad_norm": 8.521327018737793,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.519906816,
618
+ "gpu_mem": 4.43513088,
619
+ "loss": 0.6601,
620
+ "grad_norm": 9.785889625549316,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.519906816,
627
+ "gpu_mem": 4.435124736,
628
+ "loss": 0.6927,
629
+ "grad_norm": 10.841557502746582,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.519906816,
636
+ "gpu_mem": 4.4353536,
637
+ "loss": 0.6767,
638
+ "grad_norm": 10.955390930175781,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.519906816,
645
+ "gpu_mem": 4.43534592,
646
+ "loss": 0.6142,
647
+ "grad_norm": 2.390930414199829,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.520103424,
654
+ "gpu_mem": 4.435312128,
655
+ "loss": 0.7588,
656
+ "grad_norm": 11.52270793914795,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.520103424,
663
+ "gpu_mem": 4.435172352,
664
+ "loss": 0.6045,
665
+ "grad_norm": 3.1932780742645264,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.520103424,
672
+ "gpu_mem": 4.435097088,
673
+ "loss": 0.589,
674
+ "grad_norm": 7.821681022644043,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.520103424,
681
+ "gpu_mem": 4.435037184,
682
+ "loss": 0.6281,
683
+ "grad_norm": 4.9963226318359375,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.520103424,
690
+ "gpu_mem": 4.435110912,
691
+ "loss": 0.5632,
692
+ "grad_norm": 2.8097784519195557,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.520103424,
699
+ "gpu_mem": 4.435163136,
700
+ "loss": 0.9578,
701
+ "grad_norm": 19.26412582397461,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.520300032,
708
+ "gpu_mem": 4.435295232,
709
+ "loss": 0.7552,
710
+ "grad_norm": 11.019858360290527,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.520300032,
717
+ "gpu_mem": 4.435186176,
718
+ "loss": 0.6377,
719
+ "grad_norm": 6.720715045928955,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.52049664,
726
+ "gpu_mem": 4.435066368,
727
+ "loss": 0.5784,
728
+ "grad_norm": 3.100093364715576,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.52049664,
735
+ "gpu_mem": 4.435135488,
736
+ "loss": 0.6546,
737
+ "grad_norm": 7.847732067108154,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.52049664,
744
+ "gpu_mem": 4.435235328,
745
+ "loss": 0.6415,
746
+ "grad_norm": 7.157805442810059,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.52049664,
753
+ "gpu_mem": 4.435198464,
754
+ "loss": 0.6897,
755
+ "grad_norm": 10.05351734161377,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.52049664,
762
+ "gpu_mem": 4.43523072,
763
+ "loss": 0.5937,
764
+ "grad_norm": 3.456000804901123,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.52049664,
771
+ "gpu_mem": 4.435181568,
772
+ "loss": 0.6938,
773
+ "grad_norm": 6.894677639007568,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.52049664,
780
+ "gpu_mem": 4.435189248,
781
+ "loss": 0.6673,
782
+ "grad_norm": 9.324849128723145,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.520693248,
789
+ "gpu_mem": 4.435333632,
790
+ "loss": 0.5659,
791
+ "grad_norm": 5.584108352661133,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.520693248,
798
+ "gpu_mem": 4.43511552,
799
+ "loss": 0.6689,
800
+ "grad_norm": 5.593686580657959,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.520693248,
807
+ "gpu_mem": 4.43516928,
808
+ "loss": 0.6263,
809
+ "grad_norm": 5.61871862411499,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.520693248,
816
+ "gpu_mem": 4.435137024,
817
+ "loss": 0.6892,
818
+ "grad_norm": 10.633136749267578,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.520693248,
825
+ "gpu_mem": 4.435218432,
826
+ "loss": 0.6887,
827
+ "grad_norm": 3.8378069400787354,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.520693248,
834
+ "gpu_mem": 4.435021824,
835
+ "loss": 0.7163,
836
+ "grad_norm": 3.294466018676758,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.520693248,
843
+ "gpu_mem": 4.435135488,
844
+ "loss": 0.5549,
845
+ "grad_norm": 4.280175685882568,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.520693248,
852
+ "gpu_mem": 4.435155456,
853
+ "loss": 0.6492,
854
+ "grad_norm": 5.376728057861328,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.520693248,
861
+ "gpu_mem": 4.435193856,
862
+ "loss": 0.6214,
863
+ "grad_norm": 5.850833415985107,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.520889856,
870
+ "gpu_mem": 4.435178496,
871
+ "loss": 0.6908,
872
+ "grad_norm": 6.41079568862915,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.520889856,
879
+ "gpu_mem": 4.435090944,
880
+ "loss": 0.6782,
881
+ "grad_norm": 3.8216965198516846,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.520889856,
888
+ "gpu_mem": 4.435040256,
889
+ "loss": 0.6113,
890
+ "grad_norm": 2.445533037185669,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.520889856,
897
+ "gpu_mem": 4.435156992,
898
+ "loss": 0.6634,
899
+ "grad_norm": 7.595210552215576,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.520889856,
906
+ "gpu_mem": 4.435189248,
907
+ "loss": 0.6559,
908
+ "grad_norm": 6.5138936042785645,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.520889856,
915
+ "gpu_mem": 4.43522304,
916
+ "loss": 0.635,
917
+ "grad_norm": 4.007444858551025,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.520889856,
924
+ "gpu_mem": 4.435273728,
925
+ "loss": 0.6088,
926
+ "grad_norm": 2.9485924243927,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.520889856,
933
+ "gpu_mem": 4.435178496,
934
+ "loss": 0.6598,
935
+ "grad_norm": 5.93380880355835,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.520889856,
942
+ "gpu_mem": 4.435279872,
943
+ "loss": 0.657,
944
+ "grad_norm": 6.0811567306518555,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.520889856,
951
+ "gpu_mem": 4.43523072,
952
+ "loss": 0.5668,
953
+ "grad_norm": 3.6183319091796875,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.520889856,
960
+ "gpu_mem": 4.435118592,
961
+ "loss": 0.5207,
962
+ "grad_norm": 3.5147411823272705,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.520889856,
969
+ "gpu_mem": 4.435302912,
970
+ "loss": 0.603,
971
+ "grad_norm": 5.246246814727783,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.520889856,
978
+ "gpu_mem": 4.435156992,
979
+ "loss": 0.6161,
980
+ "grad_norm": 4.675862789154053,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.520889856,
987
+ "gpu_mem": 4.435160064,
988
+ "loss": 0.5996,
989
+ "grad_norm": 3.654783010482788,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.520889856,
996
+ "gpu_mem": 4.435129344,
997
+ "loss": 0.5225,
998
+ "grad_norm": 3.8030014038085938,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.520889856,
1005
+ "gpu_mem": 4.435175424,
1006
+ "loss": 0.5475,
1007
+ "grad_norm": 5.957515239715576,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.520889856,
1014
+ "gpu_mem": 4.435166208,
1015
+ "loss": 0.626,
1016
+ "grad_norm": 5.172255039215088,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.520889856,
1023
+ "gpu_mem": 4.435147776,
1024
+ "loss": 0.5508,
1025
+ "grad_norm": 4.3366289138793945,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.520889856,
1032
+ "gpu_mem": 4.43522304,
1033
+ "loss": 0.5571,
1034
+ "grad_norm": 4.991285800933838,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.520889856,
1041
+ "gpu_mem": 4.435143168,
1042
+ "loss": 0.5788,
1043
+ "grad_norm": 4.1896491050720215,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.520889856,
1050
+ "gpu_mem": 4.435034112,
1051
+ "loss": 0.5795,
1052
+ "grad_norm": 8.061439514160156,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.520889856,
1059
+ "gpu_mem": 4.435272192,
1060
+ "loss": 0.5868,
1061
+ "grad_norm": 8.318702697753906,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.520889856,
1068
+ "gpu_mem": 4.435442688,
1069
+ "loss": 0.5089,
1070
+ "grad_norm": 7.266751766204834,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.520889856,
1077
+ "gpu_mem": 4.435175424,
1078
+ "loss": 0.6088,
1079
+ "grad_norm": 9.481569290161133,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.520889856,
1086
+ "gpu_mem": 4.435203072,
1087
+ "loss": 0.5676,
1088
+ "grad_norm": 6.377309322357178,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.520889856,
1095
+ "gpu_mem": 4.43525376,
1096
+ "loss": 0.534,
1097
+ "grad_norm": 6.072964191436768,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.520889856,
1104
+ "gpu_mem": 4.435063296,
1105
+ "loss": 0.6156,
1106
+ "grad_norm": 11.986638069152832,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.521086464,
1113
+ "gpu_mem": 4.435505664,
1114
+ "loss": 0.5422,
1115
+ "grad_norm": 7.253986835479736,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.521086464,
1122
+ "gpu_mem": 4.435232256,
1123
+ "loss": 0.5348,
1124
+ "grad_norm": 5.5946431159973145,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.521086464,
1131
+ "gpu_mem": 4.43511552,
1132
+ "loss": 0.5548,
1133
+ "grad_norm": 6.215771198272705,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.521086464,
1140
+ "gpu_mem": 4.435554816,
1141
+ "loss": 0.5796,
1142
+ "grad_norm": 6.4818949699401855,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.521086464,
1149
+ "gpu_mem": 4.43533056,
1150
+ "loss": 0.5044,
1151
+ "grad_norm": 5.554439067840576,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.521086464,
1158
+ "gpu_mem": 4.435370496,
1159
+ "loss": 0.6452,
1160
+ "grad_norm": 7.500157356262207,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.521086464,
1167
+ "gpu_mem": 4.435152384,
1168
+ "loss": 0.6325,
1169
+ "grad_norm": 4.366447925567627,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.521086464,
1176
+ "gpu_mem": 4.435281408,
1177
+ "loss": 0.5926,
1178
+ "grad_norm": 4.608772277832031,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.521086464,
1185
+ "gpu_mem": 4.435362816,
1186
+ "loss": 0.5173,
1187
+ "grad_norm": 6.282446384429932,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.521086464,
1194
+ "gpu_mem": 4.43514624,
1195
+ "loss": 0.5126,
1196
+ "grad_norm": 5.64429235458374,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.521086464,
1203
+ "gpu_mem": 4.435279872,
1204
+ "loss": 0.5368,
1205
+ "grad_norm": 7.870690822601318,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.521086464,
1212
+ "gpu_mem": 4.435302912,
1213
+ "loss": 0.563,
1214
+ "grad_norm": 7.819180011749268,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.521086464,
1221
+ "gpu_mem": 4.435140096,
1222
+ "loss": 0.5316,
1223
+ "grad_norm": 7.46820592880249,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.521086464,
1230
+ "gpu_mem": 4.435020288,
1231
+ "loss": 0.6543,
1232
+ "grad_norm": 12.327062606811523,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.521086464,
1239
+ "gpu_mem": 4.435201536,
1240
+ "loss": 0.5471,
1241
+ "grad_norm": 7.879848957061768,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.521086464,
1248
+ "gpu_mem": 4.43510016,
1249
+ "loss": 0.5362,
1250
+ "grad_norm": 4.619059085845947,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.521086464,
1257
+ "gpu_mem": 4.435152384,
1258
+ "loss": 0.5643,
1259
+ "grad_norm": 8.845608711242676,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.521086464,
1266
+ "gpu_mem": 4.43518464,
1267
+ "loss": 0.6876,
1268
+ "grad_norm": 13.390810012817383,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.521086464,
1275
+ "gpu_mem": 4.435324416,
1276
+ "loss": 0.5868,
1277
+ "grad_norm": 14.687874794006348,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.521086464,
1284
+ "gpu_mem": 4.43530752,
1285
+ "loss": 0.6293,
1286
+ "grad_norm": 8.600774765014648,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.521086464,
1293
+ "gpu_mem": 4.43549952,
1294
+ "loss": 0.5202,
1295
+ "grad_norm": 4.7456254959106445,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.521086464,
1302
+ "gpu_mem": 4.435210752,
1303
+ "loss": 0.4946,
1304
+ "grad_norm": 5.0703959465026855,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.521086464,
1311
+ "gpu_mem": 4.43524608,
1312
+ "loss": 0.5895,
1313
+ "grad_norm": 9.480493545532227,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.521086464,
1320
+ "gpu_mem": 4.435144704,
1321
+ "loss": 0.6059,
1322
+ "grad_norm": 10.175168991088867,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.521086464,
1329
+ "gpu_mem": 4.440039936,
1330
+ "loss": 0.9197,
1331
+ "grad_norm": 15.65149211883545,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.521086464,
1338
+ "gpu_mem": 4.439975424,
1339
+ "loss": 0.4799,
1340
+ "grad_norm": 6.842282295227051,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.521086464,
1347
+ "gpu_mem": 4.439812608,
1348
+ "loss": 0.4679,
1349
+ "grad_norm": 4.733060359954834,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.521086464,
1356
+ "gpu_mem": 4.4398848,
1357
+ "loss": 0.5091,
1358
+ "grad_norm": 4.760961055755615,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.521086464,
1365
+ "gpu_mem": 4.439920128,
1366
+ "loss": 0.4298,
1367
+ "grad_norm": 6.078965187072754,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.521086464,
1374
+ "gpu_mem": 4.439944704,
1375
+ "loss": 0.5997,
1376
+ "grad_norm": 11.618675231933594,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.521086464,
1383
+ "gpu_mem": 4.439906304,
1384
+ "loss": 0.4825,
1385
+ "grad_norm": 8.579179763793945,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.521086464,
1392
+ "gpu_mem": 4.440127488,
1393
+ "loss": 0.4975,
1394
+ "grad_norm": 6.922858238220215,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.521086464,
1401
+ "gpu_mem": 4.440035328,
1402
+ "loss": 0.463,
1403
+ "grad_norm": 6.798807621002197,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.521086464,
1410
+ "gpu_mem": 4.439941632,
1411
+ "loss": 0.4941,
1412
+ "grad_norm": 8.606608390808105,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.521086464,
1419
+ "gpu_mem": 4.439867904,
1420
+ "loss": 0.4964,
1421
+ "grad_norm": 8.071224212646484,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.521086464,
1428
+ "gpu_mem": 4.440216576,
1429
+ "loss": 0.4681,
1430
+ "grad_norm": 8.937488555908203,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.521086464,
1437
+ "gpu_mem": 4.439811072,
1438
+ "loss": 0.5065,
1439
+ "grad_norm": 10.181997299194336,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.521086464,
1446
+ "gpu_mem": 4.439757312,
1447
+ "loss": 0.4166,
1448
+ "grad_norm": 8.312394142150879,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.521086464,
1455
+ "gpu_mem": 4.440532992,
1456
+ "loss": 0.4284,
1457
+ "grad_norm": 6.323586463928223,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.521086464,
1464
+ "gpu_mem": 4.440009216,
1465
+ "loss": 0.4603,
1466
+ "grad_norm": 5.615723133087158,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.521086464,
1473
+ "gpu_mem": 4.439921664,
1474
+ "loss": 0.5386,
1475
+ "grad_norm": 8.030407905578613,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.521086464,
1482
+ "gpu_mem": 4.439870976,
1483
+ "loss": 0.398,
1484
+ "grad_norm": 6.317580223083496,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.521086464,
1491
+ "gpu_mem": 4.439966208,
1492
+ "loss": 0.3961,
1493
+ "grad_norm": 6.63275146484375,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.521086464,
1500
+ "gpu_mem": 4.439883264,
1501
+ "loss": 0.5081,
1502
+ "grad_norm": 9.496688842773438,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.521086464,
1509
+ "gpu_mem": 4.439901696,
1510
+ "loss": 0.5905,
1511
+ "grad_norm": 8.804789543151855,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.521086464,
1518
+ "gpu_mem": 4.439989248,
1519
+ "loss": 0.4639,
1520
+ "grad_norm": 11.636336326599121,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.521086464,
1527
+ "gpu_mem": 4.439874048,
1528
+ "loss": 0.4202,
1529
+ "grad_norm": 11.674962997436523,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.521086464,
1536
+ "gpu_mem": 4.439937024,
1537
+ "loss": 0.6057,
1538
+ "grad_norm": 12.196778297424316,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.521086464,
1545
+ "gpu_mem": 4.439844864,
1546
+ "loss": 0.4275,
1547
+ "grad_norm": 7.398139953613281,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.521086464,
1554
+ "gpu_mem": 4.440144384,
1555
+ "loss": 0.4748,
1556
+ "grad_norm": 7.616250514984131,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.521086464,
1563
+ "gpu_mem": 4.439867904,
1564
+ "loss": 0.5088,
1565
+ "grad_norm": 9.336560249328613,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.521086464,
1572
+ "gpu_mem": 4.439834112,
1573
+ "loss": 0.4445,
1574
+ "grad_norm": 6.723942279815674,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.521086464,
1581
+ "gpu_mem": 4.439972352,
1582
+ "loss": 0.4937,
1583
+ "grad_norm": 11.034165382385254,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.521086464,
1590
+ "gpu_mem": 4.440070656,
1591
+ "loss": 0.4964,
1592
+ "grad_norm": 8.4466552734375,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.521086464,
1599
+ "gpu_mem": 4.439817216,
1600
+ "loss": 0.5088,
1601
+ "grad_norm": 7.810116767883301,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.521086464,
1608
+ "gpu_mem": 4.439917056,
1609
+ "loss": 0.5458,
1610
+ "grad_norm": 12.21668529510498,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.521086464,
1617
+ "gpu_mem": 4.439889408,
1618
+ "loss": 0.4019,
1619
+ "grad_norm": 6.613919734954834,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.521086464,
1626
+ "gpu_mem": 4.439826432,
1627
+ "loss": 0.3733,
1628
+ "grad_norm": 6.443704128265381,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.521086464,
1635
+ "gpu_mem": 4.440044544,
1636
+ "loss": 0.589,
1637
+ "grad_norm": 11.146683692932129,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.521086464,
1644
+ "gpu_mem": 4.439941632,
1645
+ "loss": 0.4674,
1646
+ "grad_norm": 6.876543998718262,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.521086464,
1653
+ "gpu_mem": 4.439889408,
1654
+ "loss": 0.4037,
1655
+ "grad_norm": 6.874241352081299,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.521086464,
1662
+ "gpu_mem": 4.439867904,
1663
+ "loss": 0.472,
1664
+ "grad_norm": 8.139419555664062,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.521086464,
1671
+ "gpu_mem": 4.43987712,
1672
+ "loss": 0.3631,
1673
+ "grad_norm": 6.349623203277588,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.521086464,
1680
+ "gpu_mem": 4.439809536,
1681
+ "loss": 0.599,
1682
+ "grad_norm": 10.569090843200684,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.521086464,
1689
+ "gpu_mem": 4.439972352,
1690
+ "loss": 0.387,
1691
+ "grad_norm": 6.98309850692749,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.521086464,
1698
+ "gpu_mem": 4.439841792,
1699
+ "loss": 0.5156,
1700
+ "grad_norm": 7.907104015350342,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.521086464,
1707
+ "gpu_mem": 4.4399616,
1708
+ "loss": 0.4805,
1709
+ "grad_norm": 8.252829551696777,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.521086464,
1716
+ "gpu_mem": 4.439780352,
1717
+ "loss": 0.4315,
1718
+ "grad_norm": 8.242355346679688,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.521086464,
1725
+ "gpu_mem": 4.439912448,
1726
+ "loss": 0.4163,
1727
+ "grad_norm": 13.093923568725586,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.521086464,
1734
+ "gpu_mem": 4.439886336,
1735
+ "loss": 0.4201,
1736
+ "grad_norm": 6.36842679977417,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.521086464,
1743
+ "gpu_mem": 4.439852544,
1744
+ "loss": 0.4756,
1745
+ "grad_norm": 6.584967613220215,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.521086464,
1752
+ "gpu_mem": 4.439956992,
1753
+ "loss": 0.3519,
1754
+ "grad_norm": 6.934253215789795,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.521086464,
1761
+ "gpu_mem": 4.439952384,
1762
+ "loss": 0.3706,
1763
+ "grad_norm": 7.2520880699157715,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.521086464,
1770
+ "gpu_mem": 4.439811072,
1771
+ "loss": 0.3915,
1772
+ "grad_norm": 10.085671424865723,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.521086464,
1779
+ "gpu_mem": 4.440003072,
1780
+ "loss": 0.4238,
1781
+ "grad_norm": 6.6942572593688965,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.521086464,
1788
+ "gpu_mem": 4.43985408,
1789
+ "loss": 0.4759,
1790
+ "grad_norm": 10.992294311523438,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.521086464,
1797
+ "gpu_mem": 4.439956992,
1798
+ "loss": 0.4289,
1799
+ "grad_norm": 9.138057708740234,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.521086464,
1806
+ "gpu_mem": 4.440159744,
1807
+ "loss": 0.4458,
1808
+ "grad_norm": 10.073532104492188,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.521086464,
1815
+ "gpu_mem": 4.4399616,
1816
+ "loss": 0.4531,
1817
+ "grad_norm": 8.583861351013184,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.521086464,
1824
+ "gpu_mem": 4.439847936,
1825
+ "loss": 0.3384,
1826
+ "grad_norm": 7.238431930541992,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.521086464,
1833
+ "gpu_mem": 4.43986176,
1834
+ "loss": 0.3823,
1835
+ "grad_norm": 7.507495403289795,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.521086464,
1842
+ "gpu_mem": 4.439906304,
1843
+ "loss": 0.5007,
1844
+ "grad_norm": 10.150589942932129,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.521086464,
1851
+ "gpu_mem": 4.439847936,
1852
+ "loss": 0.383,
1853
+ "grad_norm": 7.802042484283447,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.521086464,
1860
+ "gpu_mem": 4.440081408,
1861
+ "loss": 0.4301,
1862
+ "grad_norm": 10.05980396270752,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.521086464,
1869
+ "gpu_mem": 4.440112128,
1870
+ "loss": 0.3913,
1871
+ "grad_norm": 12.579257011413574,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.521086464,
1878
+ "gpu_mem": 4.4400384,
1879
+ "loss": 0.4284,
1880
+ "grad_norm": 9.47547721862793,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.521086464,
1887
+ "gpu_mem": 4.439926272,
1888
+ "loss": 0.4698,
1889
+ "grad_norm": 10.045745849609375,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.521086464,
1896
+ "gpu_mem": 4.439875584,
1897
+ "loss": 0.3637,
1898
+ "grad_norm": 8.85647964477539,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.521086464,
1905
+ "gpu_mem": 4.439844864,
1906
+ "loss": 0.4818,
1907
+ "grad_norm": 9.431801795959473,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.521086464,
1914
+ "gpu_mem": 4.439867904,
1915
+ "loss": 0.5163,
1916
+ "grad_norm": 10.95721435546875,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.521086464,
1923
+ "gpu_mem": 4.439950848,
1924
+ "loss": 0.3934,
1925
+ "grad_norm": 8.903146743774414,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.521086464,
1932
+ "gpu_mem": 4.439878656,
1933
+ "loss": 0.511,
1934
+ "grad_norm": 13.773554801940918,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.521086464,
1941
+ "gpu_mem": 4.440044544,
1942
+ "loss": 0.4131,
1943
+ "grad_norm": 8.684189796447754,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.521086464,
1950
+ "gpu_mem": 4.439886336,
1951
+ "loss": 0.4951,
1952
+ "grad_norm": 12.174453735351562,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.521086464,
1959
+ "gpu_mem": 4.439863296,
1960
+ "loss": 0.4811,
1961
+ "grad_norm": 9.615564346313477,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.521086464,
1968
+ "gpu_mem": 4.439989248,
1969
+ "loss": 0.4467,
1970
+ "grad_norm": 9.505061149597168,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.521086464,
1977
+ "gpu_mem": 4.44002304,
1978
+ "loss": 0.4204,
1979
+ "grad_norm": 7.657809734344482,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.521086464,
1986
+ "gpu_mem": 4.439890944,
1987
+ "loss": 0.4082,
1988
+ "grad_norm": 8.211697578430176,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.521086464,
1995
+ "gpu_mem": 4.440027648,
1996
+ "loss": 0.3794,
1997
+ "grad_norm": 8.586983680725098,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.521086464,
2004
+ "gpu_mem": 4.439941632,
2005
+ "loss": 0.409,
2006
+ "grad_norm": 13.101545333862305,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.521086464,
2013
+ "gpu_mem": 4.439906304,
2014
+ "loss": 0.3994,
2015
+ "grad_norm": 9.212977409362793,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.521086464,
2022
+ "gpu_mem": 4.439870976,
2023
+ "loss": 0.4026,
2024
+ "grad_norm": 7.517599105834961,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.521086464,
2031
+ "gpu_mem": 4.440019968,
2032
+ "loss": 0.3835,
2033
+ "grad_norm": 9.062960624694824,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.521086464,
2040
+ "gpu_mem": 4.439909376,
2041
+ "loss": 0.4001,
2042
+ "grad_norm": 8.712440490722656,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.521086464,
2049
+ "gpu_mem": 4.43985408,
2050
+ "loss": 0.5073,
2051
+ "grad_norm": 8.387016296386719,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.521086464,
2058
+ "gpu_mem": 4.439795712,
2059
+ "loss": 0.3872,
2060
+ "grad_norm": 7.554939270019531,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.521086464,
2067
+ "gpu_mem": 4.439852544,
2068
+ "loss": 0.3522,
2069
+ "grad_norm": 9.5506010055542,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.521086464,
2076
+ "gpu_mem": 4.44013056,
2077
+ "loss": 0.4594,
2078
+ "grad_norm": 9.223462104797363,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.521086464,
2085
+ "gpu_mem": 4.43985408,
2086
+ "loss": 0.4328,
2087
+ "grad_norm": 8.588035583496094,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.521086464,
2094
+ "gpu_mem": 4.440167424,
2095
+ "loss": 0.344,
2096
+ "grad_norm": 8.180073738098145,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.521086464,
2103
+ "gpu_mem": 4.440043008,
2104
+ "loss": 0.3725,
2105
+ "grad_norm": 11.422308921813965,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.521086464,
2112
+ "gpu_mem": 4.439798784,
2113
+ "loss": 0.3704,
2114
+ "grad_norm": 10.164522171020508,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.521086464,
2121
+ "gpu_mem": 4.439858688,
2122
+ "loss": 0.3934,
2123
+ "grad_norm": 9.219282150268555,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.521086464,
2130
+ "gpu_mem": 4.439920128,
2131
+ "loss": 0.4544,
2132
+ "grad_norm": 9.418452262878418,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.521086464,
2139
+ "gpu_mem": 4.439921664,
2140
+ "loss": 0.4074,
2141
+ "grad_norm": 8.276749610900879,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.521086464,
2148
+ "gpu_mem": 4.440175104,
2149
+ "loss": 0.4748,
2150
+ "grad_norm": 9.793453216552734,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.521086464,
2157
+ "gpu_mem": 4.439824896,
2158
+ "loss": 0.6822,
2159
+ "grad_norm": 12.661322593688965,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.521086464,
2166
+ "gpu_mem": 4.440121344,
2167
+ "loss": 0.4709,
2168
+ "grad_norm": 10.91852855682373,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.521086464,
2175
+ "gpu_mem": 4.439983104,
2176
+ "loss": 0.3463,
2177
+ "grad_norm": 7.739848613739014,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.521086464,
2184
+ "gpu_mem": 4.439835648,
2185
+ "loss": 0.3889,
2186
+ "grad_norm": 10.5182523727417,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.521086464,
2193
+ "gpu_mem": 4.439975424,
2194
+ "loss": 0.3747,
2195
+ "grad_norm": 8.963127136230469,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.521086464,
2202
+ "gpu_mem": 4.43985408,
2203
+ "loss": 0.4079,
2204
+ "grad_norm": 7.328197479248047,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.521086464,
2211
+ "gpu_mem": 4.439947776,
2212
+ "loss": 0.4373,
2213
+ "grad_norm": 8.23412799835205,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.521086464,
2220
+ "gpu_mem": 4.439966208,
2221
+ "loss": 0.2948,
2222
+ "grad_norm": 9.671659469604492,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.521086464,
2229
+ "gpu_mem": 4.439918592,
2230
+ "loss": 0.3806,
2231
+ "grad_norm": 8.410043716430664,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.521086464,
2238
+ "gpu_mem": 4.439824896,
2239
+ "loss": 0.3278,
2240
+ "grad_norm": 8.285307884216309,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.521086464,
2247
+ "gpu_mem": 4.439917056,
2248
+ "loss": 0.4161,
2249
+ "grad_norm": 9.953680038452148,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.521086464,
2256
+ "gpu_mem": 4.439829504,
2257
+ "loss": 0.3733,
2258
+ "grad_norm": 10.67496395111084,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.521086464,
2265
+ "gpu_mem": 4.439872512,
2266
+ "loss": 0.3729,
2267
+ "grad_norm": 7.934334754943848,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.521086464,
2274
+ "gpu_mem": 4.440066048,
2275
+ "loss": 0.4745,
2276
+ "grad_norm": 10.878839492797852,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.521086464,
2283
+ "gpu_mem": 4.43986176,
2284
+ "loss": 0.4278,
2285
+ "grad_norm": 9.874471664428711,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.521086464,
2292
+ "gpu_mem": 4.440018432,
2293
+ "loss": 0.4858,
2294
+ "grad_norm": 10.910809516906738,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.521086464,
2301
+ "gpu_mem": 4.439844864,
2302
+ "loss": 0.4624,
2303
+ "grad_norm": 9.551429748535156,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.521086464,
2310
+ "gpu_mem": 4.440258048,
2311
+ "loss": 0.4309,
2312
+ "grad_norm": 8.495111465454102,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.521086464,
2319
+ "gpu_mem": 4.439917056,
2320
+ "loss": 0.5244,
2321
+ "grad_norm": 15.041364669799805,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.521086464,
2328
+ "gpu_mem": 4.439834112,
2329
+ "loss": 0.3973,
2330
+ "grad_norm": 8.967065811157227,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.521086464,
2337
+ "gpu_mem": 4.439950848,
2338
+ "loss": 0.3158,
2339
+ "grad_norm": 7.493344306945801,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.521086464,
2346
+ "gpu_mem": 4.439906304,
2347
+ "loss": 0.326,
2348
+ "grad_norm": 8.3322114944458,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.521086464,
2355
+ "gpu_mem": 4.439863296,
2356
+ "loss": 0.3214,
2357
+ "grad_norm": 7.001258850097656,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.521086464,
2364
+ "gpu_mem": 4.43990016,
2365
+ "loss": 0.3677,
2366
+ "grad_norm": 7.1652116775512695,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.521086464,
2373
+ "gpu_mem": 4.439987712,
2374
+ "loss": 0.3891,
2375
+ "grad_norm": 9.185032844543457,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.521086464,
2382
+ "gpu_mem": 4.439906304,
2383
+ "loss": 0.4603,
2384
+ "grad_norm": 9.57519245147705,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.521086464,
2391
+ "gpu_mem": 4.440121344,
2392
+ "loss": 0.4583,
2393
+ "grad_norm": 10.558945655822754,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.521086464,
2400
+ "gpu_mem": 4.439913984,
2401
+ "loss": 0.4191,
2402
+ "grad_norm": 10.343015670776367,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.521086464,
2409
+ "gpu_mem": 4.439918592,
2410
+ "loss": 0.3232,
2411
+ "grad_norm": 8.387310028076172,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.521086464,
2418
+ "gpu_mem": 4.439929344,
2419
+ "loss": 0.4041,
2420
+ "grad_norm": 11.825252532958984,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.521086464,
2427
+ "gpu_mem": 4.439967744,
2428
+ "loss": 0.365,
2429
+ "grad_norm": 10.832479476928711,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.521086464,
2436
+ "gpu_mem": 4.440019968,
2437
+ "loss": 0.4345,
2438
+ "grad_norm": 10.482426643371582,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.521086464,
2445
+ "gpu_mem": 4.439878656,
2446
+ "loss": 0.4865,
2447
+ "grad_norm": 8.570451736450195,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.521086464,
2454
+ "gpu_mem": 4.439758848,
2455
+ "loss": 0.425,
2456
+ "grad_norm": 9.54675579071045,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.521086464,
2463
+ "gpu_mem": 4.439986176,
2464
+ "loss": 0.4173,
2465
+ "grad_norm": 8.669840812683105,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.521086464,
2472
+ "gpu_mem": 4.4402304,
2473
+ "loss": 0.3489,
2474
+ "grad_norm": 7.460136890411377,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.521086464,
2481
+ "gpu_mem": 4.439890944,
2482
+ "loss": 0.4088,
2483
+ "grad_norm": 8.213056564331055,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.521086464,
2490
+ "gpu_mem": 4.439837184,
2491
+ "loss": 0.5171,
2492
+ "grad_norm": 11.188621520996094,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.521086464,
2499
+ "gpu_mem": 4.44,
2500
+ "loss": 0.4212,
2501
+ "grad_norm": 8.904882431030273,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.521086464,
2508
+ "gpu_mem": 4.439940096,
2509
+ "loss": 0.4414,
2510
+ "grad_norm": 9.937751770019531,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.521086464,
2517
+ "gpu_mem": 4.439920128,
2518
+ "loss": 0.4172,
2519
+ "grad_norm": 8.643899917602539,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.521086464,
2526
+ "gpu_mem": 4.439855616,
2527
+ "loss": 0.3792,
2528
+ "grad_norm": 8.005178451538086,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.521086464,
2535
+ "gpu_mem": 4.44028416,
2536
+ "loss": 0.3363,
2537
+ "grad_norm": 10.754042625427246,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.521086464,
2544
+ "gpu_mem": 4.439990784,
2545
+ "loss": 0.3576,
2546
+ "grad_norm": 9.666711807250977,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.521086464,
2553
+ "gpu_mem": 4.4398464,
2554
+ "loss": 0.3908,
2555
+ "grad_norm": 9.546576499938965,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.521086464,
2562
+ "gpu_mem": 4.43990016,
2563
+ "loss": 0.4081,
2564
+ "grad_norm": 8.594533920288086,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.521086464,
2571
+ "gpu_mem": 4.440317952,
2572
+ "loss": 0.4529,
2573
+ "grad_norm": 8.963507652282715,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.521086464,
2580
+ "gpu_mem": 4.440087552,
2581
+ "loss": 0.3062,
2582
+ "grad_norm": 8.207653045654297,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.521086464,
2589
+ "gpu_mem": 4.439872512,
2590
+ "loss": 0.5091,
2591
+ "grad_norm": 11.16333293914795,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.521086464,
2598
+ "gpu_mem": 4.439956992,
2599
+ "loss": 0.4421,
2600
+ "grad_norm": 8.138298988342285,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.521086464,
2607
+ "gpu_mem": 4.439881728,
2608
+ "loss": 0.4773,
2609
+ "grad_norm": 10.596417427062988,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.521086464,
2616
+ "gpu_mem": 4.439917056,
2617
+ "loss": 0.4893,
2618
+ "grad_norm": 8.895089149475098,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.521086464,
2625
+ "gpu_mem": 4.44,
2626
+ "loss": 0.3718,
2627
+ "grad_norm": 7.455649375915527,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.521086464,
2634
+ "gpu_mem": 4.439917056,
2635
+ "loss": 0.5375,
2636
+ "grad_norm": 9.682267189025879,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.521086464,
2643
+ "gpu_mem": 4.439943168,
2644
+ "loss": 0.4929,
2645
+ "grad_norm": 11.033202171325684,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.521086464,
2652
+ "gpu_mem": 4.439943168,
2653
+ "train_runtime": 4551.7265,
2654
+ "train_samples_per_second": 4.142,
2655
+ "train_steps_per_second": 0.065,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.6919786211381964
2658
+ }
2659
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "boolq",
3
+ "results": 0.7718654434250765
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 13009920
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-boolq-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T23:08:42.953080"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r32-a2/training_logs.json ADDED
@@ -0,0 +1,2659 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "step": 1,
4
+ "epoch": 0.006779661016949152,
5
+ "cpu_mem": 1.493827584,
6
+ "gpu_mem": 4.520274944,
7
+ "loss": 8.869,
8
+ "grad_norm": 13.0520601272583,
9
+ "learning_rate": 9.999999999999999e-06
10
+ },
11
+ {
12
+ "step": 2,
13
+ "epoch": 0.013559322033898305,
14
+ "cpu_mem": 1.494614016,
15
+ "gpu_mem": 4.624492032,
16
+ "loss": 8.9376,
17
+ "grad_norm": 13.392369270324707,
18
+ "learning_rate": 1.9999999999999998e-05
19
+ },
20
+ {
21
+ "step": 3,
22
+ "epoch": 0.020338983050847456,
23
+ "cpu_mem": 1.49520384,
24
+ "gpu_mem": 4.624410624,
25
+ "loss": 8.8022,
26
+ "grad_norm": 13.587163925170898,
27
+ "learning_rate": 2.9999999999999997e-05
28
+ },
29
+ {
30
+ "step": 4,
31
+ "epoch": 0.02711864406779661,
32
+ "cpu_mem": 1.495793664,
33
+ "gpu_mem": 4.624410624,
34
+ "loss": 8.4715,
35
+ "grad_norm": 13.895402908325195,
36
+ "learning_rate": 3.9999999999999996e-05
37
+ },
38
+ {
39
+ "step": 5,
40
+ "epoch": 0.03389830508474576,
41
+ "cpu_mem": 1.49618688,
42
+ "gpu_mem": 4.624346112,
43
+ "loss": 7.9105,
44
+ "grad_norm": 13.423394203186035,
45
+ "learning_rate": 4.9999999999999996e-05
46
+ },
47
+ {
48
+ "step": 6,
49
+ "epoch": 0.04067796610169491,
50
+ "cpu_mem": 1.496776704,
51
+ "gpu_mem": 4.62436608,
52
+ "loss": 7.5511,
53
+ "grad_norm": 12.783663749694824,
54
+ "learning_rate": 5.9999999999999995e-05
55
+ },
56
+ {
57
+ "step": 7,
58
+ "epoch": 0.04745762711864407,
59
+ "cpu_mem": 1.497366528,
60
+ "gpu_mem": 4.624418304,
61
+ "loss": 6.6874,
62
+ "grad_norm": 13.757925987243652,
63
+ "learning_rate": 7e-05
64
+ },
65
+ {
66
+ "step": 8,
67
+ "epoch": 0.05423728813559322,
68
+ "cpu_mem": 1.497759744,
69
+ "gpu_mem": 4.62450432,
70
+ "loss": 5.8091,
71
+ "grad_norm": 13.570146560668945,
72
+ "learning_rate": 7.999999999999999e-05
73
+ },
74
+ {
75
+ "step": 9,
76
+ "epoch": 0.061016949152542375,
77
+ "cpu_mem": 1.49815296,
78
+ "gpu_mem": 4.62441216,
79
+ "loss": 4.8142,
80
+ "grad_norm": 12.81284236907959,
81
+ "learning_rate": 8.999999999999999e-05
82
+ },
83
+ {
84
+ "step": 10,
85
+ "epoch": 0.06779661016949153,
86
+ "cpu_mem": 1.498546176,
87
+ "gpu_mem": 4.62431232,
88
+ "loss": 3.9045,
89
+ "grad_norm": 12.29382610321045,
90
+ "learning_rate": 9.999999999999999e-05
91
+ },
92
+ {
93
+ "step": 11,
94
+ "epoch": 0.07457627118644068,
95
+ "cpu_mem": 1.499136,
96
+ "gpu_mem": 4.624416768,
97
+ "loss": 2.7592,
98
+ "grad_norm": 10.226658821105957,
99
+ "learning_rate": 0.00010999999999999998
100
+ },
101
+ {
102
+ "step": 12,
103
+ "epoch": 0.08135593220338982,
104
+ "cpu_mem": 1.499529216,
105
+ "gpu_mem": 4.62478848,
106
+ "loss": 2.0805,
107
+ "grad_norm": 7.122352600097656,
108
+ "learning_rate": 0.00011999999999999999
109
+ },
110
+ {
111
+ "step": 13,
112
+ "epoch": 0.08813559322033898,
113
+ "cpu_mem": 1.50011904,
114
+ "gpu_mem": 4.624392192,
115
+ "loss": 1.7604,
116
+ "grad_norm": 4.6575140953063965,
117
+ "learning_rate": 0.00013
118
+ },
119
+ {
120
+ "step": 14,
121
+ "epoch": 0.09491525423728814,
122
+ "cpu_mem": 1.500512256,
123
+ "gpu_mem": 4.624369152,
124
+ "loss": 1.1904,
125
+ "grad_norm": 2.9982383251190186,
126
+ "learning_rate": 0.00014
127
+ },
128
+ {
129
+ "step": 15,
130
+ "epoch": 0.1016949152542373,
131
+ "cpu_mem": 1.500708864,
132
+ "gpu_mem": 4.624307712,
133
+ "loss": 1.148,
134
+ "grad_norm": 2.116892099380493,
135
+ "learning_rate": 0.00015
136
+ },
137
+ {
138
+ "step": 16,
139
+ "epoch": 0.10847457627118644,
140
+ "cpu_mem": 1.50110208,
141
+ "gpu_mem": 4.624392192,
142
+ "loss": 0.9269,
143
+ "grad_norm": 1.62022066116333,
144
+ "learning_rate": 0.00015999999999999999
145
+ },
146
+ {
147
+ "step": 17,
148
+ "epoch": 0.1152542372881356,
149
+ "cpu_mem": 1.501495296,
150
+ "gpu_mem": 4.624432128,
151
+ "loss": 0.8529,
152
+ "grad_norm": 1.9826555252075195,
153
+ "learning_rate": 0.00016999999999999999
154
+ },
155
+ {
156
+ "step": 18,
157
+ "epoch": 0.12203389830508475,
158
+ "cpu_mem": 1.501691904,
159
+ "gpu_mem": 4.624495104,
160
+ "loss": 0.7057,
161
+ "grad_norm": 0.8324311971664429,
162
+ "learning_rate": 0.00017999999999999998
163
+ },
164
+ {
165
+ "step": 19,
166
+ "epoch": 0.1288135593220339,
167
+ "cpu_mem": 1.50208512,
168
+ "gpu_mem": 4.624332288,
169
+ "loss": 0.7184,
170
+ "grad_norm": 0.6730552911758423,
171
+ "learning_rate": 0.00018999999999999998
172
+ },
173
+ {
174
+ "step": 20,
175
+ "epoch": 0.13559322033898305,
176
+ "cpu_mem": 1.502478336,
177
+ "gpu_mem": 4.624444416,
178
+ "loss": 0.7847,
179
+ "grad_norm": 3.902228593826294,
180
+ "learning_rate": 0.00019999999999999998
181
+ },
182
+ {
183
+ "step": 21,
184
+ "epoch": 0.1423728813559322,
185
+ "cpu_mem": 1.502674944,
186
+ "gpu_mem": 4.624602624,
187
+ "loss": 0.6757,
188
+ "grad_norm": 1.8044828176498413,
189
+ "learning_rate": 0.00020999999999999998
190
+ },
191
+ {
192
+ "step": 22,
193
+ "epoch": 0.14915254237288136,
194
+ "cpu_mem": 1.50306816,
195
+ "gpu_mem": 4.624495104,
196
+ "loss": 0.8293,
197
+ "grad_norm": 3.8413350582122803,
198
+ "learning_rate": 0.00021999999999999995
199
+ },
200
+ {
201
+ "step": 23,
202
+ "epoch": 0.15593220338983052,
203
+ "cpu_mem": 1.503264768,
204
+ "gpu_mem": 4.624467456,
205
+ "loss": 0.8114,
206
+ "grad_norm": 3.3119895458221436,
207
+ "learning_rate": 0.00023
208
+ },
209
+ {
210
+ "step": 24,
211
+ "epoch": 0.16271186440677965,
212
+ "cpu_mem": 1.503657984,
213
+ "gpu_mem": 4.624524288,
214
+ "loss": 0.6281,
215
+ "grad_norm": 1.0927964448928833,
216
+ "learning_rate": 0.00023999999999999998
217
+ },
218
+ {
219
+ "step": 25,
220
+ "epoch": 0.1694915254237288,
221
+ "cpu_mem": 1.503854592,
222
+ "gpu_mem": 4.624309248,
223
+ "loss": 0.6868,
224
+ "grad_norm": 1.6278636455535889,
225
+ "learning_rate": 0.00025
226
+ },
227
+ {
228
+ "step": 26,
229
+ "epoch": 0.17627118644067796,
230
+ "cpu_mem": 1.5040512,
231
+ "gpu_mem": 4.624364544,
232
+ "loss": 0.6885,
233
+ "grad_norm": 1.0237977504730225,
234
+ "learning_rate": 0.00026
235
+ },
236
+ {
237
+ "step": 27,
238
+ "epoch": 0.18305084745762712,
239
+ "cpu_mem": 1.504247808,
240
+ "gpu_mem": 4.624656384,
241
+ "loss": 0.6395,
242
+ "grad_norm": 1.4592667818069458,
243
+ "learning_rate": 0.00027
244
+ },
245
+ {
246
+ "step": 28,
247
+ "epoch": 0.18983050847457628,
248
+ "cpu_mem": 1.504641024,
249
+ "gpu_mem": 4.62433536,
250
+ "loss": 0.8711,
251
+ "grad_norm": 3.5112271308898926,
252
+ "learning_rate": 0.00028
253
+ },
254
+ {
255
+ "step": 29,
256
+ "epoch": 0.19661016949152543,
257
+ "cpu_mem": 1.50503424,
258
+ "gpu_mem": 4.624399872,
259
+ "loss": 0.7362,
260
+ "grad_norm": 2.1286396980285645,
261
+ "learning_rate": 0.00029
262
+ },
263
+ {
264
+ "step": 30,
265
+ "epoch": 0.2033898305084746,
266
+ "cpu_mem": 1.505230848,
267
+ "gpu_mem": 4.624478208,
268
+ "loss": 0.7353,
269
+ "grad_norm": 1.9305531978607178,
270
+ "learning_rate": 0.0003
271
+ },
272
+ {
273
+ "step": 31,
274
+ "epoch": 0.21016949152542372,
275
+ "cpu_mem": 1.505427456,
276
+ "gpu_mem": 4.6242816,
277
+ "loss": 0.6234,
278
+ "grad_norm": 0.9298433661460876,
279
+ "learning_rate": 0.0002999893794250036
280
+ },
281
+ {
282
+ "step": 32,
283
+ "epoch": 0.21694915254237288,
284
+ "cpu_mem": 1.505624064,
285
+ "gpu_mem": 4.624395264,
286
+ "loss": 0.6989,
287
+ "grad_norm": 0.8718124032020569,
288
+ "learning_rate": 0.00029995751920396937
289
+ },
290
+ {
291
+ "step": 33,
292
+ "epoch": 0.22372881355932203,
293
+ "cpu_mem": 1.505820672,
294
+ "gpu_mem": 4.624633344,
295
+ "loss": 0.6819,
296
+ "grad_norm": 0.6011456847190857,
297
+ "learning_rate": 0.00029990442384854874
298
+ },
299
+ {
300
+ "step": 34,
301
+ "epoch": 0.2305084745762712,
302
+ "cpu_mem": 1.50601728,
303
+ "gpu_mem": 4.62433536,
304
+ "loss": 0.589,
305
+ "grad_norm": 0.5000481009483337,
306
+ "learning_rate": 0.0002998301008774512
307
+ },
308
+ {
309
+ "step": 35,
310
+ "epoch": 0.23728813559322035,
311
+ "cpu_mem": 1.506213888,
312
+ "gpu_mem": 4.624545792,
313
+ "loss": 0.7987,
314
+ "grad_norm": 1.9291328191757202,
315
+ "learning_rate": 0.0002997345608153792
316
+ },
317
+ {
318
+ "step": 36,
319
+ "epoch": 0.2440677966101695,
320
+ "cpu_mem": 1.506410496,
321
+ "gpu_mem": 4.62449664,
322
+ "loss": 0.7698,
323
+ "grad_norm": 1.7601375579833984,
324
+ "learning_rate": 0.000299617817191538
325
+ },
326
+ {
327
+ "step": 37,
328
+ "epoch": 0.25084745762711863,
329
+ "cpu_mem": 1.506607104,
330
+ "gpu_mem": 4.624307712,
331
+ "loss": 0.6271,
332
+ "grad_norm": 0.5662587285041809,
333
+ "learning_rate": 0.0002994798865377198
334
+ },
335
+ {
336
+ "step": 38,
337
+ "epoch": 0.2576271186440678,
338
+ "cpu_mem": 1.506803712,
339
+ "gpu_mem": 4.624555008,
340
+ "loss": 0.7151,
341
+ "grad_norm": 0.951106071472168,
342
+ "learning_rate": 0.0002993207883859627
343
+ },
344
+ {
345
+ "step": 39,
346
+ "epoch": 0.26440677966101694,
347
+ "cpu_mem": 1.50700032,
348
+ "gpu_mem": 4.6249344,
349
+ "loss": 0.7289,
350
+ "grad_norm": 1.2373714447021484,
351
+ "learning_rate": 0.0002991405452657846
352
+ },
353
+ {
354
+ "step": 40,
355
+ "epoch": 0.2711864406779661,
356
+ "cpu_mem": 1.507196928,
357
+ "gpu_mem": 4.62450432,
358
+ "loss": 0.6913,
359
+ "grad_norm": 1.2386014461517334,
360
+ "learning_rate": 0.00029893918270099324
361
+ },
362
+ {
363
+ "step": 41,
364
+ "epoch": 0.27796610169491526,
365
+ "cpu_mem": 1.507393536,
366
+ "gpu_mem": 4.624731648,
367
+ "loss": 0.6724,
368
+ "grad_norm": 0.6577208638191223,
369
+ "learning_rate": 0.00029871672920607153
370
+ },
371
+ {
372
+ "step": 42,
373
+ "epoch": 0.2847457627118644,
374
+ "cpu_mem": 1.507590144,
375
+ "gpu_mem": 4.624628736,
376
+ "loss": 0.6555,
377
+ "grad_norm": 0.25326287746429443,
378
+ "learning_rate": 0.0002984732162821399
379
+ },
380
+ {
381
+ "step": 43,
382
+ "epoch": 0.29152542372881357,
383
+ "cpu_mem": 1.507786752,
384
+ "gpu_mem": 4.62445056,
385
+ "loss": 0.6011,
386
+ "grad_norm": 0.29461658000946045,
387
+ "learning_rate": 0.0002982086784124952
388
+ },
389
+ {
390
+ "step": 44,
391
+ "epoch": 0.2983050847457627,
392
+ "cpu_mem": 1.507786752,
393
+ "gpu_mem": 4.624593408,
394
+ "loss": 0.6203,
395
+ "grad_norm": 0.5558605790138245,
396
+ "learning_rate": 0.00029792315305772796
397
+ },
398
+ {
399
+ "step": 45,
400
+ "epoch": 0.3050847457627119,
401
+ "cpu_mem": 1.507786752,
402
+ "gpu_mem": 4.62437376,
403
+ "loss": 0.8654,
404
+ "grad_norm": 2.026333808898926,
405
+ "learning_rate": 0.0002976166806504174
406
+ },
407
+ {
408
+ "step": 46,
409
+ "epoch": 0.31186440677966104,
410
+ "cpu_mem": 1.50798336,
411
+ "gpu_mem": 4.624616448,
412
+ "loss": 0.6738,
413
+ "grad_norm": 1.057332992553711,
414
+ "learning_rate": 0.00029728930458940595
415
+ },
416
+ {
417
+ "step": 47,
418
+ "epoch": 0.31864406779661014,
419
+ "cpu_mem": 1.508179968,
420
+ "gpu_mem": 4.624339968,
421
+ "loss": 0.5998,
422
+ "grad_norm": 0.4258666932582855,
423
+ "learning_rate": 0.00029694107123365385
424
+ },
425
+ {
426
+ "step": 48,
427
+ "epoch": 0.3254237288135593,
428
+ "cpu_mem": 1.508376576,
429
+ "gpu_mem": 4.624416768,
430
+ "loss": 0.6132,
431
+ "grad_norm": 1.7531728744506836,
432
+ "learning_rate": 0.00029657202989567393
433
+ },
434
+ {
435
+ "step": 49,
436
+ "epoch": 0.33220338983050846,
437
+ "cpu_mem": 1.508573184,
438
+ "gpu_mem": 4.624433664,
439
+ "loss": 0.6978,
440
+ "grad_norm": 0.3271001875400543,
441
+ "learning_rate": 0.00029618223283454893
442
+ },
443
+ {
444
+ "step": 50,
445
+ "epoch": 0.3389830508474576,
446
+ "cpu_mem": 1.508573184,
447
+ "gpu_mem": 4.624372224,
448
+ "loss": 0.6478,
449
+ "grad_norm": 1.044687032699585,
450
+ "learning_rate": 0.00029577173524853123
451
+ },
452
+ {
453
+ "step": 51,
454
+ "epoch": 0.34576271186440677,
455
+ "cpu_mem": 1.508769792,
456
+ "gpu_mem": 4.624376832,
457
+ "loss": 0.6598,
458
+ "grad_norm": 1.3430612087249756,
459
+ "learning_rate": 0.0002953405952672261
460
+ },
461
+ {
462
+ "step": 52,
463
+ "epoch": 0.3525423728813559,
464
+ "cpu_mem": 1.508769792,
465
+ "gpu_mem": 4.624456704,
466
+ "loss": 0.6723,
467
+ "grad_norm": 0.5483831763267517,
468
+ "learning_rate": 0.0002948888739433602
469
+ },
470
+ {
471
+ "step": 53,
472
+ "epoch": 0.3593220338983051,
473
+ "cpu_mem": 1.508769792,
474
+ "gpu_mem": 4.624479744,
475
+ "loss": 0.573,
476
+ "grad_norm": 0.3019401729106903,
477
+ "learning_rate": 0.0002944166352441363
478
+ },
479
+ {
480
+ "step": 54,
481
+ "epoch": 0.36610169491525424,
482
+ "cpu_mem": 1.5089664,
483
+ "gpu_mem": 4.624407552,
484
+ "loss": 0.7986,
485
+ "grad_norm": 1.5273728370666504,
486
+ "learning_rate": 0.0002939239460421746
487
+ },
488
+ {
489
+ "step": 55,
490
+ "epoch": 0.3728813559322034,
491
+ "cpu_mem": 1.5089664,
492
+ "gpu_mem": 4.624677888,
493
+ "loss": 0.6091,
494
+ "grad_norm": 0.4779277741909027,
495
+ "learning_rate": 0.00029341087610604337
496
+ },
497
+ {
498
+ "step": 56,
499
+ "epoch": 0.37966101694915255,
500
+ "cpu_mem": 1.509163008,
501
+ "gpu_mem": 4.624464384,
502
+ "loss": 0.6733,
503
+ "grad_norm": 1.1983107328414917,
504
+ "learning_rate": 0.00029287749809037904
505
+ },
506
+ {
507
+ "step": 57,
508
+ "epoch": 0.3864406779661017,
509
+ "cpu_mem": 1.509163008,
510
+ "gpu_mem": 4.62445824,
511
+ "loss": 0.6289,
512
+ "grad_norm": 1.399798035621643,
513
+ "learning_rate": 0.0002923238875255979
514
+ },
515
+ {
516
+ "step": 58,
517
+ "epoch": 0.39322033898305087,
518
+ "cpu_mem": 1.509163008,
519
+ "gpu_mem": 4.624353792,
520
+ "loss": 0.6225,
521
+ "grad_norm": 1.6279141902923584,
522
+ "learning_rate": 0.00029175012280720024
523
+ },
524
+ {
525
+ "step": 59,
526
+ "epoch": 0.4,
527
+ "cpu_mem": 1.509359616,
528
+ "gpu_mem": 4.624370688,
529
+ "loss": 0.6482,
530
+ "grad_norm": 0.6617344617843628,
531
+ "learning_rate": 0.000291156285184669
532
+ },
533
+ {
534
+ "step": 60,
535
+ "epoch": 0.4067796610169492,
536
+ "cpu_mem": 1.509556224,
537
+ "gpu_mem": 4.624464384,
538
+ "loss": 0.5742,
539
+ "grad_norm": 0.8502650856971741,
540
+ "learning_rate": 0.00029054245874996426
541
+ },
542
+ {
543
+ "step": 61,
544
+ "epoch": 0.4135593220338983,
545
+ "cpu_mem": 1.509556224,
546
+ "gpu_mem": 4.624475136,
547
+ "loss": 0.5745,
548
+ "grad_norm": 0.5171988010406494,
549
+ "learning_rate": 0.0002899087304256151
550
+ },
551
+ {
552
+ "step": 62,
553
+ "epoch": 0.42033898305084744,
554
+ "cpu_mem": 1.509556224,
555
+ "gpu_mem": 4.624462848,
556
+ "loss": 0.8279,
557
+ "grad_norm": 1.8439751863479614,
558
+ "learning_rate": 0.0002892551899524109
559
+ },
560
+ {
561
+ "step": 63,
562
+ "epoch": 0.4271186440677966,
563
+ "cpu_mem": 1.509556224,
564
+ "gpu_mem": 4.624455168,
565
+ "loss": 0.4921,
566
+ "grad_norm": 0.5544669032096863,
567
+ "learning_rate": 0.000288581929876693
568
+ },
569
+ {
570
+ "step": 64,
571
+ "epoch": 0.43389830508474575,
572
+ "cpu_mem": 1.509752832,
573
+ "gpu_mem": 4.624384512,
574
+ "loss": 0.5735,
575
+ "grad_norm": 0.6035580635070801,
576
+ "learning_rate": 0.0002878890455372498
577
+ },
578
+ {
579
+ "step": 65,
580
+ "epoch": 0.4406779661016949,
581
+ "cpu_mem": 1.509752832,
582
+ "gpu_mem": 4.624429056,
583
+ "loss": 0.5902,
584
+ "grad_norm": 0.5334798097610474,
585
+ "learning_rate": 0.0002871766350518159
586
+ },
587
+ {
588
+ "step": 66,
589
+ "epoch": 0.44745762711864406,
590
+ "cpu_mem": 1.50994944,
591
+ "gpu_mem": 4.624622592,
592
+ "loss": 0.6281,
593
+ "grad_norm": 1.107169270515442,
594
+ "learning_rate": 0.00028644479930317775
595
+ },
596
+ {
597
+ "step": 67,
598
+ "epoch": 0.4542372881355932,
599
+ "cpu_mem": 1.510146048,
600
+ "gpu_mem": 4.624332288,
601
+ "loss": 0.6952,
602
+ "grad_norm": 0.9980673789978027,
603
+ "learning_rate": 0.00028569364192488803
604
+ },
605
+ {
606
+ "step": 68,
607
+ "epoch": 0.4610169491525424,
608
+ "cpu_mem": 1.510146048,
609
+ "gpu_mem": 4.624300032,
610
+ "loss": 0.7016,
611
+ "grad_norm": 0.480561226606369,
612
+ "learning_rate": 0.00028492326928659045
613
+ },
614
+ {
615
+ "step": 69,
616
+ "epoch": 0.46779661016949153,
617
+ "cpu_mem": 1.510146048,
618
+ "gpu_mem": 4.62436608,
619
+ "loss": 0.5867,
620
+ "grad_norm": 0.3215237855911255,
621
+ "learning_rate": 0.00028413379047895665
622
+ },
623
+ {
624
+ "step": 70,
625
+ "epoch": 0.4745762711864407,
626
+ "cpu_mem": 1.510146048,
627
+ "gpu_mem": 4.624359936,
628
+ "loss": 0.5704,
629
+ "grad_norm": 0.5735366940498352,
630
+ "learning_rate": 0.0002833253172982385
631
+ },
632
+ {
633
+ "step": 71,
634
+ "epoch": 0.48135593220338985,
635
+ "cpu_mem": 1.510146048,
636
+ "gpu_mem": 4.6245888,
637
+ "loss": 0.5306,
638
+ "grad_norm": 0.38639822602272034,
639
+ "learning_rate": 0.0002824979642304366
640
+ },
641
+ {
642
+ "step": 72,
643
+ "epoch": 0.488135593220339,
644
+ "cpu_mem": 1.510146048,
645
+ "gpu_mem": 4.62458112,
646
+ "loss": 0.566,
647
+ "grad_norm": 0.44350624084472656,
648
+ "learning_rate": 0.0002816518484350883
649
+ },
650
+ {
651
+ "step": 73,
652
+ "epoch": 0.49491525423728816,
653
+ "cpu_mem": 1.510342656,
654
+ "gpu_mem": 4.624547328,
655
+ "loss": 0.6532,
656
+ "grad_norm": 1.009778380393982,
657
+ "learning_rate": 0.0002807870897286772
658
+ },
659
+ {
660
+ "step": 74,
661
+ "epoch": 0.5016949152542373,
662
+ "cpu_mem": 1.510342656,
663
+ "gpu_mem": 4.624407552,
664
+ "loss": 0.569,
665
+ "grad_norm": 0.530914306640625,
666
+ "learning_rate": 0.0002799038105676658
667
+ },
668
+ {
669
+ "step": 75,
670
+ "epoch": 0.5084745762711864,
671
+ "cpu_mem": 1.510342656,
672
+ "gpu_mem": 4.624332288,
673
+ "loss": 0.5485,
674
+ "grad_norm": 1.2175366878509521,
675
+ "learning_rate": 0.000279002136031155
676
+ },
677
+ {
678
+ "step": 76,
679
+ "epoch": 0.5152542372881356,
680
+ "cpu_mem": 1.510342656,
681
+ "gpu_mem": 4.624272384,
682
+ "loss": 0.6296,
683
+ "grad_norm": 1.2137932777404785,
684
+ "learning_rate": 0.00027808219380317216
685
+ },
686
+ {
687
+ "step": 77,
688
+ "epoch": 0.5220338983050847,
689
+ "cpu_mem": 1.510342656,
690
+ "gpu_mem": 4.624346112,
691
+ "loss": 0.5439,
692
+ "grad_norm": 1.0348937511444092,
693
+ "learning_rate": 0.0002771441141545895
694
+ },
695
+ {
696
+ "step": 78,
697
+ "epoch": 0.5288135593220339,
698
+ "cpu_mem": 1.510539264,
699
+ "gpu_mem": 4.624398336,
700
+ "loss": 0.7299,
701
+ "grad_norm": 2.1830639839172363,
702
+ "learning_rate": 0.0002761880299246772
703
+ },
704
+ {
705
+ "step": 79,
706
+ "epoch": 0.535593220338983,
707
+ "cpu_mem": 1.510539264,
708
+ "gpu_mem": 4.624530432,
709
+ "loss": 0.7099,
710
+ "grad_norm": 1.5999380350112915,
711
+ "learning_rate": 0.000275214076502292
712
+ },
713
+ {
714
+ "step": 80,
715
+ "epoch": 0.5423728813559322,
716
+ "cpu_mem": 1.510539264,
717
+ "gpu_mem": 4.624421376,
718
+ "loss": 0.6399,
719
+ "grad_norm": 1.355906367301941,
720
+ "learning_rate": 0.0002742223918067056
721
+ },
722
+ {
723
+ "step": 81,
724
+ "epoch": 0.5491525423728814,
725
+ "cpu_mem": 1.510539264,
726
+ "gpu_mem": 4.624301568,
727
+ "loss": 0.6151,
728
+ "grad_norm": 1.0865662097930908,
729
+ "learning_rate": 0.00027321311626807374
730
+ },
731
+ {
732
+ "step": 82,
733
+ "epoch": 0.5559322033898305,
734
+ "cpu_mem": 1.510539264,
735
+ "gpu_mem": 4.624370688,
736
+ "loss": 0.6127,
737
+ "grad_norm": 0.4206037223339081,
738
+ "learning_rate": 0.0002721863928075503
739
+ },
740
+ {
741
+ "step": 83,
742
+ "epoch": 0.5627118644067797,
743
+ "cpu_mem": 1.510539264,
744
+ "gpu_mem": 4.624470528,
745
+ "loss": 0.6713,
746
+ "grad_norm": 0.580093264579773,
747
+ "learning_rate": 0.000271142366817049
748
+ },
749
+ {
750
+ "step": 84,
751
+ "epoch": 0.5694915254237288,
752
+ "cpu_mem": 1.510539264,
753
+ "gpu_mem": 4.624433664,
754
+ "loss": 0.7334,
755
+ "grad_norm": 1.9020066261291504,
756
+ "learning_rate": 0.00027008118613865406
757
+ },
758
+ {
759
+ "step": 85,
760
+ "epoch": 0.576271186440678,
761
+ "cpu_mem": 1.510539264,
762
+ "gpu_mem": 4.62446592,
763
+ "loss": 0.7021,
764
+ "grad_norm": 1.7552995681762695,
765
+ "learning_rate": 0.00026900300104368524
766
+ },
767
+ {
768
+ "step": 86,
769
+ "epoch": 0.5830508474576271,
770
+ "cpu_mem": 1.510735872,
771
+ "gpu_mem": 4.624416768,
772
+ "loss": 0.6841,
773
+ "grad_norm": 1.2111093997955322,
774
+ "learning_rate": 0.00026790796421141813
775
+ },
776
+ {
777
+ "step": 87,
778
+ "epoch": 0.5898305084745763,
779
+ "cpu_mem": 1.510735872,
780
+ "gpu_mem": 4.624424448,
781
+ "loss": 0.5347,
782
+ "grad_norm": 0.44297388195991516,
783
+ "learning_rate": 0.00026679623070746325
784
+ },
785
+ {
786
+ "step": 88,
787
+ "epoch": 0.5966101694915255,
788
+ "cpu_mem": 1.51093248,
789
+ "gpu_mem": 4.624568832,
790
+ "loss": 0.5532,
791
+ "grad_norm": 0.36787739396095276,
792
+ "learning_rate": 0.0002656679579618081
793
+ },
794
+ {
795
+ "step": 89,
796
+ "epoch": 0.6033898305084746,
797
+ "cpu_mem": 1.51093248,
798
+ "gpu_mem": 4.62435072,
799
+ "loss": 0.6981,
800
+ "grad_norm": 1.192941665649414,
801
+ "learning_rate": 0.0002645233057465235
802
+ },
803
+ {
804
+ "step": 90,
805
+ "epoch": 0.6101694915254238,
806
+ "cpu_mem": 1.51093248,
807
+ "gpu_mem": 4.62440448,
808
+ "loss": 0.5992,
809
+ "grad_norm": 0.9635607004165649,
810
+ "learning_rate": 0.00026336243615313873
811
+ },
812
+ {
813
+ "step": 91,
814
+ "epoch": 0.6169491525423729,
815
+ "cpu_mem": 1.51093248,
816
+ "gpu_mem": 4.624372224,
817
+ "loss": 0.5141,
818
+ "grad_norm": 0.5678079128265381,
819
+ "learning_rate": 0.00026218551356968814
820
+ },
821
+ {
822
+ "step": 92,
823
+ "epoch": 0.6237288135593221,
824
+ "cpu_mem": 1.51093248,
825
+ "gpu_mem": 4.624453632,
826
+ "loss": 0.7089,
827
+ "grad_norm": 1.4068773984909058,
828
+ "learning_rate": 0.00026099270465743254
829
+ },
830
+ {
831
+ "step": 93,
832
+ "epoch": 0.6305084745762712,
833
+ "cpu_mem": 1.51093248,
834
+ "gpu_mem": 4.624257024,
835
+ "loss": 0.7013,
836
+ "grad_norm": 0.5640556216239929,
837
+ "learning_rate": 0.0002597841783272588
838
+ },
839
+ {
840
+ "step": 94,
841
+ "epoch": 0.6372881355932203,
842
+ "cpu_mem": 1.51093248,
843
+ "gpu_mem": 4.624370688,
844
+ "loss": 0.6064,
845
+ "grad_norm": 0.9994253516197205,
846
+ "learning_rate": 0.0002585601057157605
847
+ },
848
+ {
849
+ "step": 95,
850
+ "epoch": 0.6440677966101694,
851
+ "cpu_mem": 1.51093248,
852
+ "gpu_mem": 4.624390656,
853
+ "loss": 0.5994,
854
+ "grad_norm": 0.8690909743309021,
855
+ "learning_rate": 0.00025732066016100394
856
+ },
857
+ {
858
+ "step": 96,
859
+ "epoch": 0.6508474576271186,
860
+ "cpu_mem": 1.51093248,
861
+ "gpu_mem": 4.624429056,
862
+ "loss": 0.5682,
863
+ "grad_norm": 1.0052908658981323,
864
+ "learning_rate": 0.00025606601717798207
865
+ },
866
+ {
867
+ "step": 97,
868
+ "epoch": 0.6576271186440678,
869
+ "cpu_mem": 1.51093248,
870
+ "gpu_mem": 4.624413696,
871
+ "loss": 0.6538,
872
+ "grad_norm": 0.5900957584381104,
873
+ "learning_rate": 0.0002547963544337602
874
+ },
875
+ {
876
+ "step": 98,
877
+ "epoch": 0.6644067796610169,
878
+ "cpu_mem": 1.51093248,
879
+ "gpu_mem": 4.624326144,
880
+ "loss": 0.5676,
881
+ "grad_norm": 0.3594554662704468,
882
+ "learning_rate": 0.0002535118517223168
883
+ },
884
+ {
885
+ "step": 99,
886
+ "epoch": 0.6711864406779661,
887
+ "cpu_mem": 1.51093248,
888
+ "gpu_mem": 4.624275456,
889
+ "loss": 0.5491,
890
+ "grad_norm": 0.4727354347705841,
891
+ "learning_rate": 0.00025221269093908365
892
+ },
893
+ {
894
+ "step": 100,
895
+ "epoch": 0.6779661016949152,
896
+ "cpu_mem": 1.51093248,
897
+ "gpu_mem": 4.624392192,
898
+ "loss": 0.5544,
899
+ "grad_norm": 0.36728915572166443,
900
+ "learning_rate": 0.0002508990560551879
901
+ },
902
+ {
903
+ "step": 101,
904
+ "epoch": 0.6847457627118644,
905
+ "cpu_mem": 1.51093248,
906
+ "gpu_mem": 4.624424448,
907
+ "loss": 0.574,
908
+ "grad_norm": 0.6338084936141968,
909
+ "learning_rate": 0.0002495711330914001
910
+ },
911
+ {
912
+ "step": 102,
913
+ "epoch": 0.6915254237288135,
914
+ "cpu_mem": 1.51093248,
915
+ "gpu_mem": 4.62445824,
916
+ "loss": 0.609,
917
+ "grad_norm": 0.601031482219696,
918
+ "learning_rate": 0.00024822911009179276
919
+ },
920
+ {
921
+ "step": 103,
922
+ "epoch": 0.6983050847457627,
923
+ "cpu_mem": 1.51093248,
924
+ "gpu_mem": 4.624508928,
925
+ "loss": 0.6165,
926
+ "grad_norm": 0.6266305446624756,
927
+ "learning_rate": 0.0002468731770971113
928
+ },
929
+ {
930
+ "step": 104,
931
+ "epoch": 0.7050847457627119,
932
+ "cpu_mem": 1.51093248,
933
+ "gpu_mem": 4.624413696,
934
+ "loss": 0.5317,
935
+ "grad_norm": 0.5269937515258789,
936
+ "learning_rate": 0.0002455035261178632
937
+ },
938
+ {
939
+ "step": 105,
940
+ "epoch": 0.711864406779661,
941
+ "cpu_mem": 1.51093248,
942
+ "gpu_mem": 4.624515072,
943
+ "loss": 0.6078,
944
+ "grad_norm": 0.5082085132598877,
945
+ "learning_rate": 0.0002441203511071278
946
+ },
947
+ {
948
+ "step": 106,
949
+ "epoch": 0.7186440677966102,
950
+ "cpu_mem": 1.51093248,
951
+ "gpu_mem": 4.62446592,
952
+ "loss": 0.6005,
953
+ "grad_norm": 0.8072856664657593,
954
+ "learning_rate": 0.00024272384793309077
955
+ },
956
+ {
957
+ "step": 107,
958
+ "epoch": 0.7254237288135593,
959
+ "cpu_mem": 1.51093248,
960
+ "gpu_mem": 4.624353792,
961
+ "loss": 0.524,
962
+ "grad_norm": 0.751579225063324,
963
+ "learning_rate": 0.00024131421435130807
964
+ },
965
+ {
966
+ "step": 108,
967
+ "epoch": 0.7322033898305085,
968
+ "cpu_mem": 1.51093248,
969
+ "gpu_mem": 4.624538112,
970
+ "loss": 0.5979,
971
+ "grad_norm": 0.6362266540527344,
972
+ "learning_rate": 0.00023989164997670202
973
+ },
974
+ {
975
+ "step": 109,
976
+ "epoch": 0.7389830508474576,
977
+ "cpu_mem": 1.51093248,
978
+ "gpu_mem": 4.624392192,
979
+ "loss": 0.6221,
980
+ "grad_norm": 0.570347011089325,
981
+ "learning_rate": 0.0002384563562552943
982
+ },
983
+ {
984
+ "step": 110,
985
+ "epoch": 0.7457627118644068,
986
+ "cpu_mem": 1.51093248,
987
+ "gpu_mem": 4.624395264,
988
+ "loss": 0.5563,
989
+ "grad_norm": 0.6132338047027588,
990
+ "learning_rate": 0.0002370085364356797
991
+ },
992
+ {
993
+ "step": 111,
994
+ "epoch": 0.752542372881356,
995
+ "cpu_mem": 1.51093248,
996
+ "gpu_mem": 4.624364544,
997
+ "loss": 0.5369,
998
+ "grad_norm": 0.6124526858329773,
999
+ "learning_rate": 0.0002355483955402446
1000
+ },
1001
+ {
1002
+ "step": 112,
1003
+ "epoch": 0.7593220338983051,
1004
+ "cpu_mem": 1.511129088,
1005
+ "gpu_mem": 4.624410624,
1006
+ "loss": 0.4849,
1007
+ "grad_norm": 0.37720710039138794,
1008
+ "learning_rate": 0.00023407614033613407
1009
+ },
1010
+ {
1011
+ "step": 113,
1012
+ "epoch": 0.7661016949152543,
1013
+ "cpu_mem": 1.511129088,
1014
+ "gpu_mem": 4.624401408,
1015
+ "loss": 0.5588,
1016
+ "grad_norm": 0.5072793364524841,
1017
+ "learning_rate": 0.0002325919793059723
1018
+ },
1019
+ {
1020
+ "step": 114,
1021
+ "epoch": 0.7728813559322034,
1022
+ "cpu_mem": 1.511129088,
1023
+ "gpu_mem": 4.624382976,
1024
+ "loss": 0.5045,
1025
+ "grad_norm": 0.44536927342414856,
1026
+ "learning_rate": 0.00023109612261833963
1027
+ },
1028
+ {
1029
+ "step": 115,
1030
+ "epoch": 0.7796610169491526,
1031
+ "cpu_mem": 1.511129088,
1032
+ "gpu_mem": 4.62445824,
1033
+ "loss": 0.5212,
1034
+ "grad_norm": 0.38146018981933594,
1035
+ "learning_rate": 0.0002295887820980112
1036
+ },
1037
+ {
1038
+ "step": 116,
1039
+ "epoch": 0.7864406779661017,
1040
+ "cpu_mem": 1.511129088,
1041
+ "gpu_mem": 4.624378368,
1042
+ "loss": 0.5656,
1043
+ "grad_norm": 0.6866984367370605,
1044
+ "learning_rate": 0.0002280701711959608
1045
+ },
1046
+ {
1047
+ "step": 117,
1048
+ "epoch": 0.7932203389830509,
1049
+ "cpu_mem": 1.511129088,
1050
+ "gpu_mem": 4.624269312,
1051
+ "loss": 0.5171,
1052
+ "grad_norm": 0.45652973651885986,
1053
+ "learning_rate": 0.00022654050495913495
1054
+ },
1055
+ {
1056
+ "step": 118,
1057
+ "epoch": 0.8,
1058
+ "cpu_mem": 1.511129088,
1059
+ "gpu_mem": 4.624507392,
1060
+ "loss": 0.5348,
1061
+ "grad_norm": 0.5435035824775696,
1062
+ "learning_rate": 0.000225
1063
+ },
1064
+ {
1065
+ "step": 119,
1066
+ "epoch": 0.8067796610169492,
1067
+ "cpu_mem": 1.511129088,
1068
+ "gpu_mem": 4.624677888,
1069
+ "loss": 0.4785,
1070
+ "grad_norm": 0.4610842168331146,
1071
+ "learning_rate": 0.00022344887446586865
1072
+ },
1073
+ {
1074
+ "step": 120,
1075
+ "epoch": 0.8135593220338984,
1076
+ "cpu_mem": 1.511129088,
1077
+ "gpu_mem": 4.624410624,
1078
+ "loss": 0.5077,
1079
+ "grad_norm": 0.5721786618232727,
1080
+ "learning_rate": 0.00022188734800800852
1081
+ },
1082
+ {
1083
+ "step": 121,
1084
+ "epoch": 0.8203389830508474,
1085
+ "cpu_mem": 1.511129088,
1086
+ "gpu_mem": 4.624438272,
1087
+ "loss": 0.5366,
1088
+ "grad_norm": 0.4264529049396515,
1089
+ "learning_rate": 0.00022031564175053754
1090
+ },
1091
+ {
1092
+ "step": 122,
1093
+ "epoch": 0.8271186440677966,
1094
+ "cpu_mem": 1.511325696,
1095
+ "gpu_mem": 4.62448896,
1096
+ "loss": 0.4827,
1097
+ "grad_norm": 0.4972001910209656,
1098
+ "learning_rate": 0.00021873397825911153
1099
+ },
1100
+ {
1101
+ "step": 123,
1102
+ "epoch": 0.8338983050847457,
1103
+ "cpu_mem": 1.511325696,
1104
+ "gpu_mem": 4.624298496,
1105
+ "loss": 0.5555,
1106
+ "grad_norm": 1.0725321769714355,
1107
+ "learning_rate": 0.00021714258150940685
1108
+ },
1109
+ {
1110
+ "step": 124,
1111
+ "epoch": 0.8406779661016949,
1112
+ "cpu_mem": 1.511325696,
1113
+ "gpu_mem": 4.624740864,
1114
+ "loss": 0.476,
1115
+ "grad_norm": 0.7154480218887329,
1116
+ "learning_rate": 0.0002155416768554039
1117
+ },
1118
+ {
1119
+ "step": 125,
1120
+ "epoch": 0.847457627118644,
1121
+ "cpu_mem": 1.511325696,
1122
+ "gpu_mem": 4.624467456,
1123
+ "loss": 0.5311,
1124
+ "grad_norm": 0.47886914014816284,
1125
+ "learning_rate": 0.00021393149099747523
1126
+ },
1127
+ {
1128
+ "step": 126,
1129
+ "epoch": 0.8542372881355932,
1130
+ "cpu_mem": 1.511325696,
1131
+ "gpu_mem": 4.62435072,
1132
+ "loss": 0.5198,
1133
+ "grad_norm": 0.5867719650268555,
1134
+ "learning_rate": 0.00021231225195028297
1135
+ },
1136
+ {
1137
+ "step": 127,
1138
+ "epoch": 0.8610169491525423,
1139
+ "cpu_mem": 1.511325696,
1140
+ "gpu_mem": 4.624790016,
1141
+ "loss": 0.5319,
1142
+ "grad_norm": 0.6425331234931946,
1143
+ "learning_rate": 0.00021068418901049025
1144
+ },
1145
+ {
1146
+ "step": 128,
1147
+ "epoch": 0.8677966101694915,
1148
+ "cpu_mem": 1.511325696,
1149
+ "gpu_mem": 4.62456576,
1150
+ "loss": 0.4434,
1151
+ "grad_norm": 0.6586298942565918,
1152
+ "learning_rate": 0.0002090475327242912
1153
+ },
1154
+ {
1155
+ "step": 129,
1156
+ "epoch": 0.8745762711864407,
1157
+ "cpu_mem": 1.511325696,
1158
+ "gpu_mem": 4.624605696,
1159
+ "loss": 0.5039,
1160
+ "grad_norm": 0.5399751663208008,
1161
+ "learning_rate": 0.00020740251485476345
1162
+ },
1163
+ {
1164
+ "step": 130,
1165
+ "epoch": 0.8813559322033898,
1166
+ "cpu_mem": 1.511325696,
1167
+ "gpu_mem": 4.624387584,
1168
+ "loss": 0.6539,
1169
+ "grad_norm": 0.5986157655715942,
1170
+ "learning_rate": 0.0002057493683490491
1171
+ },
1172
+ {
1173
+ "step": 131,
1174
+ "epoch": 0.888135593220339,
1175
+ "cpu_mem": 1.511325696,
1176
+ "gpu_mem": 4.624516608,
1177
+ "loss": 0.4843,
1178
+ "grad_norm": 0.49613156914711,
1179
+ "learning_rate": 0.00020408832730536746
1180
+ },
1181
+ {
1182
+ "step": 132,
1183
+ "epoch": 0.8949152542372881,
1184
+ "cpu_mem": 1.511325696,
1185
+ "gpu_mem": 4.624598016,
1186
+ "loss": 0.4115,
1187
+ "grad_norm": 0.8342369794845581,
1188
+ "learning_rate": 0.00020241962693986476
1189
+ },
1190
+ {
1191
+ "step": 133,
1192
+ "epoch": 0.9016949152542373,
1193
+ "cpu_mem": 1.511325696,
1194
+ "gpu_mem": 4.62438144,
1195
+ "loss": 0.4729,
1196
+ "grad_norm": 0.633710503578186,
1197
+ "learning_rate": 0.0002007435035533061
1198
+ },
1199
+ {
1200
+ "step": 134,
1201
+ "epoch": 0.9084745762711864,
1202
+ "cpu_mem": 1.511325696,
1203
+ "gpu_mem": 4.624515072,
1204
+ "loss": 0.449,
1205
+ "grad_norm": 1.1560707092285156,
1206
+ "learning_rate": 0.00019906019449761325
1207
+ },
1208
+ {
1209
+ "step": 135,
1210
+ "epoch": 0.9152542372881356,
1211
+ "cpu_mem": 1.511325696,
1212
+ "gpu_mem": 4.624538112,
1213
+ "loss": 0.501,
1214
+ "grad_norm": 0.5545433163642883,
1215
+ "learning_rate": 0.00019736993814225374
1216
+ },
1217
+ {
1218
+ "step": 136,
1219
+ "epoch": 0.9220338983050848,
1220
+ "cpu_mem": 1.511325696,
1221
+ "gpu_mem": 4.624375296,
1222
+ "loss": 0.4472,
1223
+ "grad_norm": 0.6953291893005371,
1224
+ "learning_rate": 0.00019567297384048604
1225
+ },
1226
+ {
1227
+ "step": 137,
1228
+ "epoch": 0.9288135593220339,
1229
+ "cpu_mem": 1.511325696,
1230
+ "gpu_mem": 4.624255488,
1231
+ "loss": 0.5768,
1232
+ "grad_norm": 1.2485830783843994,
1233
+ "learning_rate": 0.0001939695418954653
1234
+ },
1235
+ {
1236
+ "step": 138,
1237
+ "epoch": 0.9355932203389831,
1238
+ "cpu_mem": 1.511325696,
1239
+ "gpu_mem": 4.624436736,
1240
+ "loss": 0.5167,
1241
+ "grad_norm": 1.273888349533081,
1242
+ "learning_rate": 0.00019225988352621445
1243
+ },
1244
+ {
1245
+ "step": 139,
1246
+ "epoch": 0.9423728813559322,
1247
+ "cpu_mem": 1.511325696,
1248
+ "gpu_mem": 4.62433536,
1249
+ "loss": 0.4855,
1250
+ "grad_norm": 1.0167266130447388,
1251
+ "learning_rate": 0.00019054424083346592
1252
+ },
1253
+ {
1254
+ "step": 140,
1255
+ "epoch": 0.9491525423728814,
1256
+ "cpu_mem": 1.511325696,
1257
+ "gpu_mem": 4.624387584,
1258
+ "loss": 0.4915,
1259
+ "grad_norm": 0.6369791626930237,
1260
+ "learning_rate": 0.0001888228567653781
1261
+ },
1262
+ {
1263
+ "step": 141,
1264
+ "epoch": 0.9559322033898305,
1265
+ "cpu_mem": 1.511325696,
1266
+ "gpu_mem": 4.62441984,
1267
+ "loss": 0.5621,
1268
+ "grad_norm": 0.7908885478973389,
1269
+ "learning_rate": 0.0001870959750831323
1270
+ },
1271
+ {
1272
+ "step": 142,
1273
+ "epoch": 0.9627118644067797,
1274
+ "cpu_mem": 1.511325696,
1275
+ "gpu_mem": 4.624559616,
1276
+ "loss": 0.5112,
1277
+ "grad_norm": 1.217617154121399,
1278
+ "learning_rate": 0.0001853638403264141
1279
+ },
1280
+ {
1281
+ "step": 143,
1282
+ "epoch": 0.9694915254237289,
1283
+ "cpu_mem": 1.511325696,
1284
+ "gpu_mem": 4.62454272,
1285
+ "loss": 0.6221,
1286
+ "grad_norm": 1.0842041969299316,
1287
+ "learning_rate": 0.00018362669777878453
1288
+ },
1289
+ {
1290
+ "step": 144,
1291
+ "epoch": 0.976271186440678,
1292
+ "cpu_mem": 1.511325696,
1293
+ "gpu_mem": 4.62473472,
1294
+ "loss": 0.504,
1295
+ "grad_norm": 0.8124033212661743,
1296
+ "learning_rate": 0.00018188479343294648
1297
+ },
1298
+ {
1299
+ "step": 145,
1300
+ "epoch": 0.9830508474576272,
1301
+ "cpu_mem": 1.511325696,
1302
+ "gpu_mem": 4.624445952,
1303
+ "loss": 0.512,
1304
+ "grad_norm": 1.1542987823486328,
1305
+ "learning_rate": 0.0001801383739559098
1306
+ },
1307
+ {
1308
+ "step": 146,
1309
+ "epoch": 0.9898305084745763,
1310
+ "cpu_mem": 1.511325696,
1311
+ "gpu_mem": 4.62448128,
1312
+ "loss": 0.4851,
1313
+ "grad_norm": 0.4650062024593353,
1314
+ "learning_rate": 0.0001783876866540615
1315
+ },
1316
+ {
1317
+ "step": 147,
1318
+ "epoch": 0.9966101694915255,
1319
+ "cpu_mem": 1.511325696,
1320
+ "gpu_mem": 4.624379904,
1321
+ "loss": 0.5265,
1322
+ "grad_norm": 0.7359176278114319,
1323
+ "learning_rate": 0.00017663297943814552
1324
+ },
1325
+ {
1326
+ "step": 148,
1327
+ "epoch": 1.0033898305084745,
1328
+ "cpu_mem": 1.511325696,
1329
+ "gpu_mem": 4.676583936,
1330
+ "loss": 0.7821,
1331
+ "grad_norm": 1.846174716949463,
1332
+ "learning_rate": 0.0001748745007881561
1333
+ },
1334
+ {
1335
+ "step": 149,
1336
+ "epoch": 1.0101694915254238,
1337
+ "cpu_mem": 1.511325696,
1338
+ "gpu_mem": 4.676519424,
1339
+ "loss": 0.4444,
1340
+ "grad_norm": 0.9513587951660156,
1341
+ "learning_rate": 0.00017311249971815185
1342
+ },
1343
+ {
1344
+ "step": 150,
1345
+ "epoch": 1.0169491525423728,
1346
+ "cpu_mem": 1.511325696,
1347
+ "gpu_mem": 4.676356608,
1348
+ "loss": 0.4795,
1349
+ "grad_norm": 0.9790411591529846,
1350
+ "learning_rate": 0.00017134722574099276
1351
+ },
1352
+ {
1353
+ "step": 151,
1354
+ "epoch": 1.023728813559322,
1355
+ "cpu_mem": 1.511325696,
1356
+ "gpu_mem": 4.6764288,
1357
+ "loss": 0.6015,
1358
+ "grad_norm": 1.2339156866073608,
1359
+ "learning_rate": 0.00016957892883300775
1360
+ },
1361
+ {
1362
+ "step": 152,
1363
+ "epoch": 1.0305084745762711,
1364
+ "cpu_mem": 1.511325696,
1365
+ "gpu_mem": 4.676464128,
1366
+ "loss": 0.4506,
1367
+ "grad_norm": 0.6777993440628052,
1368
+ "learning_rate": 0.00016780785939859576
1369
+ },
1370
+ {
1371
+ "step": 153,
1372
+ "epoch": 1.0372881355932204,
1373
+ "cpu_mem": 1.511325696,
1374
+ "gpu_mem": 4.676488704,
1375
+ "loss": 0.5975,
1376
+ "grad_norm": 0.7841456532478333,
1377
+ "learning_rate": 0.00016603426823476693
1378
+ },
1379
+ {
1380
+ "step": 154,
1381
+ "epoch": 1.0440677966101695,
1382
+ "cpu_mem": 1.511325696,
1383
+ "gpu_mem": 4.676450304,
1384
+ "loss": 0.4571,
1385
+ "grad_norm": 0.8193861842155457,
1386
+ "learning_rate": 0.00016425840649562736
1387
+ },
1388
+ {
1389
+ "step": 155,
1390
+ "epoch": 1.0508474576271187,
1391
+ "cpu_mem": 1.511325696,
1392
+ "gpu_mem": 4.676671488,
1393
+ "loss": 0.4829,
1394
+ "grad_norm": 0.938800573348999,
1395
+ "learning_rate": 0.00016248052565681436
1396
+ },
1397
+ {
1398
+ "step": 156,
1399
+ "epoch": 1.0576271186440678,
1400
+ "cpu_mem": 1.511325696,
1401
+ "gpu_mem": 4.676579328,
1402
+ "loss": 0.5072,
1403
+ "grad_norm": 1.5463695526123047,
1404
+ "learning_rate": 0.00016070087747988482
1405
+ },
1406
+ {
1407
+ "step": 157,
1408
+ "epoch": 1.064406779661017,
1409
+ "cpu_mem": 1.511325696,
1410
+ "gpu_mem": 4.676485632,
1411
+ "loss": 0.4785,
1412
+ "grad_norm": 0.6637791395187378,
1413
+ "learning_rate": 0.00015891971397666464
1414
+ },
1415
+ {
1416
+ "step": 158,
1417
+ "epoch": 1.071186440677966,
1418
+ "cpu_mem": 1.511325696,
1419
+ "gpu_mem": 4.676411904,
1420
+ "loss": 0.4213,
1421
+ "grad_norm": 0.8457335829734802,
1422
+ "learning_rate": 0.00015713728737356137
1423
+ },
1424
+ {
1425
+ "step": 159,
1426
+ "epoch": 1.0779661016949154,
1427
+ "cpu_mem": 1.511325696,
1428
+ "gpu_mem": 4.676760576,
1429
+ "loss": 0.3644,
1430
+ "grad_norm": 0.5662264823913574,
1431
+ "learning_rate": 0.00015535385007584706
1432
+ },
1433
+ {
1434
+ "step": 160,
1435
+ "epoch": 1.0847457627118644,
1436
+ "cpu_mem": 1.511325696,
1437
+ "gpu_mem": 4.676355072,
1438
+ "loss": 0.4047,
1439
+ "grad_norm": 0.6654481291770935,
1440
+ "learning_rate": 0.0001535696546319161
1441
+ },
1442
+ {
1443
+ "step": 161,
1444
+ "epoch": 1.0915254237288137,
1445
+ "cpu_mem": 1.511325696,
1446
+ "gpu_mem": 4.676301312,
1447
+ "loss": 0.4,
1448
+ "grad_norm": 0.5183922052383423,
1449
+ "learning_rate": 0.00015178495369752213
1450
+ },
1451
+ {
1452
+ "step": 162,
1453
+ "epoch": 1.0983050847457627,
1454
+ "cpu_mem": 1.511325696,
1455
+ "gpu_mem": 4.677076992,
1456
+ "loss": 0.4024,
1457
+ "grad_norm": 0.5879871845245361,
1458
+ "learning_rate": 0.00015
1459
+ },
1460
+ {
1461
+ "step": 163,
1462
+ "epoch": 1.1050847457627118,
1463
+ "cpu_mem": 1.511325696,
1464
+ "gpu_mem": 4.676553216,
1465
+ "loss": 0.537,
1466
+ "grad_norm": 1.4586920738220215,
1467
+ "learning_rate": 0.00014821504630247785
1468
+ },
1469
+ {
1470
+ "step": 164,
1471
+ "epoch": 1.111864406779661,
1472
+ "cpu_mem": 1.511325696,
1473
+ "gpu_mem": 4.676465664,
1474
+ "loss": 0.5474,
1475
+ "grad_norm": 1.072028398513794,
1476
+ "learning_rate": 0.00014643034536808387
1477
+ },
1478
+ {
1479
+ "step": 165,
1480
+ "epoch": 1.11864406779661,
1481
+ "cpu_mem": 1.511325696,
1482
+ "gpu_mem": 4.676414976,
1483
+ "loss": 0.4617,
1484
+ "grad_norm": 1.0854562520980835,
1485
+ "learning_rate": 0.00014464614992415294
1486
+ },
1487
+ {
1488
+ "step": 166,
1489
+ "epoch": 1.1254237288135593,
1490
+ "cpu_mem": 1.511325696,
1491
+ "gpu_mem": 4.676510208,
1492
+ "loss": 0.418,
1493
+ "grad_norm": 1.0139094591140747,
1494
+ "learning_rate": 0.00014286271262643866
1495
+ },
1496
+ {
1497
+ "step": 167,
1498
+ "epoch": 1.1322033898305084,
1499
+ "cpu_mem": 1.511325696,
1500
+ "gpu_mem": 4.676427264,
1501
+ "loss": 0.4613,
1502
+ "grad_norm": 0.7832831144332886,
1503
+ "learning_rate": 0.00014108028602333536
1504
+ },
1505
+ {
1506
+ "step": 168,
1507
+ "epoch": 1.1389830508474577,
1508
+ "cpu_mem": 1.511325696,
1509
+ "gpu_mem": 4.676445696,
1510
+ "loss": 0.4621,
1511
+ "grad_norm": 0.6947904825210571,
1512
+ "learning_rate": 0.00013929912252011516
1513
+ },
1514
+ {
1515
+ "step": 169,
1516
+ "epoch": 1.1457627118644067,
1517
+ "cpu_mem": 1.511325696,
1518
+ "gpu_mem": 4.676533248,
1519
+ "loss": 0.4963,
1520
+ "grad_norm": 1.5868031978607178,
1521
+ "learning_rate": 0.00013751947434318564
1522
+ },
1523
+ {
1524
+ "step": 170,
1525
+ "epoch": 1.152542372881356,
1526
+ "cpu_mem": 1.511325696,
1527
+ "gpu_mem": 4.676418048,
1528
+ "loss": 0.5307,
1529
+ "grad_norm": 1.127265214920044,
1530
+ "learning_rate": 0.00013574159350437261
1531
+ },
1532
+ {
1533
+ "step": 171,
1534
+ "epoch": 1.159322033898305,
1535
+ "cpu_mem": 1.511325696,
1536
+ "gpu_mem": 4.676481024,
1537
+ "loss": 0.5001,
1538
+ "grad_norm": 0.7554064989089966,
1539
+ "learning_rate": 0.0001339657317652331
1540
+ },
1541
+ {
1542
+ "step": 172,
1543
+ "epoch": 1.1661016949152543,
1544
+ "cpu_mem": 1.511325696,
1545
+ "gpu_mem": 4.676388864,
1546
+ "loss": 0.3924,
1547
+ "grad_norm": 0.740827202796936,
1548
+ "learning_rate": 0.00013219214060140424
1549
+ },
1550
+ {
1551
+ "step": 173,
1552
+ "epoch": 1.1728813559322033,
1553
+ "cpu_mem": 1.511325696,
1554
+ "gpu_mem": 4.676688384,
1555
+ "loss": 0.4615,
1556
+ "grad_norm": 0.6514274477958679,
1557
+ "learning_rate": 0.00013042107116699228
1558
+ },
1559
+ {
1560
+ "step": 174,
1561
+ "epoch": 1.1796610169491526,
1562
+ "cpu_mem": 1.511325696,
1563
+ "gpu_mem": 4.676411904,
1564
+ "loss": 0.4797,
1565
+ "grad_norm": 0.7258147597312927,
1566
+ "learning_rate": 0.00012865277425900724
1567
+ },
1568
+ {
1569
+ "step": 175,
1570
+ "epoch": 1.1864406779661016,
1571
+ "cpu_mem": 1.511325696,
1572
+ "gpu_mem": 4.676378112,
1573
+ "loss": 0.4992,
1574
+ "grad_norm": 0.7582775950431824,
1575
+ "learning_rate": 0.00012688750028184818
1576
+ },
1577
+ {
1578
+ "step": 176,
1579
+ "epoch": 1.193220338983051,
1580
+ "cpu_mem": 1.511325696,
1581
+ "gpu_mem": 4.676516352,
1582
+ "loss": 0.3651,
1583
+ "grad_norm": 0.7316360473632812,
1584
+ "learning_rate": 0.0001251254992118439
1585
+ },
1586
+ {
1587
+ "step": 177,
1588
+ "epoch": 1.2,
1589
+ "cpu_mem": 1.511325696,
1590
+ "gpu_mem": 4.676614656,
1591
+ "loss": 0.4881,
1592
+ "grad_norm": 0.7436537146568298,
1593
+ "learning_rate": 0.00012336702056185453
1594
+ },
1595
+ {
1596
+ "step": 178,
1597
+ "epoch": 1.2067796610169492,
1598
+ "cpu_mem": 1.511325696,
1599
+ "gpu_mem": 4.676361216,
1600
+ "loss": 0.4471,
1601
+ "grad_norm": 0.9715722799301147,
1602
+ "learning_rate": 0.00012161231334593851
1603
+ },
1604
+ {
1605
+ "step": 179,
1606
+ "epoch": 1.2135593220338983,
1607
+ "cpu_mem": 1.511325696,
1608
+ "gpu_mem": 4.676461056,
1609
+ "loss": 0.4749,
1610
+ "grad_norm": 0.6974865794181824,
1611
+ "learning_rate": 0.00011986162604409015
1612
+ },
1613
+ {
1614
+ "step": 180,
1615
+ "epoch": 1.2203389830508475,
1616
+ "cpu_mem": 1.511325696,
1617
+ "gpu_mem": 4.676433408,
1618
+ "loss": 0.481,
1619
+ "grad_norm": 1.586224913597107,
1620
+ "learning_rate": 0.00011811520656705348
1621
+ },
1622
+ {
1623
+ "step": 181,
1624
+ "epoch": 1.2271186440677966,
1625
+ "cpu_mem": 1.511325696,
1626
+ "gpu_mem": 4.676370432,
1627
+ "loss": 0.3847,
1628
+ "grad_norm": 0.6357300877571106,
1629
+ "learning_rate": 0.00011637330222121543
1630
+ },
1631
+ {
1632
+ "step": 182,
1633
+ "epoch": 1.2338983050847459,
1634
+ "cpu_mem": 1.511325696,
1635
+ "gpu_mem": 4.676588544,
1636
+ "loss": 0.4714,
1637
+ "grad_norm": 1.1362266540527344,
1638
+ "learning_rate": 0.00011463615967358588
1639
+ },
1640
+ {
1641
+ "step": 183,
1642
+ "epoch": 1.240677966101695,
1643
+ "cpu_mem": 1.511325696,
1644
+ "gpu_mem": 4.676485632,
1645
+ "loss": 0.4662,
1646
+ "grad_norm": 0.8635218143463135,
1647
+ "learning_rate": 0.00011290402491686766
1648
+ },
1649
+ {
1650
+ "step": 184,
1651
+ "epoch": 1.2474576271186442,
1652
+ "cpu_mem": 1.511325696,
1653
+ "gpu_mem": 4.676433408,
1654
+ "loss": 0.4476,
1655
+ "grad_norm": 0.8836385607719421,
1656
+ "learning_rate": 0.00011117714323462186
1657
+ },
1658
+ {
1659
+ "step": 185,
1660
+ "epoch": 1.2542372881355932,
1661
+ "cpu_mem": 1.511325696,
1662
+ "gpu_mem": 4.676411904,
1663
+ "loss": 0.4304,
1664
+ "grad_norm": 0.7257511615753174,
1665
+ "learning_rate": 0.00010945575916653407
1666
+ },
1667
+ {
1668
+ "step": 186,
1669
+ "epoch": 1.2610169491525425,
1670
+ "cpu_mem": 1.511325696,
1671
+ "gpu_mem": 4.67642112,
1672
+ "loss": 0.3666,
1673
+ "grad_norm": 0.6456829905509949,
1674
+ "learning_rate": 0.00010774011647378553
1675
+ },
1676
+ {
1677
+ "step": 187,
1678
+ "epoch": 1.2677966101694915,
1679
+ "cpu_mem": 1.511325696,
1680
+ "gpu_mem": 4.676353536,
1681
+ "loss": 0.5261,
1682
+ "grad_norm": 0.7594860792160034,
1683
+ "learning_rate": 0.00010603045810453468
1684
+ },
1685
+ {
1686
+ "step": 188,
1687
+ "epoch": 1.2745762711864406,
1688
+ "cpu_mem": 1.511325696,
1689
+ "gpu_mem": 4.676516352,
1690
+ "loss": 0.3928,
1691
+ "grad_norm": 0.6978077292442322,
1692
+ "learning_rate": 0.00010432702615951396
1693
+ },
1694
+ {
1695
+ "step": 189,
1696
+ "epoch": 1.2813559322033898,
1697
+ "cpu_mem": 1.511325696,
1698
+ "gpu_mem": 4.676385792,
1699
+ "loss": 0.5247,
1700
+ "grad_norm": 0.7300601601600647,
1701
+ "learning_rate": 0.00010263006185774627
1702
+ },
1703
+ {
1704
+ "step": 190,
1705
+ "epoch": 1.288135593220339,
1706
+ "cpu_mem": 1.511325696,
1707
+ "gpu_mem": 4.6765056,
1708
+ "loss": 0.4131,
1709
+ "grad_norm": 0.7177391648292542,
1710
+ "learning_rate": 0.00010093980550238675
1711
+ },
1712
+ {
1713
+ "step": 191,
1714
+ "epoch": 1.2949152542372881,
1715
+ "cpu_mem": 1.511325696,
1716
+ "gpu_mem": 4.676324352,
1717
+ "loss": 0.3503,
1718
+ "grad_norm": 0.8216970562934875,
1719
+ "learning_rate": 9.925649644669391e-05
1720
+ },
1721
+ {
1722
+ "step": 192,
1723
+ "epoch": 1.3016949152542372,
1724
+ "cpu_mem": 1.511325696,
1725
+ "gpu_mem": 4.676456448,
1726
+ "loss": 0.3357,
1727
+ "grad_norm": 0.9710990786552429,
1728
+ "learning_rate": 9.758037306013526e-05
1729
+ },
1730
+ {
1731
+ "step": 193,
1732
+ "epoch": 1.3084745762711865,
1733
+ "cpu_mem": 1.511325696,
1734
+ "gpu_mem": 4.676430336,
1735
+ "loss": 0.4152,
1736
+ "grad_norm": 0.6677502393722534,
1737
+ "learning_rate": 9.591167269463255e-05
1738
+ },
1739
+ {
1740
+ "step": 194,
1741
+ "epoch": 1.3152542372881357,
1742
+ "cpu_mem": 1.511325696,
1743
+ "gpu_mem": 4.676396544,
1744
+ "loss": 0.3694,
1745
+ "grad_norm": 0.6760179400444031,
1746
+ "learning_rate": 9.425063165095088e-05
1747
+ },
1748
+ {
1749
+ "step": 195,
1750
+ "epoch": 1.3220338983050848,
1751
+ "cpu_mem": 1.511325696,
1752
+ "gpu_mem": 4.676500992,
1753
+ "loss": 0.3101,
1754
+ "grad_norm": 0.7037206888198853,
1755
+ "learning_rate": 9.259748514523653e-05
1756
+ },
1757
+ {
1758
+ "step": 196,
1759
+ "epoch": 1.3288135593220338,
1760
+ "cpu_mem": 1.511325696,
1761
+ "gpu_mem": 4.676496384,
1762
+ "loss": 0.4392,
1763
+ "grad_norm": 0.7807204723358154,
1764
+ "learning_rate": 9.095246727570879e-05
1765
+ },
1766
+ {
1767
+ "step": 197,
1768
+ "epoch": 1.335593220338983,
1769
+ "cpu_mem": 1.511325696,
1770
+ "gpu_mem": 4.676355072,
1771
+ "loss": 0.4013,
1772
+ "grad_norm": 1.1068989038467407,
1773
+ "learning_rate": 8.931581098950973e-05
1774
+ },
1775
+ {
1776
+ "step": 198,
1777
+ "epoch": 1.3423728813559321,
1778
+ "cpu_mem": 1.511325696,
1779
+ "gpu_mem": 4.676547072,
1780
+ "loss": 0.4122,
1781
+ "grad_norm": 0.7010363340377808,
1782
+ "learning_rate": 8.768774804971705e-05
1783
+ },
1784
+ {
1785
+ "step": 199,
1786
+ "epoch": 1.3491525423728814,
1787
+ "cpu_mem": 1.511325696,
1788
+ "gpu_mem": 4.67639808,
1789
+ "loss": 0.4062,
1790
+ "grad_norm": 0.7628048658370972,
1791
+ "learning_rate": 8.606850900252478e-05
1792
+ },
1793
+ {
1794
+ "step": 200,
1795
+ "epoch": 1.3559322033898304,
1796
+ "cpu_mem": 1.511325696,
1797
+ "gpu_mem": 4.676500992,
1798
+ "loss": 0.3826,
1799
+ "grad_norm": 0.933638334274292,
1800
+ "learning_rate": 8.445832314459608e-05
1801
+ },
1802
+ {
1803
+ "step": 201,
1804
+ "epoch": 1.3627118644067797,
1805
+ "cpu_mem": 1.511325696,
1806
+ "gpu_mem": 4.676703744,
1807
+ "loss": 0.3256,
1808
+ "grad_norm": 0.9295195937156677,
1809
+ "learning_rate": 8.285741849059311e-05
1810
+ },
1811
+ {
1812
+ "step": 202,
1813
+ "epoch": 1.3694915254237288,
1814
+ "cpu_mem": 1.511325696,
1815
+ "gpu_mem": 4.6765056,
1816
+ "loss": 0.4441,
1817
+ "grad_norm": 0.8123700022697449,
1818
+ "learning_rate": 8.126602174088843e-05
1819
+ },
1820
+ {
1821
+ "step": 203,
1822
+ "epoch": 1.376271186440678,
1823
+ "cpu_mem": 1.511325696,
1824
+ "gpu_mem": 4.676391936,
1825
+ "loss": 0.4034,
1826
+ "grad_norm": 0.8742663860321045,
1827
+ "learning_rate": 7.968435824946242e-05
1828
+ },
1829
+ {
1830
+ "step": 204,
1831
+ "epoch": 1.383050847457627,
1832
+ "cpu_mem": 1.511325696,
1833
+ "gpu_mem": 4.67640576,
1834
+ "loss": 0.3757,
1835
+ "grad_norm": 0.953644335269928,
1836
+ "learning_rate": 7.811265199199152e-05
1837
+ },
1838
+ {
1839
+ "step": 205,
1840
+ "epoch": 1.3898305084745763,
1841
+ "cpu_mem": 1.511325696,
1842
+ "gpu_mem": 4.676450304,
1843
+ "loss": 0.4845,
1844
+ "grad_norm": 1.084182620048523,
1845
+ "learning_rate": 7.655112553413135e-05
1846
+ },
1847
+ {
1848
+ "step": 206,
1849
+ "epoch": 1.3966101694915254,
1850
+ "cpu_mem": 1.511325696,
1851
+ "gpu_mem": 4.676391936,
1852
+ "loss": 0.342,
1853
+ "grad_norm": 0.9180456399917603,
1854
+ "learning_rate": 7.500000000000002e-05
1855
+ },
1856
+ {
1857
+ "step": 207,
1858
+ "epoch": 1.4033898305084747,
1859
+ "cpu_mem": 1.511325696,
1860
+ "gpu_mem": 4.676625408,
1861
+ "loss": 0.4528,
1862
+ "grad_norm": 1.341009497642517,
1863
+ "learning_rate": 7.345949504086507e-05
1864
+ },
1865
+ {
1866
+ "step": 208,
1867
+ "epoch": 1.4101694915254237,
1868
+ "cpu_mem": 1.511325696,
1869
+ "gpu_mem": 4.676656128,
1870
+ "loss": 0.4201,
1871
+ "grad_norm": 1.7378958463668823,
1872
+ "learning_rate": 7.192982880403917e-05
1873
+ },
1874
+ {
1875
+ "step": 209,
1876
+ "epoch": 1.4169491525423727,
1877
+ "cpu_mem": 1.511325696,
1878
+ "gpu_mem": 4.6765824,
1879
+ "loss": 0.4907,
1880
+ "grad_norm": 0.8371671438217163,
1881
+ "learning_rate": 7.041121790198881e-05
1882
+ },
1883
+ {
1884
+ "step": 210,
1885
+ "epoch": 1.423728813559322,
1886
+ "cpu_mem": 1.511325696,
1887
+ "gpu_mem": 4.676470272,
1888
+ "loss": 0.4671,
1889
+ "grad_norm": 1.0010132789611816,
1890
+ "learning_rate": 6.890387738166041e-05
1891
+ },
1892
+ {
1893
+ "step": 211,
1894
+ "epoch": 1.4305084745762713,
1895
+ "cpu_mem": 1.511325696,
1896
+ "gpu_mem": 4.676419584,
1897
+ "loss": 0.3792,
1898
+ "grad_norm": 1.223456621170044,
1899
+ "learning_rate": 6.740802069402771e-05
1900
+ },
1901
+ {
1902
+ "step": 212,
1903
+ "epoch": 1.4372881355932203,
1904
+ "cpu_mem": 1.511325696,
1905
+ "gpu_mem": 4.676388864,
1906
+ "loss": 0.4226,
1907
+ "grad_norm": 0.8292936086654663,
1908
+ "learning_rate": 6.592385966386588e-05
1909
+ },
1910
+ {
1911
+ "step": 213,
1912
+ "epoch": 1.4440677966101694,
1913
+ "cpu_mem": 1.511325696,
1914
+ "gpu_mem": 4.676411904,
1915
+ "loss": 0.57,
1916
+ "grad_norm": 1.3903371095657349,
1917
+ "learning_rate": 6.445160445975536e-05
1918
+ },
1919
+ {
1920
+ "step": 214,
1921
+ "epoch": 1.4508474576271186,
1922
+ "cpu_mem": 1.511325696,
1923
+ "gpu_mem": 4.676494848,
1924
+ "loss": 0.3324,
1925
+ "grad_norm": 0.8603353500366211,
1926
+ "learning_rate": 6.299146356432029e-05
1927
+ },
1928
+ {
1929
+ "step": 215,
1930
+ "epoch": 1.457627118644068,
1931
+ "cpu_mem": 1.511325696,
1932
+ "gpu_mem": 4.676422656,
1933
+ "loss": 0.5364,
1934
+ "grad_norm": 1.7915467023849487,
1935
+ "learning_rate": 6.154364374470568e-05
1936
+ },
1937
+ {
1938
+ "step": 216,
1939
+ "epoch": 1.464406779661017,
1940
+ "cpu_mem": 1.511325696,
1941
+ "gpu_mem": 4.676588544,
1942
+ "loss": 0.4089,
1943
+ "grad_norm": 0.8353825807571411,
1944
+ "learning_rate": 6.010835002329795e-05
1945
+ },
1946
+ {
1947
+ "step": 217,
1948
+ "epoch": 1.471186440677966,
1949
+ "cpu_mem": 1.511325696,
1950
+ "gpu_mem": 4.676430336,
1951
+ "loss": 0.4557,
1952
+ "grad_norm": 1.2738498449325562,
1953
+ "learning_rate": 5.8685785648691894e-05
1954
+ },
1955
+ {
1956
+ "step": 218,
1957
+ "epoch": 1.4779661016949153,
1958
+ "cpu_mem": 1.511325696,
1959
+ "gpu_mem": 4.676407296,
1960
+ "loss": 0.4089,
1961
+ "grad_norm": 0.9500507116317749,
1962
+ "learning_rate": 5.72761520669092e-05
1963
+ },
1964
+ {
1965
+ "step": 219,
1966
+ "epoch": 1.4847457627118645,
1967
+ "cpu_mem": 1.511325696,
1968
+ "gpu_mem": 4.676533248,
1969
+ "loss": 0.4562,
1970
+ "grad_norm": 0.8485879898071289,
1971
+ "learning_rate": 5.587964889287218e-05
1972
+ },
1973
+ {
1974
+ "step": 220,
1975
+ "epoch": 1.4915254237288136,
1976
+ "cpu_mem": 1.511325696,
1977
+ "gpu_mem": 4.67656704,
1978
+ "loss": 0.4296,
1979
+ "grad_norm": 0.8201168179512024,
1980
+ "learning_rate": 5.449647388213678e-05
1981
+ },
1982
+ {
1983
+ "step": 221,
1984
+ "epoch": 1.4983050847457626,
1985
+ "cpu_mem": 1.511325696,
1986
+ "gpu_mem": 4.676434944,
1987
+ "loss": 0.4364,
1988
+ "grad_norm": 0.8443863391876221,
1989
+ "learning_rate": 5.312682290288869e-05
1990
+ },
1991
+ {
1992
+ "step": 222,
1993
+ "epoch": 1.505084745762712,
1994
+ "cpu_mem": 1.511325696,
1995
+ "gpu_mem": 4.676571648,
1996
+ "loss": 0.4266,
1997
+ "grad_norm": 0.8384501338005066,
1998
+ "learning_rate": 5.1770889908207245e-05
1999
+ },
2000
+ {
2001
+ "step": 223,
2002
+ "epoch": 1.5118644067796612,
2003
+ "cpu_mem": 1.511325696,
2004
+ "gpu_mem": 4.676485632,
2005
+ "loss": 0.3531,
2006
+ "grad_norm": 1.1277318000793457,
2007
+ "learning_rate": 5.0428866908599864e-05
2008
+ },
2009
+ {
2010
+ "step": 224,
2011
+ "epoch": 1.5186440677966102,
2012
+ "cpu_mem": 1.511325696,
2013
+ "gpu_mem": 4.676450304,
2014
+ "loss": 0.3186,
2015
+ "grad_norm": 0.6930432319641113,
2016
+ "learning_rate": 4.9100943944812114e-05
2017
+ },
2018
+ {
2019
+ "step": 225,
2020
+ "epoch": 1.5254237288135593,
2021
+ "cpu_mem": 1.511325696,
2022
+ "gpu_mem": 4.676414976,
2023
+ "loss": 0.3449,
2024
+ "grad_norm": 0.739639401435852,
2025
+ "learning_rate": 4.778730906091632e-05
2026
+ },
2027
+ {
2028
+ "step": 226,
2029
+ "epoch": 1.5322033898305085,
2030
+ "cpu_mem": 1.511325696,
2031
+ "gpu_mem": 4.676563968,
2032
+ "loss": 0.4041,
2033
+ "grad_norm": 0.9897132515907288,
2034
+ "learning_rate": 4.648814827768322e-05
2035
+ },
2036
+ {
2037
+ "step": 227,
2038
+ "epoch": 1.5389830508474578,
2039
+ "cpu_mem": 1.511325696,
2040
+ "gpu_mem": 4.676453376,
2041
+ "loss": 0.4365,
2042
+ "grad_norm": 0.8921437859535217,
2043
+ "learning_rate": 4.5203645566239816e-05
2044
+ },
2045
+ {
2046
+ "step": 228,
2047
+ "epoch": 1.5457627118644068,
2048
+ "cpu_mem": 1.511325696,
2049
+ "gpu_mem": 4.67639808,
2050
+ "loss": 0.4615,
2051
+ "grad_norm": 0.6736249327659607,
2052
+ "learning_rate": 4.3933982822017876e-05
2053
+ },
2054
+ {
2055
+ "step": 229,
2056
+ "epoch": 1.5525423728813559,
2057
+ "cpu_mem": 1.511325696,
2058
+ "gpu_mem": 4.676339712,
2059
+ "loss": 0.3326,
2060
+ "grad_norm": 0.6789183020591736,
2061
+ "learning_rate": 4.267933983899601e-05
2062
+ },
2063
+ {
2064
+ "step": 230,
2065
+ "epoch": 1.559322033898305,
2066
+ "cpu_mem": 1.511325696,
2067
+ "gpu_mem": 4.676396544,
2068
+ "loss": 0.3678,
2069
+ "grad_norm": 0.7079548835754395,
2070
+ "learning_rate": 4.143989428423947e-05
2071
+ },
2072
+ {
2073
+ "step": 231,
2074
+ "epoch": 1.5661016949152542,
2075
+ "cpu_mem": 1.511325696,
2076
+ "gpu_mem": 4.67667456,
2077
+ "loss": 0.3894,
2078
+ "grad_norm": 0.9309094548225403,
2079
+ "learning_rate": 4.0215821672741213e-05
2080
+ },
2081
+ {
2082
+ "step": 232,
2083
+ "epoch": 1.5728813559322035,
2084
+ "cpu_mem": 1.511325696,
2085
+ "gpu_mem": 4.67639808,
2086
+ "loss": 0.4916,
2087
+ "grad_norm": 1.011915922164917,
2088
+ "learning_rate": 3.900729534256745e-05
2089
+ },
2090
+ {
2091
+ "step": 233,
2092
+ "epoch": 1.5796610169491525,
2093
+ "cpu_mem": 1.511325696,
2094
+ "gpu_mem": 4.676711424,
2095
+ "loss": 0.3778,
2096
+ "grad_norm": 0.9273942112922668,
2097
+ "learning_rate": 3.781448643031187e-05
2098
+ },
2099
+ {
2100
+ "step": 234,
2101
+ "epoch": 1.5864406779661016,
2102
+ "cpu_mem": 1.511325696,
2103
+ "gpu_mem": 4.676587008,
2104
+ "loss": 0.408,
2105
+ "grad_norm": 1.0228288173675537,
2106
+ "learning_rate": 3.663756384686127e-05
2107
+ },
2108
+ {
2109
+ "step": 235,
2110
+ "epoch": 1.5932203389830508,
2111
+ "cpu_mem": 1.511325696,
2112
+ "gpu_mem": 4.676342784,
2113
+ "loss": 0.3153,
2114
+ "grad_norm": 0.8042024970054626,
2115
+ "learning_rate": 3.547669425347647e-05
2116
+ },
2117
+ {
2118
+ "step": 236,
2119
+ "epoch": 1.6,
2120
+ "cpu_mem": 1.511325696,
2121
+ "gpu_mem": 4.676402688,
2122
+ "loss": 0.4871,
2123
+ "grad_norm": 1.011894702911377,
2124
+ "learning_rate": 3.433204203819185e-05
2125
+ },
2126
+ {
2127
+ "step": 237,
2128
+ "epoch": 1.6067796610169491,
2129
+ "cpu_mem": 1.511325696,
2130
+ "gpu_mem": 4.676464128,
2131
+ "loss": 0.4899,
2132
+ "grad_norm": 1.044426441192627,
2133
+ "learning_rate": 3.3203769292536764e-05
2134
+ },
2135
+ {
2136
+ "step": 238,
2137
+ "epoch": 1.6135593220338982,
2138
+ "cpu_mem": 1.511325696,
2139
+ "gpu_mem": 4.676465664,
2140
+ "loss": 0.4463,
2141
+ "grad_norm": 1.1736929416656494,
2142
+ "learning_rate": 3.209203578858191e-05
2143
+ },
2144
+ {
2145
+ "step": 239,
2146
+ "epoch": 1.6203389830508474,
2147
+ "cpu_mem": 1.511325696,
2148
+ "gpu_mem": 4.676719104,
2149
+ "loss": 0.4268,
2150
+ "grad_norm": 0.8223713040351868,
2151
+ "learning_rate": 3.099699895631474e-05
2152
+ },
2153
+ {
2154
+ "step": 240,
2155
+ "epoch": 1.6271186440677967,
2156
+ "cpu_mem": 1.511325696,
2157
+ "gpu_mem": 4.676368896,
2158
+ "loss": 0.5236,
2159
+ "grad_norm": 0.9299846887588501,
2160
+ "learning_rate": 2.9918813861345952e-05
2161
+ },
2162
+ {
2163
+ "step": 241,
2164
+ "epoch": 1.6338983050847458,
2165
+ "cpu_mem": 1.511325696,
2166
+ "gpu_mem": 4.676665344,
2167
+ "loss": 0.4318,
2168
+ "grad_norm": 0.9577910900115967,
2169
+ "learning_rate": 2.885763318295102e-05
2170
+ },
2171
+ {
2172
+ "step": 242,
2173
+ "epoch": 1.6406779661016948,
2174
+ "cpu_mem": 1.511325696,
2175
+ "gpu_mem": 4.676527104,
2176
+ "loss": 0.4158,
2177
+ "grad_norm": 0.8543640971183777,
2178
+ "learning_rate": 2.781360719244964e-05
2179
+ },
2180
+ {
2181
+ "step": 243,
2182
+ "epoch": 1.647457627118644,
2183
+ "cpu_mem": 1.511325696,
2184
+ "gpu_mem": 4.676379648,
2185
+ "loss": 0.567,
2186
+ "grad_norm": 1.039746642112732,
2187
+ "learning_rate": 2.6786883731926306e-05
2188
+ },
2189
+ {
2190
+ "step": 244,
2191
+ "epoch": 1.6542372881355933,
2192
+ "cpu_mem": 1.511325696,
2193
+ "gpu_mem": 4.676519424,
2194
+ "loss": 0.342,
2195
+ "grad_norm": 0.99267578125,
2196
+ "learning_rate": 2.5777608193294396e-05
2197
+ },
2198
+ {
2199
+ "step": 245,
2200
+ "epoch": 1.6610169491525424,
2201
+ "cpu_mem": 1.511325696,
2202
+ "gpu_mem": 4.67639808,
2203
+ "loss": 0.4102,
2204
+ "grad_norm": 0.6880216002464294,
2205
+ "learning_rate": 2.4785923497707956e-05
2206
+ },
2207
+ {
2208
+ "step": 246,
2209
+ "epoch": 1.6677966101694914,
2210
+ "cpu_mem": 1.511325696,
2211
+ "gpu_mem": 4.676491776,
2212
+ "loss": 0.4772,
2213
+ "grad_norm": 0.7552581429481506,
2214
+ "learning_rate": 2.38119700753228e-05
2215
+ },
2216
+ {
2217
+ "step": 247,
2218
+ "epoch": 1.6745762711864407,
2219
+ "cpu_mem": 1.511325696,
2220
+ "gpu_mem": 4.676510208,
2221
+ "loss": 0.3039,
2222
+ "grad_norm": 0.5652927756309509,
2223
+ "learning_rate": 2.285588584541047e-05
2224
+ },
2225
+ {
2226
+ "step": 248,
2227
+ "epoch": 1.68135593220339,
2228
+ "cpu_mem": 1.511325696,
2229
+ "gpu_mem": 4.676462592,
2230
+ "loss": 0.3855,
2231
+ "grad_norm": 0.6893491744995117,
2232
+ "learning_rate": 2.1917806196827792e-05
2233
+ },
2234
+ {
2235
+ "step": 249,
2236
+ "epoch": 1.688135593220339,
2237
+ "cpu_mem": 1.511325696,
2238
+ "gpu_mem": 4.676368896,
2239
+ "loss": 0.2917,
2240
+ "grad_norm": 0.860058069229126,
2241
+ "learning_rate": 2.0997863968844914e-05
2242
+ },
2243
+ {
2244
+ "step": 250,
2245
+ "epoch": 1.694915254237288,
2246
+ "cpu_mem": 1.511325696,
2247
+ "gpu_mem": 4.676461056,
2248
+ "loss": 0.4473,
2249
+ "grad_norm": 0.9282390475273132,
2250
+ "learning_rate": 2.009618943233419e-05
2251
+ },
2252
+ {
2253
+ "step": 251,
2254
+ "epoch": 1.7016949152542373,
2255
+ "cpu_mem": 1.511325696,
2256
+ "gpu_mem": 4.676373504,
2257
+ "loss": 0.2981,
2258
+ "grad_norm": 0.6758578419685364,
2259
+ "learning_rate": 1.921291027132278e-05
2260
+ },
2261
+ {
2262
+ "step": 252,
2263
+ "epoch": 1.7084745762711866,
2264
+ "cpu_mem": 1.511325696,
2265
+ "gpu_mem": 4.676416512,
2266
+ "loss": 0.45,
2267
+ "grad_norm": 0.8535842299461365,
2268
+ "learning_rate": 1.834815156491165e-05
2269
+ },
2270
+ {
2271
+ "step": 253,
2272
+ "epoch": 1.7152542372881356,
2273
+ "cpu_mem": 1.511325696,
2274
+ "gpu_mem": 4.676610048,
2275
+ "loss": 0.4595,
2276
+ "grad_norm": 0.8331772089004517,
2277
+ "learning_rate": 1.750203576956341e-05
2278
+ },
2279
+ {
2280
+ "step": 254,
2281
+ "epoch": 1.7220338983050847,
2282
+ "cpu_mem": 1.511325696,
2283
+ "gpu_mem": 4.67640576,
2284
+ "loss": 0.3657,
2285
+ "grad_norm": 0.8118672370910645,
2286
+ "learning_rate": 1.6674682701761493e-05
2287
+ },
2288
+ {
2289
+ "step": 255,
2290
+ "epoch": 1.7288135593220337,
2291
+ "cpu_mem": 1.511325696,
2292
+ "gpu_mem": 4.676562432,
2293
+ "loss": 0.5165,
2294
+ "grad_norm": 1.3498203754425049,
2295
+ "learning_rate": 1.5866209521043304e-05
2296
+ },
2297
+ {
2298
+ "step": 256,
2299
+ "epoch": 1.735593220338983,
2300
+ "cpu_mem": 1.511325696,
2301
+ "gpu_mem": 4.676388864,
2302
+ "loss": 0.3667,
2303
+ "grad_norm": 0.6675718426704407,
2304
+ "learning_rate": 1.5076730713409523e-05
2305
+ },
2306
+ {
2307
+ "step": 257,
2308
+ "epoch": 1.7423728813559323,
2309
+ "cpu_mem": 1.511325696,
2310
+ "gpu_mem": 4.676802048,
2311
+ "loss": 0.47,
2312
+ "grad_norm": 0.7657194137573242,
2313
+ "learning_rate": 1.4306358075111923e-05
2314
+ },
2315
+ {
2316
+ "step": 258,
2317
+ "epoch": 1.7491525423728813,
2318
+ "cpu_mem": 1.511325696,
2319
+ "gpu_mem": 4.676461056,
2320
+ "loss": 0.4638,
2321
+ "grad_norm": 1.069642186164856,
2322
+ "learning_rate": 1.3555200696822232e-05
2323
+ },
2324
+ {
2325
+ "step": 259,
2326
+ "epoch": 1.7559322033898304,
2327
+ "cpu_mem": 1.511325696,
2328
+ "gpu_mem": 4.676378112,
2329
+ "loss": 0.3957,
2330
+ "grad_norm": 1.1308850049972534,
2331
+ "learning_rate": 1.2823364948184095e-05
2332
+ },
2333
+ {
2334
+ "step": 260,
2335
+ "epoch": 1.7627118644067796,
2336
+ "cpu_mem": 1.511325696,
2337
+ "gpu_mem": 4.676494848,
2338
+ "loss": 0.3246,
2339
+ "grad_norm": 0.8047335147857666,
2340
+ "learning_rate": 1.2110954462750166e-05
2341
+ },
2342
+ {
2343
+ "step": 261,
2344
+ "epoch": 1.769491525423729,
2345
+ "cpu_mem": 1.511325696,
2346
+ "gpu_mem": 4.676450304,
2347
+ "loss": 0.2867,
2348
+ "grad_norm": 0.6239379644393921,
2349
+ "learning_rate": 1.1418070123306989e-05
2350
+ },
2351
+ {
2352
+ "step": 262,
2353
+ "epoch": 1.776271186440678,
2354
+ "cpu_mem": 1.511325696,
2355
+ "gpu_mem": 4.676407296,
2356
+ "loss": 0.3148,
2357
+ "grad_norm": 0.6769895553588867,
2358
+ "learning_rate": 1.0744810047589115e-05
2359
+ },
2360
+ {
2361
+ "step": 263,
2362
+ "epoch": 1.783050847457627,
2363
+ "cpu_mem": 1.511325696,
2364
+ "gpu_mem": 4.67644416,
2365
+ "loss": 0.3841,
2366
+ "grad_norm": 0.7769913077354431,
2367
+ "learning_rate": 1.0091269574384874e-05
2368
+ },
2369
+ {
2370
+ "step": 264,
2371
+ "epoch": 1.7898305084745763,
2372
+ "cpu_mem": 1.511325696,
2373
+ "gpu_mem": 4.676531712,
2374
+ "loss": 0.3774,
2375
+ "grad_norm": 0.8806053400039673,
2376
+ "learning_rate": 9.45754125003576e-06
2377
+ },
2378
+ {
2379
+ "step": 265,
2380
+ "epoch": 1.7966101694915255,
2381
+ "cpu_mem": 1.511325696,
2382
+ "gpu_mem": 4.676450304,
2383
+ "loss": 0.4358,
2384
+ "grad_norm": 0.8880800008773804,
2385
+ "learning_rate": 8.843714815330987e-06
2386
+ },
2387
+ {
2388
+ "step": 266,
2389
+ "epoch": 1.8033898305084746,
2390
+ "cpu_mem": 1.511325696,
2391
+ "gpu_mem": 4.676665344,
2392
+ "loss": 0.4605,
2393
+ "grad_norm": 0.7958164811134338,
2394
+ "learning_rate": 8.249877192799731e-06
2395
+ },
2396
+ {
2397
+ "step": 267,
2398
+ "epoch": 1.8101694915254236,
2399
+ "cpu_mem": 1.511325696,
2400
+ "gpu_mem": 4.676457984,
2401
+ "loss": 0.4532,
2402
+ "grad_norm": 1.444252371788025,
2403
+ "learning_rate": 7.676112474402068e-06
2404
+ },
2405
+ {
2406
+ "step": 268,
2407
+ "epoch": 1.8169491525423729,
2408
+ "cpu_mem": 1.511325696,
2409
+ "gpu_mem": 4.676462592,
2410
+ "loss": 0.3734,
2411
+ "grad_norm": 0.9235910773277283,
2412
+ "learning_rate": 7.122501909620926e-06
2413
+ },
2414
+ {
2415
+ "step": 269,
2416
+ "epoch": 1.8237288135593221,
2417
+ "cpu_mem": 1.511325696,
2418
+ "gpu_mem": 4.676473344,
2419
+ "loss": 0.4529,
2420
+ "grad_norm": 1.036499261856079,
2421
+ "learning_rate": 6.5891238939566275e-06
2422
+ },
2423
+ {
2424
+ "step": 270,
2425
+ "epoch": 1.8305084745762712,
2426
+ "cpu_mem": 1.511325696,
2427
+ "gpu_mem": 4.676511744,
2428
+ "loss": 0.4235,
2429
+ "grad_norm": 1.073111653327942,
2430
+ "learning_rate": 6.076053957825411e-06
2431
+ },
2432
+ {
2433
+ "step": 271,
2434
+ "epoch": 1.8372881355932202,
2435
+ "cpu_mem": 1.511325696,
2436
+ "gpu_mem": 4.676563968,
2437
+ "loss": 0.4308,
2438
+ "grad_norm": 1.0276501178741455,
2439
+ "learning_rate": 5.583364755863701e-06
2440
+ },
2441
+ {
2442
+ "step": 272,
2443
+ "epoch": 1.8440677966101695,
2444
+ "cpu_mem": 1.511325696,
2445
+ "gpu_mem": 4.676422656,
2446
+ "loss": 0.4068,
2447
+ "grad_norm": 0.6079270839691162,
2448
+ "learning_rate": 5.11112605663977e-06
2449
+ },
2450
+ {
2451
+ "step": 273,
2452
+ "epoch": 1.8508474576271188,
2453
+ "cpu_mem": 1.511325696,
2454
+ "gpu_mem": 4.676302848,
2455
+ "loss": 0.414,
2456
+ "grad_norm": 0.7414558529853821,
2457
+ "learning_rate": 4.659404732773908e-06
2458
+ },
2459
+ {
2460
+ "step": 274,
2461
+ "epoch": 1.8576271186440678,
2462
+ "cpu_mem": 1.511325696,
2463
+ "gpu_mem": 4.676530176,
2464
+ "loss": 0.3892,
2465
+ "grad_norm": 0.9337151050567627,
2466
+ "learning_rate": 4.228264751468752e-06
2467
+ },
2468
+ {
2469
+ "step": 275,
2470
+ "epoch": 1.8644067796610169,
2471
+ "cpu_mem": 1.511325696,
2472
+ "gpu_mem": 4.6767744,
2473
+ "loss": 0.4114,
2474
+ "grad_norm": 1.0385849475860596,
2475
+ "learning_rate": 3.817767165451041e-06
2476
+ },
2477
+ {
2478
+ "step": 276,
2479
+ "epoch": 1.8711864406779661,
2480
+ "cpu_mem": 1.511325696,
2481
+ "gpu_mem": 4.676434944,
2482
+ "loss": 0.42,
2483
+ "grad_norm": 0.8132132291793823,
2484
+ "learning_rate": 3.4279701043260886e-06
2485
+ },
2486
+ {
2487
+ "step": 277,
2488
+ "epoch": 1.8779661016949154,
2489
+ "cpu_mem": 1.511325696,
2490
+ "gpu_mem": 4.676381184,
2491
+ "loss": 0.4333,
2492
+ "grad_norm": 0.9133860468864441,
2493
+ "learning_rate": 3.0589287663461472e-06
2494
+ },
2495
+ {
2496
+ "step": 278,
2497
+ "epoch": 1.8847457627118644,
2498
+ "cpu_mem": 1.511325696,
2499
+ "gpu_mem": 4.676544,
2500
+ "loss": 0.5146,
2501
+ "grad_norm": 0.8138352036476135,
2502
+ "learning_rate": 2.710695410593994e-06
2503
+ },
2504
+ {
2505
+ "step": 279,
2506
+ "epoch": 1.8915254237288135,
2507
+ "cpu_mem": 1.511325696,
2508
+ "gpu_mem": 4.676484096,
2509
+ "loss": 0.4438,
2510
+ "grad_norm": 0.8574395775794983,
2511
+ "learning_rate": 2.3833193495825853e-06
2512
+ },
2513
+ {
2514
+ "step": 280,
2515
+ "epoch": 1.8983050847457628,
2516
+ "cpu_mem": 1.511325696,
2517
+ "gpu_mem": 4.676464128,
2518
+ "loss": 0.4087,
2519
+ "grad_norm": 0.8839820027351379,
2520
+ "learning_rate": 2.076846942272026e-06
2521
+ },
2522
+ {
2523
+ "step": 281,
2524
+ "epoch": 1.905084745762712,
2525
+ "cpu_mem": 1.511325696,
2526
+ "gpu_mem": 4.676399616,
2527
+ "loss": 0.4411,
2528
+ "grad_norm": 0.9226896166801453,
2529
+ "learning_rate": 1.791321587504768e-06
2530
+ },
2531
+ {
2532
+ "step": 282,
2533
+ "epoch": 1.911864406779661,
2534
+ "cpu_mem": 1.511325696,
2535
+ "gpu_mem": 4.67682816,
2536
+ "loss": 0.3647,
2537
+ "grad_norm": 1.2175434827804565,
2538
+ "learning_rate": 1.5267837178600972e-06
2539
+ },
2540
+ {
2541
+ "step": 283,
2542
+ "epoch": 1.9186440677966101,
2543
+ "cpu_mem": 1.511325696,
2544
+ "gpu_mem": 4.676534784,
2545
+ "loss": 0.3445,
2546
+ "grad_norm": 0.8091426491737366,
2547
+ "learning_rate": 1.2832707939284427e-06
2548
+ },
2549
+ {
2550
+ "step": 284,
2551
+ "epoch": 1.9254237288135592,
2552
+ "cpu_mem": 1.511325696,
2553
+ "gpu_mem": 4.6763904,
2554
+ "loss": 0.3911,
2555
+ "grad_norm": 0.6109596490859985,
2556
+ "learning_rate": 1.0608172990067553e-06
2557
+ },
2558
+ {
2559
+ "step": 285,
2560
+ "epoch": 1.9322033898305084,
2561
+ "cpu_mem": 1.511325696,
2562
+ "gpu_mem": 4.67644416,
2563
+ "loss": 0.4041,
2564
+ "grad_norm": 0.8254313468933105,
2565
+ "learning_rate": 8.594547342153979e-07
2566
+ },
2567
+ {
2568
+ "step": 286,
2569
+ "epoch": 1.9389830508474577,
2570
+ "cpu_mem": 1.511325696,
2571
+ "gpu_mem": 4.676861952,
2572
+ "loss": 0.3275,
2573
+ "grad_norm": 0.6968205571174622,
2574
+ "learning_rate": 6.792116140373116e-07
2575
+ },
2576
+ {
2577
+ "step": 287,
2578
+ "epoch": 1.9457627118644067,
2579
+ "cpu_mem": 1.511325696,
2580
+ "gpu_mem": 4.676631552,
2581
+ "loss": 0.3366,
2582
+ "grad_norm": 0.6965417861938477,
2583
+ "learning_rate": 5.201134622801473e-07
2584
+ },
2585
+ {
2586
+ "step": 288,
2587
+ "epoch": 1.9525423728813558,
2588
+ "cpu_mem": 1.511325696,
2589
+ "gpu_mem": 4.676416512,
2590
+ "loss": 0.5175,
2591
+ "grad_norm": 1.0540348291397095,
2592
+ "learning_rate": 3.821828084619727e-07
2593
+ },
2594
+ {
2595
+ "step": 289,
2596
+ "epoch": 1.959322033898305,
2597
+ "cpu_mem": 1.511325696,
2598
+ "gpu_mem": 4.676500992,
2599
+ "loss": 0.4007,
2600
+ "grad_norm": 0.7388603091239929,
2601
+ "learning_rate": 2.654391846207915e-07
2602
+ },
2603
+ {
2604
+ "step": 290,
2605
+ "epoch": 1.9661016949152543,
2606
+ "cpu_mem": 1.511325696,
2607
+ "gpu_mem": 4.676425728,
2608
+ "loss": 0.4716,
2609
+ "grad_norm": 1.206581473350525,
2610
+ "learning_rate": 1.6989912254880556e-07
2611
+ },
2612
+ {
2613
+ "step": 291,
2614
+ "epoch": 1.9728813559322034,
2615
+ "cpu_mem": 1.511325696,
2616
+ "gpu_mem": 4.676461056,
2617
+ "loss": 0.4941,
2618
+ "grad_norm": 0.9969897270202637,
2619
+ "learning_rate": 9.557615145123765e-08
2620
+ },
2621
+ {
2622
+ "step": 292,
2623
+ "epoch": 1.9796610169491524,
2624
+ "cpu_mem": 1.511325696,
2625
+ "gpu_mem": 4.676544,
2626
+ "loss": 0.4308,
2627
+ "grad_norm": 0.7124324440956116,
2628
+ "learning_rate": 4.248079603064724e-08
2629
+ },
2630
+ {
2631
+ "step": 293,
2632
+ "epoch": 1.9864406779661017,
2633
+ "cpu_mem": 1.511325696,
2634
+ "gpu_mem": 4.676461056,
2635
+ "loss": 0.4941,
2636
+ "grad_norm": 1.315065622329712,
2637
+ "learning_rate": 1.0620574996372811e-08
2638
+ },
2639
+ {
2640
+ "step": 294,
2641
+ "epoch": 1.993220338983051,
2642
+ "cpu_mem": 1.511325696,
2643
+ "gpu_mem": 4.676487168,
2644
+ "loss": 0.4635,
2645
+ "grad_norm": 0.8283355832099915,
2646
+ "learning_rate": 0.0
2647
+ },
2648
+ {
2649
+ "step": 294,
2650
+ "epoch": 1.993220338983051,
2651
+ "cpu_mem": 1.511325696,
2652
+ "gpu_mem": 4.676487168,
2653
+ "train_runtime": 4573.7736,
2654
+ "train_samples_per_second": 4.122,
2655
+ "train_steps_per_second": 0.064,
2656
+ "total_flos": 0.0,
2657
+ "train_loss": 0.7646797082456601
2658
+ }
2659
+ ]
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "BOOLQ",
5
+ "dataset_id": "google/boolq",
6
+ "preprocess_id": "boolq_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 3548160
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 2,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-boolq-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-boolq-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T16:07:01.748900"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 4,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 2,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.7999402509460267
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 2,
11
+ "alpha": 4,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 1182720
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-hellaswag-r2-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T13:39:06.562417"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r2-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "hellaswag",
3
+ "results": 0.8323043218482374
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "HELLASWAG",
5
+ "dataset_id": "Rowan/hellaswag",
6
+ "preprocess_id": "hellaswag_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 3548160
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 1,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-hellaswag-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T20:40:36.678485"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-hellaswag-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 64,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 32,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.29394753150175584
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 32,
11
+ "alpha": 64,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 13009920
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-logiqa-r32-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-09-01T00:29:31.648142"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r32-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff
 
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/adapter_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha": 16,
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "TinyLlama/TinyLlama_v1.1",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": false,
8
+ "init_weight": "kaiming",
9
+ "layers_pattern": null,
10
+ "layers_to_transform": null,
11
+ "metric_tracking": false,
12
+ "modules_to_save": null,
13
+ "peft_type": "ABLATION",
14
+ "r": 8,
15
+ "revision": null,
16
+ "seed": 42,
17
+ "share_weights": false,
18
+ "target_modules": [
19
+ "up_proj",
20
+ "gate_proj",
21
+ "v_proj",
22
+ "k_proj",
23
+ "q_proj",
24
+ "down_proj",
25
+ "o_proj"
26
+ ],
27
+ "task_type": null,
28
+ "track_n": 100,
29
+ "variant": "B"
30
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/eval_results.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "task": "logiqa",
3
+ "results": 0.3850444123115059
4
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/training_configuration.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_id": "TinyLlama/TinyLlama_v1.1",
3
+ "dataset": {
4
+ "name": "LOGIQA",
5
+ "dataset_id": "data/logiqa_train",
6
+ "preprocess_id": "logiqa_train_deepeval"
7
+ },
8
+ "peft_config": {
9
+ "method": "abl_B",
10
+ "rank": 8,
11
+ "alpha": 16,
12
+ "dropout": 0.0,
13
+ "bias": "none",
14
+ "target_modules": [
15
+ "q_proj",
16
+ "k_proj",
17
+ "v_proj",
18
+ "o_proj",
19
+ "gate_proj",
20
+ "down_proj",
21
+ "up_proj"
22
+ ],
23
+ "trainable_parameter_count": 3548160
24
+ },
25
+ "training_config": {
26
+ "max_dataset_length": null,
27
+ "batch_size": 64,
28
+ "per_device_batch_size": 32,
29
+ "gradient_accumulation_steps": 2,
30
+ "learning_rate": 0.0003,
31
+ "num_epochs": 3,
32
+ "warmup_ratio": 0.1
33
+ },
34
+ "model_name": "TinyLlama_v1.1-abl_B-logiqa-r8-a2",
35
+ "output_dir": "./experiment_results/TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2",
36
+ "seed": 42,
37
+ "timestamp": "2025-08-31T17:26:03.159670"
38
+ }
TinyLlama_v1.1-abl_B/TinyLlama_v1.1-abl_B-logiqa-r8-a2/training_logs.json ADDED
The diff for this file is too large to render. See raw diff