Khamad commited on
Commit
338104f
·
verified ·
1 Parent(s): e042e2c

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +6 -0
  2. adapter_config.json +1 -1
  3. adapter_model.safetensors +1 -1
  4. amiya_training_config.json +30 -0
  5. checkpoint-1000/adapter_config.json +1 -1
  6. checkpoint-1000/adapter_model.safetensors +1 -1
  7. checkpoint-1000/optimizer.pt +1 -1
  8. checkpoint-1000/rng_state.pth +1 -1
  9. checkpoint-1000/scheduler.pt +1 -1
  10. checkpoint-1000/trainer_state.json +103 -103
  11. checkpoint-1000/training_args.bin +1 -1
  12. checkpoint-1500/adapter_config.json +1 -1
  13. checkpoint-1500/adapter_model.safetensors +1 -1
  14. checkpoint-1500/optimizer.pt +1 -1
  15. checkpoint-1500/rng_state.pth +1 -1
  16. checkpoint-1500/scheduler.pt +1 -1
  17. checkpoint-1500/trainer_state.json +153 -153
  18. checkpoint-1500/training_args.bin +1 -1
  19. checkpoint-2000/adapter_config.json +1 -1
  20. checkpoint-2000/adapter_model.safetensors +1 -1
  21. checkpoint-2000/optimizer.pt +1 -1
  22. checkpoint-2000/rng_state.pth +1 -1
  23. checkpoint-2000/scheduler.pt +1 -1
  24. checkpoint-2000/trainer_state.json +203 -203
  25. checkpoint-2000/training_args.bin +1 -1
  26. checkpoint-2500/adapter_config.json +1 -1
  27. checkpoint-2500/adapter_model.safetensors +1 -1
  28. checkpoint-2500/optimizer.pt +1 -1
  29. checkpoint-2500/rng_state.pth +1 -1
  30. checkpoint-2500/scaler.pt +1 -1
  31. checkpoint-2500/scheduler.pt +1 -1
  32. checkpoint-2500/trainer_state.json +253 -253
  33. checkpoint-2500/training_args.bin +1 -1
  34. checkpoint-3000/adapter_config.json +1 -1
  35. checkpoint-3000/adapter_model.safetensors +1 -1
  36. checkpoint-3000/optimizer.pt +1 -1
  37. checkpoint-3000/rng_state.pth +1 -1
  38. checkpoint-3000/scaler.pt +1 -1
  39. checkpoint-3000/scheduler.pt +1 -1
  40. checkpoint-3000/trainer_state.json +303 -303
  41. checkpoint-3000/training_args.bin +1 -1
  42. checkpoint-3500/adapter_config.json +1 -1
  43. checkpoint-3500/adapter_model.safetensors +1 -1
  44. checkpoint-3500/optimizer.pt +1 -1
  45. checkpoint-3500/rng_state.pth +1 -1
  46. checkpoint-3500/scaler.pt +1 -1
  47. checkpoint-3500/scheduler.pt +1 -1
  48. checkpoint-3500/trainer_state.json +354 -354
  49. checkpoint-3500/training_args.bin +1 -1
  50. checkpoint-4000/adapter_config.json +1 -1
.gitattributes CHANGED
@@ -43,3 +43,9 @@ checkpoint-4000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
43
  checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
  checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
43
  checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
44
  checkpoint-500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
45
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
46
+ checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
47
+ checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
48
+ checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
49
+ checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text
50
+ checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
51
+ checkpoint-7242/tokenizer.json filter=lfs diff=lfs merge=lfs -text
adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8737189ec50534340f940487b7bbcfbb3c0341cdc991f458aa11988b0dcf614e
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ed806adeae688d7c41407f6645cccc7ce2b13d73c5c283a964e550db5cccdfd
3
  size 54560368
amiya_training_config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "task": "AMIYA - Palestinian Dialect Generation & Translation",
3
+ "base_model_id": "meta-llama/Meta-Llama-3.1-8B-Instruct",
4
+ "model_name": "llama3.1-8b-amiya-palestinian",
5
+ "lora_config": {
6
+ "r": 16,
7
+ "alpha": 32,
8
+ "dropout": 0.1,
9
+ "target_modules": [
10
+ "q_proj",
11
+ "k_proj",
12
+ "v_proj",
13
+ "o_proj"
14
+ ]
15
+ },
16
+ "training_config": {
17
+ "learning_rate": 0.0002,
18
+ "batch_size": 4,
19
+ "gradient_accumulation_steps": 4,
20
+ "num_epochs": 3,
21
+ "max_seq_length": 512
22
+ },
23
+ "data_info": {
24
+ "train_examples": 38610,
25
+ "val_examples": 4826,
26
+ "task_distribution": {
27
+ "generation": 38610
28
+ }
29
+ }
30
+ }
checkpoint-1000/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-1000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a337408970398c2e9a24e688bf7ae27f447fa36418d8f264d28e4a21f2f49314
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0e9afa212c03f271afa6d36c899544f890afc05a2c223d980daf1a6e15ef57c
3
  size 54560368
checkpoint-1000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:836a20d29ed19417deb4b6ed2fc4b4569861de82c4c55fabfe49fbe87f5fb08d
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d5f323c02c1896e68421c3a31c11a7088016245c869c50a4426821b3cd7a3b19
3
  size 109267450
checkpoint-1000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fd8c7fc2d07824f068e75323719839356ff5fdee8fb7889a50120d59de9dba54
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0848c22229788451a8855f4ad6b26100cfddc951d37153298ef3edaa793e835b
3
  size 14244
checkpoint-1000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d1e1088243bd7a7c628a47a6bf4ac054b65997c9a7e139b848ea5fe3e7d04eb2
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1136245b007779a968f37d1aeab3ab161c76720f4fca73eee284d9fc931f26e
3
  size 1064
checkpoint-1000/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_global_step": 1000,
3
- "best_metric": 0.5663638710975647,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-1000",
5
- "epoch": 0.6666666666666666,
6
  "eval_steps": 250,
7
  "global_step": 1000,
8
  "is_hyper_param_search": false,
@@ -10,180 +10,180 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  }
184
  ],
185
  "logging_steps": 50,
186
- "max_steps": 4500,
187
  "num_input_tokens_seen": 0,
188
  "num_train_epochs": 3,
189
  "save_steps": 500,
@@ -199,7 +199,7 @@
199
  "attributes": {}
200
  }
201
  },
202
- "total_flos": 1.2237370421673984e+17,
203
  "train_batch_size": 4,
204
  "trial_name": null,
205
  "trial_params": null
 
1
  {
2
  "best_global_step": 1000,
3
+ "best_metric": 0.7030432820320129,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-1000",
5
+ "epoch": 0.4143789495493629,
6
  "eval_steps": 250,
7
  "global_step": 1000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  }
184
  ],
185
  "logging_steps": 50,
186
+ "max_steps": 7242,
187
  "num_input_tokens_seen": 0,
188
  "num_train_epochs": 3,
189
  "save_steps": 500,
 
199
  "attributes": {}
200
  }
201
  },
202
+ "total_flos": 6.287293343858688e+16,
203
  "train_batch_size": 4,
204
  "trial_name": null,
205
  "trial_params": null
checkpoint-1000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-1500/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-1500/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fffc39fa6d134c8e1c7fb75eb4b8bba5ec8ab6c346da3a5eb4d76438cd39ae0c
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ea46f8890e14af515a490916d64d71a2431e0fc2dcd93524c7fc01129a8a616
3
  size 54560368
checkpoint-1500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:717a1347063e278eeee0f830ff534ecc2eac4a766bbc7c3f20365db80f97a61c
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:354f1db55f58b61b25e0b64b403e6dde75e6311037b61694806cdbc4c95a72ff
3
  size 109267450
checkpoint-1500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:81714e5e2c84586d42b5d5f07880ce07b947cdccdb018347e47dd6d73d8228e1
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dc994964dd77b4b17f41bf873360fc1a0838df4b2f5359ed8062b49a57ca0441
3
  size 14244
checkpoint-1500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00b75657512bf4d369b2b5bae16105c8cc283d42aacd01df4e2a83091d439a73
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e226332b6c4b4510f2c3b1022f832e7e6d32594e02d1e8b882e79ae8cbda6044
3
  size 1064
checkpoint-1500/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_global_step": 1500,
3
- "best_metric": 0.5581239461898804,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-1500",
5
- "epoch": 1.0,
6
  "eval_steps": 250,
7
  "global_step": 1500,
8
  "is_hyper_param_search": false,
@@ -10,266 +10,266 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  },
184
  {
185
- "epoch": 0.7,
186
- "grad_norm": 0.41202324628829956,
187
- "learning_rate": 0.00015686363636363638,
188
- "loss": 0.6118,
189
  "step": 1050
190
  },
191
  {
192
- "epoch": 0.7333333333333333,
193
- "grad_norm": 0.3883333206176758,
194
- "learning_rate": 0.0001545909090909091,
195
- "loss": 0.5392,
196
  "step": 1100
197
  },
198
  {
199
- "epoch": 0.7666666666666667,
200
- "grad_norm": 0.31973451375961304,
201
- "learning_rate": 0.00015231818181818182,
202
- "loss": 0.5602,
203
  "step": 1150
204
  },
205
  {
206
- "epoch": 0.8,
207
- "grad_norm": 0.31378698348999023,
208
- "learning_rate": 0.00015004545454545454,
209
- "loss": 0.5642,
210
  "step": 1200
211
  },
212
  {
213
- "epoch": 0.8333333333333334,
214
- "grad_norm": 0.3346308171749115,
215
- "learning_rate": 0.0001477727272727273,
216
- "loss": 0.5925,
217
  "step": 1250
218
  },
219
  {
220
- "epoch": 0.8333333333333334,
221
- "eval_loss": 0.5619704723358154,
222
- "eval_runtime": 80.824,
223
- "eval_samples_per_second": 37.118,
224
- "eval_steps_per_second": 9.279,
225
  "step": 1250
226
  },
227
  {
228
- "epoch": 0.8666666666666667,
229
- "grad_norm": 0.5573959946632385,
230
- "learning_rate": 0.0001455,
231
- "loss": 0.5829,
232
  "step": 1300
233
  },
234
  {
235
- "epoch": 0.9,
236
- "grad_norm": 0.36054643988609314,
237
- "learning_rate": 0.00014322727272727273,
238
- "loss": 0.5923,
239
  "step": 1350
240
  },
241
  {
242
- "epoch": 0.9333333333333333,
243
- "grad_norm": 0.36059027910232544,
244
- "learning_rate": 0.00014095454545454546,
245
- "loss": 0.5808,
246
  "step": 1400
247
  },
248
  {
249
- "epoch": 0.9666666666666667,
250
- "grad_norm": 0.3942534327507019,
251
- "learning_rate": 0.00013868181818181818,
252
- "loss": 0.5597,
253
  "step": 1450
254
  },
255
  {
256
- "epoch": 1.0,
257
- "grad_norm": 0.3995835483074188,
258
- "learning_rate": 0.0001364090909090909,
259
- "loss": 0.5554,
260
  "step": 1500
261
  },
262
  {
263
- "epoch": 1.0,
264
- "eval_loss": 0.5581239461898804,
265
- "eval_runtime": 80.8326,
266
- "eval_samples_per_second": 37.114,
267
- "eval_steps_per_second": 9.278,
268
  "step": 1500
269
  }
270
  ],
271
  "logging_steps": 50,
272
- "max_steps": 4500,
273
  "num_input_tokens_seen": 0,
274
  "num_train_epochs": 3,
275
  "save_steps": 500,
@@ -285,7 +285,7 @@
285
  "attributes": {}
286
  }
287
  },
288
- "total_flos": 1.834623940558848e+17,
289
  "train_batch_size": 4,
290
  "trial_name": null,
291
  "trial_params": null
 
1
  {
2
  "best_global_step": 1500,
3
+ "best_metric": 0.6915447115898132,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-1500",
5
+ "epoch": 0.6215684243240444,
6
  "eval_steps": 250,
7
  "global_step": 1500,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  },
184
  {
185
+ "epoch": 0.43509789702683105,
186
+ "grad_norm": 0.45690879225730896,
187
+ "learning_rate": 0.0001734248109773173,
188
+ "loss": 0.793,
189
  "step": 1050
190
  },
191
  {
192
+ "epoch": 0.45581684450429916,
193
+ "grad_norm": 0.5000227093696594,
194
+ "learning_rate": 0.00017202464295715486,
195
+ "loss": 0.8342,
196
  "step": 1100
197
  },
198
  {
199
+ "epoch": 0.4765357919817673,
200
+ "grad_norm": 0.47182488441467285,
201
+ "learning_rate": 0.00017062447493699246,
202
+ "loss": 0.7997,
203
  "step": 1150
204
  },
205
  {
206
+ "epoch": 0.4972547394592355,
207
+ "grad_norm": 0.7060516476631165,
208
+ "learning_rate": 0.00016922430691683002,
209
+ "loss": 0.7788,
210
  "step": 1200
211
  },
212
  {
213
+ "epoch": 0.5179736869367036,
214
+ "grad_norm": 0.46701857447624207,
215
+ "learning_rate": 0.00016782413889666762,
216
+ "loss": 0.7518,
217
  "step": 1250
218
  },
219
  {
220
+ "epoch": 0.5179736869367036,
221
+ "eval_loss": 0.7023425698280334,
222
+ "eval_runtime": 86.3015,
223
+ "eval_samples_per_second": 55.92,
224
+ "eval_steps_per_second": 13.986,
225
  "step": 1250
226
  },
227
  {
228
+ "epoch": 0.5386926344141718,
229
+ "grad_norm": 0.668192446231842,
230
+ "learning_rate": 0.00016642397087650518,
231
+ "loss": 0.7682,
232
  "step": 1300
233
  },
234
  {
235
+ "epoch": 0.5594115818916399,
236
+ "grad_norm": 0.47292283177375793,
237
+ "learning_rate": 0.00016502380285634278,
238
+ "loss": 0.7985,
239
  "step": 1350
240
  },
241
  {
242
+ "epoch": 0.580130529369108,
243
+ "grad_norm": 0.7327275276184082,
244
+ "learning_rate": 0.00016362363483618034,
245
+ "loss": 0.8378,
246
  "step": 1400
247
  },
248
  {
249
+ "epoch": 0.6008494768465762,
250
+ "grad_norm": 0.8417996764183044,
251
+ "learning_rate": 0.0001622234668160179,
252
+ "loss": 0.7962,
253
  "step": 1450
254
  },
255
  {
256
+ "epoch": 0.6215684243240444,
257
+ "grad_norm": 0.6189562678337097,
258
+ "learning_rate": 0.0001608232987958555,
259
+ "loss": 0.8028,
260
  "step": 1500
261
  },
262
  {
263
+ "epoch": 0.6215684243240444,
264
+ "eval_loss": 0.6915447115898132,
265
+ "eval_runtime": 86.2147,
266
+ "eval_samples_per_second": 55.977,
267
+ "eval_steps_per_second": 14.0,
268
  "step": 1500
269
  }
270
  ],
271
  "logging_steps": 50,
272
+ "max_steps": 7242,
273
  "num_input_tokens_seen": 0,
274
  "num_train_epochs": 3,
275
  "save_steps": 500,
 
285
  "attributes": {}
286
  }
287
  },
288
+ "total_flos": 9.496080786358272e+16,
289
  "train_batch_size": 4,
290
  "trial_name": null,
291
  "trial_params": null
checkpoint-1500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-2000/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-2000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7c3a88ad5e47e99721d27a7cc47580f0dd445458c5b5d383d9746ac5150752b3
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bc2c38a4feb292b5953adbca7e889a9849d3aae23bebf65624fa6ef2a12e814
3
  size 54560368
checkpoint-2000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1aa882db7f1e97aecd9e23deab24ec52c1966c084ef84d480101777cb20b2b38
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:02591c829ebc2e9c023c77020f2505b8055865e72743aebf570d809d20a5bf01
3
  size 109267450
checkpoint-2000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e86b19998400264e99e08275eb288ef36b233377938a1f173b2ecb9fa75ffacc
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b69ce190b07f928f0db402b171b4a32695620a6cc7680ee0294e1d3ca9955e84
3
  size 14244
checkpoint-2000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:de481359f7708f1c509bebbc539f8384d41101271a19491884a2ffc4b1dd3c44
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aff236ec96fe456a1d48a1c988fd9dbf62d3fbd22f57121ebf0b02e7d4ca2c27
3
  size 1064
checkpoint-2000/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_global_step": 2000,
3
- "best_metric": 0.5535863637924194,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-2000",
5
- "epoch": 1.3333333333333333,
6
  "eval_steps": 250,
7
  "global_step": 2000,
8
  "is_hyper_param_search": false,
@@ -10,352 +10,352 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  },
184
  {
185
- "epoch": 0.7,
186
- "grad_norm": 0.41202324628829956,
187
- "learning_rate": 0.00015686363636363638,
188
- "loss": 0.6118,
189
  "step": 1050
190
  },
191
  {
192
- "epoch": 0.7333333333333333,
193
- "grad_norm": 0.3883333206176758,
194
- "learning_rate": 0.0001545909090909091,
195
- "loss": 0.5392,
196
  "step": 1100
197
  },
198
  {
199
- "epoch": 0.7666666666666667,
200
- "grad_norm": 0.31973451375961304,
201
- "learning_rate": 0.00015231818181818182,
202
- "loss": 0.5602,
203
  "step": 1150
204
  },
205
  {
206
- "epoch": 0.8,
207
- "grad_norm": 0.31378698348999023,
208
- "learning_rate": 0.00015004545454545454,
209
- "loss": 0.5642,
210
  "step": 1200
211
  },
212
  {
213
- "epoch": 0.8333333333333334,
214
- "grad_norm": 0.3346308171749115,
215
- "learning_rate": 0.0001477727272727273,
216
- "loss": 0.5925,
217
  "step": 1250
218
  },
219
  {
220
- "epoch": 0.8333333333333334,
221
- "eval_loss": 0.5619704723358154,
222
- "eval_runtime": 80.824,
223
- "eval_samples_per_second": 37.118,
224
- "eval_steps_per_second": 9.279,
225
  "step": 1250
226
  },
227
  {
228
- "epoch": 0.8666666666666667,
229
- "grad_norm": 0.5573959946632385,
230
- "learning_rate": 0.0001455,
231
- "loss": 0.5829,
232
  "step": 1300
233
  },
234
  {
235
- "epoch": 0.9,
236
- "grad_norm": 0.36054643988609314,
237
- "learning_rate": 0.00014322727272727273,
238
- "loss": 0.5923,
239
  "step": 1350
240
  },
241
  {
242
- "epoch": 0.9333333333333333,
243
- "grad_norm": 0.36059027910232544,
244
- "learning_rate": 0.00014095454545454546,
245
- "loss": 0.5808,
246
  "step": 1400
247
  },
248
  {
249
- "epoch": 0.9666666666666667,
250
- "grad_norm": 0.3942534327507019,
251
- "learning_rate": 0.00013868181818181818,
252
- "loss": 0.5597,
253
  "step": 1450
254
  },
255
  {
256
- "epoch": 1.0,
257
- "grad_norm": 0.3995835483074188,
258
- "learning_rate": 0.0001364090909090909,
259
- "loss": 0.5554,
260
  "step": 1500
261
  },
262
  {
263
- "epoch": 1.0,
264
- "eval_loss": 0.5581239461898804,
265
- "eval_runtime": 80.8326,
266
- "eval_samples_per_second": 37.114,
267
- "eval_steps_per_second": 9.278,
268
  "step": 1500
269
  },
270
  {
271
- "epoch": 1.0333333333333334,
272
- "grad_norm": 0.3405410051345825,
273
- "learning_rate": 0.00013413636363636365,
274
- "loss": 0.5571,
275
  "step": 1550
276
  },
277
  {
278
- "epoch": 1.0666666666666667,
279
- "grad_norm": 0.4485073983669281,
280
- "learning_rate": 0.00013186363636363637,
281
- "loss": 0.5674,
282
  "step": 1600
283
  },
284
  {
285
- "epoch": 1.1,
286
- "grad_norm": 0.34938374161720276,
287
- "learning_rate": 0.0001295909090909091,
288
- "loss": 0.5354,
289
  "step": 1650
290
  },
291
  {
292
- "epoch": 1.1333333333333333,
293
- "grad_norm": 0.33084195852279663,
294
- "learning_rate": 0.00012731818181818184,
295
- "loss": 0.5765,
296
  "step": 1700
297
  },
298
  {
299
- "epoch": 1.1666666666666667,
300
- "grad_norm": 0.3667336404323578,
301
- "learning_rate": 0.00012504545454545456,
302
- "loss": 0.5486,
303
  "step": 1750
304
  },
305
  {
306
- "epoch": 1.1666666666666667,
307
- "eval_loss": 0.5557209253311157,
308
- "eval_runtime": 80.8386,
309
- "eval_samples_per_second": 37.111,
310
- "eval_steps_per_second": 9.278,
311
  "step": 1750
312
  },
313
  {
314
- "epoch": 1.2,
315
- "grad_norm": 0.33248019218444824,
316
- "learning_rate": 0.00012277272727272728,
317
- "loss": 0.5617,
318
  "step": 1800
319
  },
320
  {
321
- "epoch": 1.2333333333333334,
322
- "grad_norm": 0.4447474479675293,
323
- "learning_rate": 0.00012050000000000002,
324
- "loss": 0.567,
325
  "step": 1850
326
  },
327
  {
328
- "epoch": 1.2666666666666666,
329
- "grad_norm": 0.42134660482406616,
330
- "learning_rate": 0.00011822727272727274,
331
- "loss": 0.5319,
332
  "step": 1900
333
  },
334
  {
335
- "epoch": 1.3,
336
- "grad_norm": 0.3942984640598297,
337
- "learning_rate": 0.00011595454545454544,
338
- "loss": 0.5325,
339
  "step": 1950
340
  },
341
  {
342
- "epoch": 1.3333333333333333,
343
- "grad_norm": 0.4929428696632385,
344
- "learning_rate": 0.00011368181818181818,
345
- "loss": 0.5565,
346
  "step": 2000
347
  },
348
  {
349
- "epoch": 1.3333333333333333,
350
- "eval_loss": 0.5535863637924194,
351
- "eval_runtime": 80.8279,
352
- "eval_samples_per_second": 37.116,
353
- "eval_steps_per_second": 9.279,
354
  "step": 2000
355
  }
356
  ],
357
  "logging_steps": 50,
358
- "max_steps": 4500,
359
  "num_input_tokens_seen": 0,
360
  "num_train_epochs": 3,
361
  "save_steps": 500,
@@ -371,7 +371,7 @@
371
  "attributes": {}
372
  }
373
  },
374
- "total_flos": 2.4518949953568768e+17,
375
  "train_batch_size": 4,
376
  "trial_name": null,
377
  "trial_params": null
 
1
  {
2
  "best_global_step": 2000,
3
+ "best_metric": 0.6823315024375916,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-2000",
5
+ "epoch": 0.8287578990987258,
6
  "eval_steps": 250,
7
  "global_step": 2000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  },
184
  {
185
+ "epoch": 0.43509789702683105,
186
+ "grad_norm": 0.45690879225730896,
187
+ "learning_rate": 0.0001734248109773173,
188
+ "loss": 0.793,
189
  "step": 1050
190
  },
191
  {
192
+ "epoch": 0.45581684450429916,
193
+ "grad_norm": 0.5000227093696594,
194
+ "learning_rate": 0.00017202464295715486,
195
+ "loss": 0.8342,
196
  "step": 1100
197
  },
198
  {
199
+ "epoch": 0.4765357919817673,
200
+ "grad_norm": 0.47182488441467285,
201
+ "learning_rate": 0.00017062447493699246,
202
+ "loss": 0.7997,
203
  "step": 1150
204
  },
205
  {
206
+ "epoch": 0.4972547394592355,
207
+ "grad_norm": 0.7060516476631165,
208
+ "learning_rate": 0.00016922430691683002,
209
+ "loss": 0.7788,
210
  "step": 1200
211
  },
212
  {
213
+ "epoch": 0.5179736869367036,
214
+ "grad_norm": 0.46701857447624207,
215
+ "learning_rate": 0.00016782413889666762,
216
+ "loss": 0.7518,
217
  "step": 1250
218
  },
219
  {
220
+ "epoch": 0.5179736869367036,
221
+ "eval_loss": 0.7023425698280334,
222
+ "eval_runtime": 86.3015,
223
+ "eval_samples_per_second": 55.92,
224
+ "eval_steps_per_second": 13.986,
225
  "step": 1250
226
  },
227
  {
228
+ "epoch": 0.5386926344141718,
229
+ "grad_norm": 0.668192446231842,
230
+ "learning_rate": 0.00016642397087650518,
231
+ "loss": 0.7682,
232
  "step": 1300
233
  },
234
  {
235
+ "epoch": 0.5594115818916399,
236
+ "grad_norm": 0.47292283177375793,
237
+ "learning_rate": 0.00016502380285634278,
238
+ "loss": 0.7985,
239
  "step": 1350
240
  },
241
  {
242
+ "epoch": 0.580130529369108,
243
+ "grad_norm": 0.7327275276184082,
244
+ "learning_rate": 0.00016362363483618034,
245
+ "loss": 0.8378,
246
  "step": 1400
247
  },
248
  {
249
+ "epoch": 0.6008494768465762,
250
+ "grad_norm": 0.8417996764183044,
251
+ "learning_rate": 0.0001622234668160179,
252
+ "loss": 0.7962,
253
  "step": 1450
254
  },
255
  {
256
+ "epoch": 0.6215684243240444,
257
+ "grad_norm": 0.6189562678337097,
258
+ "learning_rate": 0.0001608232987958555,
259
+ "loss": 0.8028,
260
  "step": 1500
261
  },
262
  {
263
+ "epoch": 0.6215684243240444,
264
+ "eval_loss": 0.6915447115898132,
265
+ "eval_runtime": 86.2147,
266
+ "eval_samples_per_second": 55.977,
267
+ "eval_steps_per_second": 14.0,
268
  "step": 1500
269
  },
270
  {
271
+ "epoch": 0.6422873718015125,
272
+ "grad_norm": 0.7345826625823975,
273
+ "learning_rate": 0.0001594231307756931,
274
+ "loss": 0.7978,
275
  "step": 1550
276
  },
277
  {
278
+ "epoch": 0.6630063192789807,
279
+ "grad_norm": 0.6538310050964355,
280
+ "learning_rate": 0.0001580229627555307,
281
+ "loss": 0.7672,
282
  "step": 1600
283
  },
284
  {
285
+ "epoch": 0.6837252667564487,
286
+ "grad_norm": 0.661582350730896,
287
+ "learning_rate": 0.00015662279473536826,
288
+ "loss": 0.7378,
289
  "step": 1650
290
  },
291
  {
292
+ "epoch": 0.7044442142339169,
293
+ "grad_norm": 0.3603042960166931,
294
+ "learning_rate": 0.00015522262671520583,
295
+ "loss": 0.6741,
296
  "step": 1700
297
  },
298
  {
299
+ "epoch": 0.7251631617113851,
300
+ "grad_norm": 0.8882561326026917,
301
+ "learning_rate": 0.00015382245869504342,
302
+ "loss": 0.7695,
303
  "step": 1750
304
  },
305
  {
306
+ "epoch": 0.7251631617113851,
307
+ "eval_loss": 0.6858941316604614,
308
+ "eval_runtime": 86.6358,
309
+ "eval_samples_per_second": 55.704,
310
+ "eval_steps_per_second": 13.932,
311
  "step": 1750
312
  },
313
  {
314
+ "epoch": 0.7458821091888532,
315
+ "grad_norm": 0.5933266282081604,
316
+ "learning_rate": 0.000152422290674881,
317
+ "loss": 0.7548,
318
  "step": 1800
319
  },
320
  {
321
+ "epoch": 0.7666010566663214,
322
+ "grad_norm": 0.8178608417510986,
323
+ "learning_rate": 0.00015102212265471858,
324
+ "loss": 0.7639,
325
  "step": 1850
326
  },
327
  {
328
+ "epoch": 0.7873200041437894,
329
+ "grad_norm": 0.4378993511199951,
330
+ "learning_rate": 0.00014962195463455615,
331
+ "loss": 0.7985,
332
  "step": 1900
333
  },
334
  {
335
+ "epoch": 0.8080389516212576,
336
+ "grad_norm": 0.3732803463935852,
337
+ "learning_rate": 0.00014822178661439374,
338
+ "loss": 0.8481,
339
  "step": 1950
340
  },
341
  {
342
+ "epoch": 0.8287578990987258,
343
+ "grad_norm": 0.7421035170555115,
344
+ "learning_rate": 0.0001468216185942313,
345
+ "loss": 0.7223,
346
  "step": 2000
347
  },
348
  {
349
+ "epoch": 0.8287578990987258,
350
+ "eval_loss": 0.6823315024375916,
351
+ "eval_runtime": 86.5575,
352
+ "eval_samples_per_second": 55.755,
353
+ "eval_steps_per_second": 13.944,
354
  "step": 2000
355
  }
356
  ],
357
  "logging_steps": 50,
358
+ "max_steps": 7242,
359
  "num_input_tokens_seen": 0,
360
  "num_train_epochs": 3,
361
  "save_steps": 500,
 
371
  "attributes": {}
372
  }
373
  },
374
+ "total_flos": 1.262946011799552e+17,
375
  "train_batch_size": 4,
376
  "trial_name": null,
377
  "trial_params": null
checkpoint-2000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-2500/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-2500/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a5207bd2a71d6e74489cb5103f4173305575b69a2798a53c970da2f8e42cfd1b
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:49efc21966de98bf9994a157e2e4dabb68153133adf2745eefe182249b3f3197
3
  size 54560368
checkpoint-2500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:848bae18dd111819cc86ee93c9822b1baf9b23656a8810d3bbb4140c26fa04d8
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fcd2efd5a1e4f00990e049fca5bee5c43d977c21ae7fd093d6c7fdff6fe068b8
3
  size 109267450
checkpoint-2500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:02297c389f0848a1a674f64fd3230c94f24e9dbabcb192a80189b95e9b26ab11
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:24368a441ca3fc9abe92a436629bb258dee7296cd6e160cb97d6948bbd91695b
3
  size 14244
checkpoint-2500/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48e2d97f563bb838328076a1666504681962151a3975a2f064be3a03e6500740
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ab3b49628f2ae2ec7cdbb0bc103569c008e8a11af2787309237ce369c80d7b9
3
  size 988
checkpoint-2500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:23892cead62882c0c408b409776e78c7487ed4ce0dfaca891fbc6687acaa712e
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:abac4e3ce09d884de31337ea91e7059472492d528353bcdadc9c18f2f41cdb86
3
  size 1064
checkpoint-2500/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_global_step": 2500,
3
- "best_metric": 0.5476261377334595,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-2500",
5
- "epoch": 1.6666666666666665,
6
  "eval_steps": 250,
7
  "global_step": 2500,
8
  "is_hyper_param_search": false,
@@ -10,438 +10,438 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  },
184
  {
185
- "epoch": 0.7,
186
- "grad_norm": 0.41202324628829956,
187
- "learning_rate": 0.00015686363636363638,
188
- "loss": 0.6118,
189
  "step": 1050
190
  },
191
  {
192
- "epoch": 0.7333333333333333,
193
- "grad_norm": 0.3883333206176758,
194
- "learning_rate": 0.0001545909090909091,
195
- "loss": 0.5392,
196
  "step": 1100
197
  },
198
  {
199
- "epoch": 0.7666666666666667,
200
- "grad_norm": 0.31973451375961304,
201
- "learning_rate": 0.00015231818181818182,
202
- "loss": 0.5602,
203
  "step": 1150
204
  },
205
  {
206
- "epoch": 0.8,
207
- "grad_norm": 0.31378698348999023,
208
- "learning_rate": 0.00015004545454545454,
209
- "loss": 0.5642,
210
  "step": 1200
211
  },
212
  {
213
- "epoch": 0.8333333333333334,
214
- "grad_norm": 0.3346308171749115,
215
- "learning_rate": 0.0001477727272727273,
216
- "loss": 0.5925,
217
  "step": 1250
218
  },
219
  {
220
- "epoch": 0.8333333333333334,
221
- "eval_loss": 0.5619704723358154,
222
- "eval_runtime": 80.824,
223
- "eval_samples_per_second": 37.118,
224
- "eval_steps_per_second": 9.279,
225
  "step": 1250
226
  },
227
  {
228
- "epoch": 0.8666666666666667,
229
- "grad_norm": 0.5573959946632385,
230
- "learning_rate": 0.0001455,
231
- "loss": 0.5829,
232
  "step": 1300
233
  },
234
  {
235
- "epoch": 0.9,
236
- "grad_norm": 0.36054643988609314,
237
- "learning_rate": 0.00014322727272727273,
238
- "loss": 0.5923,
239
  "step": 1350
240
  },
241
  {
242
- "epoch": 0.9333333333333333,
243
- "grad_norm": 0.36059027910232544,
244
- "learning_rate": 0.00014095454545454546,
245
- "loss": 0.5808,
246
  "step": 1400
247
  },
248
  {
249
- "epoch": 0.9666666666666667,
250
- "grad_norm": 0.3942534327507019,
251
- "learning_rate": 0.00013868181818181818,
252
- "loss": 0.5597,
253
  "step": 1450
254
  },
255
  {
256
- "epoch": 1.0,
257
- "grad_norm": 0.3995835483074188,
258
- "learning_rate": 0.0001364090909090909,
259
- "loss": 0.5554,
260
  "step": 1500
261
  },
262
  {
263
- "epoch": 1.0,
264
- "eval_loss": 0.5581239461898804,
265
- "eval_runtime": 80.8326,
266
- "eval_samples_per_second": 37.114,
267
- "eval_steps_per_second": 9.278,
268
  "step": 1500
269
  },
270
  {
271
- "epoch": 1.0333333333333334,
272
- "grad_norm": 0.3405410051345825,
273
- "learning_rate": 0.00013413636363636365,
274
- "loss": 0.5571,
275
  "step": 1550
276
  },
277
  {
278
- "epoch": 1.0666666666666667,
279
- "grad_norm": 0.4485073983669281,
280
- "learning_rate": 0.00013186363636363637,
281
- "loss": 0.5674,
282
  "step": 1600
283
  },
284
  {
285
- "epoch": 1.1,
286
- "grad_norm": 0.34938374161720276,
287
- "learning_rate": 0.0001295909090909091,
288
- "loss": 0.5354,
289
  "step": 1650
290
  },
291
  {
292
- "epoch": 1.1333333333333333,
293
- "grad_norm": 0.33084195852279663,
294
- "learning_rate": 0.00012731818181818184,
295
- "loss": 0.5765,
296
  "step": 1700
297
  },
298
  {
299
- "epoch": 1.1666666666666667,
300
- "grad_norm": 0.3667336404323578,
301
- "learning_rate": 0.00012504545454545456,
302
- "loss": 0.5486,
303
  "step": 1750
304
  },
305
  {
306
- "epoch": 1.1666666666666667,
307
- "eval_loss": 0.5557209253311157,
308
- "eval_runtime": 80.8386,
309
- "eval_samples_per_second": 37.111,
310
- "eval_steps_per_second": 9.278,
311
  "step": 1750
312
  },
313
  {
314
- "epoch": 1.2,
315
- "grad_norm": 0.33248019218444824,
316
- "learning_rate": 0.00012277272727272728,
317
- "loss": 0.5617,
318
  "step": 1800
319
  },
320
  {
321
- "epoch": 1.2333333333333334,
322
- "grad_norm": 0.4447474479675293,
323
- "learning_rate": 0.00012050000000000002,
324
- "loss": 0.567,
325
  "step": 1850
326
  },
327
  {
328
- "epoch": 1.2666666666666666,
329
- "grad_norm": 0.42134660482406616,
330
- "learning_rate": 0.00011822727272727274,
331
- "loss": 0.5319,
332
  "step": 1900
333
  },
334
  {
335
- "epoch": 1.3,
336
- "grad_norm": 0.3942984640598297,
337
- "learning_rate": 0.00011595454545454544,
338
- "loss": 0.5325,
339
  "step": 1950
340
  },
341
  {
342
- "epoch": 1.3333333333333333,
343
- "grad_norm": 0.4929428696632385,
344
- "learning_rate": 0.00011368181818181818,
345
- "loss": 0.5565,
346
  "step": 2000
347
  },
348
  {
349
- "epoch": 1.3333333333333333,
350
- "eval_loss": 0.5535863637924194,
351
- "eval_runtime": 80.8279,
352
- "eval_samples_per_second": 37.116,
353
- "eval_steps_per_second": 9.279,
354
  "step": 2000
355
  },
356
  {
357
- "epoch": 1.3666666666666667,
358
- "grad_norm": 0.4141586720943451,
359
- "learning_rate": 0.00011140909090909091,
360
- "loss": 0.5801,
361
  "step": 2050
362
  },
363
  {
364
- "epoch": 1.4,
365
- "grad_norm": 0.45937269926071167,
366
- "learning_rate": 0.00010913636363636364,
367
- "loss": 0.5439,
368
  "step": 2100
369
  },
370
  {
371
- "epoch": 1.4333333333333333,
372
- "grad_norm": 0.47830042243003845,
373
- "learning_rate": 0.00010686363636363637,
374
- "loss": 0.547,
375
  "step": 2150
376
  },
377
  {
378
- "epoch": 1.4666666666666668,
379
- "grad_norm": 0.40260276198387146,
380
- "learning_rate": 0.00010459090909090909,
381
- "loss": 0.5229,
382
  "step": 2200
383
  },
384
  {
385
- "epoch": 1.5,
386
- "grad_norm": 0.5281402468681335,
387
- "learning_rate": 0.00010231818181818183,
388
- "loss": 0.5475,
389
  "step": 2250
390
  },
391
  {
392
- "epoch": 1.5,
393
- "eval_loss": 0.5505018830299377,
394
- "eval_runtime": 80.8409,
395
- "eval_samples_per_second": 37.11,
396
- "eval_steps_per_second": 9.277,
397
  "step": 2250
398
  },
399
  {
400
- "epoch": 1.5333333333333332,
401
- "grad_norm": 0.3721947968006134,
402
- "learning_rate": 0.00010004545454545455,
403
- "loss": 0.5466,
404
  "step": 2300
405
  },
406
  {
407
- "epoch": 1.5666666666666667,
408
- "grad_norm": 0.3462945818901062,
409
- "learning_rate": 9.777272727272728e-05,
410
- "loss": 0.5209,
411
  "step": 2350
412
  },
413
  {
414
- "epoch": 1.6,
415
- "grad_norm": 0.4027090072631836,
416
- "learning_rate": 9.55e-05,
417
- "loss": 0.5307,
418
  "step": 2400
419
  },
420
  {
421
- "epoch": 1.6333333333333333,
422
- "grad_norm": 0.3684265613555908,
423
- "learning_rate": 9.322727272727273e-05,
424
- "loss": 0.5118,
425
  "step": 2450
426
  },
427
  {
428
- "epoch": 1.6666666666666665,
429
- "grad_norm": 0.4819887578487396,
430
- "learning_rate": 9.095454545454546e-05,
431
- "loss": 0.561,
432
  "step": 2500
433
  },
434
  {
435
- "epoch": 1.6666666666666665,
436
- "eval_loss": 0.5476261377334595,
437
- "eval_runtime": 80.8288,
438
- "eval_samples_per_second": 37.115,
439
- "eval_steps_per_second": 9.279,
440
  "step": 2500
441
  }
442
  ],
443
  "logging_steps": 50,
444
- "max_steps": 4500,
445
  "num_input_tokens_seen": 0,
446
  "num_train_epochs": 3,
447
  "save_steps": 500,
@@ -457,7 +457,7 @@
457
  "attributes": {}
458
  }
459
  },
460
- "total_flos": 3.061836360125645e+17,
461
  "train_batch_size": 4,
462
  "trial_name": null,
463
  "trial_params": null
 
1
  {
2
  "best_global_step": 2500,
3
+ "best_metric": 0.675748348236084,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-2500",
5
+ "epoch": 1.0356365896612452,
6
  "eval_steps": 250,
7
  "global_step": 2500,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  },
184
  {
185
+ "epoch": 0.43509789702683105,
186
+ "grad_norm": 0.45690879225730896,
187
+ "learning_rate": 0.0001734248109773173,
188
+ "loss": 0.793,
189
  "step": 1050
190
  },
191
  {
192
+ "epoch": 0.45581684450429916,
193
+ "grad_norm": 0.5000227093696594,
194
+ "learning_rate": 0.00017202464295715486,
195
+ "loss": 0.8342,
196
  "step": 1100
197
  },
198
  {
199
+ "epoch": 0.4765357919817673,
200
+ "grad_norm": 0.47182488441467285,
201
+ "learning_rate": 0.00017062447493699246,
202
+ "loss": 0.7997,
203
  "step": 1150
204
  },
205
  {
206
+ "epoch": 0.4972547394592355,
207
+ "grad_norm": 0.7060516476631165,
208
+ "learning_rate": 0.00016922430691683002,
209
+ "loss": 0.7788,
210
  "step": 1200
211
  },
212
  {
213
+ "epoch": 0.5179736869367036,
214
+ "grad_norm": 0.46701857447624207,
215
+ "learning_rate": 0.00016782413889666762,
216
+ "loss": 0.7518,
217
  "step": 1250
218
  },
219
  {
220
+ "epoch": 0.5179736869367036,
221
+ "eval_loss": 0.7023425698280334,
222
+ "eval_runtime": 86.3015,
223
+ "eval_samples_per_second": 55.92,
224
+ "eval_steps_per_second": 13.986,
225
  "step": 1250
226
  },
227
  {
228
+ "epoch": 0.5386926344141718,
229
+ "grad_norm": 0.668192446231842,
230
+ "learning_rate": 0.00016642397087650518,
231
+ "loss": 0.7682,
232
  "step": 1300
233
  },
234
  {
235
+ "epoch": 0.5594115818916399,
236
+ "grad_norm": 0.47292283177375793,
237
+ "learning_rate": 0.00016502380285634278,
238
+ "loss": 0.7985,
239
  "step": 1350
240
  },
241
  {
242
+ "epoch": 0.580130529369108,
243
+ "grad_norm": 0.7327275276184082,
244
+ "learning_rate": 0.00016362363483618034,
245
+ "loss": 0.8378,
246
  "step": 1400
247
  },
248
  {
249
+ "epoch": 0.6008494768465762,
250
+ "grad_norm": 0.8417996764183044,
251
+ "learning_rate": 0.0001622234668160179,
252
+ "loss": 0.7962,
253
  "step": 1450
254
  },
255
  {
256
+ "epoch": 0.6215684243240444,
257
+ "grad_norm": 0.6189562678337097,
258
+ "learning_rate": 0.0001608232987958555,
259
+ "loss": 0.8028,
260
  "step": 1500
261
  },
262
  {
263
+ "epoch": 0.6215684243240444,
264
+ "eval_loss": 0.6915447115898132,
265
+ "eval_runtime": 86.2147,
266
+ "eval_samples_per_second": 55.977,
267
+ "eval_steps_per_second": 14.0,
268
  "step": 1500
269
  },
270
  {
271
+ "epoch": 0.6422873718015125,
272
+ "grad_norm": 0.7345826625823975,
273
+ "learning_rate": 0.0001594231307756931,
274
+ "loss": 0.7978,
275
  "step": 1550
276
  },
277
  {
278
+ "epoch": 0.6630063192789807,
279
+ "grad_norm": 0.6538310050964355,
280
+ "learning_rate": 0.0001580229627555307,
281
+ "loss": 0.7672,
282
  "step": 1600
283
  },
284
  {
285
+ "epoch": 0.6837252667564487,
286
+ "grad_norm": 0.661582350730896,
287
+ "learning_rate": 0.00015662279473536826,
288
+ "loss": 0.7378,
289
  "step": 1650
290
  },
291
  {
292
+ "epoch": 0.7044442142339169,
293
+ "grad_norm": 0.3603042960166931,
294
+ "learning_rate": 0.00015522262671520583,
295
+ "loss": 0.6741,
296
  "step": 1700
297
  },
298
  {
299
+ "epoch": 0.7251631617113851,
300
+ "grad_norm": 0.8882561326026917,
301
+ "learning_rate": 0.00015382245869504342,
302
+ "loss": 0.7695,
303
  "step": 1750
304
  },
305
  {
306
+ "epoch": 0.7251631617113851,
307
+ "eval_loss": 0.6858941316604614,
308
+ "eval_runtime": 86.6358,
309
+ "eval_samples_per_second": 55.704,
310
+ "eval_steps_per_second": 13.932,
311
  "step": 1750
312
  },
313
  {
314
+ "epoch": 0.7458821091888532,
315
+ "grad_norm": 0.5933266282081604,
316
+ "learning_rate": 0.000152422290674881,
317
+ "loss": 0.7548,
318
  "step": 1800
319
  },
320
  {
321
+ "epoch": 0.7666010566663214,
322
+ "grad_norm": 0.8178608417510986,
323
+ "learning_rate": 0.00015102212265471858,
324
+ "loss": 0.7639,
325
  "step": 1850
326
  },
327
  {
328
+ "epoch": 0.7873200041437894,
329
+ "grad_norm": 0.4378993511199951,
330
+ "learning_rate": 0.00014962195463455615,
331
+ "loss": 0.7985,
332
  "step": 1900
333
  },
334
  {
335
+ "epoch": 0.8080389516212576,
336
+ "grad_norm": 0.3732803463935852,
337
+ "learning_rate": 0.00014822178661439374,
338
+ "loss": 0.8481,
339
  "step": 1950
340
  },
341
  {
342
+ "epoch": 0.8287578990987258,
343
+ "grad_norm": 0.7421035170555115,
344
+ "learning_rate": 0.0001468216185942313,
345
+ "loss": 0.7223,
346
  "step": 2000
347
  },
348
  {
349
+ "epoch": 0.8287578990987258,
350
+ "eval_loss": 0.6823315024375916,
351
+ "eval_runtime": 86.5575,
352
+ "eval_samples_per_second": 55.755,
353
+ "eval_steps_per_second": 13.944,
354
  "step": 2000
355
  },
356
  {
357
+ "epoch": 0.8494768465761939,
358
+ "grad_norm": 0.5109913349151611,
359
+ "learning_rate": 0.00014542145057406888,
360
+ "loss": 0.7895,
361
  "step": 2050
362
  },
363
  {
364
+ "epoch": 0.8701957940536621,
365
+ "grad_norm": 0.47988179326057434,
366
+ "learning_rate": 0.00014402128255390647,
367
+ "loss": 0.7385,
368
  "step": 2100
369
  },
370
  {
371
+ "epoch": 0.8909147415311303,
372
+ "grad_norm": 0.7593080997467041,
373
+ "learning_rate": 0.00014262111453374404,
374
+ "loss": 0.7744,
375
  "step": 2150
376
  },
377
  {
378
+ "epoch": 0.9116336890085983,
379
+ "grad_norm": 0.5866154432296753,
380
+ "learning_rate": 0.00014122094651358163,
381
+ "loss": 0.7062,
382
  "step": 2200
383
  },
384
  {
385
+ "epoch": 0.9323526364860665,
386
+ "grad_norm": 0.47364088892936707,
387
+ "learning_rate": 0.00013982077849341922,
388
+ "loss": 0.7792,
389
  "step": 2250
390
  },
391
  {
392
+ "epoch": 0.9323526364860665,
393
+ "eval_loss": 0.6785813570022583,
394
+ "eval_runtime": 86.3444,
395
+ "eval_samples_per_second": 55.892,
396
+ "eval_steps_per_second": 13.979,
397
  "step": 2250
398
  },
399
  {
400
+ "epoch": 0.9530715839635346,
401
+ "grad_norm": 0.7610514760017395,
402
+ "learning_rate": 0.00013842061047325682,
403
+ "loss": 0.7804,
404
  "step": 2300
405
  },
406
  {
407
+ "epoch": 0.9737905314410028,
408
+ "grad_norm": 0.7689616084098816,
409
+ "learning_rate": 0.00013702044245309438,
410
+ "loss": 0.7497,
411
  "step": 2350
412
  },
413
  {
414
+ "epoch": 0.994509478918471,
415
+ "grad_norm": 0.542168378829956,
416
+ "learning_rate": 0.00013562027443293195,
417
+ "loss": 0.7333,
418
  "step": 2400
419
  },
420
  {
421
+ "epoch": 1.0149176421837771,
422
+ "grad_norm": 0.33903324604034424,
423
+ "learning_rate": 0.0001342481097731728,
424
+ "loss": 0.6952,
425
  "step": 2450
426
  },
427
  {
428
+ "epoch": 1.0356365896612452,
429
+ "grad_norm": 0.8183636665344238,
430
+ "learning_rate": 0.00013284794175301036,
431
+ "loss": 0.7386,
432
  "step": 2500
433
  },
434
  {
435
+ "epoch": 1.0356365896612452,
436
+ "eval_loss": 0.675748348236084,
437
+ "eval_runtime": 86.2887,
438
+ "eval_samples_per_second": 55.929,
439
+ "eval_steps_per_second": 13.988,
440
  "step": 2500
441
  }
442
  ],
443
  "logging_steps": 50,
444
+ "max_steps": 7242,
445
  "num_input_tokens_seen": 0,
446
  "num_train_epochs": 3,
447
  "save_steps": 500,
 
457
  "attributes": {}
458
  }
459
  },
460
+ "total_flos": 1.5737739501748224e+17,
461
  "train_batch_size": 4,
462
  "trial_name": null,
463
  "trial_params": null
checkpoint-2500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-3000/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-3000/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8737189ec50534340f940487b7bbcfbb3c0341cdc991f458aa11988b0dcf614e
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6cea713a82dfa53e4225af27dadf62a79d6d173e0e322110ef4080d4150c823b
3
  size 54560368
checkpoint-3000/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:39f0f99c70b766de881416221876b07b78545e8c0e5a126b92f0fa687a983694
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3acc6fa6243a9ec00f9b0e375b237bf0f64023ebb64d44703bbfd65f25a2f895
3
  size 109267450
checkpoint-3000/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:08b64b36dce5f25b027b7d960504594585ac14a5c1168ea02281c808e279d651
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fa346dc61aa799e0160013066342f483bcb52c5551441757ad69edfbabf48bb0
3
  size 14244
checkpoint-3000/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:21aba8ed0f38ed1c04994c10a9ca7e9925e55ef2ed51283c43ff8e2cce78585f
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fab881b6261b7765de00aaece9d42aeb004a99a034f6ff76b068724f6121a7ec
3
  size 988
checkpoint-3000/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d6b22153a6004ee7569e1ad90f415ae5727df20ad97a541ace0b82f7edb0c83a
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06a8d96703998223bf2cf655698a26277cad9e4925693c4c21a22c01308a5a11
3
  size 1064
checkpoint-3000/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
  "best_global_step": 3000,
3
- "best_metric": 0.5436099171638489,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000",
5
- "epoch": 2.0,
6
  "eval_steps": 250,
7
  "global_step": 3000,
8
  "is_hyper_param_search": false,
@@ -10,524 +10,524 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  },
184
  {
185
- "epoch": 0.7,
186
- "grad_norm": 0.41202324628829956,
187
- "learning_rate": 0.00015686363636363638,
188
- "loss": 0.6118,
189
  "step": 1050
190
  },
191
  {
192
- "epoch": 0.7333333333333333,
193
- "grad_norm": 0.3883333206176758,
194
- "learning_rate": 0.0001545909090909091,
195
- "loss": 0.5392,
196
  "step": 1100
197
  },
198
  {
199
- "epoch": 0.7666666666666667,
200
- "grad_norm": 0.31973451375961304,
201
- "learning_rate": 0.00015231818181818182,
202
- "loss": 0.5602,
203
  "step": 1150
204
  },
205
  {
206
- "epoch": 0.8,
207
- "grad_norm": 0.31378698348999023,
208
- "learning_rate": 0.00015004545454545454,
209
- "loss": 0.5642,
210
  "step": 1200
211
  },
212
  {
213
- "epoch": 0.8333333333333334,
214
- "grad_norm": 0.3346308171749115,
215
- "learning_rate": 0.0001477727272727273,
216
- "loss": 0.5925,
217
  "step": 1250
218
  },
219
  {
220
- "epoch": 0.8333333333333334,
221
- "eval_loss": 0.5619704723358154,
222
- "eval_runtime": 80.824,
223
- "eval_samples_per_second": 37.118,
224
- "eval_steps_per_second": 9.279,
225
  "step": 1250
226
  },
227
  {
228
- "epoch": 0.8666666666666667,
229
- "grad_norm": 0.5573959946632385,
230
- "learning_rate": 0.0001455,
231
- "loss": 0.5829,
232
  "step": 1300
233
  },
234
  {
235
- "epoch": 0.9,
236
- "grad_norm": 0.36054643988609314,
237
- "learning_rate": 0.00014322727272727273,
238
- "loss": 0.5923,
239
  "step": 1350
240
  },
241
  {
242
- "epoch": 0.9333333333333333,
243
- "grad_norm": 0.36059027910232544,
244
- "learning_rate": 0.00014095454545454546,
245
- "loss": 0.5808,
246
  "step": 1400
247
  },
248
  {
249
- "epoch": 0.9666666666666667,
250
- "grad_norm": 0.3942534327507019,
251
- "learning_rate": 0.00013868181818181818,
252
- "loss": 0.5597,
253
  "step": 1450
254
  },
255
  {
256
- "epoch": 1.0,
257
- "grad_norm": 0.3995835483074188,
258
- "learning_rate": 0.0001364090909090909,
259
- "loss": 0.5554,
260
  "step": 1500
261
  },
262
  {
263
- "epoch": 1.0,
264
- "eval_loss": 0.5581239461898804,
265
- "eval_runtime": 80.8326,
266
- "eval_samples_per_second": 37.114,
267
- "eval_steps_per_second": 9.278,
268
  "step": 1500
269
  },
270
  {
271
- "epoch": 1.0333333333333334,
272
- "grad_norm": 0.3405410051345825,
273
- "learning_rate": 0.00013413636363636365,
274
- "loss": 0.5571,
275
  "step": 1550
276
  },
277
  {
278
- "epoch": 1.0666666666666667,
279
- "grad_norm": 0.4485073983669281,
280
- "learning_rate": 0.00013186363636363637,
281
- "loss": 0.5674,
282
  "step": 1600
283
  },
284
  {
285
- "epoch": 1.1,
286
- "grad_norm": 0.34938374161720276,
287
- "learning_rate": 0.0001295909090909091,
288
- "loss": 0.5354,
289
  "step": 1650
290
  },
291
  {
292
- "epoch": 1.1333333333333333,
293
- "grad_norm": 0.33084195852279663,
294
- "learning_rate": 0.00012731818181818184,
295
- "loss": 0.5765,
296
  "step": 1700
297
  },
298
  {
299
- "epoch": 1.1666666666666667,
300
- "grad_norm": 0.3667336404323578,
301
- "learning_rate": 0.00012504545454545456,
302
- "loss": 0.5486,
303
  "step": 1750
304
  },
305
  {
306
- "epoch": 1.1666666666666667,
307
- "eval_loss": 0.5557209253311157,
308
- "eval_runtime": 80.8386,
309
- "eval_samples_per_second": 37.111,
310
- "eval_steps_per_second": 9.278,
311
  "step": 1750
312
  },
313
  {
314
- "epoch": 1.2,
315
- "grad_norm": 0.33248019218444824,
316
- "learning_rate": 0.00012277272727272728,
317
- "loss": 0.5617,
318
  "step": 1800
319
  },
320
  {
321
- "epoch": 1.2333333333333334,
322
- "grad_norm": 0.4447474479675293,
323
- "learning_rate": 0.00012050000000000002,
324
- "loss": 0.567,
325
  "step": 1850
326
  },
327
  {
328
- "epoch": 1.2666666666666666,
329
- "grad_norm": 0.42134660482406616,
330
- "learning_rate": 0.00011822727272727274,
331
- "loss": 0.5319,
332
  "step": 1900
333
  },
334
  {
335
- "epoch": 1.3,
336
- "grad_norm": 0.3942984640598297,
337
- "learning_rate": 0.00011595454545454544,
338
- "loss": 0.5325,
339
  "step": 1950
340
  },
341
  {
342
- "epoch": 1.3333333333333333,
343
- "grad_norm": 0.4929428696632385,
344
- "learning_rate": 0.00011368181818181818,
345
- "loss": 0.5565,
346
  "step": 2000
347
  },
348
  {
349
- "epoch": 1.3333333333333333,
350
- "eval_loss": 0.5535863637924194,
351
- "eval_runtime": 80.8279,
352
- "eval_samples_per_second": 37.116,
353
- "eval_steps_per_second": 9.279,
354
  "step": 2000
355
  },
356
  {
357
- "epoch": 1.3666666666666667,
358
- "grad_norm": 0.4141586720943451,
359
- "learning_rate": 0.00011140909090909091,
360
- "loss": 0.5801,
361
  "step": 2050
362
  },
363
  {
364
- "epoch": 1.4,
365
- "grad_norm": 0.45937269926071167,
366
- "learning_rate": 0.00010913636363636364,
367
- "loss": 0.5439,
368
  "step": 2100
369
  },
370
  {
371
- "epoch": 1.4333333333333333,
372
- "grad_norm": 0.47830042243003845,
373
- "learning_rate": 0.00010686363636363637,
374
- "loss": 0.547,
375
  "step": 2150
376
  },
377
  {
378
- "epoch": 1.4666666666666668,
379
- "grad_norm": 0.40260276198387146,
380
- "learning_rate": 0.00010459090909090909,
381
- "loss": 0.5229,
382
  "step": 2200
383
  },
384
  {
385
- "epoch": 1.5,
386
- "grad_norm": 0.5281402468681335,
387
- "learning_rate": 0.00010231818181818183,
388
- "loss": 0.5475,
389
  "step": 2250
390
  },
391
  {
392
- "epoch": 1.5,
393
- "eval_loss": 0.5505018830299377,
394
- "eval_runtime": 80.8409,
395
- "eval_samples_per_second": 37.11,
396
- "eval_steps_per_second": 9.277,
397
  "step": 2250
398
  },
399
  {
400
- "epoch": 1.5333333333333332,
401
- "grad_norm": 0.3721947968006134,
402
- "learning_rate": 0.00010004545454545455,
403
- "loss": 0.5466,
404
  "step": 2300
405
  },
406
  {
407
- "epoch": 1.5666666666666667,
408
- "grad_norm": 0.3462945818901062,
409
- "learning_rate": 9.777272727272728e-05,
410
- "loss": 0.5209,
411
  "step": 2350
412
  },
413
  {
414
- "epoch": 1.6,
415
- "grad_norm": 0.4027090072631836,
416
- "learning_rate": 9.55e-05,
417
- "loss": 0.5307,
418
  "step": 2400
419
  },
420
  {
421
- "epoch": 1.6333333333333333,
422
- "grad_norm": 0.3684265613555908,
423
- "learning_rate": 9.322727272727273e-05,
424
- "loss": 0.5118,
425
  "step": 2450
426
  },
427
  {
428
- "epoch": 1.6666666666666665,
429
- "grad_norm": 0.4819887578487396,
430
- "learning_rate": 9.095454545454546e-05,
431
- "loss": 0.561,
432
  "step": 2500
433
  },
434
  {
435
- "epoch": 1.6666666666666665,
436
- "eval_loss": 0.5476261377334595,
437
- "eval_runtime": 80.8288,
438
- "eval_samples_per_second": 37.115,
439
- "eval_steps_per_second": 9.279,
440
  "step": 2500
441
  },
442
  {
443
- "epoch": 1.7,
444
- "grad_norm": 0.3161783218383789,
445
- "learning_rate": 8.86818181818182e-05,
446
- "loss": 0.5413,
447
  "step": 2550
448
  },
449
  {
450
- "epoch": 1.7333333333333334,
451
- "grad_norm": 0.34697386622428894,
452
- "learning_rate": 8.640909090909092e-05,
453
- "loss": 0.5366,
454
  "step": 2600
455
  },
456
  {
457
- "epoch": 1.7666666666666666,
458
- "grad_norm": 0.4084527790546417,
459
- "learning_rate": 8.413636363636364e-05,
460
- "loss": 0.5426,
461
  "step": 2650
462
  },
463
  {
464
- "epoch": 1.8,
465
- "grad_norm": 0.4053308963775635,
466
- "learning_rate": 8.186363636363636e-05,
467
- "loss": 0.532,
468
  "step": 2700
469
  },
470
  {
471
- "epoch": 1.8333333333333335,
472
- "grad_norm": 0.3551884591579437,
473
- "learning_rate": 7.95909090909091e-05,
474
- "loss": 0.5399,
475
  "step": 2750
476
  },
477
  {
478
- "epoch": 1.8333333333333335,
479
- "eval_loss": 0.546008288860321,
480
- "eval_runtime": 80.8186,
481
- "eval_samples_per_second": 37.12,
482
- "eval_steps_per_second": 9.28,
483
  "step": 2750
484
  },
485
  {
486
- "epoch": 1.8666666666666667,
487
- "grad_norm": 0.40072572231292725,
488
- "learning_rate": 7.731818181818183e-05,
489
- "loss": 0.5332,
490
  "step": 2800
491
  },
492
  {
493
- "epoch": 1.9,
494
- "grad_norm": 0.3773200213909149,
495
- "learning_rate": 7.504545454545455e-05,
496
- "loss": 0.5296,
497
  "step": 2850
498
  },
499
  {
500
- "epoch": 1.9333333333333333,
501
- "grad_norm": 0.45379436016082764,
502
- "learning_rate": 7.277272727272728e-05,
503
- "loss": 0.5356,
504
  "step": 2900
505
  },
506
  {
507
- "epoch": 1.9666666666666668,
508
- "grad_norm": 0.36246028542518616,
509
- "learning_rate": 7.05e-05,
510
- "loss": 0.5112,
511
  "step": 2950
512
  },
513
  {
514
- "epoch": 2.0,
515
- "grad_norm": 0.40895622968673706,
516
- "learning_rate": 6.822727272727273e-05,
517
- "loss": 0.5358,
518
  "step": 3000
519
  },
520
  {
521
- "epoch": 2.0,
522
- "eval_loss": 0.5436099171638489,
523
- "eval_runtime": 80.8207,
524
- "eval_samples_per_second": 37.119,
525
- "eval_steps_per_second": 9.28,
526
  "step": 3000
527
  }
528
  ],
529
  "logging_steps": 50,
530
- "max_steps": 4500,
531
  "num_input_tokens_seen": 0,
532
  "num_train_epochs": 3,
533
  "save_steps": 500,
@@ -543,7 +543,7 @@
543
  "attributes": {}
544
  }
545
  },
546
- "total_flos": 3.6691738985250816e+17,
547
  "train_batch_size": 4,
548
  "trial_name": null,
549
  "trial_params": null
 
1
  {
2
  "best_global_step": 3000,
3
+ "best_metric": 0.6727278828620911,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-3000",
5
+ "epoch": 1.2428260644359266,
6
  "eval_steps": 250,
7
  "global_step": 3000,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  },
184
  {
185
+ "epoch": 0.43509789702683105,
186
+ "grad_norm": 0.45690879225730896,
187
+ "learning_rate": 0.0001734248109773173,
188
+ "loss": 0.793,
189
  "step": 1050
190
  },
191
  {
192
+ "epoch": 0.45581684450429916,
193
+ "grad_norm": 0.5000227093696594,
194
+ "learning_rate": 0.00017202464295715486,
195
+ "loss": 0.8342,
196
  "step": 1100
197
  },
198
  {
199
+ "epoch": 0.4765357919817673,
200
+ "grad_norm": 0.47182488441467285,
201
+ "learning_rate": 0.00017062447493699246,
202
+ "loss": 0.7997,
203
  "step": 1150
204
  },
205
  {
206
+ "epoch": 0.4972547394592355,
207
+ "grad_norm": 0.7060516476631165,
208
+ "learning_rate": 0.00016922430691683002,
209
+ "loss": 0.7788,
210
  "step": 1200
211
  },
212
  {
213
+ "epoch": 0.5179736869367036,
214
+ "grad_norm": 0.46701857447624207,
215
+ "learning_rate": 0.00016782413889666762,
216
+ "loss": 0.7518,
217
  "step": 1250
218
  },
219
  {
220
+ "epoch": 0.5179736869367036,
221
+ "eval_loss": 0.7023425698280334,
222
+ "eval_runtime": 86.3015,
223
+ "eval_samples_per_second": 55.92,
224
+ "eval_steps_per_second": 13.986,
225
  "step": 1250
226
  },
227
  {
228
+ "epoch": 0.5386926344141718,
229
+ "grad_norm": 0.668192446231842,
230
+ "learning_rate": 0.00016642397087650518,
231
+ "loss": 0.7682,
232
  "step": 1300
233
  },
234
  {
235
+ "epoch": 0.5594115818916399,
236
+ "grad_norm": 0.47292283177375793,
237
+ "learning_rate": 0.00016502380285634278,
238
+ "loss": 0.7985,
239
  "step": 1350
240
  },
241
  {
242
+ "epoch": 0.580130529369108,
243
+ "grad_norm": 0.7327275276184082,
244
+ "learning_rate": 0.00016362363483618034,
245
+ "loss": 0.8378,
246
  "step": 1400
247
  },
248
  {
249
+ "epoch": 0.6008494768465762,
250
+ "grad_norm": 0.8417996764183044,
251
+ "learning_rate": 0.0001622234668160179,
252
+ "loss": 0.7962,
253
  "step": 1450
254
  },
255
  {
256
+ "epoch": 0.6215684243240444,
257
+ "grad_norm": 0.6189562678337097,
258
+ "learning_rate": 0.0001608232987958555,
259
+ "loss": 0.8028,
260
  "step": 1500
261
  },
262
  {
263
+ "epoch": 0.6215684243240444,
264
+ "eval_loss": 0.6915447115898132,
265
+ "eval_runtime": 86.2147,
266
+ "eval_samples_per_second": 55.977,
267
+ "eval_steps_per_second": 14.0,
268
  "step": 1500
269
  },
270
  {
271
+ "epoch": 0.6422873718015125,
272
+ "grad_norm": 0.7345826625823975,
273
+ "learning_rate": 0.0001594231307756931,
274
+ "loss": 0.7978,
275
  "step": 1550
276
  },
277
  {
278
+ "epoch": 0.6630063192789807,
279
+ "grad_norm": 0.6538310050964355,
280
+ "learning_rate": 0.0001580229627555307,
281
+ "loss": 0.7672,
282
  "step": 1600
283
  },
284
  {
285
+ "epoch": 0.6837252667564487,
286
+ "grad_norm": 0.661582350730896,
287
+ "learning_rate": 0.00015662279473536826,
288
+ "loss": 0.7378,
289
  "step": 1650
290
  },
291
  {
292
+ "epoch": 0.7044442142339169,
293
+ "grad_norm": 0.3603042960166931,
294
+ "learning_rate": 0.00015522262671520583,
295
+ "loss": 0.6741,
296
  "step": 1700
297
  },
298
  {
299
+ "epoch": 0.7251631617113851,
300
+ "grad_norm": 0.8882561326026917,
301
+ "learning_rate": 0.00015382245869504342,
302
+ "loss": 0.7695,
303
  "step": 1750
304
  },
305
  {
306
+ "epoch": 0.7251631617113851,
307
+ "eval_loss": 0.6858941316604614,
308
+ "eval_runtime": 86.6358,
309
+ "eval_samples_per_second": 55.704,
310
+ "eval_steps_per_second": 13.932,
311
  "step": 1750
312
  },
313
  {
314
+ "epoch": 0.7458821091888532,
315
+ "grad_norm": 0.5933266282081604,
316
+ "learning_rate": 0.000152422290674881,
317
+ "loss": 0.7548,
318
  "step": 1800
319
  },
320
  {
321
+ "epoch": 0.7666010566663214,
322
+ "grad_norm": 0.8178608417510986,
323
+ "learning_rate": 0.00015102212265471858,
324
+ "loss": 0.7639,
325
  "step": 1850
326
  },
327
  {
328
+ "epoch": 0.7873200041437894,
329
+ "grad_norm": 0.4378993511199951,
330
+ "learning_rate": 0.00014962195463455615,
331
+ "loss": 0.7985,
332
  "step": 1900
333
  },
334
  {
335
+ "epoch": 0.8080389516212576,
336
+ "grad_norm": 0.3732803463935852,
337
+ "learning_rate": 0.00014822178661439374,
338
+ "loss": 0.8481,
339
  "step": 1950
340
  },
341
  {
342
+ "epoch": 0.8287578990987258,
343
+ "grad_norm": 0.7421035170555115,
344
+ "learning_rate": 0.0001468216185942313,
345
+ "loss": 0.7223,
346
  "step": 2000
347
  },
348
  {
349
+ "epoch": 0.8287578990987258,
350
+ "eval_loss": 0.6823315024375916,
351
+ "eval_runtime": 86.5575,
352
+ "eval_samples_per_second": 55.755,
353
+ "eval_steps_per_second": 13.944,
354
  "step": 2000
355
  },
356
  {
357
+ "epoch": 0.8494768465761939,
358
+ "grad_norm": 0.5109913349151611,
359
+ "learning_rate": 0.00014542145057406888,
360
+ "loss": 0.7895,
361
  "step": 2050
362
  },
363
  {
364
+ "epoch": 0.8701957940536621,
365
+ "grad_norm": 0.47988179326057434,
366
+ "learning_rate": 0.00014402128255390647,
367
+ "loss": 0.7385,
368
  "step": 2100
369
  },
370
  {
371
+ "epoch": 0.8909147415311303,
372
+ "grad_norm": 0.7593080997467041,
373
+ "learning_rate": 0.00014262111453374404,
374
+ "loss": 0.7744,
375
  "step": 2150
376
  },
377
  {
378
+ "epoch": 0.9116336890085983,
379
+ "grad_norm": 0.5866154432296753,
380
+ "learning_rate": 0.00014122094651358163,
381
+ "loss": 0.7062,
382
  "step": 2200
383
  },
384
  {
385
+ "epoch": 0.9323526364860665,
386
+ "grad_norm": 0.47364088892936707,
387
+ "learning_rate": 0.00013982077849341922,
388
+ "loss": 0.7792,
389
  "step": 2250
390
  },
391
  {
392
+ "epoch": 0.9323526364860665,
393
+ "eval_loss": 0.6785813570022583,
394
+ "eval_runtime": 86.3444,
395
+ "eval_samples_per_second": 55.892,
396
+ "eval_steps_per_second": 13.979,
397
  "step": 2250
398
  },
399
  {
400
+ "epoch": 0.9530715839635346,
401
+ "grad_norm": 0.7610514760017395,
402
+ "learning_rate": 0.00013842061047325682,
403
+ "loss": 0.7804,
404
  "step": 2300
405
  },
406
  {
407
+ "epoch": 0.9737905314410028,
408
+ "grad_norm": 0.7689616084098816,
409
+ "learning_rate": 0.00013702044245309438,
410
+ "loss": 0.7497,
411
  "step": 2350
412
  },
413
  {
414
+ "epoch": 0.994509478918471,
415
+ "grad_norm": 0.542168378829956,
416
+ "learning_rate": 0.00013562027443293195,
417
+ "loss": 0.7333,
418
  "step": 2400
419
  },
420
  {
421
+ "epoch": 1.0149176421837771,
422
+ "grad_norm": 0.33903324604034424,
423
+ "learning_rate": 0.0001342481097731728,
424
+ "loss": 0.6952,
425
  "step": 2450
426
  },
427
  {
428
+ "epoch": 1.0356365896612452,
429
+ "grad_norm": 0.8183636665344238,
430
+ "learning_rate": 0.00013284794175301036,
431
+ "loss": 0.7386,
432
  "step": 2500
433
  },
434
  {
435
+ "epoch": 1.0356365896612452,
436
+ "eval_loss": 0.675748348236084,
437
+ "eval_runtime": 86.2887,
438
+ "eval_samples_per_second": 55.929,
439
+ "eval_steps_per_second": 13.988,
440
  "step": 2500
441
  },
442
  {
443
+ "epoch": 1.0563555371387134,
444
+ "grad_norm": 0.6831589937210083,
445
+ "learning_rate": 0.00013144777373284795,
446
+ "loss": 0.72,
447
  "step": 2550
448
  },
449
  {
450
+ "epoch": 1.0770744846161815,
451
+ "grad_norm": 0.6346258521080017,
452
+ "learning_rate": 0.00013004760571268552,
453
+ "loss": 0.7026,
454
  "step": 2600
455
  },
456
  {
457
+ "epoch": 1.0977934320936495,
458
+ "grad_norm": 0.5658385753631592,
459
+ "learning_rate": 0.0001286474376925231,
460
+ "loss": 0.7162,
461
  "step": 2650
462
  },
463
  {
464
+ "epoch": 1.1185123795711178,
465
+ "grad_norm": 0.4242883026599884,
466
+ "learning_rate": 0.00012724726967236068,
467
+ "loss": 0.7325,
468
  "step": 2700
469
  },
470
  {
471
+ "epoch": 1.1392313270485859,
472
+ "grad_norm": 0.5489133596420288,
473
+ "learning_rate": 0.00012584710165219827,
474
+ "loss": 0.7138,
475
  "step": 2750
476
  },
477
  {
478
+ "epoch": 1.1392313270485859,
479
+ "eval_loss": 0.6747092604637146,
480
+ "eval_runtime": 86.4239,
481
+ "eval_samples_per_second": 55.841,
482
+ "eval_steps_per_second": 13.966,
483
  "step": 2750
484
  },
485
  {
486
+ "epoch": 1.1599502745260541,
487
+ "grad_norm": 0.6514728665351868,
488
+ "learning_rate": 0.00012444693363203587,
489
+ "loss": 0.7105,
490
  "step": 2800
491
  },
492
  {
493
+ "epoch": 1.1806692220035222,
494
+ "grad_norm": 0.48897412419319153,
495
+ "learning_rate": 0.00012304676561187343,
496
+ "loss": 0.7271,
497
  "step": 2850
498
  },
499
  {
500
+ "epoch": 1.2013881694809903,
501
+ "grad_norm": 0.7159713506698608,
502
+ "learning_rate": 0.00012164659759171101,
503
+ "loss": 0.7454,
504
  "step": 2900
505
  },
506
  {
507
+ "epoch": 1.2221071169584585,
508
+ "grad_norm": 0.7044214010238647,
509
+ "learning_rate": 0.0001202464295715486,
510
+ "loss": 0.6918,
511
  "step": 2950
512
  },
513
  {
514
+ "epoch": 1.2428260644359266,
515
+ "grad_norm": 0.7934305667877197,
516
+ "learning_rate": 0.00011884626155138616,
517
+ "loss": 0.7018,
518
  "step": 3000
519
  },
520
  {
521
+ "epoch": 1.2428260644359266,
522
+ "eval_loss": 0.6727278828620911,
523
+ "eval_runtime": 86.1985,
524
+ "eval_samples_per_second": 55.987,
525
+ "eval_steps_per_second": 14.003,
526
  "step": 3000
527
  }
528
  ],
529
  "logging_steps": 50,
530
+ "max_steps": 7242,
531
  "num_input_tokens_seen": 0,
532
  "num_train_epochs": 3,
533
  "save_steps": 500,
 
543
  "attributes": {}
544
  }
545
  },
546
+ "total_flos": 1.8877939667533824e+17,
547
  "train_batch_size": 4,
548
  "trial_name": null,
549
  "trial_params": null
checkpoint-3000/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-3500/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
checkpoint-3500/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:12f0b3a531cc4b85c51c3d19fd29fa9f2ff0a0aaebca23605d724770413d49dd
3
  size 54560368
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd6ac84524dc525109e0cf3984e4fb4afaa59e8f7de0dc6109c2b12c586afc5
3
  size 54560368
checkpoint-3500/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4a4eef6562f7b0c26ac35c5f7d087ce2a7559b2e6cdf2884cceaa3c0ee6e1b36
3
  size 109267450
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98d7c59285360613eaeb682746b1d5e816d8c270b8349c722a70600a8d9d6ddb
3
  size 109267450
checkpoint-3500/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8b57187731297d3d34a8d707e0d59c7b35e51c65106b068986ec8c8627963b5d
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ed5252c8fed9a2f3c650896ede719a729d89d5457a6b7b888d47da3cf1064c08
3
  size 14244
checkpoint-3500/scaler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9d25c9e5c384ba91142c829ef5432ebc4ae7d8c71f3de723046dd3aa202e08a2
3
  size 988
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0ba488383c7d42e68fdfa7344fb6e0324b381de27f6504d975f79101124ff3a
3
  size 988
checkpoint-3500/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:19eea76bd539d1067fbb6c0af0bc3feabf4a4fcc75b4afa719255b0d413e8ced
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71c9cdb357928829533126660dc9acec503bb0b54ce6ea94dffebd2dc851fd2c
3
  size 1064
checkpoint-3500/trainer_state.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "best_global_step": 3000,
3
- "best_metric": 0.5436099171638489,
4
- "best_model_checkpoint": "finetuned_models/llama3.1-8b-lora/checkpoint-3000",
5
- "epoch": 2.3333333333333335,
6
  "eval_steps": 250,
7
  "global_step": 3500,
8
  "is_hyper_param_search": false,
@@ -10,610 +10,610 @@
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
- "epoch": 0.03333333333333333,
14
- "grad_norm": 0.5346225500106812,
15
  "learning_rate": 9.8e-05,
16
- "loss": 2.4955,
17
  "step": 50
18
  },
19
  {
20
- "epoch": 0.06666666666666667,
21
- "grad_norm": 0.719093918800354,
22
  "learning_rate": 0.00019800000000000002,
23
- "loss": 0.71,
24
  "step": 100
25
  },
26
  {
27
- "epoch": 0.1,
28
- "grad_norm": 0.4840560853481293,
29
- "learning_rate": 0.0001977727272727273,
30
- "loss": 0.6405,
31
  "step": 150
32
  },
33
  {
34
- "epoch": 0.13333333333333333,
35
- "grad_norm": 0.3332301676273346,
36
- "learning_rate": 0.0001955,
37
- "loss": 0.6287,
38
  "step": 200
39
  },
40
  {
41
- "epoch": 0.16666666666666666,
42
- "grad_norm": 0.40639588236808777,
43
- "learning_rate": 0.00019322727272727276,
44
- "loss": 0.5572,
45
  "step": 250
46
  },
47
  {
48
- "epoch": 0.16666666666666666,
49
- "eval_loss": 0.5975945591926575,
50
- "eval_runtime": 80.8004,
51
- "eval_samples_per_second": 37.129,
52
- "eval_steps_per_second": 9.282,
53
  "step": 250
54
  },
55
  {
56
- "epoch": 0.2,
57
- "grad_norm": 0.3970712423324585,
58
- "learning_rate": 0.00019095454545454545,
59
- "loss": 0.6165,
60
  "step": 300
61
  },
62
  {
63
- "epoch": 0.23333333333333334,
64
- "grad_norm": 0.38409528136253357,
65
- "learning_rate": 0.00018868181818181817,
66
- "loss": 0.639,
67
  "step": 350
68
  },
69
  {
70
- "epoch": 0.26666666666666666,
71
- "grad_norm": 0.44628769159317017,
72
- "learning_rate": 0.00018640909090909092,
73
- "loss": 0.636,
74
  "step": 400
75
  },
76
  {
77
- "epoch": 0.3,
78
- "grad_norm": 0.3697021007537842,
79
- "learning_rate": 0.00018413636363636364,
80
- "loss": 0.6192,
81
  "step": 450
82
  },
83
  {
84
- "epoch": 0.3333333333333333,
85
- "grad_norm": 0.36338189244270325,
86
- "learning_rate": 0.00018186363636363636,
87
- "loss": 0.6134,
88
  "step": 500
89
  },
90
  {
91
- "epoch": 0.3333333333333333,
92
- "eval_loss": 0.5813060998916626,
93
- "eval_runtime": 80.7819,
94
- "eval_samples_per_second": 37.137,
95
- "eval_steps_per_second": 9.284,
96
  "step": 500
97
  },
98
  {
99
- "epoch": 0.36666666666666664,
100
- "grad_norm": 0.35211533308029175,
101
- "learning_rate": 0.0001795909090909091,
102
- "loss": 0.6128,
103
  "step": 550
104
  },
105
  {
106
- "epoch": 0.4,
107
- "grad_norm": 0.36327463388442993,
108
- "learning_rate": 0.00017731818181818183,
109
- "loss": 0.5915,
110
  "step": 600
111
  },
112
  {
113
- "epoch": 0.43333333333333335,
114
- "grad_norm": 0.40672942996025085,
115
- "learning_rate": 0.00017504545454545455,
116
- "loss": 0.5807,
117
  "step": 650
118
  },
119
  {
120
- "epoch": 0.4666666666666667,
121
- "grad_norm": 0.4689007103443146,
122
- "learning_rate": 0.00017277272727272728,
123
- "loss": 0.602,
124
  "step": 700
125
  },
126
  {
127
- "epoch": 0.5,
128
- "grad_norm": 0.3979697823524475,
129
- "learning_rate": 0.00017050000000000002,
130
- "loss": 0.5703,
131
  "step": 750
132
  },
133
  {
134
- "epoch": 0.5,
135
- "eval_loss": 0.5740106701850891,
136
- "eval_runtime": 80.8209,
137
- "eval_samples_per_second": 37.119,
138
- "eval_steps_per_second": 9.28,
139
  "step": 750
140
  },
141
  {
142
- "epoch": 0.5333333333333333,
143
- "grad_norm": 0.3071135878562927,
144
- "learning_rate": 0.00016822727272727275,
145
- "loss": 0.5746,
146
  "step": 800
147
  },
148
  {
149
- "epoch": 0.5666666666666667,
150
- "grad_norm": 0.318085253238678,
151
- "learning_rate": 0.00016595454545454544,
152
- "loss": 0.5873,
153
  "step": 850
154
  },
155
  {
156
- "epoch": 0.6,
157
- "grad_norm": 0.35915374755859375,
158
- "learning_rate": 0.0001636818181818182,
159
- "loss": 0.6283,
160
  "step": 900
161
  },
162
  {
163
- "epoch": 0.6333333333333333,
164
- "grad_norm": 0.3174057602882385,
165
- "learning_rate": 0.0001614090909090909,
166
- "loss": 0.5912,
167
  "step": 950
168
  },
169
  {
170
- "epoch": 0.6666666666666666,
171
- "grad_norm": 0.416111022233963,
172
- "learning_rate": 0.00015913636363636363,
173
- "loss": 0.5647,
174
  "step": 1000
175
  },
176
  {
177
- "epoch": 0.6666666666666666,
178
- "eval_loss": 0.5663638710975647,
179
- "eval_runtime": 80.8183,
180
- "eval_samples_per_second": 37.12,
181
- "eval_steps_per_second": 9.28,
182
  "step": 1000
183
  },
184
  {
185
- "epoch": 0.7,
186
- "grad_norm": 0.41202324628829956,
187
- "learning_rate": 0.00015686363636363638,
188
- "loss": 0.6118,
189
  "step": 1050
190
  },
191
  {
192
- "epoch": 0.7333333333333333,
193
- "grad_norm": 0.3883333206176758,
194
- "learning_rate": 0.0001545909090909091,
195
- "loss": 0.5392,
196
  "step": 1100
197
  },
198
  {
199
- "epoch": 0.7666666666666667,
200
- "grad_norm": 0.31973451375961304,
201
- "learning_rate": 0.00015231818181818182,
202
- "loss": 0.5602,
203
  "step": 1150
204
  },
205
  {
206
- "epoch": 0.8,
207
- "grad_norm": 0.31378698348999023,
208
- "learning_rate": 0.00015004545454545454,
209
- "loss": 0.5642,
210
  "step": 1200
211
  },
212
  {
213
- "epoch": 0.8333333333333334,
214
- "grad_norm": 0.3346308171749115,
215
- "learning_rate": 0.0001477727272727273,
216
- "loss": 0.5925,
217
  "step": 1250
218
  },
219
  {
220
- "epoch": 0.8333333333333334,
221
- "eval_loss": 0.5619704723358154,
222
- "eval_runtime": 80.824,
223
- "eval_samples_per_second": 37.118,
224
- "eval_steps_per_second": 9.279,
225
  "step": 1250
226
  },
227
  {
228
- "epoch": 0.8666666666666667,
229
- "grad_norm": 0.5573959946632385,
230
- "learning_rate": 0.0001455,
231
- "loss": 0.5829,
232
  "step": 1300
233
  },
234
  {
235
- "epoch": 0.9,
236
- "grad_norm": 0.36054643988609314,
237
- "learning_rate": 0.00014322727272727273,
238
- "loss": 0.5923,
239
  "step": 1350
240
  },
241
  {
242
- "epoch": 0.9333333333333333,
243
- "grad_norm": 0.36059027910232544,
244
- "learning_rate": 0.00014095454545454546,
245
- "loss": 0.5808,
246
  "step": 1400
247
  },
248
  {
249
- "epoch": 0.9666666666666667,
250
- "grad_norm": 0.3942534327507019,
251
- "learning_rate": 0.00013868181818181818,
252
- "loss": 0.5597,
253
  "step": 1450
254
  },
255
  {
256
- "epoch": 1.0,
257
- "grad_norm": 0.3995835483074188,
258
- "learning_rate": 0.0001364090909090909,
259
- "loss": 0.5554,
260
  "step": 1500
261
  },
262
  {
263
- "epoch": 1.0,
264
- "eval_loss": 0.5581239461898804,
265
- "eval_runtime": 80.8326,
266
- "eval_samples_per_second": 37.114,
267
- "eval_steps_per_second": 9.278,
268
  "step": 1500
269
  },
270
  {
271
- "epoch": 1.0333333333333334,
272
- "grad_norm": 0.3405410051345825,
273
- "learning_rate": 0.00013413636363636365,
274
- "loss": 0.5571,
275
  "step": 1550
276
  },
277
  {
278
- "epoch": 1.0666666666666667,
279
- "grad_norm": 0.4485073983669281,
280
- "learning_rate": 0.00013186363636363637,
281
- "loss": 0.5674,
282
  "step": 1600
283
  },
284
  {
285
- "epoch": 1.1,
286
- "grad_norm": 0.34938374161720276,
287
- "learning_rate": 0.0001295909090909091,
288
- "loss": 0.5354,
289
  "step": 1650
290
  },
291
  {
292
- "epoch": 1.1333333333333333,
293
- "grad_norm": 0.33084195852279663,
294
- "learning_rate": 0.00012731818181818184,
295
- "loss": 0.5765,
296
  "step": 1700
297
  },
298
  {
299
- "epoch": 1.1666666666666667,
300
- "grad_norm": 0.3667336404323578,
301
- "learning_rate": 0.00012504545454545456,
302
- "loss": 0.5486,
303
  "step": 1750
304
  },
305
  {
306
- "epoch": 1.1666666666666667,
307
- "eval_loss": 0.5557209253311157,
308
- "eval_runtime": 80.8386,
309
- "eval_samples_per_second": 37.111,
310
- "eval_steps_per_second": 9.278,
311
  "step": 1750
312
  },
313
  {
314
- "epoch": 1.2,
315
- "grad_norm": 0.33248019218444824,
316
- "learning_rate": 0.00012277272727272728,
317
- "loss": 0.5617,
318
  "step": 1800
319
  },
320
  {
321
- "epoch": 1.2333333333333334,
322
- "grad_norm": 0.4447474479675293,
323
- "learning_rate": 0.00012050000000000002,
324
- "loss": 0.567,
325
  "step": 1850
326
  },
327
  {
328
- "epoch": 1.2666666666666666,
329
- "grad_norm": 0.42134660482406616,
330
- "learning_rate": 0.00011822727272727274,
331
- "loss": 0.5319,
332
  "step": 1900
333
  },
334
  {
335
- "epoch": 1.3,
336
- "grad_norm": 0.3942984640598297,
337
- "learning_rate": 0.00011595454545454544,
338
- "loss": 0.5325,
339
  "step": 1950
340
  },
341
  {
342
- "epoch": 1.3333333333333333,
343
- "grad_norm": 0.4929428696632385,
344
- "learning_rate": 0.00011368181818181818,
345
- "loss": 0.5565,
346
  "step": 2000
347
  },
348
  {
349
- "epoch": 1.3333333333333333,
350
- "eval_loss": 0.5535863637924194,
351
- "eval_runtime": 80.8279,
352
- "eval_samples_per_second": 37.116,
353
- "eval_steps_per_second": 9.279,
354
  "step": 2000
355
  },
356
  {
357
- "epoch": 1.3666666666666667,
358
- "grad_norm": 0.4141586720943451,
359
- "learning_rate": 0.00011140909090909091,
360
- "loss": 0.5801,
361
  "step": 2050
362
  },
363
  {
364
- "epoch": 1.4,
365
- "grad_norm": 0.45937269926071167,
366
- "learning_rate": 0.00010913636363636364,
367
- "loss": 0.5439,
368
  "step": 2100
369
  },
370
  {
371
- "epoch": 1.4333333333333333,
372
- "grad_norm": 0.47830042243003845,
373
- "learning_rate": 0.00010686363636363637,
374
- "loss": 0.547,
375
  "step": 2150
376
  },
377
  {
378
- "epoch": 1.4666666666666668,
379
- "grad_norm": 0.40260276198387146,
380
- "learning_rate": 0.00010459090909090909,
381
- "loss": 0.5229,
382
  "step": 2200
383
  },
384
  {
385
- "epoch": 1.5,
386
- "grad_norm": 0.5281402468681335,
387
- "learning_rate": 0.00010231818181818183,
388
- "loss": 0.5475,
389
  "step": 2250
390
  },
391
  {
392
- "epoch": 1.5,
393
- "eval_loss": 0.5505018830299377,
394
- "eval_runtime": 80.8409,
395
- "eval_samples_per_second": 37.11,
396
- "eval_steps_per_second": 9.277,
397
  "step": 2250
398
  },
399
  {
400
- "epoch": 1.5333333333333332,
401
- "grad_norm": 0.3721947968006134,
402
- "learning_rate": 0.00010004545454545455,
403
- "loss": 0.5466,
404
  "step": 2300
405
  },
406
  {
407
- "epoch": 1.5666666666666667,
408
- "grad_norm": 0.3462945818901062,
409
- "learning_rate": 9.777272727272728e-05,
410
- "loss": 0.5209,
411
  "step": 2350
412
  },
413
  {
414
- "epoch": 1.6,
415
- "grad_norm": 0.4027090072631836,
416
- "learning_rate": 9.55e-05,
417
- "loss": 0.5307,
418
  "step": 2400
419
  },
420
  {
421
- "epoch": 1.6333333333333333,
422
- "grad_norm": 0.3684265613555908,
423
- "learning_rate": 9.322727272727273e-05,
424
- "loss": 0.5118,
425
  "step": 2450
426
  },
427
  {
428
- "epoch": 1.6666666666666665,
429
- "grad_norm": 0.4819887578487396,
430
- "learning_rate": 9.095454545454546e-05,
431
- "loss": 0.561,
432
  "step": 2500
433
  },
434
  {
435
- "epoch": 1.6666666666666665,
436
- "eval_loss": 0.5476261377334595,
437
- "eval_runtime": 80.8288,
438
- "eval_samples_per_second": 37.115,
439
- "eval_steps_per_second": 9.279,
440
  "step": 2500
441
  },
442
  {
443
- "epoch": 1.7,
444
- "grad_norm": 0.3161783218383789,
445
- "learning_rate": 8.86818181818182e-05,
446
- "loss": 0.5413,
447
  "step": 2550
448
  },
449
  {
450
- "epoch": 1.7333333333333334,
451
- "grad_norm": 0.34697386622428894,
452
- "learning_rate": 8.640909090909092e-05,
453
- "loss": 0.5366,
454
  "step": 2600
455
  },
456
  {
457
- "epoch": 1.7666666666666666,
458
- "grad_norm": 0.4084527790546417,
459
- "learning_rate": 8.413636363636364e-05,
460
- "loss": 0.5426,
461
  "step": 2650
462
  },
463
  {
464
- "epoch": 1.8,
465
- "grad_norm": 0.4053308963775635,
466
- "learning_rate": 8.186363636363636e-05,
467
- "loss": 0.532,
468
  "step": 2700
469
  },
470
  {
471
- "epoch": 1.8333333333333335,
472
- "grad_norm": 0.3551884591579437,
473
- "learning_rate": 7.95909090909091e-05,
474
- "loss": 0.5399,
475
  "step": 2750
476
  },
477
  {
478
- "epoch": 1.8333333333333335,
479
- "eval_loss": 0.546008288860321,
480
- "eval_runtime": 80.8186,
481
- "eval_samples_per_second": 37.12,
482
- "eval_steps_per_second": 9.28,
483
  "step": 2750
484
  },
485
  {
486
- "epoch": 1.8666666666666667,
487
- "grad_norm": 0.40072572231292725,
488
- "learning_rate": 7.731818181818183e-05,
489
- "loss": 0.5332,
490
  "step": 2800
491
  },
492
  {
493
- "epoch": 1.9,
494
- "grad_norm": 0.3773200213909149,
495
- "learning_rate": 7.504545454545455e-05,
496
- "loss": 0.5296,
497
  "step": 2850
498
  },
499
  {
500
- "epoch": 1.9333333333333333,
501
- "grad_norm": 0.45379436016082764,
502
- "learning_rate": 7.277272727272728e-05,
503
- "loss": 0.5356,
504
  "step": 2900
505
  },
506
  {
507
- "epoch": 1.9666666666666668,
508
- "grad_norm": 0.36246028542518616,
509
- "learning_rate": 7.05e-05,
510
- "loss": 0.5112,
511
  "step": 2950
512
  },
513
  {
514
- "epoch": 2.0,
515
- "grad_norm": 0.40895622968673706,
516
- "learning_rate": 6.822727272727273e-05,
517
- "loss": 0.5358,
518
  "step": 3000
519
  },
520
  {
521
- "epoch": 2.0,
522
- "eval_loss": 0.5436099171638489,
523
- "eval_runtime": 80.8207,
524
- "eval_samples_per_second": 37.119,
525
- "eval_steps_per_second": 9.28,
526
  "step": 3000
527
  },
528
  {
529
- "epoch": 2.033333333333333,
530
- "grad_norm": 0.4935952425003052,
531
- "learning_rate": 6.595454545454546e-05,
532
- "loss": 0.5074,
533
  "step": 3050
534
  },
535
  {
536
- "epoch": 2.066666666666667,
537
- "grad_norm": 0.505511999130249,
538
- "learning_rate": 6.368181818181818e-05,
539
- "loss": 0.4716,
540
  "step": 3100
541
  },
542
  {
543
- "epoch": 2.1,
544
- "grad_norm": 0.47748756408691406,
545
- "learning_rate": 6.140909090909092e-05,
546
- "loss": 0.4909,
547
  "step": 3150
548
  },
549
  {
550
- "epoch": 2.1333333333333333,
551
- "grad_norm": 0.3205774426460266,
552
- "learning_rate": 5.913636363636363e-05,
553
- "loss": 0.5009,
554
  "step": 3200
555
  },
556
  {
557
- "epoch": 2.1666666666666665,
558
- "grad_norm": 0.437486469745636,
559
- "learning_rate": 5.686363636363636e-05,
560
- "loss": 0.5224,
561
  "step": 3250
562
  },
563
  {
564
- "epoch": 2.1666666666666665,
565
- "eval_loss": 0.5484762787818909,
566
- "eval_runtime": 80.8314,
567
- "eval_samples_per_second": 37.114,
568
- "eval_steps_per_second": 9.279,
569
  "step": 3250
570
  },
571
  {
572
- "epoch": 2.2,
573
- "grad_norm": 0.49795669317245483,
574
- "learning_rate": 5.4590909090909096e-05,
575
- "loss": 0.516,
576
  "step": 3300
577
  },
578
  {
579
- "epoch": 2.2333333333333334,
580
- "grad_norm": 0.40953299403190613,
581
- "learning_rate": 5.2318181818181824e-05,
582
- "loss": 0.5025,
583
  "step": 3350
584
  },
585
  {
586
- "epoch": 2.2666666666666666,
587
- "grad_norm": 0.5090060830116272,
588
- "learning_rate": 5.004545454545455e-05,
589
- "loss": 0.5064,
590
  "step": 3400
591
  },
592
  {
593
- "epoch": 2.3,
594
- "grad_norm": 0.4385254979133606,
595
- "learning_rate": 4.777272727272727e-05,
596
- "loss": 0.497,
597
  "step": 3450
598
  },
599
  {
600
- "epoch": 2.3333333333333335,
601
- "grad_norm": 0.4746367037296295,
602
- "learning_rate": 4.55e-05,
603
- "loss": 0.4696,
604
  "step": 3500
605
  },
606
  {
607
- "epoch": 2.3333333333333335,
608
- "eval_loss": 0.5463398098945618,
609
- "eval_runtime": 80.8383,
610
- "eval_samples_per_second": 37.111,
611
- "eval_steps_per_second": 9.278,
612
  "step": 3500
613
  }
614
  ],
615
  "logging_steps": 50,
616
- "max_steps": 4500,
617
  "num_input_tokens_seen": 0,
618
  "num_train_epochs": 3,
619
  "save_steps": 500,
@@ -629,7 +629,7 @@
629
  "attributes": {}
630
  }
631
  },
632
- "total_flos": 4.2819969754988544e+17,
633
  "train_batch_size": 4,
634
  "trial_name": null,
635
  "trial_params": null
 
1
  {
2
+ "best_global_step": 3500,
3
+ "best_metric": 0.6663665175437927,
4
+ "best_model_checkpoint": "amiya_outputs/llama3.1-8b-amiya-palestinian/checkpoint-3500",
5
+ "epoch": 1.4500155392106082,
6
  "eval_steps": 250,
7
  "global_step": 3500,
8
  "is_hyper_param_search": false,
 
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
+ "epoch": 0.020718947477468146,
14
+ "grad_norm": 1.0589393377304077,
15
  "learning_rate": 9.8e-05,
16
+ "loss": 2.6567,
17
  "step": 50
18
  },
19
  {
20
+ "epoch": 0.04143789495493629,
21
+ "grad_norm": 0.9738045334815979,
22
  "learning_rate": 0.00019800000000000002,
23
+ "loss": 0.9502,
24
  "step": 100
25
  },
26
  {
27
+ "epoch": 0.062156842432404436,
28
+ "grad_norm": 0.8801347017288208,
29
+ "learning_rate": 0.00019862783534024082,
30
+ "loss": 0.8496,
31
  "step": 150
32
  },
33
  {
34
+ "epoch": 0.08287578990987259,
35
+ "grad_norm": 0.7272312045097351,
36
+ "learning_rate": 0.00019722766732007841,
37
+ "loss": 0.8184,
38
  "step": 200
39
  },
40
  {
41
+ "epoch": 0.10359473738734072,
42
+ "grad_norm": 0.7850629091262817,
43
+ "learning_rate": 0.000195827499299916,
44
+ "loss": 0.8392,
45
  "step": 250
46
  },
47
  {
48
+ "epoch": 0.10359473738734072,
49
+ "eval_loss": 0.7402811050415039,
50
+ "eval_runtime": 85.9367,
51
+ "eval_samples_per_second": 56.158,
52
+ "eval_steps_per_second": 14.045,
53
  "step": 250
54
  },
55
  {
56
+ "epoch": 0.12431368486480887,
57
+ "grad_norm": 0.40629276633262634,
58
+ "learning_rate": 0.00019442733127975358,
59
+ "loss": 0.8108,
60
  "step": 300
61
  },
62
  {
63
+ "epoch": 0.145032632342277,
64
+ "grad_norm": 0.5258236527442932,
65
+ "learning_rate": 0.00019302716325959117,
66
+ "loss": 0.8116,
67
  "step": 350
68
  },
69
  {
70
+ "epoch": 0.16575157981974517,
71
+ "grad_norm": 0.6879925727844238,
72
+ "learning_rate": 0.00019162699523942874,
73
+ "loss": 0.9089,
74
  "step": 400
75
  },
76
  {
77
+ "epoch": 0.1864705272972133,
78
+ "grad_norm": 0.7583937048912048,
79
+ "learning_rate": 0.00019022682721926633,
80
+ "loss": 0.874,
81
  "step": 450
82
  },
83
  {
84
+ "epoch": 0.20718947477468144,
85
+ "grad_norm": 0.6399120688438416,
86
+ "learning_rate": 0.0001888266591991039,
87
+ "loss": 0.8366,
88
  "step": 500
89
  },
90
  {
91
+ "epoch": 0.20718947477468144,
92
+ "eval_loss": 0.7194066047668457,
93
+ "eval_runtime": 86.2811,
94
+ "eval_samples_per_second": 55.933,
95
+ "eval_steps_per_second": 13.989,
96
  "step": 500
97
  },
98
  {
99
+ "epoch": 0.22790842225214958,
100
+ "grad_norm": 0.7763131856918335,
101
+ "learning_rate": 0.0001874264911789415,
102
+ "loss": 0.7912,
103
  "step": 550
104
  },
105
  {
106
+ "epoch": 0.24862736972961774,
107
+ "grad_norm": 0.6845299601554871,
108
+ "learning_rate": 0.00018602632315877906,
109
+ "loss": 0.8506,
110
  "step": 600
111
  },
112
  {
113
+ "epoch": 0.2693463172070859,
114
+ "grad_norm": 0.8045451045036316,
115
+ "learning_rate": 0.00018462615513861665,
116
+ "loss": 0.763,
117
  "step": 650
118
  },
119
  {
120
+ "epoch": 0.290065264684554,
121
+ "grad_norm": 0.7035927176475525,
122
+ "learning_rate": 0.00018322598711845422,
123
+ "loss": 0.7769,
124
  "step": 700
125
  },
126
  {
127
+ "epoch": 0.3107842121620222,
128
+ "grad_norm": 0.465000718832016,
129
+ "learning_rate": 0.00018182581909829179,
130
+ "loss": 0.7705,
131
  "step": 750
132
  },
133
  {
134
+ "epoch": 0.3107842121620222,
135
+ "eval_loss": 0.7103215456008911,
136
+ "eval_runtime": 86.1101,
137
+ "eval_samples_per_second": 56.045,
138
+ "eval_steps_per_second": 14.017,
139
  "step": 750
140
  },
141
  {
142
+ "epoch": 0.33150315963949034,
143
+ "grad_norm": 0.4990151524543762,
144
+ "learning_rate": 0.00018042565107812938,
145
+ "loss": 0.8438,
146
  "step": 800
147
  },
148
  {
149
+ "epoch": 0.35222210711695845,
150
+ "grad_norm": 0.7391067147254944,
151
+ "learning_rate": 0.00017902548305796695,
152
+ "loss": 0.7688,
153
  "step": 850
154
  },
155
  {
156
+ "epoch": 0.3729410545944266,
157
+ "grad_norm": 0.8036171197891235,
158
+ "learning_rate": 0.00017762531503780454,
159
+ "loss": 0.753,
160
  "step": 900
161
  },
162
  {
163
+ "epoch": 0.3936600020718947,
164
+ "grad_norm": 0.44744470715522766,
165
+ "learning_rate": 0.00017622514701764213,
166
+ "loss": 0.7793,
167
  "step": 950
168
  },
169
  {
170
+ "epoch": 0.4143789495493629,
171
+ "grad_norm": 0.630820631980896,
172
+ "learning_rate": 0.00017482497899747973,
173
+ "loss": 0.7555,
174
  "step": 1000
175
  },
176
  {
177
+ "epoch": 0.4143789495493629,
178
+ "eval_loss": 0.7030432820320129,
179
+ "eval_runtime": 86.2543,
180
+ "eval_samples_per_second": 55.951,
181
+ "eval_steps_per_second": 13.994,
182
  "step": 1000
183
  },
184
  {
185
+ "epoch": 0.43509789702683105,
186
+ "grad_norm": 0.45690879225730896,
187
+ "learning_rate": 0.0001734248109773173,
188
+ "loss": 0.793,
189
  "step": 1050
190
  },
191
  {
192
+ "epoch": 0.45581684450429916,
193
+ "grad_norm": 0.5000227093696594,
194
+ "learning_rate": 0.00017202464295715486,
195
+ "loss": 0.8342,
196
  "step": 1100
197
  },
198
  {
199
+ "epoch": 0.4765357919817673,
200
+ "grad_norm": 0.47182488441467285,
201
+ "learning_rate": 0.00017062447493699246,
202
+ "loss": 0.7997,
203
  "step": 1150
204
  },
205
  {
206
+ "epoch": 0.4972547394592355,
207
+ "grad_norm": 0.7060516476631165,
208
+ "learning_rate": 0.00016922430691683002,
209
+ "loss": 0.7788,
210
  "step": 1200
211
  },
212
  {
213
+ "epoch": 0.5179736869367036,
214
+ "grad_norm": 0.46701857447624207,
215
+ "learning_rate": 0.00016782413889666762,
216
+ "loss": 0.7518,
217
  "step": 1250
218
  },
219
  {
220
+ "epoch": 0.5179736869367036,
221
+ "eval_loss": 0.7023425698280334,
222
+ "eval_runtime": 86.3015,
223
+ "eval_samples_per_second": 55.92,
224
+ "eval_steps_per_second": 13.986,
225
  "step": 1250
226
  },
227
  {
228
+ "epoch": 0.5386926344141718,
229
+ "grad_norm": 0.668192446231842,
230
+ "learning_rate": 0.00016642397087650518,
231
+ "loss": 0.7682,
232
  "step": 1300
233
  },
234
  {
235
+ "epoch": 0.5594115818916399,
236
+ "grad_norm": 0.47292283177375793,
237
+ "learning_rate": 0.00016502380285634278,
238
+ "loss": 0.7985,
239
  "step": 1350
240
  },
241
  {
242
+ "epoch": 0.580130529369108,
243
+ "grad_norm": 0.7327275276184082,
244
+ "learning_rate": 0.00016362363483618034,
245
+ "loss": 0.8378,
246
  "step": 1400
247
  },
248
  {
249
+ "epoch": 0.6008494768465762,
250
+ "grad_norm": 0.8417996764183044,
251
+ "learning_rate": 0.0001622234668160179,
252
+ "loss": 0.7962,
253
  "step": 1450
254
  },
255
  {
256
+ "epoch": 0.6215684243240444,
257
+ "grad_norm": 0.6189562678337097,
258
+ "learning_rate": 0.0001608232987958555,
259
+ "loss": 0.8028,
260
  "step": 1500
261
  },
262
  {
263
+ "epoch": 0.6215684243240444,
264
+ "eval_loss": 0.6915447115898132,
265
+ "eval_runtime": 86.2147,
266
+ "eval_samples_per_second": 55.977,
267
+ "eval_steps_per_second": 14.0,
268
  "step": 1500
269
  },
270
  {
271
+ "epoch": 0.6422873718015125,
272
+ "grad_norm": 0.7345826625823975,
273
+ "learning_rate": 0.0001594231307756931,
274
+ "loss": 0.7978,
275
  "step": 1550
276
  },
277
  {
278
+ "epoch": 0.6630063192789807,
279
+ "grad_norm": 0.6538310050964355,
280
+ "learning_rate": 0.0001580229627555307,
281
+ "loss": 0.7672,
282
  "step": 1600
283
  },
284
  {
285
+ "epoch": 0.6837252667564487,
286
+ "grad_norm": 0.661582350730896,
287
+ "learning_rate": 0.00015662279473536826,
288
+ "loss": 0.7378,
289
  "step": 1650
290
  },
291
  {
292
+ "epoch": 0.7044442142339169,
293
+ "grad_norm": 0.3603042960166931,
294
+ "learning_rate": 0.00015522262671520583,
295
+ "loss": 0.6741,
296
  "step": 1700
297
  },
298
  {
299
+ "epoch": 0.7251631617113851,
300
+ "grad_norm": 0.8882561326026917,
301
+ "learning_rate": 0.00015382245869504342,
302
+ "loss": 0.7695,
303
  "step": 1750
304
  },
305
  {
306
+ "epoch": 0.7251631617113851,
307
+ "eval_loss": 0.6858941316604614,
308
+ "eval_runtime": 86.6358,
309
+ "eval_samples_per_second": 55.704,
310
+ "eval_steps_per_second": 13.932,
311
  "step": 1750
312
  },
313
  {
314
+ "epoch": 0.7458821091888532,
315
+ "grad_norm": 0.5933266282081604,
316
+ "learning_rate": 0.000152422290674881,
317
+ "loss": 0.7548,
318
  "step": 1800
319
  },
320
  {
321
+ "epoch": 0.7666010566663214,
322
+ "grad_norm": 0.8178608417510986,
323
+ "learning_rate": 0.00015102212265471858,
324
+ "loss": 0.7639,
325
  "step": 1850
326
  },
327
  {
328
+ "epoch": 0.7873200041437894,
329
+ "grad_norm": 0.4378993511199951,
330
+ "learning_rate": 0.00014962195463455615,
331
+ "loss": 0.7985,
332
  "step": 1900
333
  },
334
  {
335
+ "epoch": 0.8080389516212576,
336
+ "grad_norm": 0.3732803463935852,
337
+ "learning_rate": 0.00014822178661439374,
338
+ "loss": 0.8481,
339
  "step": 1950
340
  },
341
  {
342
+ "epoch": 0.8287578990987258,
343
+ "grad_norm": 0.7421035170555115,
344
+ "learning_rate": 0.0001468216185942313,
345
+ "loss": 0.7223,
346
  "step": 2000
347
  },
348
  {
349
+ "epoch": 0.8287578990987258,
350
+ "eval_loss": 0.6823315024375916,
351
+ "eval_runtime": 86.5575,
352
+ "eval_samples_per_second": 55.755,
353
+ "eval_steps_per_second": 13.944,
354
  "step": 2000
355
  },
356
  {
357
+ "epoch": 0.8494768465761939,
358
+ "grad_norm": 0.5109913349151611,
359
+ "learning_rate": 0.00014542145057406888,
360
+ "loss": 0.7895,
361
  "step": 2050
362
  },
363
  {
364
+ "epoch": 0.8701957940536621,
365
+ "grad_norm": 0.47988179326057434,
366
+ "learning_rate": 0.00014402128255390647,
367
+ "loss": 0.7385,
368
  "step": 2100
369
  },
370
  {
371
+ "epoch": 0.8909147415311303,
372
+ "grad_norm": 0.7593080997467041,
373
+ "learning_rate": 0.00014262111453374404,
374
+ "loss": 0.7744,
375
  "step": 2150
376
  },
377
  {
378
+ "epoch": 0.9116336890085983,
379
+ "grad_norm": 0.5866154432296753,
380
+ "learning_rate": 0.00014122094651358163,
381
+ "loss": 0.7062,
382
  "step": 2200
383
  },
384
  {
385
+ "epoch": 0.9323526364860665,
386
+ "grad_norm": 0.47364088892936707,
387
+ "learning_rate": 0.00013982077849341922,
388
+ "loss": 0.7792,
389
  "step": 2250
390
  },
391
  {
392
+ "epoch": 0.9323526364860665,
393
+ "eval_loss": 0.6785813570022583,
394
+ "eval_runtime": 86.3444,
395
+ "eval_samples_per_second": 55.892,
396
+ "eval_steps_per_second": 13.979,
397
  "step": 2250
398
  },
399
  {
400
+ "epoch": 0.9530715839635346,
401
+ "grad_norm": 0.7610514760017395,
402
+ "learning_rate": 0.00013842061047325682,
403
+ "loss": 0.7804,
404
  "step": 2300
405
  },
406
  {
407
+ "epoch": 0.9737905314410028,
408
+ "grad_norm": 0.7689616084098816,
409
+ "learning_rate": 0.00013702044245309438,
410
+ "loss": 0.7497,
411
  "step": 2350
412
  },
413
  {
414
+ "epoch": 0.994509478918471,
415
+ "grad_norm": 0.542168378829956,
416
+ "learning_rate": 0.00013562027443293195,
417
+ "loss": 0.7333,
418
  "step": 2400
419
  },
420
  {
421
+ "epoch": 1.0149176421837771,
422
+ "grad_norm": 0.33903324604034424,
423
+ "learning_rate": 0.0001342481097731728,
424
+ "loss": 0.6952,
425
  "step": 2450
426
  },
427
  {
428
+ "epoch": 1.0356365896612452,
429
+ "grad_norm": 0.8183636665344238,
430
+ "learning_rate": 0.00013284794175301036,
431
+ "loss": 0.7386,
432
  "step": 2500
433
  },
434
  {
435
+ "epoch": 1.0356365896612452,
436
+ "eval_loss": 0.675748348236084,
437
+ "eval_runtime": 86.2887,
438
+ "eval_samples_per_second": 55.929,
439
+ "eval_steps_per_second": 13.988,
440
  "step": 2500
441
  },
442
  {
443
+ "epoch": 1.0563555371387134,
444
+ "grad_norm": 0.6831589937210083,
445
+ "learning_rate": 0.00013144777373284795,
446
+ "loss": 0.72,
447
  "step": 2550
448
  },
449
  {
450
+ "epoch": 1.0770744846161815,
451
+ "grad_norm": 0.6346258521080017,
452
+ "learning_rate": 0.00013004760571268552,
453
+ "loss": 0.7026,
454
  "step": 2600
455
  },
456
  {
457
+ "epoch": 1.0977934320936495,
458
+ "grad_norm": 0.5658385753631592,
459
+ "learning_rate": 0.0001286474376925231,
460
+ "loss": 0.7162,
461
  "step": 2650
462
  },
463
  {
464
+ "epoch": 1.1185123795711178,
465
+ "grad_norm": 0.4242883026599884,
466
+ "learning_rate": 0.00012724726967236068,
467
+ "loss": 0.7325,
468
  "step": 2700
469
  },
470
  {
471
+ "epoch": 1.1392313270485859,
472
+ "grad_norm": 0.5489133596420288,
473
+ "learning_rate": 0.00012584710165219827,
474
+ "loss": 0.7138,
475
  "step": 2750
476
  },
477
  {
478
+ "epoch": 1.1392313270485859,
479
+ "eval_loss": 0.6747092604637146,
480
+ "eval_runtime": 86.4239,
481
+ "eval_samples_per_second": 55.841,
482
+ "eval_steps_per_second": 13.966,
483
  "step": 2750
484
  },
485
  {
486
+ "epoch": 1.1599502745260541,
487
+ "grad_norm": 0.6514728665351868,
488
+ "learning_rate": 0.00012444693363203587,
489
+ "loss": 0.7105,
490
  "step": 2800
491
  },
492
  {
493
+ "epoch": 1.1806692220035222,
494
+ "grad_norm": 0.48897412419319153,
495
+ "learning_rate": 0.00012304676561187343,
496
+ "loss": 0.7271,
497
  "step": 2850
498
  },
499
  {
500
+ "epoch": 1.2013881694809903,
501
+ "grad_norm": 0.7159713506698608,
502
+ "learning_rate": 0.00012164659759171101,
503
+ "loss": 0.7454,
504
  "step": 2900
505
  },
506
  {
507
+ "epoch": 1.2221071169584585,
508
+ "grad_norm": 0.7044214010238647,
509
+ "learning_rate": 0.0001202464295715486,
510
+ "loss": 0.6918,
511
  "step": 2950
512
  },
513
  {
514
+ "epoch": 1.2428260644359266,
515
+ "grad_norm": 0.7934305667877197,
516
+ "learning_rate": 0.00011884626155138616,
517
+ "loss": 0.7018,
518
  "step": 3000
519
  },
520
  {
521
+ "epoch": 1.2428260644359266,
522
+ "eval_loss": 0.6727278828620911,
523
+ "eval_runtime": 86.1985,
524
+ "eval_samples_per_second": 55.987,
525
+ "eval_steps_per_second": 14.003,
526
  "step": 3000
527
  },
528
  {
529
+ "epoch": 1.2635450119133949,
530
+ "grad_norm": 0.8456618785858154,
531
+ "learning_rate": 0.00011744609353122375,
532
+ "loss": 0.763,
533
  "step": 3050
534
  },
535
  {
536
+ "epoch": 1.284263959390863,
537
+ "grad_norm": 0.5733729600906372,
538
+ "learning_rate": 0.00011604592551106132,
539
+ "loss": 0.7034,
540
  "step": 3100
541
  },
542
  {
543
+ "epoch": 1.304982906868331,
544
+ "grad_norm": 0.4783104658126831,
545
+ "learning_rate": 0.00011464575749089892,
546
+ "loss": 0.762,
547
  "step": 3150
548
  },
549
  {
550
+ "epoch": 1.3257018543457992,
551
+ "grad_norm": 0.7016689777374268,
552
+ "learning_rate": 0.0001132455894707365,
553
+ "loss": 0.7049,
554
  "step": 3200
555
  },
556
  {
557
+ "epoch": 1.3464208018232675,
558
+ "grad_norm": 0.6739513278007507,
559
+ "learning_rate": 0.00011184542145057409,
560
+ "loss": 0.7137,
561
  "step": 3250
562
  },
563
  {
564
+ "epoch": 1.3464208018232675,
565
+ "eval_loss": 0.6689812541007996,
566
+ "eval_runtime": 86.4895,
567
+ "eval_samples_per_second": 55.799,
568
+ "eval_steps_per_second": 13.955,
569
  "step": 3250
570
  },
571
  {
572
+ "epoch": 1.3671397493007356,
573
+ "grad_norm": 0.8907766938209534,
574
+ "learning_rate": 0.00011044525343041166,
575
+ "loss": 0.7476,
576
  "step": 3300
577
  },
578
  {
579
+ "epoch": 1.3878586967782036,
580
+ "grad_norm": 0.8889743089675903,
581
+ "learning_rate": 0.00010904508541024922,
582
+ "loss": 0.7059,
583
  "step": 3350
584
  },
585
  {
586
+ "epoch": 1.408577644255672,
587
+ "grad_norm": 0.5788094401359558,
588
+ "learning_rate": 0.00010764491739008682,
589
+ "loss": 0.7018,
590
  "step": 3400
591
  },
592
  {
593
+ "epoch": 1.42929659173314,
594
+ "grad_norm": 0.7107548713684082,
595
+ "learning_rate": 0.00010624474936992438,
596
+ "loss": 0.6796,
597
  "step": 3450
598
  },
599
  {
600
+ "epoch": 1.4500155392106082,
601
+ "grad_norm": 0.6979348063468933,
602
+ "learning_rate": 0.00010484458134976198,
603
+ "loss": 0.7212,
604
  "step": 3500
605
  },
606
  {
607
+ "epoch": 1.4500155392106082,
608
+ "eval_loss": 0.6663665175437927,
609
+ "eval_runtime": 86.5532,
610
+ "eval_samples_per_second": 55.758,
611
+ "eval_steps_per_second": 13.945,
612
  "step": 3500
613
  }
614
  ],
615
  "logging_steps": 50,
616
+ "max_steps": 7242,
617
  "num_input_tokens_seen": 0,
618
  "num_train_epochs": 3,
619
  "save_steps": 500,
 
629
  "attributes": {}
630
  }
631
  },
632
+ "total_flos": 2.2030969497550848e+17,
633
  "train_batch_size": 4,
634
  "trial_name": null,
635
  "trial_params": null
checkpoint-3500/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:eb76e3e4d2123d52e529262f1ff37bfc600a160bb369e2338a54f71b47c17108
3
  size 5432
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a783872a61d64f4d1b4e002fa0fa67fa54dd9c59c4f7fbad58c794a77134069d
3
  size 5432
checkpoint-4000/adapter_config.json CHANGED
@@ -29,9 +29,9 @@
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
 
32
  "o_proj",
33
  "v_proj",
34
- "q_proj",
35
  "k_proj"
36
  ],
37
  "target_parameters": null,
 
29
  "rank_pattern": {},
30
  "revision": null,
31
  "target_modules": [
32
+ "q_proj",
33
  "o_proj",
34
  "v_proj",
 
35
  "k_proj"
36
  ],
37
  "target_parameters": null,