TinyPixel commited on
Commit
a8ab680
·
1 Parent(s): cd25384

Upload folder using huggingface_hub

Browse files
adapter_config.json CHANGED
@@ -1,21 +1,22 @@
1
  {
2
  "auto_mapping": null,
3
- "base_model_name_or_path": "PY007/TinyLlama-1.1B-step-50K-105b",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
7
  "init_lora_weights": true,
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
10
- "lora_alpha": 32,
11
- "lora_dropout": 0.05,
12
  "modules_to_save": null,
13
  "peft_type": "LORA",
14
- "r": 16,
15
  "revision": null,
16
  "target_modules": [
17
  "q_proj",
18
- "v_proj"
 
19
  ],
20
  "task_type": "CAUSAL_LM"
21
  }
 
1
  {
2
  "auto_mapping": null,
3
+ "base_model_name_or_path": "TinyPixel/Llama-2-7B-bf16-sharded",
4
  "bias": "none",
5
  "fan_in_fan_out": false,
6
  "inference_mode": true,
7
  "init_lora_weights": true,
8
  "layers_pattern": null,
9
  "layers_to_transform": null,
10
+ "lora_alpha": 16,
11
+ "lora_dropout": 0.1,
12
  "modules_to_save": null,
13
  "peft_type": "LORA",
14
+ "r": 64,
15
  "revision": null,
16
  "target_modules": [
17
  "q_proj",
18
+ "k_proj",
19
+ "o_projv_proj"
20
  ],
21
  "task_type": "CAUSAL_LM"
22
  }
adapter_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49b1db649f62608a292c65882e498e09844919514cdfc53b68de7ff96d371500
3
- size 9042553
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8a7b95fabdd309ba6fc001b1942bc705736076ade7f83927a6dd0b3d6b69ec7b
3
+ size 134263757
optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3534ebf92322594ad9ded0e430d0d660efc80b124d5d5679647303ec7ad7847
3
- size 18096133
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a69463ff320de46aac4dc98556d1d77e0742915c146eb65bcb3a76fe05c5a65a
3
+ size 268514437
rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:709998aff45d3d23e134a514167229d5823265273a2c5779b5162f5aee126bcb
3
  size 14575
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:355ab504af59eeb21227012a9502c0bb3006b3bd131acc80713acf4b2bb148ae
3
  size 14575
scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cfb3cda746669cc2759fad95155fafc2b9f77c1d115ae34e38b020c310aa9572
3
  size 627
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0bd7b80ec246d44d8b81124b8ba36f2bf12bf8f35c030e50fad44a064bd194e8
3
  size 627
special_tokens_map.json CHANGED
@@ -2,14 +2,14 @@
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
- "normalized": false,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
  "content": "</s>",
11
  "lstrip": false,
12
- "normalized": false,
13
  "rstrip": false,
14
  "single_word": false
15
  },
@@ -17,7 +17,7 @@
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
20
- "normalized": false,
21
  "rstrip": false,
22
  "single_word": false
23
  }
 
2
  "bos_token": {
3
  "content": "<s>",
4
  "lstrip": false,
5
+ "normalized": true,
6
  "rstrip": false,
7
  "single_word": false
8
  },
9
  "eos_token": {
10
  "content": "</s>",
11
  "lstrip": false,
12
+ "normalized": true,
13
  "rstrip": false,
14
  "single_word": false
15
  },
 
17
  "unk_token": {
18
  "content": "<unk>",
19
  "lstrip": false,
20
+ "normalized": true,
21
  "rstrip": false,
22
  "single_word": false
23
  }
tokenizer.json CHANGED
@@ -14,7 +14,7 @@
14
  "single_word": false,
15
  "lstrip": false,
16
  "rstrip": false,
17
- "normalized": false,
18
  "special": true
19
  },
20
  {
@@ -23,7 +23,7 @@
23
  "single_word": false,
24
  "lstrip": false,
25
  "rstrip": false,
26
- "normalized": false,
27
  "special": true
28
  },
29
  {
@@ -32,7 +32,7 @@
32
  "single_word": false,
33
  "lstrip": false,
34
  "rstrip": false,
35
- "normalized": false,
36
  "special": true
37
  }
38
  ],
@@ -93271,126 +93271,126 @@
93271
  "▁livre s",
93272
  "lu b",
93273
  "l ub",
93274
- "▁ ▁",
93275
  "▁▁ ▁▁",
93276
- "▁▁ ▁",
93277
- "▁ ▁▁▁",
 
93278
  "▁▁ ▁▁▁▁▁▁",
93279
- "▁▁▁▁ ▁▁▁▁",
93280
- "▁▁▁▁▁ ▁▁▁",
93281
- "▁▁▁▁▁▁ ▁▁",
93282
- "▁▁▁ ▁▁▁▁▁",
93283
- "▁▁▁▁▁▁▁ ▁",
93284
- "▁ ▁▁▁▁▁▁▁",
93285
- "▁▁ ▁▁▁",
93286
- "▁▁▁▁ ▁",
93287
- "▁▁▁ ▁▁",
93288
- "▁ ▁▁▁▁",
93289
  "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
 
 
 
93290
  "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
 
93291
  "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93292
  "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
93293
  "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
93294
  "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
 
 
 
93295
  "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
 
 
 
 
93296
  "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
 
 
93297
  "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
 
 
 
 
 
 
 
93298
  "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93299
- "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93300
- "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93301
- "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93302
- "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93303
- "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
93304
- "▁▁ ▁▁▁▁",
93305
- "▁▁▁▁ ▁▁",
93306
- "▁▁▁▁▁ ▁",
93307
  "▁▁▁ ▁▁▁",
93308
- "▁ ▁▁▁▁▁",
93309
- "▁▁ ▁▁▁▁▁▁▁▁▁▁",
93310
- "▁▁▁▁ ▁▁▁▁▁▁▁▁",
93311
- "▁▁▁▁▁▁▁▁ ▁▁▁▁",
93312
- "▁▁▁▁▁ ▁▁▁▁▁▁▁",
93313
- "▁▁▁▁▁▁ ▁▁▁▁▁▁",
93314
- "▁▁▁▁▁▁▁▁▁▁ ▁▁",
93315
  "▁▁▁ ▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
93316
  "▁▁▁▁▁▁▁▁▁ ▁▁▁",
 
 
 
 
 
93317
  "▁▁▁▁▁▁▁ ▁▁▁▁▁",
93318
- "▁▁▁▁▁▁▁▁▁▁▁ ▁",
93319
- "▁ ▁▁▁▁▁▁▁▁▁▁▁",
93320
- "▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93321
- "▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93322
- "▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93323
- "▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93324
- "▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93325
- "▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93326
- "▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93327
- "▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93328
- "▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93329
  "▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93330
- "▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93331
- "▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93332
- "▁▁ ▁▁▁▁▁▁▁▁",
93333
- "▁▁▁▁ ▁▁▁▁▁▁",
93334
- "▁▁▁▁▁▁▁▁ ▁▁",
93335
- "▁▁▁▁▁ ▁▁▁▁▁",
93336
- "▁▁▁▁▁▁ ▁▁▁▁",
93337
- "▁▁▁ ▁▁▁▁▁▁▁",
93338
- "▁▁▁▁▁▁▁▁▁ ▁",
93339
  "▁▁▁▁▁▁▁ ▁▁▁",
93340
- "▁ ▁▁▁▁▁▁▁▁▁",
93341
- "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93342
- "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93343
- "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93344
- "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93345
- "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93346
- "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93347
- "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93348
- "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93349
- "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93350
- "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93351
  "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
 
 
 
 
93352
  "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93353
- "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93354
- "▁▁ ▁",
93355
  "▁ ▁▁",
93356
- "▁ ▁▁▁▁▁▁▁",
93357
- "▁▁▁▁ ▁▁▁▁▁",
93358
- "▁▁▁▁▁▁▁▁ ▁",
93359
- "▁▁▁▁▁ ▁▁▁▁",
93360
- "▁▁▁▁▁▁ ▁▁▁",
93361
- "▁▁▁ ▁▁▁▁▁▁",
93362
- "▁▁▁▁▁▁▁ ▁▁",
93363
  "▁ ▁▁▁▁▁▁▁▁",
93364
- "▁ ▁▁▁▁▁",
93365
- "▁▁▁▁ ▁▁▁",
93366
- "▁▁▁▁▁ ▁▁",
93367
- "▁▁▁▁▁▁ ▁",
93368
- "▁▁▁ ▁▁▁▁",
93369
  "▁ ▁▁▁▁▁▁",
93370
- "▁ ▁▁▁▁▁▁▁▁▁",
93371
- "▁▁▁▁ ▁▁▁▁▁▁▁",
93372
- "▁▁▁▁▁▁▁▁ ▁▁▁",
93373
- "▁▁▁▁▁ ▁▁▁▁▁▁",
93374
- "▁▁▁▁▁▁ ▁▁▁▁▁",
93375
- "▁▁▁▁▁▁▁▁▁▁ ▁",
93376
- "▁▁▁ ▁▁▁▁▁▁▁▁",
93377
- "▁▁▁▁▁▁▁▁▁ ▁▁",
93378
- "▁▁▁▁▁▁▁ ▁▁▁▁",
93379
  "▁ ▁▁▁▁▁▁▁▁▁▁",
93380
- "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93381
- "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93382
- "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93383
- "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93384
- "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93385
- "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93386
- "▁▁▁▁▁▁▁▁▁▁▁▁▁",
93387
- "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93388
- "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93389
- "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93390
- "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93391
- "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93392
- "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93393
- "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁"
93394
  ]
93395
  }
93396
  }
 
14
  "single_word": false,
15
  "lstrip": false,
16
  "rstrip": false,
17
+ "normalized": true,
18
  "special": true
19
  },
20
  {
 
23
  "single_word": false,
24
  "lstrip": false,
25
  "rstrip": false,
26
+ "normalized": true,
27
  "special": true
28
  },
29
  {
 
32
  "single_word": false,
33
  "lstrip": false,
34
  "rstrip": false,
35
+ "normalized": true,
36
  "special": true
37
  }
38
  ],
 
93271
  "▁livre s",
93272
  "lu b",
93273
  "l ub",
 
93274
  "▁▁ ▁▁",
93275
+ "▁▁ ▁▁▁▁",
93276
+ "▁ ▁▁▁▁▁▁▁▁",
93277
+ "▁▁ ▁▁▁▁▁",
93278
  "▁▁ ▁▁▁▁▁▁",
93279
+ "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93280
+ "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93281
+ "▁▁ ▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
93282
  "▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
93283
+ "▁▁ ▁▁▁",
93284
+ "▁▁ ▁▁▁▁▁▁▁▁▁",
93285
+ "▁▁ ▁▁▁▁▁▁▁",
93286
+ "▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93287
+ "▁▁ ▁",
93288
+ "▁▁▁▁ ▁▁",
93289
+ "▁▁▁▁ ▁▁▁▁",
93290
+ "▁▁▁▁ ▁▁▁▁▁▁▁▁",
93291
+ "▁▁▁▁ ▁▁▁▁▁",
93292
+ "▁▁▁▁ ▁▁▁▁▁▁",
93293
  "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93294
+ "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93295
+ "▁▁▁▁ ▁▁▁",
93296
+ "▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93297
+ "▁▁▁▁ ▁▁▁▁▁▁▁",
93298
+ "▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93299
+ "▁▁▁▁ ▁",
93300
+ "▁▁▁▁▁▁▁▁ ▁▁",
93301
+ "▁▁▁▁▁▁▁▁ ▁▁▁▁",
93302
  "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93303
+ "▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93304
+ "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93305
+ "▁▁▁▁▁▁▁▁ ▁▁▁",
93306
+ "▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93307
+ "▁▁▁▁▁▁▁▁ ▁",
93308
+ "▁▁▁▁▁ ▁▁",
93309
+ "▁▁▁▁▁ ▁▁▁▁",
93310
+ "▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93311
+ "▁▁▁▁▁ ▁▁▁▁▁",
93312
+ "▁▁▁▁▁ ▁▁▁▁▁▁",
93313
+ "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93314
+ "▁▁▁▁▁ ▁▁▁",
93315
+ "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93316
+ "▁▁▁▁▁ ▁▁▁▁▁▁▁",
93317
  "▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93318
+ "▁▁▁▁▁ ▁",
93319
+ "▁▁▁▁▁▁ ▁▁",
93320
+ "▁▁▁▁▁▁ ▁▁▁▁",
93321
+ "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93322
+ "▁▁▁▁▁▁ ▁▁▁▁▁",
93323
+ "▁▁▁▁▁▁ ▁▁▁▁▁▁",
93324
  "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
93325
+ "▁▁▁▁▁▁ ▁▁▁",
93326
+ "▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
93327
+ "▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93328
+ "▁▁▁▁▁▁ ▁",
93329
+ "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93330
  "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93331
+ "▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93332
+ "▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93333
+ "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93334
  "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93335
+ "▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93336
+ "▁▁▁▁▁▁▁▁▁▁ ▁▁",
93337
+ "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93338
+ "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93339
  "▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93340
+ "▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93341
+ "▁▁▁▁▁▁▁▁▁▁ ▁",
93342
  "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93343
+ "▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93344
+ "▁▁▁ ▁▁",
93345
+ "▁▁▁ ▁▁▁▁",
93346
+ "▁▁▁ ▁▁▁▁▁▁▁▁",
93347
+ "▁▁▁ ▁▁▁▁▁",
93348
+ "▁▁▁ ▁▁▁▁▁▁",
93349
+ "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93350
  "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93351
+ "▁▁▁ ▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
93352
  "▁▁▁ ▁▁▁",
 
 
 
 
 
 
 
93353
  "▁▁▁ ▁▁▁▁▁▁▁▁▁",
93354
+ "▁▁▁ ▁▁▁▁▁▁▁",
93355
+ "▁▁▁ ▁▁▁▁▁▁▁▁▁▁▁",
93356
+ "▁▁▁ ▁",
93357
+ "▁▁▁▁▁▁▁▁▁ ▁▁",
93358
+ "▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93359
+ "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93360
+ "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
93361
  "▁▁▁▁▁▁▁▁▁ ▁▁▁",
93362
+ "▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93363
+ "▁▁▁▁▁▁▁▁▁ ▁",
93364
+ "▁▁▁▁▁▁▁ ▁▁",
93365
+ "▁▁▁▁▁▁▁ ▁▁▁▁",
93366
+ "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁",
93367
  "▁▁▁▁▁▁▁ ▁▁▁▁▁",
 
 
 
 
 
 
 
 
 
 
 
93368
  "▁▁▁▁▁▁▁ ▁▁▁▁▁▁",
 
 
 
 
 
 
 
 
 
93369
  "▁▁▁▁▁▁▁ ▁▁▁",
93370
+ "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
 
 
 
93371
  "▁▁▁▁▁▁▁ ▁▁▁▁▁▁▁",
93372
+ "▁▁▁▁▁▁▁ ▁",
93373
+ "▁▁▁▁▁▁▁▁▁▁▁ ▁▁",
93374
+ "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁",
93375
+ "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁▁▁",
93376
  "▁▁▁▁▁▁▁▁▁▁▁ ▁▁▁",
93377
+ "▁▁▁▁▁▁▁▁▁▁▁ ▁",
93378
+ "▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁ ▁",
93379
  "▁ ▁▁",
93380
+ "▁ ▁▁▁▁",
 
 
 
 
 
 
93381
  "▁ ▁▁▁▁▁▁▁▁",
93382
+ "▁ ▁▁▁▁▁",
 
 
 
 
93383
  "▁ ▁▁▁▁▁▁",
93384
+ "▁ ▁▁▁▁▁▁▁▁▁▁▁▁",
93385
+ "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
 
 
 
 
 
 
 
93386
  "▁ ▁▁▁▁▁▁▁▁▁▁",
93387
+ "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁",
93388
+ "▁ ▁▁▁",
93389
+ "▁ ▁▁▁▁▁▁▁▁▁",
93390
+ "▁ ▁▁▁▁▁▁▁",
93391
+ "▁ ▁▁▁▁▁▁▁▁▁▁▁",
93392
+ "▁ ▁▁▁▁▁▁▁▁▁▁▁▁▁▁",
93393
+ "▁ ▁"
 
 
 
 
 
 
 
93394
  ]
93395
  }
93396
  }
tokenizer_config.json CHANGED
@@ -3,7 +3,7 @@
3
  "__type": "AddedToken",
4
  "content": "<s>",
5
  "lstrip": false,
6
- "normalized": false,
7
  "rstrip": false,
8
  "single_word": false
9
  },
@@ -12,21 +12,19 @@
12
  "__type": "AddedToken",
13
  "content": "</s>",
14
  "lstrip": false,
15
- "normalized": false,
16
  "rstrip": false,
17
  "single_word": false
18
  },
19
- "legacy": false,
20
  "model_max_length": 1000000000000000019884624838656,
21
  "pad_token": null,
22
- "padding_side": "right",
23
  "sp_model_kwargs": {},
24
  "tokenizer_class": "LlamaTokenizer",
25
  "unk_token": {
26
  "__type": "AddedToken",
27
  "content": "<unk>",
28
  "lstrip": false,
29
- "normalized": false,
30
  "rstrip": false,
31
  "single_word": false
32
  },
 
3
  "__type": "AddedToken",
4
  "content": "<s>",
5
  "lstrip": false,
6
+ "normalized": true,
7
  "rstrip": false,
8
  "single_word": false
9
  },
 
12
  "__type": "AddedToken",
13
  "content": "</s>",
14
  "lstrip": false,
15
+ "normalized": true,
16
  "rstrip": false,
17
  "single_word": false
18
  },
 
19
  "model_max_length": 1000000000000000019884624838656,
20
  "pad_token": null,
 
21
  "sp_model_kwargs": {},
22
  "tokenizer_class": "LlamaTokenizer",
23
  "unk_token": {
24
  "__type": "AddedToken",
25
  "content": "<unk>",
26
  "lstrip": false,
27
+ "normalized": true,
28
  "rstrip": false,
29
  "single_word": false
30
  },
trainer_state.json CHANGED
@@ -1,3099 +1,595 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 0.8,
5
- "eval_steps": 50,
6
- "global_step": 500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0,
13
- "learning_rate": 0.0001,
14
- "loss": 0.9116,
15
- "step": 1
16
- },
17
- {
18
- "epoch": 0.0,
19
- "learning_rate": 0.0001,
20
- "loss": 1.3723,
21
  "step": 2
22
  },
23
  {
24
- "epoch": 0.0,
25
- "learning_rate": 0.0001,
26
- "loss": 1.1627,
27
- "step": 3
28
- },
29
- {
30
- "epoch": 0.01,
31
- "learning_rate": 0.0001,
32
- "loss": 1.0757,
33
  "step": 4
34
  },
35
  {
36
- "epoch": 0.01,
37
- "learning_rate": 0.0001,
38
- "loss": 1.2296,
39
- "step": 5
40
- },
41
- {
42
- "epoch": 0.01,
43
- "learning_rate": 0.0001,
44
- "loss": 1.077,
45
  "step": 6
46
  },
47
  {
48
- "epoch": 0.01,
49
- "learning_rate": 0.0001,
50
- "loss": 0.9249,
51
- "step": 7
52
- },
53
- {
54
- "epoch": 0.01,
55
- "learning_rate": 0.0001,
56
- "loss": 0.759,
57
  "step": 8
58
  },
59
  {
60
- "epoch": 0.01,
61
- "learning_rate": 0.0001,
62
- "loss": 1.0388,
63
- "step": 9
64
- },
65
- {
66
- "epoch": 0.02,
67
- "learning_rate": 0.0001,
68
- "loss": 1.261,
69
  "step": 10
70
  },
71
  {
72
- "epoch": 0.02,
73
- "learning_rate": 0.0001,
74
- "loss": 1.3653,
75
- "step": 11
76
- },
77
- {
78
- "epoch": 0.02,
79
- "learning_rate": 0.0001,
80
- "loss": 0.8655,
81
  "step": 12
82
  },
83
  {
84
- "epoch": 0.02,
85
- "learning_rate": 0.0001,
86
- "loss": 0.9888,
87
- "step": 13
88
- },
89
- {
90
- "epoch": 0.02,
91
- "learning_rate": 0.0001,
92
- "loss": 1.1763,
93
  "step": 14
94
  },
95
  {
96
- "epoch": 0.02,
97
- "learning_rate": 0.0001,
98
- "loss": 1.0124,
99
- "step": 15
100
- },
101
- {
102
- "epoch": 0.03,
103
- "learning_rate": 0.0001,
104
- "loss": 0.9616,
105
  "step": 16
106
  },
107
  {
108
- "epoch": 0.03,
109
- "learning_rate": 0.0001,
110
- "loss": 0.8868,
111
- "step": 17
112
- },
113
- {
114
- "epoch": 0.03,
115
- "learning_rate": 0.0001,
116
- "loss": 0.7583,
117
  "step": 18
118
  },
119
  {
120
- "epoch": 0.03,
121
- "learning_rate": 0.0001,
122
- "loss": 0.9342,
123
- "step": 19
124
- },
125
- {
126
- "epoch": 0.03,
127
- "learning_rate": 0.0001,
128
- "loss": 1.0648,
129
  "step": 20
130
  },
131
  {
132
- "epoch": 0.03,
133
- "learning_rate": 0.0001,
134
- "loss": 0.8388,
135
- "step": 21
136
- },
137
- {
138
- "epoch": 0.04,
139
- "learning_rate": 0.0001,
140
- "loss": 0.7302,
141
  "step": 22
142
  },
143
  {
144
- "epoch": 0.04,
145
- "learning_rate": 0.0001,
146
- "loss": 0.8321,
147
- "step": 23
148
- },
149
- {
150
- "epoch": 0.04,
151
- "learning_rate": 0.0001,
152
- "loss": 0.7517,
153
  "step": 24
154
  },
155
  {
156
- "epoch": 0.04,
157
- "learning_rate": 0.0001,
158
- "loss": 0.8373,
159
- "step": 25
160
- },
161
- {
162
- "epoch": 0.04,
163
- "learning_rate": 0.0001,
164
- "loss": 1.1871,
165
  "step": 26
166
  },
167
  {
168
- "epoch": 0.04,
169
- "learning_rate": 0.0001,
170
- "loss": 0.9553,
171
- "step": 27
172
- },
173
- {
174
- "epoch": 0.04,
175
- "learning_rate": 0.0001,
176
- "loss": 0.6482,
177
  "step": 28
178
  },
179
  {
180
- "epoch": 0.05,
181
- "learning_rate": 0.0001,
182
- "loss": 1.1545,
183
- "step": 29
184
- },
185
- {
186
- "epoch": 0.05,
187
- "learning_rate": 0.0001,
188
- "loss": 0.8847,
189
  "step": 30
190
  },
191
  {
192
- "epoch": 0.05,
193
- "learning_rate": 0.0001,
194
- "loss": 0.7373,
195
- "step": 31
196
- },
197
- {
198
- "epoch": 0.05,
199
- "learning_rate": 0.0001,
200
- "loss": 0.7648,
201
  "step": 32
202
  },
203
  {
204
- "epoch": 0.05,
205
- "learning_rate": 0.0001,
206
- "loss": 1.0672,
207
- "step": 33
208
- },
209
- {
210
- "epoch": 0.05,
211
- "learning_rate": 0.0001,
212
- "loss": 0.8169,
213
  "step": 34
214
  },
215
  {
216
- "epoch": 0.06,
217
- "learning_rate": 0.0001,
218
- "loss": 0.6257,
219
- "step": 35
220
- },
221
- {
222
- "epoch": 0.06,
223
- "learning_rate": 0.0001,
224
- "loss": 1.1182,
225
  "step": 36
226
  },
227
  {
228
- "epoch": 0.06,
229
- "learning_rate": 0.0001,
230
- "loss": 0.7479,
231
- "step": 37
232
- },
233
- {
234
- "epoch": 0.06,
235
- "learning_rate": 0.0001,
236
- "loss": 0.6269,
237
  "step": 38
238
  },
239
  {
240
- "epoch": 0.06,
241
- "learning_rate": 0.0001,
242
- "loss": 0.7677,
243
- "step": 39
244
- },
245
- {
246
- "epoch": 0.06,
247
- "learning_rate": 0.0001,
248
- "loss": 0.6104,
249
  "step": 40
250
  },
251
  {
252
- "epoch": 0.07,
253
- "learning_rate": 0.0001,
254
- "loss": 0.7166,
255
- "step": 41
256
- },
257
- {
258
- "epoch": 0.07,
259
- "learning_rate": 0.0001,
260
- "loss": 0.8078,
261
  "step": 42
262
  },
263
  {
264
- "epoch": 0.07,
265
- "learning_rate": 0.0001,
266
- "loss": 1.2132,
267
- "step": 43
268
- },
269
- {
270
- "epoch": 0.07,
271
- "learning_rate": 0.0001,
272
- "loss": 0.7491,
273
  "step": 44
274
  },
275
  {
276
- "epoch": 0.07,
277
- "learning_rate": 0.0001,
278
- "loss": 0.8092,
279
- "step": 45
280
- },
281
- {
282
- "epoch": 0.07,
283
- "learning_rate": 0.0001,
284
- "loss": 0.8556,
285
  "step": 46
286
  },
287
  {
288
- "epoch": 0.08,
289
- "learning_rate": 0.0001,
290
- "loss": 0.9427,
291
- "step": 47
292
- },
293
- {
294
- "epoch": 0.08,
295
- "learning_rate": 0.0001,
296
- "loss": 1.1345,
297
  "step": 48
298
  },
299
  {
300
- "epoch": 0.08,
301
- "learning_rate": 0.0001,
302
- "loss": 0.709,
303
- "step": 49
304
- },
305
- {
306
- "epoch": 0.08,
307
- "learning_rate": 0.0001,
308
- "loss": 0.6406,
309
- "step": 50
310
- },
311
- {
312
- "epoch": 0.08,
313
- "eval_loss": 0.868549644947052,
314
- "eval_runtime": 23.3187,
315
- "eval_samples_per_second": 8.577,
316
- "eval_steps_per_second": 2.144,
317
  "step": 50
318
  },
319
  {
320
- "epoch": 0.08,
321
- "learning_rate": 0.0001,
322
- "loss": 0.865,
323
- "step": 51
324
- },
325
- {
326
- "epoch": 0.08,
327
- "learning_rate": 0.0001,
328
- "loss": 0.6379,
329
  "step": 52
330
  },
331
  {
332
- "epoch": 0.08,
333
- "learning_rate": 0.0001,
334
- "loss": 0.6535,
335
- "step": 53
336
- },
337
- {
338
- "epoch": 0.09,
339
- "learning_rate": 0.0001,
340
- "loss": 0.9263,
341
  "step": 54
342
  },
343
  {
344
- "epoch": 0.09,
345
- "learning_rate": 0.0001,
346
- "loss": 1.0689,
347
- "step": 55
348
- },
349
- {
350
- "epoch": 0.09,
351
- "learning_rate": 0.0001,
352
- "loss": 0.9504,
353
  "step": 56
354
  },
355
  {
356
- "epoch": 0.09,
357
- "learning_rate": 0.0001,
358
- "loss": 0.7806,
359
- "step": 57
360
- },
361
- {
362
- "epoch": 0.09,
363
- "learning_rate": 0.0001,
364
- "loss": 0.8006,
365
  "step": 58
366
  },
367
  {
368
- "epoch": 0.09,
369
- "learning_rate": 0.0001,
370
- "loss": 0.6191,
371
- "step": 59
372
- },
373
- {
374
- "epoch": 0.1,
375
- "learning_rate": 0.0001,
376
- "loss": 0.872,
377
  "step": 60
378
  },
379
  {
380
- "epoch": 0.1,
381
- "learning_rate": 0.0001,
382
- "loss": 0.7302,
383
- "step": 61
384
- },
385
- {
386
- "epoch": 0.1,
387
- "learning_rate": 0.0001,
388
- "loss": 0.7796,
389
  "step": 62
390
  },
391
  {
392
- "epoch": 0.1,
393
- "learning_rate": 0.0001,
394
- "loss": 0.8935,
395
- "step": 63
396
- },
397
- {
398
- "epoch": 0.1,
399
- "learning_rate": 0.0001,
400
- "loss": 0.733,
401
  "step": 64
402
  },
403
  {
404
- "epoch": 0.1,
405
- "learning_rate": 0.0001,
406
- "loss": 1.1531,
407
- "step": 65
408
- },
409
- {
410
- "epoch": 0.11,
411
- "learning_rate": 0.0001,
412
- "loss": 0.8504,
413
  "step": 66
414
  },
415
  {
416
- "epoch": 0.11,
417
- "learning_rate": 0.0001,
418
- "loss": 0.6734,
419
- "step": 67
420
- },
421
- {
422
- "epoch": 0.11,
423
- "learning_rate": 0.0001,
424
- "loss": 1.2797,
425
  "step": 68
426
  },
427
  {
428
- "epoch": 0.11,
429
- "learning_rate": 0.0001,
430
- "loss": 0.8812,
431
- "step": 69
432
- },
433
- {
434
- "epoch": 0.11,
435
- "learning_rate": 0.0001,
436
- "loss": 0.6963,
437
  "step": 70
438
  },
439
  {
440
- "epoch": 0.11,
441
- "learning_rate": 0.0001,
442
- "loss": 1.1081,
443
- "step": 71
444
- },
445
- {
446
- "epoch": 0.12,
447
- "learning_rate": 0.0001,
448
- "loss": 0.7996,
449
  "step": 72
450
  },
451
  {
452
- "epoch": 0.12,
453
- "learning_rate": 0.0001,
454
- "loss": 0.7439,
455
- "step": 73
456
- },
457
- {
458
- "epoch": 0.12,
459
- "learning_rate": 0.0001,
460
- "loss": 0.5573,
461
  "step": 74
462
  },
463
  {
464
- "epoch": 0.12,
465
- "learning_rate": 0.0001,
466
- "loss": 0.6985,
467
- "step": 75
468
- },
469
- {
470
- "epoch": 0.12,
471
- "learning_rate": 0.0001,
472
- "loss": 1.2053,
473
  "step": 76
474
  },
475
  {
476
- "epoch": 0.12,
477
- "learning_rate": 0.0001,
478
- "loss": 1.0202,
479
- "step": 77
480
- },
481
- {
482
- "epoch": 0.12,
483
- "learning_rate": 0.0001,
484
- "loss": 0.7611,
485
  "step": 78
486
  },
487
  {
488
- "epoch": 0.13,
489
- "learning_rate": 0.0001,
490
- "loss": 0.8152,
491
- "step": 79
492
- },
493
- {
494
- "epoch": 0.13,
495
- "learning_rate": 0.0001,
496
- "loss": 1.0737,
497
  "step": 80
498
  },
499
  {
500
- "epoch": 0.13,
501
- "learning_rate": 0.0001,
502
- "loss": 0.6072,
503
- "step": 81
504
- },
505
- {
506
- "epoch": 0.13,
507
- "learning_rate": 0.0001,
508
- "loss": 0.7347,
509
  "step": 82
510
  },
511
  {
512
- "epoch": 0.13,
513
- "learning_rate": 0.0001,
514
- "loss": 0.7593,
515
- "step": 83
516
- },
517
- {
518
- "epoch": 0.13,
519
- "learning_rate": 0.0001,
520
- "loss": 0.7908,
521
  "step": 84
522
  },
523
  {
524
- "epoch": 0.14,
525
- "learning_rate": 0.0001,
526
- "loss": 1.1343,
527
- "step": 85
528
- },
529
- {
530
- "epoch": 0.14,
531
- "learning_rate": 0.0001,
532
- "loss": 0.6875,
533
  "step": 86
534
  },
535
  {
536
- "epoch": 0.14,
537
- "learning_rate": 0.0001,
538
- "loss": 0.5912,
539
- "step": 87
540
- },
541
- {
542
- "epoch": 0.14,
543
- "learning_rate": 0.0001,
544
- "loss": 0.7937,
545
  "step": 88
546
  },
547
  {
548
- "epoch": 0.14,
549
- "learning_rate": 0.0001,
550
- "loss": 0.8358,
551
- "step": 89
552
- },
553
- {
554
- "epoch": 0.14,
555
- "learning_rate": 0.0001,
556
- "loss": 0.6399,
557
  "step": 90
558
  },
559
  {
560
- "epoch": 0.15,
561
- "learning_rate": 0.0001,
562
- "loss": 0.9131,
563
- "step": 91
564
- },
565
- {
566
- "epoch": 0.15,
567
- "learning_rate": 0.0001,
568
- "loss": 0.7699,
569
  "step": 92
570
  },
571
  {
572
- "epoch": 0.15,
573
- "learning_rate": 0.0001,
574
- "loss": 0.5413,
575
- "step": 93
576
- },
577
- {
578
- "epoch": 0.15,
579
- "learning_rate": 0.0001,
580
- "loss": 1.0011,
581
  "step": 94
582
  },
583
  {
584
- "epoch": 0.15,
585
- "learning_rate": 0.0001,
586
- "loss": 0.6131,
587
- "step": 95
588
- },
589
- {
590
- "epoch": 0.15,
591
- "learning_rate": 0.0001,
592
- "loss": 0.8474,
593
  "step": 96
594
  },
595
  {
596
- "epoch": 0.16,
597
- "learning_rate": 0.0001,
598
- "loss": 1.0663,
599
- "step": 97
600
- },
601
- {
602
- "epoch": 0.16,
603
- "learning_rate": 0.0001,
604
- "loss": 0.6811,
605
  "step": 98
606
  },
607
  {
608
- "epoch": 0.16,
609
- "learning_rate": 0.0001,
610
- "loss": 0.9187,
611
- "step": 99
612
- },
613
- {
614
- "epoch": 0.16,
615
- "learning_rate": 0.0001,
616
- "loss": 0.737,
617
- "step": 100
618
- },
619
- {
620
- "epoch": 0.16,
621
- "eval_loss": 0.8485437035560608,
622
- "eval_runtime": 24.65,
623
- "eval_samples_per_second": 8.114,
624
- "eval_steps_per_second": 2.028,
625
  "step": 100
626
  },
627
  {
628
- "epoch": 0.16,
629
- "learning_rate": 0.0001,
630
- "loss": 0.8255,
631
- "step": 101
632
- },
633
- {
634
- "epoch": 0.16,
635
- "learning_rate": 0.0001,
636
- "loss": 0.9627,
637
  "step": 102
638
  },
639
  {
640
- "epoch": 0.16,
641
- "learning_rate": 0.0001,
642
- "loss": 0.7725,
643
- "step": 103
644
- },
645
- {
646
- "epoch": 0.17,
647
- "learning_rate": 0.0001,
648
- "loss": 0.8507,
649
  "step": 104
650
  },
651
  {
652
- "epoch": 0.17,
653
- "learning_rate": 0.0001,
654
- "loss": 0.8143,
655
- "step": 105
656
- },
657
- {
658
- "epoch": 0.17,
659
- "learning_rate": 0.0001,
660
- "loss": 0.6684,
661
  "step": 106
662
  },
663
  {
664
- "epoch": 0.17,
665
- "learning_rate": 0.0001,
666
- "loss": 0.7614,
667
- "step": 107
668
- },
669
- {
670
- "epoch": 0.17,
671
- "learning_rate": 0.0001,
672
- "loss": 0.7802,
673
  "step": 108
674
  },
675
  {
676
- "epoch": 0.17,
677
- "learning_rate": 0.0001,
678
- "loss": 1.0361,
679
- "step": 109
680
- },
681
- {
682
- "epoch": 0.18,
683
- "learning_rate": 0.0001,
684
- "loss": 0.987,
685
  "step": 110
686
  },
687
  {
688
- "epoch": 0.18,
689
- "learning_rate": 0.0001,
690
- "loss": 0.8892,
691
- "step": 111
692
- },
693
- {
694
- "epoch": 0.18,
695
- "learning_rate": 0.0001,
696
- "loss": 0.8485,
697
  "step": 112
698
  },
699
  {
700
- "epoch": 0.18,
701
- "learning_rate": 0.0001,
702
- "loss": 0.8788,
703
- "step": 113
704
- },
705
- {
706
- "epoch": 0.18,
707
- "learning_rate": 0.0001,
708
- "loss": 0.6153,
709
  "step": 114
710
  },
711
  {
712
- "epoch": 0.18,
713
- "learning_rate": 0.0001,
714
- "loss": 0.7439,
715
- "step": 115
716
- },
717
- {
718
- "epoch": 0.19,
719
- "learning_rate": 0.0001,
720
- "loss": 0.7335,
721
  "step": 116
722
  },
723
  {
724
- "epoch": 0.19,
725
- "learning_rate": 0.0001,
726
- "loss": 0.8796,
727
- "step": 117
728
- },
729
- {
730
- "epoch": 0.19,
731
- "learning_rate": 0.0001,
732
- "loss": 0.6855,
733
  "step": 118
734
  },
735
  {
736
- "epoch": 0.19,
737
- "learning_rate": 0.0001,
738
- "loss": 0.8044,
739
- "step": 119
740
- },
741
- {
742
- "epoch": 0.19,
743
- "learning_rate": 0.0001,
744
- "loss": 1.0749,
745
  "step": 120
746
  },
747
  {
748
- "epoch": 0.19,
749
- "learning_rate": 0.0001,
750
- "loss": 0.9934,
751
- "step": 121
752
- },
753
- {
754
- "epoch": 0.2,
755
- "learning_rate": 0.0001,
756
- "loss": 0.7191,
757
  "step": 122
758
  },
759
  {
760
- "epoch": 0.2,
761
- "learning_rate": 0.0001,
762
- "loss": 0.6728,
763
- "step": 123
764
- },
765
- {
766
- "epoch": 0.2,
767
- "learning_rate": 0.0001,
768
- "loss": 0.7966,
769
  "step": 124
770
  },
771
  {
772
- "epoch": 0.2,
773
- "learning_rate": 0.0001,
774
- "loss": 0.6851,
775
- "step": 125
776
- },
777
- {
778
- "epoch": 0.2,
779
- "learning_rate": 0.0001,
780
- "loss": 0.7798,
781
  "step": 126
782
  },
783
  {
784
- "epoch": 0.2,
785
- "learning_rate": 0.0001,
786
- "loss": 0.7101,
787
- "step": 127
788
- },
789
- {
790
- "epoch": 0.2,
791
- "learning_rate": 0.0001,
792
- "loss": 0.8464,
793
  "step": 128
794
  },
795
  {
796
- "epoch": 0.21,
797
- "learning_rate": 0.0001,
798
- "loss": 0.6968,
799
- "step": 129
800
- },
801
- {
802
- "epoch": 0.21,
803
- "learning_rate": 0.0001,
804
- "loss": 0.9243,
805
  "step": 130
806
  },
807
  {
808
- "epoch": 0.21,
809
- "learning_rate": 0.0001,
810
- "loss": 0.9137,
811
- "step": 131
812
- },
813
- {
814
- "epoch": 0.21,
815
- "learning_rate": 0.0001,
816
- "loss": 1.0197,
817
  "step": 132
818
  },
819
  {
820
- "epoch": 0.21,
821
- "learning_rate": 0.0001,
822
- "loss": 0.7454,
823
- "step": 133
824
- },
825
- {
826
- "epoch": 0.21,
827
- "learning_rate": 0.0001,
828
- "loss": 0.6409,
829
  "step": 134
830
  },
831
  {
832
- "epoch": 0.22,
833
- "learning_rate": 0.0001,
834
- "loss": 0.9743,
835
- "step": 135
836
- },
837
- {
838
- "epoch": 0.22,
839
- "learning_rate": 0.0001,
840
- "loss": 0.7787,
841
  "step": 136
842
  },
843
  {
844
- "epoch": 0.22,
845
- "learning_rate": 0.0001,
846
- "loss": 1.2262,
847
- "step": 137
848
- },
849
- {
850
- "epoch": 0.22,
851
- "learning_rate": 0.0001,
852
- "loss": 0.7161,
853
  "step": 138
854
  },
855
  {
856
- "epoch": 0.22,
857
- "learning_rate": 0.0001,
858
- "loss": 0.9221,
859
- "step": 139
860
- },
861
- {
862
- "epoch": 0.22,
863
- "learning_rate": 0.0001,
864
- "loss": 0.5967,
865
  "step": 140
866
  },
867
  {
868
- "epoch": 0.23,
869
- "learning_rate": 0.0001,
870
- "loss": 0.8278,
871
- "step": 141
872
- },
873
- {
874
- "epoch": 0.23,
875
- "learning_rate": 0.0001,
876
- "loss": 0.7456,
877
  "step": 142
878
  },
879
  {
880
- "epoch": 0.23,
881
- "learning_rate": 0.0001,
882
- "loss": 1.074,
883
- "step": 143
884
- },
885
- {
886
- "epoch": 0.23,
887
- "learning_rate": 0.0001,
888
- "loss": 0.579,
889
  "step": 144
890
  },
891
  {
892
- "epoch": 0.23,
893
- "learning_rate": 0.0001,
894
- "loss": 0.7077,
895
- "step": 145
896
- },
897
- {
898
- "epoch": 0.23,
899
- "learning_rate": 0.0001,
900
- "loss": 0.7434,
901
  "step": 146
902
  },
903
  {
904
- "epoch": 0.24,
905
- "learning_rate": 0.0001,
906
- "loss": 0.7941,
907
- "step": 147
908
- },
909
- {
910
- "epoch": 0.24,
911
- "learning_rate": 0.0001,
912
- "loss": 0.8617,
913
  "step": 148
914
  },
915
  {
916
- "epoch": 0.24,
917
- "learning_rate": 0.0001,
918
- "loss": 1.1551,
919
- "step": 149
920
- },
921
- {
922
- "epoch": 0.24,
923
- "learning_rate": 0.0001,
924
- "loss": 0.6728,
925
- "step": 150
926
- },
927
- {
928
- "epoch": 0.24,
929
- "eval_loss": 0.834892749786377,
930
- "eval_runtime": 24.8389,
931
- "eval_samples_per_second": 8.052,
932
- "eval_steps_per_second": 2.013,
933
  "step": 150
934
  },
935
  {
936
- "epoch": 0.24,
937
- "learning_rate": 0.0001,
938
- "loss": 0.6695,
939
- "step": 151
940
- },
941
- {
942
- "epoch": 0.24,
943
- "learning_rate": 0.0001,
944
- "loss": 1.2884,
945
  "step": 152
946
  },
947
  {
948
- "epoch": 0.24,
949
- "learning_rate": 0.0001,
950
- "loss": 0.7167,
951
- "step": 153
952
- },
953
- {
954
- "epoch": 0.25,
955
- "learning_rate": 0.0001,
956
- "loss": 0.6741,
957
  "step": 154
958
  },
959
  {
960
- "epoch": 0.25,
961
- "learning_rate": 0.0001,
962
- "loss": 1.188,
963
- "step": 155
964
- },
965
- {
966
- "epoch": 0.25,
967
- "learning_rate": 0.0001,
968
- "loss": 0.9072,
969
  "step": 156
970
  },
971
  {
972
- "epoch": 0.25,
973
- "learning_rate": 0.0001,
974
- "loss": 1.1358,
975
- "step": 157
976
- },
977
- {
978
- "epoch": 0.25,
979
- "learning_rate": 0.0001,
980
- "loss": 1.1017,
981
  "step": 158
982
  },
983
  {
984
- "epoch": 0.25,
985
- "learning_rate": 0.0001,
986
- "loss": 1.1505,
987
- "step": 159
988
- },
989
- {
990
- "epoch": 0.26,
991
- "learning_rate": 0.0001,
992
- "loss": 0.6388,
993
  "step": 160
994
  },
995
  {
996
- "epoch": 0.26,
997
- "learning_rate": 0.0001,
998
- "loss": 0.7861,
999
- "step": 161
1000
- },
1001
- {
1002
- "epoch": 0.26,
1003
- "learning_rate": 0.0001,
1004
- "loss": 0.8031,
1005
  "step": 162
1006
  },
1007
  {
1008
- "epoch": 0.26,
1009
- "learning_rate": 0.0001,
1010
- "loss": 1.0333,
1011
- "step": 163
1012
- },
1013
- {
1014
- "epoch": 0.26,
1015
- "learning_rate": 0.0001,
1016
- "loss": 0.7221,
1017
  "step": 164
1018
  },
1019
  {
1020
- "epoch": 0.26,
1021
- "learning_rate": 0.0001,
1022
- "loss": 0.9964,
1023
- "step": 165
1024
- },
1025
- {
1026
- "epoch": 0.27,
1027
- "learning_rate": 0.0001,
1028
- "loss": 1.0135,
1029
  "step": 166
1030
  },
1031
  {
1032
- "epoch": 0.27,
1033
- "learning_rate": 0.0001,
1034
- "loss": 0.8494,
1035
- "step": 167
1036
- },
1037
- {
1038
- "epoch": 0.27,
1039
- "learning_rate": 0.0001,
1040
- "loss": 0.9859,
1041
  "step": 168
1042
  },
1043
  {
1044
- "epoch": 0.27,
1045
- "learning_rate": 0.0001,
1046
- "loss": 0.7997,
1047
- "step": 169
1048
- },
1049
- {
1050
- "epoch": 0.27,
1051
- "learning_rate": 0.0001,
1052
- "loss": 0.9404,
1053
  "step": 170
1054
  },
1055
  {
1056
- "epoch": 0.27,
1057
- "learning_rate": 0.0001,
1058
- "loss": 0.6323,
1059
- "step": 171
1060
  },
1061
  {
1062
- "epoch": 0.28,
1063
- "learning_rate": 0.0001,
1064
- "loss": 0.8403,
1065
- "step": 172
1066
- },
1067
- {
1068
- "epoch": 0.28,
1069
- "learning_rate": 0.0001,
1070
- "loss": 0.582,
1071
- "step": 173
1072
- },
1073
- {
1074
- "epoch": 0.28,
1075
- "learning_rate": 0.0001,
1076
- "loss": 0.9193,
1077
  "step": 174
1078
  },
1079
  {
1080
- "epoch": 0.28,
1081
- "learning_rate": 0.0001,
1082
- "loss": 0.8621,
1083
- "step": 175
1084
- },
1085
- {
1086
- "epoch": 0.28,
1087
- "learning_rate": 0.0001,
1088
- "loss": 0.5766,
1089
  "step": 176
1090
  },
1091
  {
1092
- "epoch": 0.28,
1093
- "learning_rate": 0.0001,
1094
- "loss": 0.7207,
1095
- "step": 177
1096
- },
1097
- {
1098
- "epoch": 0.28,
1099
- "learning_rate": 0.0001,
1100
- "loss": 0.5348,
1101
  "step": 178
1102
  },
1103
  {
1104
- "epoch": 0.29,
1105
- "learning_rate": 0.0001,
1106
- "loss": 0.6951,
1107
- "step": 179
1108
- },
1109
- {
1110
- "epoch": 0.29,
1111
- "learning_rate": 0.0001,
1112
- "loss": 0.9066,
1113
  "step": 180
1114
  },
1115
  {
1116
- "epoch": 0.29,
1117
- "learning_rate": 0.0001,
1118
- "loss": 0.6585,
1119
- "step": 181
1120
- },
1121
- {
1122
- "epoch": 0.29,
1123
- "learning_rate": 0.0001,
1124
- "loss": 0.7614,
1125
  "step": 182
1126
  },
1127
  {
1128
- "epoch": 0.29,
1129
- "learning_rate": 0.0001,
1130
- "loss": 0.8271,
1131
- "step": 183
1132
- },
1133
- {
1134
- "epoch": 0.29,
1135
- "learning_rate": 0.0001,
1136
- "loss": 0.8223,
1137
  "step": 184
1138
  },
1139
  {
1140
- "epoch": 0.3,
1141
- "learning_rate": 0.0001,
1142
- "loss": 0.5425,
1143
- "step": 185
1144
- },
1145
- {
1146
- "epoch": 0.3,
1147
- "learning_rate": 0.0001,
1148
- "loss": 0.9519,
1149
  "step": 186
1150
  },
1151
  {
1152
- "epoch": 0.3,
1153
- "learning_rate": 0.0001,
1154
- "loss": 1.1696,
1155
- "step": 187
1156
- },
1157
- {
1158
- "epoch": 0.3,
1159
- "learning_rate": 0.0001,
1160
- "loss": 1.0861,
1161
  "step": 188
1162
  },
1163
  {
1164
- "epoch": 0.3,
1165
- "learning_rate": 0.0001,
1166
- "loss": 0.6459,
1167
- "step": 189
1168
- },
1169
- {
1170
- "epoch": 0.3,
1171
- "learning_rate": 0.0001,
1172
- "loss": 1.0339,
1173
  "step": 190
1174
  },
1175
  {
1176
- "epoch": 0.31,
1177
- "learning_rate": 0.0001,
1178
- "loss": 0.6049,
1179
- "step": 191
1180
- },
1181
- {
1182
- "epoch": 0.31,
1183
- "learning_rate": 0.0001,
1184
- "loss": 0.4248,
1185
  "step": 192
1186
- },
1187
- {
1188
- "epoch": 0.31,
1189
- "learning_rate": 0.0001,
1190
- "loss": 0.6728,
1191
- "step": 193
1192
- },
1193
- {
1194
- "epoch": 0.31,
1195
- "learning_rate": 0.0001,
1196
- "loss": 0.844,
1197
- "step": 194
1198
- },
1199
- {
1200
- "epoch": 0.31,
1201
- "learning_rate": 0.0001,
1202
- "loss": 0.9513,
1203
- "step": 195
1204
- },
1205
- {
1206
- "epoch": 0.31,
1207
- "learning_rate": 0.0001,
1208
- "loss": 0.9057,
1209
- "step": 196
1210
- },
1211
- {
1212
- "epoch": 0.32,
1213
- "learning_rate": 0.0001,
1214
- "loss": 0.8658,
1215
- "step": 197
1216
- },
1217
- {
1218
- "epoch": 0.32,
1219
- "learning_rate": 0.0001,
1220
- "loss": 1.1457,
1221
- "step": 198
1222
- },
1223
- {
1224
- "epoch": 0.32,
1225
- "learning_rate": 0.0001,
1226
- "loss": 0.6222,
1227
- "step": 199
1228
- },
1229
- {
1230
- "epoch": 0.32,
1231
- "learning_rate": 0.0001,
1232
- "loss": 0.8502,
1233
- "step": 200
1234
- },
1235
- {
1236
- "epoch": 0.32,
1237
- "eval_loss": 0.8277159929275513,
1238
- "eval_runtime": 24.752,
1239
- "eval_samples_per_second": 8.08,
1240
- "eval_steps_per_second": 2.02,
1241
- "step": 200
1242
- },
1243
- {
1244
- "epoch": 0.32,
1245
- "learning_rate": 0.0001,
1246
- "loss": 0.8184,
1247
- "step": 201
1248
- },
1249
- {
1250
- "epoch": 0.32,
1251
- "learning_rate": 0.0001,
1252
- "loss": 0.7073,
1253
- "step": 202
1254
- },
1255
- {
1256
- "epoch": 0.32,
1257
- "learning_rate": 0.0001,
1258
- "loss": 0.7264,
1259
- "step": 203
1260
- },
1261
- {
1262
- "epoch": 0.33,
1263
- "learning_rate": 0.0001,
1264
- "loss": 0.7576,
1265
- "step": 204
1266
- },
1267
- {
1268
- "epoch": 0.33,
1269
- "learning_rate": 0.0001,
1270
- "loss": 0.9689,
1271
- "step": 205
1272
- },
1273
- {
1274
- "epoch": 0.33,
1275
- "learning_rate": 0.0001,
1276
- "loss": 1.0524,
1277
- "step": 206
1278
- },
1279
- {
1280
- "epoch": 0.33,
1281
- "learning_rate": 0.0001,
1282
- "loss": 1.0149,
1283
- "step": 207
1284
- },
1285
- {
1286
- "epoch": 0.33,
1287
- "learning_rate": 0.0001,
1288
- "loss": 0.8202,
1289
- "step": 208
1290
- },
1291
- {
1292
- "epoch": 0.33,
1293
- "learning_rate": 0.0001,
1294
- "loss": 0.5502,
1295
- "step": 209
1296
- },
1297
- {
1298
- "epoch": 0.34,
1299
- "learning_rate": 0.0001,
1300
- "loss": 0.9816,
1301
- "step": 210
1302
- },
1303
- {
1304
- "epoch": 0.34,
1305
- "learning_rate": 0.0001,
1306
- "loss": 0.6147,
1307
- "step": 211
1308
- },
1309
- {
1310
- "epoch": 0.34,
1311
- "learning_rate": 0.0001,
1312
- "loss": 0.5323,
1313
- "step": 212
1314
- },
1315
- {
1316
- "epoch": 0.34,
1317
- "learning_rate": 0.0001,
1318
- "loss": 0.8119,
1319
- "step": 213
1320
- },
1321
- {
1322
- "epoch": 0.34,
1323
- "learning_rate": 0.0001,
1324
- "loss": 0.686,
1325
- "step": 214
1326
- },
1327
- {
1328
- "epoch": 0.34,
1329
- "learning_rate": 0.0001,
1330
- "loss": 0.6172,
1331
- "step": 215
1332
- },
1333
- {
1334
- "epoch": 0.35,
1335
- "learning_rate": 0.0001,
1336
- "loss": 0.667,
1337
- "step": 216
1338
- },
1339
- {
1340
- "epoch": 0.35,
1341
- "learning_rate": 0.0001,
1342
- "loss": 0.5587,
1343
- "step": 217
1344
- },
1345
- {
1346
- "epoch": 0.35,
1347
- "learning_rate": 0.0001,
1348
- "loss": 0.7948,
1349
- "step": 218
1350
- },
1351
- {
1352
- "epoch": 0.35,
1353
- "learning_rate": 0.0001,
1354
- "loss": 0.6955,
1355
- "step": 219
1356
- },
1357
- {
1358
- "epoch": 0.35,
1359
- "learning_rate": 0.0001,
1360
- "loss": 1.0785,
1361
- "step": 220
1362
- },
1363
- {
1364
- "epoch": 0.35,
1365
- "learning_rate": 0.0001,
1366
- "loss": 1.0274,
1367
- "step": 221
1368
- },
1369
- {
1370
- "epoch": 0.36,
1371
- "learning_rate": 0.0001,
1372
- "loss": 1.0043,
1373
- "step": 222
1374
- },
1375
- {
1376
- "epoch": 0.36,
1377
- "learning_rate": 0.0001,
1378
- "loss": 0.7549,
1379
- "step": 223
1380
- },
1381
- {
1382
- "epoch": 0.36,
1383
- "learning_rate": 0.0001,
1384
- "loss": 0.7411,
1385
- "step": 224
1386
- },
1387
- {
1388
- "epoch": 0.36,
1389
- "learning_rate": 0.0001,
1390
- "loss": 0.6823,
1391
- "step": 225
1392
- },
1393
- {
1394
- "epoch": 0.36,
1395
- "learning_rate": 0.0001,
1396
- "loss": 0.579,
1397
- "step": 226
1398
- },
1399
- {
1400
- "epoch": 0.36,
1401
- "learning_rate": 0.0001,
1402
- "loss": 0.8334,
1403
- "step": 227
1404
- },
1405
- {
1406
- "epoch": 0.36,
1407
- "learning_rate": 0.0001,
1408
- "loss": 0.8739,
1409
- "step": 228
1410
- },
1411
- {
1412
- "epoch": 0.37,
1413
- "learning_rate": 0.0001,
1414
- "loss": 0.7777,
1415
- "step": 229
1416
- },
1417
- {
1418
- "epoch": 0.37,
1419
- "learning_rate": 0.0001,
1420
- "loss": 1.0111,
1421
- "step": 230
1422
- },
1423
- {
1424
- "epoch": 0.37,
1425
- "learning_rate": 0.0001,
1426
- "loss": 0.551,
1427
- "step": 231
1428
- },
1429
- {
1430
- "epoch": 0.37,
1431
- "learning_rate": 0.0001,
1432
- "loss": 0.9072,
1433
- "step": 232
1434
- },
1435
- {
1436
- "epoch": 0.37,
1437
- "learning_rate": 0.0001,
1438
- "loss": 0.6715,
1439
- "step": 233
1440
- },
1441
- {
1442
- "epoch": 0.37,
1443
- "learning_rate": 0.0001,
1444
- "loss": 0.9077,
1445
- "step": 234
1446
- },
1447
- {
1448
- "epoch": 0.38,
1449
- "learning_rate": 0.0001,
1450
- "loss": 0.6588,
1451
- "step": 235
1452
- },
1453
- {
1454
- "epoch": 0.38,
1455
- "learning_rate": 0.0001,
1456
- "loss": 0.6529,
1457
- "step": 236
1458
- },
1459
- {
1460
- "epoch": 0.38,
1461
- "learning_rate": 0.0001,
1462
- "loss": 0.4969,
1463
- "step": 237
1464
- },
1465
- {
1466
- "epoch": 0.38,
1467
- "learning_rate": 0.0001,
1468
- "loss": 0.9918,
1469
- "step": 238
1470
- },
1471
- {
1472
- "epoch": 0.38,
1473
- "learning_rate": 0.0001,
1474
- "loss": 1.0968,
1475
- "step": 239
1476
- },
1477
- {
1478
- "epoch": 0.38,
1479
- "learning_rate": 0.0001,
1480
- "loss": 0.8712,
1481
- "step": 240
1482
- },
1483
- {
1484
- "epoch": 0.39,
1485
- "learning_rate": 0.0001,
1486
- "loss": 0.7713,
1487
- "step": 241
1488
- },
1489
- {
1490
- "epoch": 0.39,
1491
- "learning_rate": 0.0001,
1492
- "loss": 0.7751,
1493
- "step": 242
1494
- },
1495
- {
1496
- "epoch": 0.39,
1497
- "learning_rate": 0.0001,
1498
- "loss": 0.7431,
1499
- "step": 243
1500
- },
1501
- {
1502
- "epoch": 0.39,
1503
- "learning_rate": 0.0001,
1504
- "loss": 0.6143,
1505
- "step": 244
1506
- },
1507
- {
1508
- "epoch": 0.39,
1509
- "learning_rate": 0.0001,
1510
- "loss": 0.7544,
1511
- "step": 245
1512
- },
1513
- {
1514
- "epoch": 0.39,
1515
- "learning_rate": 0.0001,
1516
- "loss": 0.7271,
1517
- "step": 246
1518
- },
1519
- {
1520
- "epoch": 0.4,
1521
- "learning_rate": 0.0001,
1522
- "loss": 0.8474,
1523
- "step": 247
1524
- },
1525
- {
1526
- "epoch": 0.4,
1527
- "learning_rate": 0.0001,
1528
- "loss": 0.8025,
1529
- "step": 248
1530
- },
1531
- {
1532
- "epoch": 0.4,
1533
- "learning_rate": 0.0001,
1534
- "loss": 0.6609,
1535
- "step": 249
1536
- },
1537
- {
1538
- "epoch": 0.4,
1539
- "learning_rate": 0.0001,
1540
- "loss": 0.5675,
1541
- "step": 250
1542
- },
1543
- {
1544
- "epoch": 0.4,
1545
- "eval_loss": 0.8194305300712585,
1546
- "eval_runtime": 24.841,
1547
- "eval_samples_per_second": 8.051,
1548
- "eval_steps_per_second": 2.013,
1549
- "step": 250
1550
- },
1551
- {
1552
- "epoch": 0.4,
1553
- "learning_rate": 0.0001,
1554
- "loss": 0.8914,
1555
- "step": 251
1556
- },
1557
- {
1558
- "epoch": 0.4,
1559
- "learning_rate": 0.0001,
1560
- "loss": 1.1987,
1561
- "step": 252
1562
- },
1563
- {
1564
- "epoch": 0.4,
1565
- "learning_rate": 0.0001,
1566
- "loss": 0.8126,
1567
- "step": 253
1568
- },
1569
- {
1570
- "epoch": 0.41,
1571
- "learning_rate": 0.0001,
1572
- "loss": 0.6353,
1573
- "step": 254
1574
- },
1575
- {
1576
- "epoch": 0.41,
1577
- "learning_rate": 0.0001,
1578
- "loss": 0.6317,
1579
- "step": 255
1580
- },
1581
- {
1582
- "epoch": 0.41,
1583
- "learning_rate": 0.0001,
1584
- "loss": 1.4137,
1585
- "step": 256
1586
- },
1587
- {
1588
- "epoch": 0.41,
1589
- "learning_rate": 0.0001,
1590
- "loss": 0.8684,
1591
- "step": 257
1592
- },
1593
- {
1594
- "epoch": 0.41,
1595
- "learning_rate": 0.0001,
1596
- "loss": 0.8108,
1597
- "step": 258
1598
- },
1599
- {
1600
- "epoch": 0.41,
1601
- "learning_rate": 0.0001,
1602
- "loss": 0.8498,
1603
- "step": 259
1604
- },
1605
- {
1606
- "epoch": 0.42,
1607
- "learning_rate": 0.0001,
1608
- "loss": 0.9877,
1609
- "step": 260
1610
- },
1611
- {
1612
- "epoch": 0.42,
1613
- "learning_rate": 0.0001,
1614
- "loss": 0.4313,
1615
- "step": 261
1616
- },
1617
- {
1618
- "epoch": 0.42,
1619
- "learning_rate": 0.0001,
1620
- "loss": 1.2938,
1621
- "step": 262
1622
- },
1623
- {
1624
- "epoch": 0.42,
1625
- "learning_rate": 0.0001,
1626
- "loss": 0.7729,
1627
- "step": 263
1628
- },
1629
- {
1630
- "epoch": 0.42,
1631
- "learning_rate": 0.0001,
1632
- "loss": 0.7268,
1633
- "step": 264
1634
- },
1635
- {
1636
- "epoch": 0.42,
1637
- "learning_rate": 0.0001,
1638
- "loss": 0.689,
1639
- "step": 265
1640
- },
1641
- {
1642
- "epoch": 0.43,
1643
- "learning_rate": 0.0001,
1644
- "loss": 0.8234,
1645
- "step": 266
1646
- },
1647
- {
1648
- "epoch": 0.43,
1649
- "learning_rate": 0.0001,
1650
- "loss": 0.795,
1651
- "step": 267
1652
- },
1653
- {
1654
- "epoch": 0.43,
1655
- "learning_rate": 0.0001,
1656
- "loss": 1.3363,
1657
- "step": 268
1658
- },
1659
- {
1660
- "epoch": 0.43,
1661
- "learning_rate": 0.0001,
1662
- "loss": 0.8648,
1663
- "step": 269
1664
- },
1665
- {
1666
- "epoch": 0.43,
1667
- "learning_rate": 0.0001,
1668
- "loss": 0.6062,
1669
- "step": 270
1670
- },
1671
- {
1672
- "epoch": 0.43,
1673
- "learning_rate": 0.0001,
1674
- "loss": 0.9011,
1675
- "step": 271
1676
- },
1677
- {
1678
- "epoch": 0.44,
1679
- "learning_rate": 0.0001,
1680
- "loss": 0.9681,
1681
- "step": 272
1682
- },
1683
- {
1684
- "epoch": 0.44,
1685
- "learning_rate": 0.0001,
1686
- "loss": 0.7686,
1687
- "step": 273
1688
- },
1689
- {
1690
- "epoch": 0.44,
1691
- "learning_rate": 0.0001,
1692
- "loss": 0.6336,
1693
- "step": 274
1694
- },
1695
- {
1696
- "epoch": 0.44,
1697
- "learning_rate": 0.0001,
1698
- "loss": 0.656,
1699
- "step": 275
1700
- },
1701
- {
1702
- "epoch": 0.44,
1703
- "learning_rate": 0.0001,
1704
- "loss": 0.6881,
1705
- "step": 276
1706
- },
1707
- {
1708
- "epoch": 0.44,
1709
- "learning_rate": 0.0001,
1710
- "loss": 1.237,
1711
- "step": 277
1712
- },
1713
- {
1714
- "epoch": 0.44,
1715
- "learning_rate": 0.0001,
1716
- "loss": 0.7276,
1717
- "step": 278
1718
- },
1719
- {
1720
- "epoch": 0.45,
1721
- "learning_rate": 0.0001,
1722
- "loss": 0.8812,
1723
- "step": 279
1724
- },
1725
- {
1726
- "epoch": 0.45,
1727
- "learning_rate": 0.0001,
1728
- "loss": 0.62,
1729
- "step": 280
1730
- },
1731
- {
1732
- "epoch": 0.45,
1733
- "learning_rate": 0.0001,
1734
- "loss": 1.0725,
1735
- "step": 281
1736
- },
1737
- {
1738
- "epoch": 0.45,
1739
- "learning_rate": 0.0001,
1740
- "loss": 0.7557,
1741
- "step": 282
1742
- },
1743
- {
1744
- "epoch": 0.45,
1745
- "learning_rate": 0.0001,
1746
- "loss": 0.7496,
1747
- "step": 283
1748
- },
1749
- {
1750
- "epoch": 0.45,
1751
- "learning_rate": 0.0001,
1752
- "loss": 0.6644,
1753
- "step": 284
1754
- },
1755
- {
1756
- "epoch": 0.46,
1757
- "learning_rate": 0.0001,
1758
- "loss": 0.8527,
1759
- "step": 285
1760
- },
1761
- {
1762
- "epoch": 0.46,
1763
- "learning_rate": 0.0001,
1764
- "loss": 0.7585,
1765
- "step": 286
1766
- },
1767
- {
1768
- "epoch": 0.46,
1769
- "learning_rate": 0.0001,
1770
- "loss": 1.3147,
1771
- "step": 287
1772
- },
1773
- {
1774
- "epoch": 0.46,
1775
- "learning_rate": 0.0001,
1776
- "loss": 0.5564,
1777
- "step": 288
1778
- },
1779
- {
1780
- "epoch": 0.46,
1781
- "learning_rate": 0.0001,
1782
- "loss": 0.7872,
1783
- "step": 289
1784
- },
1785
- {
1786
- "epoch": 0.46,
1787
- "learning_rate": 0.0001,
1788
- "loss": 0.6818,
1789
- "step": 290
1790
- },
1791
- {
1792
- "epoch": 0.47,
1793
- "learning_rate": 0.0001,
1794
- "loss": 0.6323,
1795
- "step": 291
1796
- },
1797
- {
1798
- "epoch": 0.47,
1799
- "learning_rate": 0.0001,
1800
- "loss": 0.8843,
1801
- "step": 292
1802
- },
1803
- {
1804
- "epoch": 0.47,
1805
- "learning_rate": 0.0001,
1806
- "loss": 0.8496,
1807
- "step": 293
1808
- },
1809
- {
1810
- "epoch": 0.47,
1811
- "learning_rate": 0.0001,
1812
- "loss": 0.8922,
1813
- "step": 294
1814
- },
1815
- {
1816
- "epoch": 0.47,
1817
- "learning_rate": 0.0001,
1818
- "loss": 0.7424,
1819
- "step": 295
1820
- },
1821
- {
1822
- "epoch": 0.47,
1823
- "learning_rate": 0.0001,
1824
- "loss": 0.8656,
1825
- "step": 296
1826
- },
1827
- {
1828
- "epoch": 0.48,
1829
- "learning_rate": 0.0001,
1830
- "loss": 0.7949,
1831
- "step": 297
1832
- },
1833
- {
1834
- "epoch": 0.48,
1835
- "learning_rate": 0.0001,
1836
- "loss": 1.1672,
1837
- "step": 298
1838
- },
1839
- {
1840
- "epoch": 0.48,
1841
- "learning_rate": 0.0001,
1842
- "loss": 1.0858,
1843
- "step": 299
1844
- },
1845
- {
1846
- "epoch": 0.48,
1847
- "learning_rate": 0.0001,
1848
- "loss": 0.6229,
1849
- "step": 300
1850
- },
1851
- {
1852
- "epoch": 0.48,
1853
- "eval_loss": 0.8142353892326355,
1854
- "eval_runtime": 24.7801,
1855
- "eval_samples_per_second": 8.071,
1856
- "eval_steps_per_second": 2.018,
1857
- "step": 300
1858
- },
1859
- {
1860
- "epoch": 0.48,
1861
- "learning_rate": 0.0001,
1862
- "loss": 0.6634,
1863
- "step": 301
1864
- },
1865
- {
1866
- "epoch": 0.48,
1867
- "learning_rate": 0.0001,
1868
- "loss": 0.7846,
1869
- "step": 302
1870
- },
1871
- {
1872
- "epoch": 0.48,
1873
- "learning_rate": 0.0001,
1874
- "loss": 0.6297,
1875
- "step": 303
1876
- },
1877
- {
1878
- "epoch": 0.49,
1879
- "learning_rate": 0.0001,
1880
- "loss": 1.1403,
1881
- "step": 304
1882
- },
1883
- {
1884
- "epoch": 0.49,
1885
- "learning_rate": 0.0001,
1886
- "loss": 0.908,
1887
- "step": 305
1888
- },
1889
- {
1890
- "epoch": 0.49,
1891
- "learning_rate": 0.0001,
1892
- "loss": 0.9101,
1893
- "step": 306
1894
- },
1895
- {
1896
- "epoch": 0.49,
1897
- "learning_rate": 0.0001,
1898
- "loss": 0.8763,
1899
- "step": 307
1900
- },
1901
- {
1902
- "epoch": 0.49,
1903
- "learning_rate": 0.0001,
1904
- "loss": 0.5648,
1905
- "step": 308
1906
- },
1907
- {
1908
- "epoch": 0.49,
1909
- "learning_rate": 0.0001,
1910
- "loss": 0.5962,
1911
- "step": 309
1912
- },
1913
- {
1914
- "epoch": 0.5,
1915
- "learning_rate": 0.0001,
1916
- "loss": 0.8039,
1917
- "step": 310
1918
- },
1919
- {
1920
- "epoch": 0.5,
1921
- "learning_rate": 0.0001,
1922
- "loss": 0.6596,
1923
- "step": 311
1924
- },
1925
- {
1926
- "epoch": 0.5,
1927
- "learning_rate": 0.0001,
1928
- "loss": 0.663,
1929
- "step": 312
1930
- },
1931
- {
1932
- "epoch": 0.5,
1933
- "learning_rate": 0.0001,
1934
- "loss": 0.8796,
1935
- "step": 313
1936
- },
1937
- {
1938
- "epoch": 0.5,
1939
- "learning_rate": 0.0001,
1940
- "loss": 0.8528,
1941
- "step": 314
1942
- },
1943
- {
1944
- "epoch": 0.5,
1945
- "learning_rate": 0.0001,
1946
- "loss": 0.6546,
1947
- "step": 315
1948
- },
1949
- {
1950
- "epoch": 0.51,
1951
- "learning_rate": 0.0001,
1952
- "loss": 0.7442,
1953
- "step": 316
1954
- },
1955
- {
1956
- "epoch": 0.51,
1957
- "learning_rate": 0.0001,
1958
- "loss": 0.8597,
1959
- "step": 317
1960
- },
1961
- {
1962
- "epoch": 0.51,
1963
- "learning_rate": 0.0001,
1964
- "loss": 0.4604,
1965
- "step": 318
1966
- },
1967
- {
1968
- "epoch": 0.51,
1969
- "learning_rate": 0.0001,
1970
- "loss": 0.9421,
1971
- "step": 319
1972
- },
1973
- {
1974
- "epoch": 0.51,
1975
- "learning_rate": 0.0001,
1976
- "loss": 0.7374,
1977
- "step": 320
1978
- },
1979
- {
1980
- "epoch": 0.51,
1981
- "learning_rate": 0.0001,
1982
- "loss": 0.831,
1983
- "step": 321
1984
- },
1985
- {
1986
- "epoch": 0.52,
1987
- "learning_rate": 0.0001,
1988
- "loss": 0.7816,
1989
- "step": 322
1990
- },
1991
- {
1992
- "epoch": 0.52,
1993
- "learning_rate": 0.0001,
1994
- "loss": 1.0519,
1995
- "step": 323
1996
- },
1997
- {
1998
- "epoch": 0.52,
1999
- "learning_rate": 0.0001,
2000
- "loss": 0.8184,
2001
- "step": 324
2002
- },
2003
- {
2004
- "epoch": 0.52,
2005
- "learning_rate": 0.0001,
2006
- "loss": 0.6287,
2007
- "step": 325
2008
- },
2009
- {
2010
- "epoch": 0.52,
2011
- "learning_rate": 0.0001,
2012
- "loss": 0.7022,
2013
- "step": 326
2014
- },
2015
- {
2016
- "epoch": 0.52,
2017
- "learning_rate": 0.0001,
2018
- "loss": 0.5178,
2019
- "step": 327
2020
- },
2021
- {
2022
- "epoch": 0.52,
2023
- "learning_rate": 0.0001,
2024
- "loss": 0.6746,
2025
- "step": 328
2026
- },
2027
- {
2028
- "epoch": 0.53,
2029
- "learning_rate": 0.0001,
2030
- "loss": 0.9392,
2031
- "step": 329
2032
- },
2033
- {
2034
- "epoch": 0.53,
2035
- "learning_rate": 0.0001,
2036
- "loss": 0.7255,
2037
- "step": 330
2038
- },
2039
- {
2040
- "epoch": 0.53,
2041
- "learning_rate": 0.0001,
2042
- "loss": 1.0495,
2043
- "step": 331
2044
- },
2045
- {
2046
- "epoch": 0.53,
2047
- "learning_rate": 0.0001,
2048
- "loss": 0.6588,
2049
- "step": 332
2050
- },
2051
- {
2052
- "epoch": 0.53,
2053
- "learning_rate": 0.0001,
2054
- "loss": 1.4366,
2055
- "step": 333
2056
- },
2057
- {
2058
- "epoch": 0.53,
2059
- "learning_rate": 0.0001,
2060
- "loss": 0.6771,
2061
- "step": 334
2062
- },
2063
- {
2064
- "epoch": 0.54,
2065
- "learning_rate": 0.0001,
2066
- "loss": 0.6922,
2067
- "step": 335
2068
- },
2069
- {
2070
- "epoch": 0.54,
2071
- "learning_rate": 0.0001,
2072
- "loss": 0.7222,
2073
- "step": 336
2074
- },
2075
- {
2076
- "epoch": 0.54,
2077
- "learning_rate": 0.0001,
2078
- "loss": 0.6601,
2079
- "step": 337
2080
- },
2081
- {
2082
- "epoch": 0.54,
2083
- "learning_rate": 0.0001,
2084
- "loss": 0.6565,
2085
- "step": 338
2086
- },
2087
- {
2088
- "epoch": 0.54,
2089
- "learning_rate": 0.0001,
2090
- "loss": 0.6923,
2091
- "step": 339
2092
- },
2093
- {
2094
- "epoch": 0.54,
2095
- "learning_rate": 0.0001,
2096
- "loss": 1.3793,
2097
- "step": 340
2098
- },
2099
- {
2100
- "epoch": 0.55,
2101
- "learning_rate": 0.0001,
2102
- "loss": 0.6519,
2103
- "step": 341
2104
- },
2105
- {
2106
- "epoch": 0.55,
2107
- "learning_rate": 0.0001,
2108
- "loss": 0.8818,
2109
- "step": 342
2110
- },
2111
- {
2112
- "epoch": 0.55,
2113
- "learning_rate": 0.0001,
2114
- "loss": 0.6468,
2115
- "step": 343
2116
- },
2117
- {
2118
- "epoch": 0.55,
2119
- "learning_rate": 0.0001,
2120
- "loss": 0.6078,
2121
- "step": 344
2122
- },
2123
- {
2124
- "epoch": 0.55,
2125
- "learning_rate": 0.0001,
2126
- "loss": 0.9392,
2127
- "step": 345
2128
- },
2129
- {
2130
- "epoch": 0.55,
2131
- "learning_rate": 0.0001,
2132
- "loss": 0.4536,
2133
- "step": 346
2134
- },
2135
- {
2136
- "epoch": 0.56,
2137
- "learning_rate": 0.0001,
2138
- "loss": 0.949,
2139
- "step": 347
2140
- },
2141
- {
2142
- "epoch": 0.56,
2143
- "learning_rate": 0.0001,
2144
- "loss": 0.8729,
2145
- "step": 348
2146
- },
2147
- {
2148
- "epoch": 0.56,
2149
- "learning_rate": 0.0001,
2150
- "loss": 0.9161,
2151
- "step": 349
2152
- },
2153
- {
2154
- "epoch": 0.56,
2155
- "learning_rate": 0.0001,
2156
- "loss": 0.949,
2157
- "step": 350
2158
- },
2159
- {
2160
- "epoch": 0.56,
2161
- "eval_loss": 0.8095277547836304,
2162
- "eval_runtime": 24.7174,
2163
- "eval_samples_per_second": 8.091,
2164
- "eval_steps_per_second": 2.023,
2165
- "step": 350
2166
- },
2167
- {
2168
- "epoch": 0.56,
2169
- "learning_rate": 0.0001,
2170
- "loss": 1.0743,
2171
- "step": 351
2172
- },
2173
- {
2174
- "epoch": 0.56,
2175
- "learning_rate": 0.0001,
2176
- "loss": 0.8093,
2177
- "step": 352
2178
- },
2179
- {
2180
- "epoch": 0.56,
2181
- "learning_rate": 0.0001,
2182
- "loss": 0.67,
2183
- "step": 353
2184
- },
2185
- {
2186
- "epoch": 0.57,
2187
- "learning_rate": 0.0001,
2188
- "loss": 0.4371,
2189
- "step": 354
2190
- },
2191
- {
2192
- "epoch": 0.57,
2193
- "learning_rate": 0.0001,
2194
- "loss": 0.8948,
2195
- "step": 355
2196
- },
2197
- {
2198
- "epoch": 0.57,
2199
- "learning_rate": 0.0001,
2200
- "loss": 1.0447,
2201
- "step": 356
2202
- },
2203
- {
2204
- "epoch": 0.57,
2205
- "learning_rate": 0.0001,
2206
- "loss": 0.9963,
2207
- "step": 357
2208
- },
2209
- {
2210
- "epoch": 0.57,
2211
- "learning_rate": 0.0001,
2212
- "loss": 0.4708,
2213
- "step": 358
2214
- },
2215
- {
2216
- "epoch": 0.57,
2217
- "learning_rate": 0.0001,
2218
- "loss": 0.9607,
2219
- "step": 359
2220
- },
2221
- {
2222
- "epoch": 0.58,
2223
- "learning_rate": 0.0001,
2224
- "loss": 0.5614,
2225
- "step": 360
2226
- },
2227
- {
2228
- "epoch": 0.58,
2229
- "learning_rate": 0.0001,
2230
- "loss": 0.6489,
2231
- "step": 361
2232
- },
2233
- {
2234
- "epoch": 0.58,
2235
- "learning_rate": 0.0001,
2236
- "loss": 0.8976,
2237
- "step": 362
2238
- },
2239
- {
2240
- "epoch": 0.58,
2241
- "learning_rate": 0.0001,
2242
- "loss": 0.9451,
2243
- "step": 363
2244
- },
2245
- {
2246
- "epoch": 0.58,
2247
- "learning_rate": 0.0001,
2248
- "loss": 0.7571,
2249
- "step": 364
2250
- },
2251
- {
2252
- "epoch": 0.58,
2253
- "learning_rate": 0.0001,
2254
- "loss": 0.9322,
2255
- "step": 365
2256
- },
2257
- {
2258
- "epoch": 0.59,
2259
- "learning_rate": 0.0001,
2260
- "loss": 0.8059,
2261
- "step": 366
2262
- },
2263
- {
2264
- "epoch": 0.59,
2265
- "learning_rate": 0.0001,
2266
- "loss": 0.9788,
2267
- "step": 367
2268
- },
2269
- {
2270
- "epoch": 0.59,
2271
- "learning_rate": 0.0001,
2272
- "loss": 0.6501,
2273
- "step": 368
2274
- },
2275
- {
2276
- "epoch": 0.59,
2277
- "learning_rate": 0.0001,
2278
- "loss": 0.4786,
2279
- "step": 369
2280
- },
2281
- {
2282
- "epoch": 0.59,
2283
- "learning_rate": 0.0001,
2284
- "loss": 0.6794,
2285
- "step": 370
2286
- },
2287
- {
2288
- "epoch": 0.59,
2289
- "learning_rate": 0.0001,
2290
- "loss": 0.9943,
2291
- "step": 371
2292
- },
2293
- {
2294
- "epoch": 0.6,
2295
- "learning_rate": 0.0001,
2296
- "loss": 0.7991,
2297
- "step": 372
2298
- },
2299
- {
2300
- "epoch": 0.6,
2301
- "learning_rate": 0.0001,
2302
- "loss": 0.452,
2303
- "step": 373
2304
- },
2305
- {
2306
- "epoch": 0.6,
2307
- "learning_rate": 0.0001,
2308
- "loss": 0.9038,
2309
- "step": 374
2310
- },
2311
- {
2312
- "epoch": 0.6,
2313
- "learning_rate": 0.0001,
2314
- "loss": 0.8583,
2315
- "step": 375
2316
- },
2317
- {
2318
- "epoch": 0.6,
2319
- "learning_rate": 0.0001,
2320
- "loss": 0.6856,
2321
- "step": 376
2322
- },
2323
- {
2324
- "epoch": 0.6,
2325
- "learning_rate": 0.0001,
2326
- "loss": 0.8296,
2327
- "step": 377
2328
- },
2329
- {
2330
- "epoch": 0.6,
2331
- "learning_rate": 0.0001,
2332
- "loss": 0.89,
2333
- "step": 378
2334
- },
2335
- {
2336
- "epoch": 0.61,
2337
- "learning_rate": 0.0001,
2338
- "loss": 0.8538,
2339
- "step": 379
2340
- },
2341
- {
2342
- "epoch": 0.61,
2343
- "learning_rate": 0.0001,
2344
- "loss": 0.5888,
2345
- "step": 380
2346
- },
2347
- {
2348
- "epoch": 0.61,
2349
- "learning_rate": 0.0001,
2350
- "loss": 0.5261,
2351
- "step": 381
2352
- },
2353
- {
2354
- "epoch": 0.61,
2355
- "learning_rate": 0.0001,
2356
- "loss": 0.6218,
2357
- "step": 382
2358
- },
2359
- {
2360
- "epoch": 0.61,
2361
- "learning_rate": 0.0001,
2362
- "loss": 0.8169,
2363
- "step": 383
2364
- },
2365
- {
2366
- "epoch": 0.61,
2367
- "learning_rate": 0.0001,
2368
- "loss": 0.6245,
2369
- "step": 384
2370
- },
2371
- {
2372
- "epoch": 0.62,
2373
- "learning_rate": 0.0001,
2374
- "loss": 0.6245,
2375
- "step": 385
2376
- },
2377
- {
2378
- "epoch": 0.62,
2379
- "learning_rate": 0.0001,
2380
- "loss": 0.8975,
2381
- "step": 386
2382
- },
2383
- {
2384
- "epoch": 0.62,
2385
- "learning_rate": 0.0001,
2386
- "loss": 0.8271,
2387
- "step": 387
2388
- },
2389
- {
2390
- "epoch": 0.62,
2391
- "learning_rate": 0.0001,
2392
- "loss": 1.1388,
2393
- "step": 388
2394
- },
2395
- {
2396
- "epoch": 0.62,
2397
- "learning_rate": 0.0001,
2398
- "loss": 0.8321,
2399
- "step": 389
2400
- },
2401
- {
2402
- "epoch": 0.62,
2403
- "learning_rate": 0.0001,
2404
- "loss": 0.5424,
2405
- "step": 390
2406
- },
2407
- {
2408
- "epoch": 0.63,
2409
- "learning_rate": 0.0001,
2410
- "loss": 0.9909,
2411
- "step": 391
2412
- },
2413
- {
2414
- "epoch": 0.63,
2415
- "learning_rate": 0.0001,
2416
- "loss": 0.9046,
2417
- "step": 392
2418
- },
2419
- {
2420
- "epoch": 0.63,
2421
- "learning_rate": 0.0001,
2422
- "loss": 0.9526,
2423
- "step": 393
2424
- },
2425
- {
2426
- "epoch": 0.63,
2427
- "learning_rate": 0.0001,
2428
- "loss": 0.8797,
2429
- "step": 394
2430
- },
2431
- {
2432
- "epoch": 0.63,
2433
- "learning_rate": 0.0001,
2434
- "loss": 0.7379,
2435
- "step": 395
2436
- },
2437
- {
2438
- "epoch": 0.63,
2439
- "learning_rate": 0.0001,
2440
- "loss": 0.6621,
2441
- "step": 396
2442
- },
2443
- {
2444
- "epoch": 0.64,
2445
- "learning_rate": 0.0001,
2446
- "loss": 0.776,
2447
- "step": 397
2448
- },
2449
- {
2450
- "epoch": 0.64,
2451
- "learning_rate": 0.0001,
2452
- "loss": 0.9563,
2453
- "step": 398
2454
- },
2455
- {
2456
- "epoch": 0.64,
2457
- "learning_rate": 0.0001,
2458
- "loss": 0.6347,
2459
- "step": 399
2460
- },
2461
- {
2462
- "epoch": 0.64,
2463
- "learning_rate": 0.0001,
2464
- "loss": 0.6491,
2465
- "step": 400
2466
- },
2467
- {
2468
- "epoch": 0.64,
2469
- "eval_loss": 0.8057796359062195,
2470
- "eval_runtime": 24.887,
2471
- "eval_samples_per_second": 8.036,
2472
- "eval_steps_per_second": 2.009,
2473
- "step": 400
2474
- },
2475
- {
2476
- "epoch": 0.64,
2477
- "learning_rate": 0.0001,
2478
- "loss": 0.7607,
2479
- "step": 401
2480
- },
2481
- {
2482
- "epoch": 0.64,
2483
- "learning_rate": 0.0001,
2484
- "loss": 1.0243,
2485
- "step": 402
2486
- },
2487
- {
2488
- "epoch": 0.64,
2489
- "learning_rate": 0.0001,
2490
- "loss": 0.5438,
2491
- "step": 403
2492
- },
2493
- {
2494
- "epoch": 0.65,
2495
- "learning_rate": 0.0001,
2496
- "loss": 0.6113,
2497
- "step": 404
2498
- },
2499
- {
2500
- "epoch": 0.65,
2501
- "learning_rate": 0.0001,
2502
- "loss": 0.6026,
2503
- "step": 405
2504
- },
2505
- {
2506
- "epoch": 0.65,
2507
- "learning_rate": 0.0001,
2508
- "loss": 0.7509,
2509
- "step": 406
2510
- },
2511
- {
2512
- "epoch": 0.65,
2513
- "learning_rate": 0.0001,
2514
- "loss": 0.8664,
2515
- "step": 407
2516
- },
2517
- {
2518
- "epoch": 0.65,
2519
- "learning_rate": 0.0001,
2520
- "loss": 0.6859,
2521
- "step": 408
2522
- },
2523
- {
2524
- "epoch": 0.65,
2525
- "learning_rate": 0.0001,
2526
- "loss": 0.8515,
2527
- "step": 409
2528
- },
2529
- {
2530
- "epoch": 0.66,
2531
- "learning_rate": 0.0001,
2532
- "loss": 0.7311,
2533
- "step": 410
2534
- },
2535
- {
2536
- "epoch": 0.66,
2537
- "learning_rate": 0.0001,
2538
- "loss": 0.5227,
2539
- "step": 411
2540
- },
2541
- {
2542
- "epoch": 0.66,
2543
- "learning_rate": 0.0001,
2544
- "loss": 0.6609,
2545
- "step": 412
2546
- },
2547
- {
2548
- "epoch": 0.66,
2549
- "learning_rate": 0.0001,
2550
- "loss": 0.8416,
2551
- "step": 413
2552
- },
2553
- {
2554
- "epoch": 0.66,
2555
- "learning_rate": 0.0001,
2556
- "loss": 0.9931,
2557
- "step": 414
2558
- },
2559
- {
2560
- "epoch": 0.66,
2561
- "learning_rate": 0.0001,
2562
- "loss": 0.676,
2563
- "step": 415
2564
- },
2565
- {
2566
- "epoch": 0.67,
2567
- "learning_rate": 0.0001,
2568
- "loss": 0.7191,
2569
- "step": 416
2570
- },
2571
- {
2572
- "epoch": 0.67,
2573
- "learning_rate": 0.0001,
2574
- "loss": 0.7078,
2575
- "step": 417
2576
- },
2577
- {
2578
- "epoch": 0.67,
2579
- "learning_rate": 0.0001,
2580
- "loss": 0.6609,
2581
- "step": 418
2582
- },
2583
- {
2584
- "epoch": 0.67,
2585
- "learning_rate": 0.0001,
2586
- "loss": 0.7186,
2587
- "step": 419
2588
- },
2589
- {
2590
- "epoch": 0.67,
2591
- "learning_rate": 0.0001,
2592
- "loss": 0.5858,
2593
- "step": 420
2594
- },
2595
- {
2596
- "epoch": 0.67,
2597
- "learning_rate": 0.0001,
2598
- "loss": 0.7404,
2599
- "step": 421
2600
- },
2601
- {
2602
- "epoch": 0.68,
2603
- "learning_rate": 0.0001,
2604
- "loss": 0.8101,
2605
- "step": 422
2606
- },
2607
- {
2608
- "epoch": 0.68,
2609
- "learning_rate": 0.0001,
2610
- "loss": 0.6452,
2611
- "step": 423
2612
- },
2613
- {
2614
- "epoch": 0.68,
2615
- "learning_rate": 0.0001,
2616
- "loss": 0.6771,
2617
- "step": 424
2618
- },
2619
- {
2620
- "epoch": 0.68,
2621
- "learning_rate": 0.0001,
2622
- "loss": 1.0507,
2623
- "step": 425
2624
- },
2625
- {
2626
- "epoch": 0.68,
2627
- "learning_rate": 0.0001,
2628
- "loss": 0.88,
2629
- "step": 426
2630
- },
2631
- {
2632
- "epoch": 0.68,
2633
- "learning_rate": 0.0001,
2634
- "loss": 0.908,
2635
- "step": 427
2636
- },
2637
- {
2638
- "epoch": 0.68,
2639
- "learning_rate": 0.0001,
2640
- "loss": 0.6201,
2641
- "step": 428
2642
- },
2643
- {
2644
- "epoch": 0.69,
2645
- "learning_rate": 0.0001,
2646
- "loss": 1.0989,
2647
- "step": 429
2648
- },
2649
- {
2650
- "epoch": 0.69,
2651
- "learning_rate": 0.0001,
2652
- "loss": 0.9573,
2653
- "step": 430
2654
- },
2655
- {
2656
- "epoch": 0.69,
2657
- "learning_rate": 0.0001,
2658
- "loss": 0.7364,
2659
- "step": 431
2660
- },
2661
- {
2662
- "epoch": 0.69,
2663
- "learning_rate": 0.0001,
2664
- "loss": 1.3403,
2665
- "step": 432
2666
- },
2667
- {
2668
- "epoch": 0.69,
2669
- "learning_rate": 0.0001,
2670
- "loss": 0.5869,
2671
- "step": 433
2672
- },
2673
- {
2674
- "epoch": 0.69,
2675
- "learning_rate": 0.0001,
2676
- "loss": 0.6953,
2677
- "step": 434
2678
- },
2679
- {
2680
- "epoch": 0.7,
2681
- "learning_rate": 0.0001,
2682
- "loss": 0.8893,
2683
- "step": 435
2684
- },
2685
- {
2686
- "epoch": 0.7,
2687
- "learning_rate": 0.0001,
2688
- "loss": 0.6034,
2689
- "step": 436
2690
- },
2691
- {
2692
- "epoch": 0.7,
2693
- "learning_rate": 0.0001,
2694
- "loss": 0.9679,
2695
- "step": 437
2696
- },
2697
- {
2698
- "epoch": 0.7,
2699
- "learning_rate": 0.0001,
2700
- "loss": 0.8783,
2701
- "step": 438
2702
- },
2703
- {
2704
- "epoch": 0.7,
2705
- "learning_rate": 0.0001,
2706
- "loss": 1.1804,
2707
- "step": 439
2708
- },
2709
- {
2710
- "epoch": 0.7,
2711
- "learning_rate": 0.0001,
2712
- "loss": 0.7172,
2713
- "step": 440
2714
- },
2715
- {
2716
- "epoch": 0.71,
2717
- "learning_rate": 0.0001,
2718
- "loss": 0.4911,
2719
- "step": 441
2720
- },
2721
- {
2722
- "epoch": 0.71,
2723
- "learning_rate": 0.0001,
2724
- "loss": 0.6776,
2725
- "step": 442
2726
- },
2727
- {
2728
- "epoch": 0.71,
2729
- "learning_rate": 0.0001,
2730
- "loss": 0.7893,
2731
- "step": 443
2732
- },
2733
- {
2734
- "epoch": 0.71,
2735
- "learning_rate": 0.0001,
2736
- "loss": 0.6594,
2737
- "step": 444
2738
- },
2739
- {
2740
- "epoch": 0.71,
2741
- "learning_rate": 0.0001,
2742
- "loss": 0.7675,
2743
- "step": 445
2744
- },
2745
- {
2746
- "epoch": 0.71,
2747
- "learning_rate": 0.0001,
2748
- "loss": 0.8901,
2749
- "step": 446
2750
- },
2751
- {
2752
- "epoch": 0.72,
2753
- "learning_rate": 0.0001,
2754
- "loss": 0.9592,
2755
- "step": 447
2756
- },
2757
- {
2758
- "epoch": 0.72,
2759
- "learning_rate": 0.0001,
2760
- "loss": 0.7212,
2761
- "step": 448
2762
- },
2763
- {
2764
- "epoch": 0.72,
2765
- "learning_rate": 0.0001,
2766
- "loss": 0.7154,
2767
- "step": 449
2768
- },
2769
- {
2770
- "epoch": 0.72,
2771
- "learning_rate": 0.0001,
2772
- "loss": 0.5141,
2773
- "step": 450
2774
- },
2775
- {
2776
- "epoch": 0.72,
2777
- "eval_loss": 0.8031967282295227,
2778
- "eval_runtime": 24.7448,
2779
- "eval_samples_per_second": 8.082,
2780
- "eval_steps_per_second": 2.021,
2781
- "step": 450
2782
- },
2783
- {
2784
- "epoch": 0.72,
2785
- "learning_rate": 0.0001,
2786
- "loss": 0.6868,
2787
- "step": 451
2788
- },
2789
- {
2790
- "epoch": 0.72,
2791
- "learning_rate": 0.0001,
2792
- "loss": 0.909,
2793
- "step": 452
2794
- },
2795
- {
2796
- "epoch": 0.72,
2797
- "learning_rate": 0.0001,
2798
- "loss": 0.7715,
2799
- "step": 453
2800
- },
2801
- {
2802
- "epoch": 0.73,
2803
- "learning_rate": 0.0001,
2804
- "loss": 0.7504,
2805
- "step": 454
2806
- },
2807
- {
2808
- "epoch": 0.73,
2809
- "learning_rate": 0.0001,
2810
- "loss": 0.9652,
2811
- "step": 455
2812
- },
2813
- {
2814
- "epoch": 0.73,
2815
- "learning_rate": 0.0001,
2816
- "loss": 1.128,
2817
- "step": 456
2818
- },
2819
- {
2820
- "epoch": 0.73,
2821
- "learning_rate": 0.0001,
2822
- "loss": 0.6338,
2823
- "step": 457
2824
- },
2825
- {
2826
- "epoch": 0.73,
2827
- "learning_rate": 0.0001,
2828
- "loss": 0.6946,
2829
- "step": 458
2830
- },
2831
- {
2832
- "epoch": 0.73,
2833
- "learning_rate": 0.0001,
2834
- "loss": 0.7774,
2835
- "step": 459
2836
- },
2837
- {
2838
- "epoch": 0.74,
2839
- "learning_rate": 0.0001,
2840
- "loss": 1.0357,
2841
- "step": 460
2842
- },
2843
- {
2844
- "epoch": 0.74,
2845
- "learning_rate": 0.0001,
2846
- "loss": 0.632,
2847
- "step": 461
2848
- },
2849
- {
2850
- "epoch": 0.74,
2851
- "learning_rate": 0.0001,
2852
- "loss": 0.6207,
2853
- "step": 462
2854
- },
2855
- {
2856
- "epoch": 0.74,
2857
- "learning_rate": 0.0001,
2858
- "loss": 0.4877,
2859
- "step": 463
2860
- },
2861
- {
2862
- "epoch": 0.74,
2863
- "learning_rate": 0.0001,
2864
- "loss": 0.9296,
2865
- "step": 464
2866
- },
2867
- {
2868
- "epoch": 0.74,
2869
- "learning_rate": 0.0001,
2870
- "loss": 0.6321,
2871
- "step": 465
2872
- },
2873
- {
2874
- "epoch": 0.75,
2875
- "learning_rate": 0.0001,
2876
- "loss": 0.4997,
2877
- "step": 466
2878
- },
2879
- {
2880
- "epoch": 0.75,
2881
- "learning_rate": 0.0001,
2882
- "loss": 0.7938,
2883
- "step": 467
2884
- },
2885
- {
2886
- "epoch": 0.75,
2887
- "learning_rate": 0.0001,
2888
- "loss": 0.6134,
2889
- "step": 468
2890
- },
2891
- {
2892
- "epoch": 0.75,
2893
- "learning_rate": 0.0001,
2894
- "loss": 1.1448,
2895
- "step": 469
2896
- },
2897
- {
2898
- "epoch": 0.75,
2899
- "learning_rate": 0.0001,
2900
- "loss": 0.4663,
2901
- "step": 470
2902
- },
2903
- {
2904
- "epoch": 0.75,
2905
- "learning_rate": 0.0001,
2906
- "loss": 0.6637,
2907
- "step": 471
2908
- },
2909
- {
2910
- "epoch": 0.76,
2911
- "learning_rate": 0.0001,
2912
- "loss": 0.682,
2913
- "step": 472
2914
- },
2915
- {
2916
- "epoch": 0.76,
2917
- "learning_rate": 0.0001,
2918
- "loss": 0.5853,
2919
- "step": 473
2920
- },
2921
- {
2922
- "epoch": 0.76,
2923
- "learning_rate": 0.0001,
2924
- "loss": 0.7186,
2925
- "step": 474
2926
- },
2927
- {
2928
- "epoch": 0.76,
2929
- "learning_rate": 0.0001,
2930
- "loss": 0.7872,
2931
- "step": 475
2932
- },
2933
- {
2934
- "epoch": 0.76,
2935
- "learning_rate": 0.0001,
2936
- "loss": 0.6337,
2937
- "step": 476
2938
- },
2939
- {
2940
- "epoch": 0.76,
2941
- "learning_rate": 0.0001,
2942
- "loss": 0.6891,
2943
- "step": 477
2944
- },
2945
- {
2946
- "epoch": 0.76,
2947
- "learning_rate": 0.0001,
2948
- "loss": 0.974,
2949
- "step": 478
2950
- },
2951
- {
2952
- "epoch": 0.77,
2953
- "learning_rate": 0.0001,
2954
- "loss": 0.5751,
2955
- "step": 479
2956
- },
2957
- {
2958
- "epoch": 0.77,
2959
- "learning_rate": 0.0001,
2960
- "loss": 0.9913,
2961
- "step": 480
2962
- },
2963
- {
2964
- "epoch": 0.77,
2965
- "learning_rate": 0.0001,
2966
- "loss": 0.4584,
2967
- "step": 481
2968
- },
2969
- {
2970
- "epoch": 0.77,
2971
- "learning_rate": 0.0001,
2972
- "loss": 0.7533,
2973
- "step": 482
2974
- },
2975
- {
2976
- "epoch": 0.77,
2977
- "learning_rate": 0.0001,
2978
- "loss": 0.7567,
2979
- "step": 483
2980
- },
2981
- {
2982
- "epoch": 0.77,
2983
- "learning_rate": 0.0001,
2984
- "loss": 0.6906,
2985
- "step": 484
2986
- },
2987
- {
2988
- "epoch": 0.78,
2989
- "learning_rate": 0.0001,
2990
- "loss": 0.9371,
2991
- "step": 485
2992
- },
2993
- {
2994
- "epoch": 0.78,
2995
- "learning_rate": 0.0001,
2996
- "loss": 0.6613,
2997
- "step": 486
2998
- },
2999
- {
3000
- "epoch": 0.78,
3001
- "learning_rate": 0.0001,
3002
- "loss": 0.6724,
3003
- "step": 487
3004
- },
3005
- {
3006
- "epoch": 0.78,
3007
- "learning_rate": 0.0001,
3008
- "loss": 1.0523,
3009
- "step": 488
3010
- },
3011
- {
3012
- "epoch": 0.78,
3013
- "learning_rate": 0.0001,
3014
- "loss": 0.5206,
3015
- "step": 489
3016
- },
3017
- {
3018
- "epoch": 0.78,
3019
- "learning_rate": 0.0001,
3020
- "loss": 0.6908,
3021
- "step": 490
3022
- },
3023
- {
3024
- "epoch": 0.79,
3025
- "learning_rate": 0.0001,
3026
- "loss": 0.8953,
3027
- "step": 491
3028
- },
3029
- {
3030
- "epoch": 0.79,
3031
- "learning_rate": 0.0001,
3032
- "loss": 0.5296,
3033
- "step": 492
3034
- },
3035
- {
3036
- "epoch": 0.79,
3037
- "learning_rate": 0.0001,
3038
- "loss": 0.8856,
3039
- "step": 493
3040
- },
3041
- {
3042
- "epoch": 0.79,
3043
- "learning_rate": 0.0001,
3044
- "loss": 0.7541,
3045
- "step": 494
3046
- },
3047
- {
3048
- "epoch": 0.79,
3049
- "learning_rate": 0.0001,
3050
- "loss": 0.9077,
3051
- "step": 495
3052
- },
3053
- {
3054
- "epoch": 0.79,
3055
- "learning_rate": 0.0001,
3056
- "loss": 0.5498,
3057
- "step": 496
3058
- },
3059
- {
3060
- "epoch": 0.8,
3061
- "learning_rate": 0.0001,
3062
- "loss": 0.5698,
3063
- "step": 497
3064
- },
3065
- {
3066
- "epoch": 0.8,
3067
- "learning_rate": 0.0001,
3068
- "loss": 0.7017,
3069
- "step": 498
3070
- },
3071
- {
3072
- "epoch": 0.8,
3073
- "learning_rate": 0.0001,
3074
- "loss": 0.7255,
3075
- "step": 499
3076
- },
3077
- {
3078
- "epoch": 0.8,
3079
- "learning_rate": 0.0001,
3080
- "loss": 0.9133,
3081
- "step": 500
3082
- },
3083
- {
3084
- "epoch": 0.8,
3085
- "eval_loss": 0.8015583753585815,
3086
- "eval_runtime": 24.8711,
3087
- "eval_samples_per_second": 8.041,
3088
- "eval_steps_per_second": 2.01,
3089
- "step": 500
3090
  }
3091
  ],
3092
- "logging_steps": 1,
3093
- "max_steps": 1875,
3094
  "num_train_epochs": 3,
3095
  "save_steps": 500,
3096
- "total_flos": 2223003155939328.0,
3097
  "trial_name": null,
3098
  "trial_params": null
3099
  }
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 2.9825242718446603,
5
+ "eval_steps": 500,
6
+ "global_step": 192,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.03,
13
+ "learning_rate": 2e-05,
14
+ "loss": 1.6655,
 
 
 
 
 
 
15
  "step": 2
16
  },
17
  {
18
+ "epoch": 0.06,
19
+ "learning_rate": 2e-05,
20
+ "loss": 1.8229,
 
 
 
 
 
 
21
  "step": 4
22
  },
23
  {
24
+ "epoch": 0.09,
25
+ "learning_rate": 2e-05,
26
+ "loss": 1.871,
 
 
 
 
 
 
27
  "step": 6
28
  },
29
  {
30
+ "epoch": 0.12,
31
+ "learning_rate": 2e-05,
32
+ "loss": 1.9893,
 
 
 
 
 
 
33
  "step": 8
34
  },
35
  {
36
+ "epoch": 0.16,
37
+ "learning_rate": 2e-05,
38
+ "loss": 2.0291,
 
 
 
 
 
 
39
  "step": 10
40
  },
41
  {
42
+ "epoch": 0.19,
43
+ "learning_rate": 2e-05,
44
+ "loss": 1.983,
 
 
 
 
 
 
45
  "step": 12
46
  },
47
  {
48
+ "epoch": 0.22,
49
+ "learning_rate": 2e-05,
50
+ "loss": 2.0649,
 
 
 
 
 
 
51
  "step": 14
52
  },
53
  {
54
+ "epoch": 0.25,
55
+ "learning_rate": 2e-05,
56
+ "loss": 2.3775,
 
 
 
 
 
 
57
  "step": 16
58
  },
59
  {
60
+ "epoch": 0.28,
61
+ "learning_rate": 2e-05,
62
+ "loss": 1.6722,
 
 
 
 
 
 
63
  "step": 18
64
  },
65
  {
66
+ "epoch": 0.31,
67
+ "learning_rate": 2e-05,
68
+ "loss": 1.7044,
 
 
 
 
 
 
69
  "step": 20
70
  },
71
  {
72
+ "epoch": 0.34,
73
+ "learning_rate": 2e-05,
74
+ "loss": 1.77,
 
 
 
 
 
 
75
  "step": 22
76
  },
77
  {
78
+ "epoch": 0.37,
79
+ "learning_rate": 2e-05,
80
+ "loss": 2.0366,
 
 
 
 
 
 
81
  "step": 24
82
  },
83
  {
84
+ "epoch": 0.4,
85
+ "learning_rate": 2e-05,
86
+ "loss": 2.0772,
 
 
 
 
 
 
87
  "step": 26
88
  },
89
  {
90
+ "epoch": 0.43,
91
+ "learning_rate": 2e-05,
92
+ "loss": 2.1213,
 
 
 
 
 
 
93
  "step": 28
94
  },
95
  {
96
+ "epoch": 0.47,
97
+ "learning_rate": 2e-05,
98
+ "loss": 2.1209,
 
 
 
 
 
 
99
  "step": 30
100
  },
101
  {
102
+ "epoch": 0.5,
103
+ "learning_rate": 2e-05,
104
+ "loss": 2.567,
 
 
 
 
 
 
105
  "step": 32
106
  },
107
  {
108
+ "epoch": 0.53,
109
+ "learning_rate": 2e-05,
110
+ "loss": 1.6556,
 
 
 
 
 
 
111
  "step": 34
112
  },
113
  {
114
+ "epoch": 0.56,
115
+ "learning_rate": 2e-05,
116
+ "loss": 1.8708,
 
 
 
 
 
 
117
  "step": 36
118
  },
119
  {
120
+ "epoch": 0.59,
121
+ "learning_rate": 2e-05,
122
+ "loss": 2.0389,
 
 
 
 
 
 
123
  "step": 38
124
  },
125
  {
126
+ "epoch": 0.62,
127
+ "learning_rate": 2e-05,
128
+ "loss": 1.9722,
 
 
 
 
 
 
129
  "step": 40
130
  },
131
  {
132
+ "epoch": 0.65,
133
+ "learning_rate": 2e-05,
134
+ "loss": 2.075,
 
 
 
 
 
 
135
  "step": 42
136
  },
137
  {
138
+ "epoch": 0.68,
139
+ "learning_rate": 2e-05,
140
+ "loss": 2.0757,
 
 
 
 
 
 
141
  "step": 44
142
  },
143
  {
144
+ "epoch": 0.71,
145
+ "learning_rate": 2e-05,
146
+ "loss": 2.0656,
 
 
 
 
 
 
147
  "step": 46
148
  },
149
  {
150
+ "epoch": 0.75,
151
+ "learning_rate": 2e-05,
152
+ "loss": 2.4261,
 
 
 
 
 
 
153
  "step": 48
154
  },
155
  {
156
+ "epoch": 0.78,
157
+ "learning_rate": 2e-05,
158
+ "loss": 1.7365,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
159
  "step": 50
160
  },
161
  {
162
+ "epoch": 0.81,
163
+ "learning_rate": 2e-05,
164
+ "loss": 1.8049,
 
 
 
 
 
 
165
  "step": 52
166
  },
167
  {
168
+ "epoch": 0.84,
169
+ "learning_rate": 2e-05,
170
+ "loss": 1.9021,
 
 
 
 
 
 
171
  "step": 54
172
  },
173
  {
174
+ "epoch": 0.87,
175
+ "learning_rate": 2e-05,
176
+ "loss": 1.9696,
 
 
 
 
 
 
177
  "step": 56
178
  },
179
  {
180
+ "epoch": 0.9,
181
+ "learning_rate": 2e-05,
182
+ "loss": 2.1454,
 
 
 
 
 
 
183
  "step": 58
184
  },
185
  {
186
+ "epoch": 0.93,
187
+ "learning_rate": 2e-05,
188
+ "loss": 1.9359,
 
 
 
 
 
 
189
  "step": 60
190
  },
191
  {
192
+ "epoch": 0.96,
193
+ "learning_rate": 2e-05,
194
+ "loss": 2.2369,
 
 
 
 
 
 
195
  "step": 62
196
  },
197
  {
198
+ "epoch": 0.99,
199
+ "learning_rate": 2e-05,
200
+ "loss": 2.309,
 
 
 
 
 
 
201
  "step": 64
202
  },
203
  {
204
+ "epoch": 1.03,
205
+ "learning_rate": 2e-05,
206
+ "loss": 1.7485,
 
 
 
 
 
 
207
  "step": 66
208
  },
209
  {
210
+ "epoch": 1.06,
211
+ "learning_rate": 2e-05,
212
+ "loss": 1.805,
 
 
 
 
 
 
213
  "step": 68
214
  },
215
  {
216
+ "epoch": 1.09,
217
+ "learning_rate": 2e-05,
218
+ "loss": 1.777,
 
 
 
 
 
 
219
  "step": 70
220
  },
221
  {
222
+ "epoch": 1.12,
223
+ "learning_rate": 2e-05,
224
+ "loss": 1.9951,
 
 
 
 
 
 
225
  "step": 72
226
  },
227
  {
228
+ "epoch": 1.15,
229
+ "learning_rate": 2e-05,
230
+ "loss": 2.1456,
 
 
 
 
 
 
231
  "step": 74
232
  },
233
  {
234
+ "epoch": 1.18,
235
+ "learning_rate": 2e-05,
236
+ "loss": 2.0073,
 
 
 
 
 
 
237
  "step": 76
238
  },
239
  {
240
+ "epoch": 1.21,
241
+ "learning_rate": 2e-05,
242
+ "loss": 2.1608,
 
 
 
 
 
 
243
  "step": 78
244
  },
245
  {
246
+ "epoch": 1.24,
247
+ "learning_rate": 2e-05,
248
+ "loss": 2.2249,
 
 
 
 
 
 
249
  "step": 80
250
  },
251
  {
252
+ "epoch": 1.27,
253
+ "learning_rate": 2e-05,
254
+ "loss": 1.7468,
 
 
 
 
 
 
255
  "step": 82
256
  },
257
  {
258
+ "epoch": 1.3,
259
+ "learning_rate": 2e-05,
260
+ "loss": 1.7292,
 
 
 
 
 
 
261
  "step": 84
262
  },
263
  {
264
+ "epoch": 1.34,
265
+ "learning_rate": 2e-05,
266
+ "loss": 1.8926,
 
 
 
 
 
 
267
  "step": 86
268
  },
269
  {
270
+ "epoch": 1.37,
271
+ "learning_rate": 2e-05,
272
+ "loss": 1.9109,
 
 
 
 
 
 
273
  "step": 88
274
  },
275
  {
276
+ "epoch": 1.4,
277
+ "learning_rate": 2e-05,
278
+ "loss": 2.0223,
 
 
 
 
 
 
279
  "step": 90
280
  },
281
  {
282
+ "epoch": 1.43,
283
+ "learning_rate": 2e-05,
284
+ "loss": 2.0283,
 
 
 
 
 
 
285
  "step": 92
286
  },
287
  {
288
+ "epoch": 1.46,
289
+ "learning_rate": 2e-05,
290
+ "loss": 1.9571,
 
 
 
 
 
 
291
  "step": 94
292
  },
293
  {
294
+ "epoch": 1.49,
295
+ "learning_rate": 2e-05,
296
+ "loss": 2.4003,
 
 
 
 
 
 
297
  "step": 96
298
  },
299
  {
300
+ "epoch": 1.52,
301
+ "learning_rate": 2e-05,
302
+ "loss": 1.9499,
 
 
 
 
 
 
303
  "step": 98
304
  },
305
  {
306
+ "epoch": 1.55,
307
+ "learning_rate": 2e-05,
308
+ "loss": 1.7059,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  "step": 100
310
  },
311
  {
312
+ "epoch": 1.58,
313
+ "learning_rate": 2e-05,
314
+ "loss": 1.7516,
 
 
 
 
 
 
315
  "step": 102
316
  },
317
  {
318
+ "epoch": 1.62,
319
+ "learning_rate": 2e-05,
320
+ "loss": 1.9586,
 
 
 
 
 
 
321
  "step": 104
322
  },
323
  {
324
+ "epoch": 1.65,
325
+ "learning_rate": 2e-05,
326
+ "loss": 2.0152,
 
 
 
 
 
 
327
  "step": 106
328
  },
329
  {
330
+ "epoch": 1.68,
331
+ "learning_rate": 2e-05,
332
+ "loss": 2.1286,
 
 
 
 
 
 
333
  "step": 108
334
  },
335
  {
336
+ "epoch": 1.71,
337
+ "learning_rate": 2e-05,
338
+ "loss": 2.1614,
 
 
 
 
 
 
339
  "step": 110
340
  },
341
  {
342
+ "epoch": 1.74,
343
+ "learning_rate": 2e-05,
344
+ "loss": 2.168,
 
 
 
 
 
 
345
  "step": 112
346
  },
347
  {
348
+ "epoch": 1.77,
349
+ "learning_rate": 2e-05,
350
+ "loss": 1.8767,
 
 
 
 
 
 
351
  "step": 114
352
  },
353
  {
354
+ "epoch": 1.8,
355
+ "learning_rate": 2e-05,
356
+ "loss": 1.8243,
 
 
 
 
 
 
357
  "step": 116
358
  },
359
  {
360
+ "epoch": 1.83,
361
+ "learning_rate": 2e-05,
362
+ "loss": 1.9965,
 
 
 
 
 
 
363
  "step": 118
364
  },
365
  {
366
+ "epoch": 1.86,
367
+ "learning_rate": 2e-05,
368
+ "loss": 1.9171,
 
 
 
 
 
 
369
  "step": 120
370
  },
371
  {
372
+ "epoch": 1.9,
373
+ "learning_rate": 2e-05,
374
+ "loss": 1.9598,
 
 
 
 
 
 
375
  "step": 122
376
  },
377
  {
378
+ "epoch": 1.93,
379
+ "learning_rate": 2e-05,
380
+ "loss": 1.8569,
 
 
 
 
 
 
381
  "step": 124
382
  },
383
  {
384
+ "epoch": 1.96,
385
+ "learning_rate": 2e-05,
386
+ "loss": 2.0991,
 
 
 
 
 
 
387
  "step": 126
388
  },
389
  {
390
+ "epoch": 1.99,
391
+ "learning_rate": 2e-05,
392
+ "loss": 2.1616,
 
 
 
 
 
 
393
  "step": 128
394
  },
395
  {
396
+ "epoch": 2.02,
397
+ "learning_rate": 2e-05,
398
+ "loss": 1.7902,
 
 
 
 
 
 
399
  "step": 130
400
  },
401
  {
402
+ "epoch": 2.05,
403
+ "learning_rate": 2e-05,
404
+ "loss": 1.7404,
 
 
 
 
 
 
405
  "step": 132
406
  },
407
  {
408
+ "epoch": 2.08,
409
+ "learning_rate": 2e-05,
410
+ "loss": 1.9132,
 
 
 
 
 
 
411
  "step": 134
412
  },
413
  {
414
+ "epoch": 2.11,
415
+ "learning_rate": 2e-05,
416
+ "loss": 1.9342,
 
 
 
 
 
 
417
  "step": 136
418
  },
419
  {
420
+ "epoch": 2.14,
421
+ "learning_rate": 2e-05,
422
+ "loss": 2.0537,
 
 
 
 
 
 
423
  "step": 138
424
  },
425
  {
426
+ "epoch": 2.17,
427
+ "learning_rate": 2e-05,
428
+ "loss": 2.0116,
 
 
 
 
 
 
429
  "step": 140
430
  },
431
  {
432
+ "epoch": 2.21,
433
+ "learning_rate": 2e-05,
434
+ "loss": 2.0901,
 
 
 
 
 
 
435
  "step": 142
436
  },
437
  {
438
+ "epoch": 2.24,
439
+ "learning_rate": 2e-05,
440
+ "loss": 2.1829,
 
 
 
 
 
 
441
  "step": 144
442
  },
443
  {
444
+ "epoch": 2.27,
445
+ "learning_rate": 2e-05,
446
+ "loss": 1.8606,
 
 
 
 
 
 
447
  "step": 146
448
  },
449
  {
450
+ "epoch": 2.3,
451
+ "learning_rate": 2e-05,
452
+ "loss": 1.6824,
 
 
 
 
 
 
453
  "step": 148
454
  },
455
  {
456
+ "epoch": 2.33,
457
+ "learning_rate": 2e-05,
458
+ "loss": 1.8978,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  "step": 150
460
  },
461
  {
462
+ "epoch": 2.36,
463
+ "learning_rate": 2e-05,
464
+ "loss": 1.8608,
 
 
 
 
 
 
465
  "step": 152
466
  },
467
  {
468
+ "epoch": 2.39,
469
+ "learning_rate": 2e-05,
470
+ "loss": 1.9369,
 
 
 
 
 
 
471
  "step": 154
472
  },
473
  {
474
+ "epoch": 2.42,
475
+ "learning_rate": 2e-05,
476
+ "loss": 1.8742,
 
 
 
 
 
 
477
  "step": 156
478
  },
479
  {
480
+ "epoch": 2.45,
481
+ "learning_rate": 2e-05,
482
+ "loss": 2.0519,
 
 
 
 
 
 
483
  "step": 158
484
  },
485
  {
486
+ "epoch": 2.49,
487
+ "learning_rate": 2e-05,
488
+ "loss": 2.1078,
 
 
 
 
 
 
489
  "step": 160
490
  },
491
  {
492
+ "epoch": 2.52,
493
+ "learning_rate": 2e-05,
494
+ "loss": 1.8818,
 
 
 
 
 
 
495
  "step": 162
496
  },
497
  {
498
+ "epoch": 2.55,
499
+ "learning_rate": 2e-05,
500
+ "loss": 1.7438,
 
 
 
 
 
 
501
  "step": 164
502
  },
503
  {
504
+ "epoch": 2.58,
505
+ "learning_rate": 2e-05,
506
+ "loss": 1.8832,
 
 
 
 
 
 
507
  "step": 166
508
  },
509
  {
510
+ "epoch": 2.61,
511
+ "learning_rate": 2e-05,
512
+ "loss": 1.7988,
 
 
 
 
 
 
513
  "step": 168
514
  },
515
  {
516
+ "epoch": 2.64,
517
+ "learning_rate": 2e-05,
518
+ "loss": 2.0218,
 
 
 
 
 
 
519
  "step": 170
520
  },
521
  {
522
+ "epoch": 2.67,
523
+ "learning_rate": 2e-05,
524
+ "loss": 1.949,
525
+ "step": 172
526
  },
527
  {
528
+ "epoch": 2.7,
529
+ "learning_rate": 2e-05,
530
+ "loss": 1.9499,
 
 
 
 
 
 
 
 
 
 
 
 
531
  "step": 174
532
  },
533
  {
534
+ "epoch": 2.73,
535
+ "learning_rate": 2e-05,
536
+ "loss": 2.1105,
 
 
 
 
 
 
537
  "step": 176
538
  },
539
  {
540
+ "epoch": 2.77,
541
+ "learning_rate": 2e-05,
542
+ "loss": 1.8723,
 
 
 
 
 
 
543
  "step": 178
544
  },
545
  {
546
+ "epoch": 2.8,
547
+ "learning_rate": 2e-05,
548
+ "loss": 1.696,
 
 
 
 
 
 
549
  "step": 180
550
  },
551
  {
552
+ "epoch": 2.83,
553
+ "learning_rate": 2e-05,
554
+ "loss": 1.7281,
 
 
 
 
 
 
555
  "step": 182
556
  },
557
  {
558
+ "epoch": 2.86,
559
+ "learning_rate": 2e-05,
560
+ "loss": 1.8753,
 
 
 
 
 
 
561
  "step": 184
562
  },
563
  {
564
+ "epoch": 2.89,
565
+ "learning_rate": 2e-05,
566
+ "loss": 2.0551,
 
 
 
 
 
 
567
  "step": 186
568
  },
569
  {
570
+ "epoch": 2.92,
571
+ "learning_rate": 2e-05,
572
+ "loss": 1.8918,
 
 
 
 
 
 
573
  "step": 188
574
  },
575
  {
576
+ "epoch": 2.95,
577
+ "learning_rate": 2e-05,
578
+ "loss": 1.883,
 
 
 
 
 
 
579
  "step": 190
580
  },
581
  {
582
+ "epoch": 2.98,
583
+ "learning_rate": 2e-05,
584
+ "loss": 2.0562,
 
 
 
 
 
 
585
  "step": 192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
586
  }
587
  ],
588
+ "logging_steps": 2,
589
+ "max_steps": 192,
590
  "num_train_epochs": 3,
591
  "save_steps": 500,
592
+ "total_flos": 3.523807718510592e+16,
593
  "trial_name": null,
594
  "trial_params": null
595
  }
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d0be16a4ef746ff680c142f86514f7353aba1cd96c55e757a8a29103a6b214f
3
- size 3963
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c7a972a5b17f6099efa570235cd9950cd0e25f150cf2581773bba7342d849701
3
+ size 4027