shulijia commited on
Commit
9967cac
·
verified ·
1 Parent(s): f2131eb

Training in progress, step 4500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:29b32b64478cbdeda9875145bb265427fdfdcfc88b4dfa3e973d6646373e0a0f
3
  size 2384234968
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c4e63b26372d3f3e62b6db0399391a0574424daa0751e0e7769783c179152263
3
  size 2384234968
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cc9bb74074cb17bdb9685f131305e9e7ae2f53f82516e862de8e7a638f79006
3
  size 4768663315
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d141ff3463e96322620a710d65203085ab4c7bfad20d6edead9263d77dfc77d1
3
  size 4768663315
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a1ccf15ab626b17b6464b860472b5e0620f2d570991113393ae691c84ea2b523
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40cd56bbbdb5d88ccc71e65c68f8f624220ae821195d3d32ae2f6eb8a22d3ebe
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 2.0313429587172602,
6
  "eval_steps": 100,
7
- "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3158,6 +3158,906 @@
3158
  "mean_token_accuracy": 0.7707681007683277,
3159
  "num_tokens": 28668928.0,
3160
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3161
  }
3162
  ],
3163
  "logging_steps": 10,
@@ -3177,7 +4077,7 @@
3177
  "attributes": {}
3178
  }
3179
  },
3180
- "total_flos": 7.5766378694443e+16,
3181
  "train_batch_size": 2,
3182
  "trial_name": null,
3183
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 2.6117681201480085,
6
  "eval_steps": 100,
7
+ "global_step": 4500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3158
  "mean_token_accuracy": 0.7707681007683277,
3159
  "num_tokens": 28668928.0,
3160
  "step": 3500
3161
+ },
3162
+ {
3163
+ "epoch": 2.0371472103315678,
3164
+ "grad_norm": 1.1667520999908447,
3165
+ "learning_rate": 3.5683576956147896e-06,
3166
+ "loss": 0.0654,
3167
+ "mean_token_accuracy": 0.7756482396274805,
3168
+ "num_tokens": 28750848.0,
3169
+ "step": 3510
3170
+ },
3171
+ {
3172
+ "epoch": 2.0429514619458753,
3173
+ "grad_norm": 1.4603782892227173,
3174
+ "learning_rate": 3.546861564918315e-06,
3175
+ "loss": 0.0777,
3176
+ "mean_token_accuracy": 0.7753424648195505,
3177
+ "num_tokens": 28832768.0,
3178
+ "step": 3520
3179
+ },
3180
+ {
3181
+ "epoch": 2.048755713560183,
3182
+ "grad_norm": 1.4233088493347168,
3183
+ "learning_rate": 3.5253654342218404e-06,
3184
+ "loss": 0.0801,
3185
+ "mean_token_accuracy": 0.7762353233993053,
3186
+ "num_tokens": 28914688.0,
3187
+ "step": 3530
3188
+ },
3189
+ {
3190
+ "epoch": 2.0545599651744904,
3191
+ "grad_norm": 1.341167688369751,
3192
+ "learning_rate": 3.5038693035253656e-06,
3193
+ "loss": 0.0827,
3194
+ "mean_token_accuracy": 0.7780577309429646,
3195
+ "num_tokens": 28996608.0,
3196
+ "step": 3540
3197
+ },
3198
+ {
3199
+ "epoch": 2.060364216788798,
3200
+ "grad_norm": 1.2166831493377686,
3201
+ "learning_rate": 3.482373172828891e-06,
3202
+ "loss": 0.0819,
3203
+ "mean_token_accuracy": 0.7716487277299166,
3204
+ "num_tokens": 29078528.0,
3205
+ "step": 3550
3206
+ },
3207
+ {
3208
+ "epoch": 2.0661684684031054,
3209
+ "grad_norm": 1.952674388885498,
3210
+ "learning_rate": 3.4608770421324168e-06,
3211
+ "loss": 0.0789,
3212
+ "mean_token_accuracy": 0.790459880977869,
3213
+ "num_tokens": 29160448.0,
3214
+ "step": 3560
3215
+ },
3216
+ {
3217
+ "epoch": 2.071972720017413,
3218
+ "grad_norm": 1.5593677759170532,
3219
+ "learning_rate": 3.4393809114359415e-06,
3220
+ "loss": 0.071,
3221
+ "mean_token_accuracy": 0.7739603701978922,
3222
+ "num_tokens": 29242368.0,
3223
+ "step": 3570
3224
+ },
3225
+ {
3226
+ "epoch": 2.07777697163172,
3227
+ "grad_norm": 1.3000702857971191,
3228
+ "learning_rate": 3.417884780739467e-06,
3229
+ "loss": 0.0746,
3230
+ "mean_token_accuracy": 0.7763698630034923,
3231
+ "num_tokens": 29324288.0,
3232
+ "step": 3580
3233
+ },
3234
+ {
3235
+ "epoch": 2.0835812232460276,
3236
+ "grad_norm": 1.699432373046875,
3237
+ "learning_rate": 3.3963886500429928e-06,
3238
+ "loss": 0.0786,
3239
+ "mean_token_accuracy": 0.7786692760884761,
3240
+ "num_tokens": 29406208.0,
3241
+ "step": 3590
3242
+ },
3243
+ {
3244
+ "epoch": 2.089385474860335,
3245
+ "grad_norm": 1.335424780845642,
3246
+ "learning_rate": 3.3748925193465175e-06,
3247
+ "loss": 0.0757,
3248
+ "mean_token_accuracy": 0.7785591997206212,
3249
+ "num_tokens": 29488128.0,
3250
+ "step": 3600
3251
+ },
3252
+ {
3253
+ "epoch": 2.0951897264746426,
3254
+ "grad_norm": 1.6017467975616455,
3255
+ "learning_rate": 3.353396388650043e-06,
3256
+ "loss": 0.0639,
3257
+ "mean_token_accuracy": 0.7843321934342384,
3258
+ "num_tokens": 29570048.0,
3259
+ "step": 3610
3260
+ },
3261
+ {
3262
+ "epoch": 2.10099397808895,
3263
+ "grad_norm": 1.5050615072250366,
3264
+ "learning_rate": 3.3319002579535687e-06,
3265
+ "loss": 0.0657,
3266
+ "mean_token_accuracy": 0.7802959885448217,
3267
+ "num_tokens": 29651968.0,
3268
+ "step": 3620
3269
+ },
3270
+ {
3271
+ "epoch": 2.1067982297032577,
3272
+ "grad_norm": 1.6678760051727295,
3273
+ "learning_rate": 3.3104041272570943e-06,
3274
+ "loss": 0.0872,
3275
+ "mean_token_accuracy": 0.7761497080326081,
3276
+ "num_tokens": 29733888.0,
3277
+ "step": 3630
3278
+ },
3279
+ {
3280
+ "epoch": 2.1126024813175652,
3281
+ "grad_norm": 1.618610143661499,
3282
+ "learning_rate": 3.288907996560619e-06,
3283
+ "loss": 0.0704,
3284
+ "mean_token_accuracy": 0.7856042109429836,
3285
+ "num_tokens": 29815808.0,
3286
+ "step": 3640
3287
+ },
3288
+ {
3289
+ "epoch": 2.1184067329318728,
3290
+ "grad_norm": 1.520416498184204,
3291
+ "learning_rate": 3.2674118658641447e-06,
3292
+ "loss": 0.0802,
3293
+ "mean_token_accuracy": 0.780247063934803,
3294
+ "num_tokens": 29897728.0,
3295
+ "step": 3650
3296
+ },
3297
+ {
3298
+ "epoch": 2.1242109845461803,
3299
+ "grad_norm": 1.4392660856246948,
3300
+ "learning_rate": 3.2459157351676703e-06,
3301
+ "loss": 0.0878,
3302
+ "mean_token_accuracy": 0.7686399213969708,
3303
+ "num_tokens": 29979648.0,
3304
+ "step": 3660
3305
+ },
3306
+ {
3307
+ "epoch": 2.1300152361604874,
3308
+ "grad_norm": 1.7562257051467896,
3309
+ "learning_rate": 3.2244196044711955e-06,
3310
+ "loss": 0.0924,
3311
+ "mean_token_accuracy": 0.7676492169499397,
3312
+ "num_tokens": 30061568.0,
3313
+ "step": 3670
3314
+ },
3315
+ {
3316
+ "epoch": 2.135819487774795,
3317
+ "grad_norm": 1.808322548866272,
3318
+ "learning_rate": 3.2029234737747207e-06,
3319
+ "loss": 0.0768,
3320
+ "mean_token_accuracy": 0.7800391376018524,
3321
+ "num_tokens": 30143488.0,
3322
+ "step": 3680
3323
+ },
3324
+ {
3325
+ "epoch": 2.1416237393891024,
3326
+ "grad_norm": 1.0957632064819336,
3327
+ "learning_rate": 3.1814273430782463e-06,
3328
+ "loss": 0.0692,
3329
+ "mean_token_accuracy": 0.7738013703376054,
3330
+ "num_tokens": 30225408.0,
3331
+ "step": 3690
3332
+ },
3333
+ {
3334
+ "epoch": 2.14742799100341,
3335
+ "grad_norm": 1.1658979654312134,
3336
+ "learning_rate": 3.1599312123817715e-06,
3337
+ "loss": 0.0642,
3338
+ "mean_token_accuracy": 0.7983977481722832,
3339
+ "num_tokens": 30307328.0,
3340
+ "step": 3700
3341
+ },
3342
+ {
3343
+ "epoch": 2.1532322426177175,
3344
+ "grad_norm": 1.1856775283813477,
3345
+ "learning_rate": 3.138435081685297e-06,
3346
+ "loss": 0.0626,
3347
+ "mean_token_accuracy": 0.7825954023748636,
3348
+ "num_tokens": 30389248.0,
3349
+ "step": 3710
3350
+ },
3351
+ {
3352
+ "epoch": 2.159036494232025,
3353
+ "grad_norm": 1.3690383434295654,
3354
+ "learning_rate": 3.1169389509888223e-06,
3355
+ "loss": 0.0835,
3356
+ "mean_token_accuracy": 0.7620841465890408,
3357
+ "num_tokens": 30471168.0,
3358
+ "step": 3720
3359
+ },
3360
+ {
3361
+ "epoch": 2.1648407458463326,
3362
+ "grad_norm": 1.2411593198776245,
3363
+ "learning_rate": 3.0954428202923474e-06,
3364
+ "loss": 0.0671,
3365
+ "mean_token_accuracy": 0.7918542079627514,
3366
+ "num_tokens": 30553088.0,
3367
+ "step": 3730
3368
+ },
3369
+ {
3370
+ "epoch": 2.17064499746064,
3371
+ "grad_norm": 1.424753189086914,
3372
+ "learning_rate": 3.073946689595873e-06,
3373
+ "loss": 0.075,
3374
+ "mean_token_accuracy": 0.78350048661232,
3375
+ "num_tokens": 30635008.0,
3376
+ "step": 3740
3377
+ },
3378
+ {
3379
+ "epoch": 2.176449249074947,
3380
+ "grad_norm": 1.6102385520935059,
3381
+ "learning_rate": 3.0524505588993986e-06,
3382
+ "loss": 0.0731,
3383
+ "mean_token_accuracy": 0.7807729918509722,
3384
+ "num_tokens": 30716928.0,
3385
+ "step": 3750
3386
+ },
3387
+ {
3388
+ "epoch": 2.1822535006892547,
3389
+ "grad_norm": 1.415201187133789,
3390
+ "learning_rate": 3.0309544282029234e-06,
3391
+ "loss": 0.0659,
3392
+ "mean_token_accuracy": 0.7775929540395736,
3393
+ "num_tokens": 30798848.0,
3394
+ "step": 3760
3395
+ },
3396
+ {
3397
+ "epoch": 2.1880577523035623,
3398
+ "grad_norm": 1.448217749595642,
3399
+ "learning_rate": 3.009458297506449e-06,
3400
+ "loss": 0.0964,
3401
+ "mean_token_accuracy": 0.7664138942956924,
3402
+ "num_tokens": 30880768.0,
3403
+ "step": 3770
3404
+ },
3405
+ {
3406
+ "epoch": 2.19386200391787,
3407
+ "grad_norm": 1.3995651006698608,
3408
+ "learning_rate": 2.9879621668099746e-06,
3409
+ "loss": 0.0853,
3410
+ "mean_token_accuracy": 0.7758072406053543,
3411
+ "num_tokens": 30962688.0,
3412
+ "step": 3780
3413
+ },
3414
+ {
3415
+ "epoch": 2.1996662555321773,
3416
+ "grad_norm": 1.0274962186813354,
3417
+ "learning_rate": 2.9664660361135002e-06,
3418
+ "loss": 0.0733,
3419
+ "mean_token_accuracy": 0.7836594924330711,
3420
+ "num_tokens": 31044608.0,
3421
+ "step": 3790
3422
+ },
3423
+ {
3424
+ "epoch": 2.205470507146485,
3425
+ "grad_norm": 1.4366650581359863,
3426
+ "learning_rate": 2.944969905417025e-06,
3427
+ "loss": 0.072,
3428
+ "mean_token_accuracy": 0.7864726021885872,
3429
+ "num_tokens": 31126528.0,
3430
+ "step": 3800
3431
+ },
3432
+ {
3433
+ "epoch": 2.2112747587607924,
3434
+ "grad_norm": 2.3021836280822754,
3435
+ "learning_rate": 2.9234737747205506e-06,
3436
+ "loss": 0.0739,
3437
+ "mean_token_accuracy": 0.7708781782537699,
3438
+ "num_tokens": 31208448.0,
3439
+ "step": 3810
3440
+ },
3441
+ {
3442
+ "epoch": 2.2170790103751,
3443
+ "grad_norm": 1.155388355255127,
3444
+ "learning_rate": 2.901977644024076e-06,
3445
+ "loss": 0.0719,
3446
+ "mean_token_accuracy": 0.7701443199068307,
3447
+ "num_tokens": 31290368.0,
3448
+ "step": 3820
3449
+ },
3450
+ {
3451
+ "epoch": 2.2228832619894074,
3452
+ "grad_norm": 1.2492424249649048,
3453
+ "learning_rate": 2.8804815133276014e-06,
3454
+ "loss": 0.0813,
3455
+ "mean_token_accuracy": 0.7763820935040713,
3456
+ "num_tokens": 31372288.0,
3457
+ "step": 3830
3458
+ },
3459
+ {
3460
+ "epoch": 2.2286875136037145,
3461
+ "grad_norm": 1.6110029220581055,
3462
+ "learning_rate": 2.8589853826311266e-06,
3463
+ "loss": 0.0735,
3464
+ "mean_token_accuracy": 0.7763698607683182,
3465
+ "num_tokens": 31454208.0,
3466
+ "step": 3840
3467
+ },
3468
+ {
3469
+ "epoch": 2.234491765218022,
3470
+ "grad_norm": 1.6716264486312866,
3471
+ "learning_rate": 2.837489251934652e-06,
3472
+ "loss": 0.0732,
3473
+ "mean_token_accuracy": 0.7921722121536732,
3474
+ "num_tokens": 31536128.0,
3475
+ "step": 3850
3476
+ },
3477
+ {
3478
+ "epoch": 2.2402960168323296,
3479
+ "grad_norm": 1.6248875856399536,
3480
+ "learning_rate": 2.8159931212381774e-06,
3481
+ "loss": 0.0677,
3482
+ "mean_token_accuracy": 0.7913038194179535,
3483
+ "num_tokens": 31618048.0,
3484
+ "step": 3860
3485
+ },
3486
+ {
3487
+ "epoch": 2.246100268446637,
3488
+ "grad_norm": 1.2621709108352661,
3489
+ "learning_rate": 2.794496990541703e-06,
3490
+ "loss": 0.0722,
3491
+ "mean_token_accuracy": 0.7781678088009357,
3492
+ "num_tokens": 31699968.0,
3493
+ "step": 3870
3494
+ },
3495
+ {
3496
+ "epoch": 2.2519045200609447,
3497
+ "grad_norm": 1.600437879562378,
3498
+ "learning_rate": 2.773000859845228e-06,
3499
+ "loss": 0.0794,
3500
+ "mean_token_accuracy": 0.77471868917346,
3501
+ "num_tokens": 31781888.0,
3502
+ "step": 3880
3503
+ },
3504
+ {
3505
+ "epoch": 2.257708771675252,
3506
+ "grad_norm": 1.2767956256866455,
3507
+ "learning_rate": 2.7515047291487533e-06,
3508
+ "loss": 0.077,
3509
+ "mean_token_accuracy": 0.7761986322700978,
3510
+ "num_tokens": 31863808.0,
3511
+ "step": 3890
3512
+ },
3513
+ {
3514
+ "epoch": 2.2635130232895597,
3515
+ "grad_norm": 1.3727670907974243,
3516
+ "learning_rate": 2.730008598452279e-06,
3517
+ "loss": 0.0621,
3518
+ "mean_token_accuracy": 0.7837817996740342,
3519
+ "num_tokens": 31945728.0,
3520
+ "step": 3900
3521
+ },
3522
+ {
3523
+ "epoch": 2.2693172749038673,
3524
+ "grad_norm": 1.73198664188385,
3525
+ "learning_rate": 2.7085124677558045e-06,
3526
+ "loss": 0.0733,
3527
+ "mean_token_accuracy": 0.7854207444936037,
3528
+ "num_tokens": 32027648.0,
3529
+ "step": 3910
3530
+ },
3531
+ {
3532
+ "epoch": 2.2751215265181743,
3533
+ "grad_norm": 1.376431941986084,
3534
+ "learning_rate": 2.6870163370593293e-06,
3535
+ "loss": 0.0765,
3536
+ "mean_token_accuracy": 0.7875489227473735,
3537
+ "num_tokens": 32109568.0,
3538
+ "step": 3920
3539
+ },
3540
+ {
3541
+ "epoch": 2.280925778132482,
3542
+ "grad_norm": 1.4519610404968262,
3543
+ "learning_rate": 2.665520206362855e-06,
3544
+ "loss": 0.074,
3545
+ "mean_token_accuracy": 0.7698507837951183,
3546
+ "num_tokens": 32191488.0,
3547
+ "step": 3930
3548
+ },
3549
+ {
3550
+ "epoch": 2.2867300297467894,
3551
+ "grad_norm": 1.6419020891189575,
3552
+ "learning_rate": 2.6440240756663805e-06,
3553
+ "loss": 0.0715,
3554
+ "mean_token_accuracy": 0.7834393359720707,
3555
+ "num_tokens": 32273408.0,
3556
+ "step": 3940
3557
+ },
3558
+ {
3559
+ "epoch": 2.292534281361097,
3560
+ "grad_norm": 1.8466465473175049,
3561
+ "learning_rate": 2.6225279449699053e-06,
3562
+ "loss": 0.0804,
3563
+ "mean_token_accuracy": 0.7836350306868554,
3564
+ "num_tokens": 32355328.0,
3565
+ "step": 3950
3566
+ },
3567
+ {
3568
+ "epoch": 2.2983385329754045,
3569
+ "grad_norm": 2.484616756439209,
3570
+ "learning_rate": 2.601031814273431e-06,
3571
+ "loss": 0.0727,
3572
+ "mean_token_accuracy": 0.7774584133177995,
3573
+ "num_tokens": 32437248.0,
3574
+ "step": 3960
3575
+ },
3576
+ {
3577
+ "epoch": 2.304142784589712,
3578
+ "grad_norm": 1.4137665033340454,
3579
+ "learning_rate": 2.5795356835769565e-06,
3580
+ "loss": 0.0639,
3581
+ "mean_token_accuracy": 0.7795132108032703,
3582
+ "num_tokens": 32519168.0,
3583
+ "step": 3970
3584
+ },
3585
+ {
3586
+ "epoch": 2.3099470362040195,
3587
+ "grad_norm": 1.2358074188232422,
3588
+ "learning_rate": 2.558039552880482e-06,
3589
+ "loss": 0.0711,
3590
+ "mean_token_accuracy": 0.7876223064959049,
3591
+ "num_tokens": 32601088.0,
3592
+ "step": 3980
3593
+ },
3594
+ {
3595
+ "epoch": 2.315751287818327,
3596
+ "grad_norm": 2.1651759147644043,
3597
+ "learning_rate": 2.536543422184007e-06,
3598
+ "loss": 0.0813,
3599
+ "mean_token_accuracy": 0.7723091997206211,
3600
+ "num_tokens": 32683008.0,
3601
+ "step": 3990
3602
+ },
3603
+ {
3604
+ "epoch": 2.3215555394326346,
3605
+ "grad_norm": 1.3276866674423218,
3606
+ "learning_rate": 2.5150472914875324e-06,
3607
+ "loss": 0.0727,
3608
+ "mean_token_accuracy": 0.7805161453783512,
3609
+ "num_tokens": 32764928.0,
3610
+ "step": 4000
3611
+ },
3612
+ {
3613
+ "epoch": 2.327359791046942,
3614
+ "grad_norm": 1.3086682558059692,
3615
+ "learning_rate": 2.493551160791058e-06,
3616
+ "loss": 0.0827,
3617
+ "mean_token_accuracy": 0.7633317016065121,
3618
+ "num_tokens": 32846848.0,
3619
+ "step": 4010
3620
+ },
3621
+ {
3622
+ "epoch": 2.333164042661249,
3623
+ "grad_norm": 1.2182139158248901,
3624
+ "learning_rate": 2.4720550300945832e-06,
3625
+ "loss": 0.0722,
3626
+ "mean_token_accuracy": 0.7765410944819451,
3627
+ "num_tokens": 32928768.0,
3628
+ "step": 4020
3629
+ },
3630
+ {
3631
+ "epoch": 2.3389682942755567,
3632
+ "grad_norm": 1.6519408226013184,
3633
+ "learning_rate": 2.4505588993981084e-06,
3634
+ "loss": 0.0878,
3635
+ "mean_token_accuracy": 0.7735567502677441,
3636
+ "num_tokens": 33010688.0,
3637
+ "step": 4030
3638
+ },
3639
+ {
3640
+ "epoch": 2.3447725458898643,
3641
+ "grad_norm": 1.2966673374176025,
3642
+ "learning_rate": 2.429062768701634e-06,
3643
+ "loss": 0.0647,
3644
+ "mean_token_accuracy": 0.7849682010710239,
3645
+ "num_tokens": 33092608.0,
3646
+ "step": 4040
3647
+ },
3648
+ {
3649
+ "epoch": 2.350576797504172,
3650
+ "grad_norm": 1.672311782836914,
3651
+ "learning_rate": 2.407566638005159e-06,
3652
+ "loss": 0.0724,
3653
+ "mean_token_accuracy": 0.782937865331769,
3654
+ "num_tokens": 33174528.0,
3655
+ "step": 4050
3656
+ },
3657
+ {
3658
+ "epoch": 2.3563810491184793,
3659
+ "grad_norm": 1.4624446630477905,
3660
+ "learning_rate": 2.3860705073086844e-06,
3661
+ "loss": 0.0787,
3662
+ "mean_token_accuracy": 0.7726761233061552,
3663
+ "num_tokens": 33256448.0,
3664
+ "step": 4060
3665
+ },
3666
+ {
3667
+ "epoch": 2.362185300732787,
3668
+ "grad_norm": 1.3207608461380005,
3669
+ "learning_rate": 2.36457437661221e-06,
3670
+ "loss": 0.074,
3671
+ "mean_token_accuracy": 0.776687865704298,
3672
+ "num_tokens": 33338368.0,
3673
+ "step": 4070
3674
+ },
3675
+ {
3676
+ "epoch": 2.3679895523470944,
3677
+ "grad_norm": 1.5817980766296387,
3678
+ "learning_rate": 2.343078245915735e-06,
3679
+ "loss": 0.0715,
3680
+ "mean_token_accuracy": 0.7849192768335342,
3681
+ "num_tokens": 33420288.0,
3682
+ "step": 4080
3683
+ },
3684
+ {
3685
+ "epoch": 2.3737938039614015,
3686
+ "grad_norm": 1.4322782754898071,
3687
+ "learning_rate": 2.3215821152192608e-06,
3688
+ "loss": 0.07,
3689
+ "mean_token_accuracy": 0.7940680023282767,
3690
+ "num_tokens": 33502208.0,
3691
+ "step": 4090
3692
+ },
3693
+ {
3694
+ "epoch": 2.379598055575709,
3695
+ "grad_norm": 1.8286523818969727,
3696
+ "learning_rate": 2.300085984522786e-06,
3697
+ "loss": 0.0701,
3698
+ "mean_token_accuracy": 0.7780577309429646,
3699
+ "num_tokens": 33584128.0,
3700
+ "step": 4100
3701
+ },
3702
+ {
3703
+ "epoch": 2.3854023071900166,
3704
+ "grad_norm": 1.4655170440673828,
3705
+ "learning_rate": 2.2785898538263116e-06,
3706
+ "loss": 0.0714,
3707
+ "mean_token_accuracy": 0.769752936065197,
3708
+ "num_tokens": 33666048.0,
3709
+ "step": 4110
3710
+ },
3711
+ {
3712
+ "epoch": 2.391206558804324,
3713
+ "grad_norm": 1.5401175022125244,
3714
+ "learning_rate": 2.2570937231298368e-06,
3715
+ "loss": 0.0653,
3716
+ "mean_token_accuracy": 0.7836839508265256,
3717
+ "num_tokens": 33747968.0,
3718
+ "step": 4120
3719
+ },
3720
+ {
3721
+ "epoch": 2.3970108104186316,
3722
+ "grad_norm": 1.9929181337356567,
3723
+ "learning_rate": 2.2355975924333624e-06,
3724
+ "loss": 0.0735,
3725
+ "mean_token_accuracy": 0.7790484316647053,
3726
+ "num_tokens": 33829888.0,
3727
+ "step": 4130
3728
+ },
3729
+ {
3730
+ "epoch": 2.402815062032939,
3731
+ "grad_norm": 1.3667681217193604,
3732
+ "learning_rate": 2.2141014617368875e-06,
3733
+ "loss": 0.0848,
3734
+ "mean_token_accuracy": 0.7788405057042838,
3735
+ "num_tokens": 33911808.0,
3736
+ "step": 4140
3737
+ },
3738
+ {
3739
+ "epoch": 2.4086193136472467,
3740
+ "grad_norm": 1.5205732583999634,
3741
+ "learning_rate": 2.192605331040413e-06,
3742
+ "loss": 0.0734,
3743
+ "mean_token_accuracy": 0.7823263231664896,
3744
+ "num_tokens": 33993728.0,
3745
+ "step": 4150
3746
+ },
3747
+ {
3748
+ "epoch": 2.414423565261554,
3749
+ "grad_norm": 1.2659153938293457,
3750
+ "learning_rate": 2.1711092003439383e-06,
3751
+ "loss": 0.069,
3752
+ "mean_token_accuracy": 0.7853962779045105,
3753
+ "num_tokens": 34075648.0,
3754
+ "step": 4160
3755
+ },
3756
+ {
3757
+ "epoch": 2.4202278168758617,
3758
+ "grad_norm": 1.5595417022705078,
3759
+ "learning_rate": 2.149613069647464e-06,
3760
+ "loss": 0.0745,
3761
+ "mean_token_accuracy": 0.7868517592549324,
3762
+ "num_tokens": 34157568.0,
3763
+ "step": 4170
3764
+ },
3765
+ {
3766
+ "epoch": 2.4260320684901693,
3767
+ "grad_norm": 1.0648770332336426,
3768
+ "learning_rate": 2.128116938950989e-06,
3769
+ "loss": 0.0752,
3770
+ "mean_token_accuracy": 0.7812010742723942,
3771
+ "num_tokens": 34239488.0,
3772
+ "step": 4180
3773
+ },
3774
+ {
3775
+ "epoch": 2.4318363201044764,
3776
+ "grad_norm": 1.3832935094833374,
3777
+ "learning_rate": 2.1066208082545143e-06,
3778
+ "loss": 0.074,
3779
+ "mean_token_accuracy": 0.7747920755296945,
3780
+ "num_tokens": 34321408.0,
3781
+ "step": 4190
3782
+ },
3783
+ {
3784
+ "epoch": 2.437640571718784,
3785
+ "grad_norm": 1.4626703262329102,
3786
+ "learning_rate": 2.08512467755804e-06,
3787
+ "loss": 0.0735,
3788
+ "mean_token_accuracy": 0.7776663392782212,
3789
+ "num_tokens": 34403328.0,
3790
+ "step": 4200
3791
+ },
3792
+ {
3793
+ "epoch": 2.4434448233330914,
3794
+ "grad_norm": 1.7622649669647217,
3795
+ "learning_rate": 2.063628546861565e-06,
3796
+ "loss": 0.0789,
3797
+ "mean_token_accuracy": 0.7715998023748398,
3798
+ "num_tokens": 34485248.0,
3799
+ "step": 4210
3800
+ },
3801
+ {
3802
+ "epoch": 2.449249074947399,
3803
+ "grad_norm": 1.1333751678466797,
3804
+ "learning_rate": 2.0421324161650903e-06,
3805
+ "loss": 0.0647,
3806
+ "mean_token_accuracy": 0.789371332526207,
3807
+ "num_tokens": 34567168.0,
3808
+ "step": 4220
3809
+ },
3810
+ {
3811
+ "epoch": 2.4550533265617065,
3812
+ "grad_norm": 1.1712217330932617,
3813
+ "learning_rate": 2.020636285468616e-06,
3814
+ "loss": 0.06,
3815
+ "mean_token_accuracy": 0.789444712549448,
3816
+ "num_tokens": 34649088.0,
3817
+ "step": 4230
3818
+ },
3819
+ {
3820
+ "epoch": 2.460857578176014,
3821
+ "grad_norm": 1.4060677289962769,
3822
+ "learning_rate": 1.999140154772141e-06,
3823
+ "loss": 0.0674,
3824
+ "mean_token_accuracy": 0.7787793520838022,
3825
+ "num_tokens": 34731008.0,
3826
+ "step": 4240
3827
+ },
3828
+ {
3829
+ "epoch": 2.4666618297903216,
3830
+ "grad_norm": 1.385300874710083,
3831
+ "learning_rate": 1.9776440240756663e-06,
3832
+ "loss": 0.0715,
3833
+ "mean_token_accuracy": 0.7902886494994164,
3834
+ "num_tokens": 34812928.0,
3835
+ "step": 4250
3836
+ },
3837
+ {
3838
+ "epoch": 2.4724660814046286,
3839
+ "grad_norm": 2.0180656909942627,
3840
+ "learning_rate": 1.956147893379192e-06,
3841
+ "loss": 0.0809,
3842
+ "mean_token_accuracy": 0.7750244610011577,
3843
+ "num_tokens": 34894848.0,
3844
+ "step": 4260
3845
+ },
3846
+ {
3847
+ "epoch": 2.478270333018936,
3848
+ "grad_norm": 1.4561444520950317,
3849
+ "learning_rate": 1.934651762682717e-06,
3850
+ "loss": 0.07,
3851
+ "mean_token_accuracy": 0.7866927605122328,
3852
+ "num_tokens": 34976768.0,
3853
+ "step": 4270
3854
+ },
3855
+ {
3856
+ "epoch": 2.4840745846332437,
3857
+ "grad_norm": 1.5207933187484741,
3858
+ "learning_rate": 1.9131556319862426e-06,
3859
+ "loss": 0.0673,
3860
+ "mean_token_accuracy": 0.7777641881257296,
3861
+ "num_tokens": 35058688.0,
3862
+ "step": 4280
3863
+ },
3864
+ {
3865
+ "epoch": 2.4898788362475512,
3866
+ "grad_norm": 1.2230969667434692,
3867
+ "learning_rate": 1.8916595012897678e-06,
3868
+ "loss": 0.0642,
3869
+ "mean_token_accuracy": 0.7840631097555161,
3870
+ "num_tokens": 35140608.0,
3871
+ "step": 4290
3872
+ },
3873
+ {
3874
+ "epoch": 2.4956830878618588,
3875
+ "grad_norm": 1.5149006843566895,
3876
+ "learning_rate": 1.8701633705932934e-06,
3877
+ "loss": 0.069,
3878
+ "mean_token_accuracy": 0.7750856131315231,
3879
+ "num_tokens": 35222528.0,
3880
+ "step": 4300
3881
+ },
3882
+ {
3883
+ "epoch": 2.5014873394761663,
3884
+ "grad_norm": 1.388324499130249,
3885
+ "learning_rate": 1.8486672398968186e-06,
3886
+ "loss": 0.0624,
3887
+ "mean_token_accuracy": 0.7744740705937148,
3888
+ "num_tokens": 35304448.0,
3889
+ "step": 4310
3890
+ },
3891
+ {
3892
+ "epoch": 2.507291591090474,
3893
+ "grad_norm": 1.2073156833648682,
3894
+ "learning_rate": 1.8271711092003442e-06,
3895
+ "loss": 0.0746,
3896
+ "mean_token_accuracy": 0.7800391394644975,
3897
+ "num_tokens": 35386368.0,
3898
+ "step": 4320
3899
+ },
3900
+ {
3901
+ "epoch": 2.5130958427047814,
3902
+ "grad_norm": 1.2883899211883545,
3903
+ "learning_rate": 1.8056749785038694e-06,
3904
+ "loss": 0.0683,
3905
+ "mean_token_accuracy": 0.7817636966705322,
3906
+ "num_tokens": 35468288.0,
3907
+ "step": 4330
3908
+ },
3909
+ {
3910
+ "epoch": 2.518900094319089,
3911
+ "grad_norm": 1.3818330764770508,
3912
+ "learning_rate": 1.7841788478073948e-06,
3913
+ "loss": 0.0755,
3914
+ "mean_token_accuracy": 0.7780821919441223,
3915
+ "num_tokens": 35550208.0,
3916
+ "step": 4340
3917
+ },
3918
+ {
3919
+ "epoch": 2.5247043459333964,
3920
+ "grad_norm": 1.5316420793533325,
3921
+ "learning_rate": 1.7626827171109202e-06,
3922
+ "loss": 0.0751,
3923
+ "mean_token_accuracy": 0.7811643831431866,
3924
+ "num_tokens": 35632128.0,
3925
+ "step": 4350
3926
+ },
3927
+ {
3928
+ "epoch": 2.530508597547704,
3929
+ "grad_norm": 1.7762279510498047,
3930
+ "learning_rate": 1.7411865864144456e-06,
3931
+ "loss": 0.0777,
3932
+ "mean_token_accuracy": 0.7726394325494766,
3933
+ "num_tokens": 35714048.0,
3934
+ "step": 4360
3935
+ },
3936
+ {
3937
+ "epoch": 2.536312849162011,
3938
+ "grad_norm": 2.5594027042388916,
3939
+ "learning_rate": 1.7196904557179708e-06,
3940
+ "loss": 0.0743,
3941
+ "mean_token_accuracy": 0.7793908979743719,
3942
+ "num_tokens": 35795968.0,
3943
+ "step": 4370
3944
+ },
3945
+ {
3946
+ "epoch": 2.5421171007763186,
3947
+ "grad_norm": 1.6017844676971436,
3948
+ "learning_rate": 1.6981943250214964e-06,
3949
+ "loss": 0.0868,
3950
+ "mean_token_accuracy": 0.7675513669848442,
3951
+ "num_tokens": 35877888.0,
3952
+ "step": 4380
3953
+ },
3954
+ {
3955
+ "epoch": 2.547921352390626,
3956
+ "grad_norm": 1.0661394596099854,
3957
+ "learning_rate": 1.6766981943250216e-06,
3958
+ "loss": 0.0751,
3959
+ "mean_token_accuracy": 0.7853351254016161,
3960
+ "num_tokens": 35959808.0,
3961
+ "step": 4390
3962
+ },
3963
+ {
3964
+ "epoch": 2.5537256040049336,
3965
+ "grad_norm": 1.982590675354004,
3966
+ "learning_rate": 1.6552020636285472e-06,
3967
+ "loss": 0.0763,
3968
+ "mean_token_accuracy": 0.7791952028870582,
3969
+ "num_tokens": 36041728.0,
3970
+ "step": 4400
3971
+ },
3972
+ {
3973
+ "epoch": 2.559529855619241,
3974
+ "grad_norm": 1.2957309484481812,
3975
+ "learning_rate": 1.6337059329320724e-06,
3976
+ "loss": 0.0778,
3977
+ "mean_token_accuracy": 0.7759540095925331,
3978
+ "num_tokens": 36123648.0,
3979
+ "step": 4410
3980
+ },
3981
+ {
3982
+ "epoch": 2.5653341072335487,
3983
+ "grad_norm": 1.2153947353363037,
3984
+ "learning_rate": 1.6122098022355977e-06,
3985
+ "loss": 0.0742,
3986
+ "mean_token_accuracy": 0.7756237789988518,
3987
+ "num_tokens": 36205568.0,
3988
+ "step": 4420
3989
+ },
3990
+ {
3991
+ "epoch": 2.571138358847856,
3992
+ "grad_norm": 0.9859249591827393,
3993
+ "learning_rate": 1.5907136715391231e-06,
3994
+ "loss": 0.0704,
3995
+ "mean_token_accuracy": 0.7828155551105738,
3996
+ "num_tokens": 36287488.0,
3997
+ "step": 4430
3998
+ },
3999
+ {
4000
+ "epoch": 2.5769426104621633,
4001
+ "grad_norm": 1.356974720954895,
4002
+ "learning_rate": 1.5692175408426485e-06,
4003
+ "loss": 0.0744,
4004
+ "mean_token_accuracy": 0.7800024446099997,
4005
+ "num_tokens": 36369408.0,
4006
+ "step": 4440
4007
+ },
4008
+ {
4009
+ "epoch": 2.582746862076471,
4010
+ "grad_norm": 1.439613699913025,
4011
+ "learning_rate": 1.5477214101461737e-06,
4012
+ "loss": 0.0632,
4013
+ "mean_token_accuracy": 0.7881604686379433,
4014
+ "num_tokens": 36451328.0,
4015
+ "step": 4450
4016
+ },
4017
+ {
4018
+ "epoch": 2.5885511136907784,
4019
+ "grad_norm": 1.880275011062622,
4020
+ "learning_rate": 1.5262252794496993e-06,
4021
+ "loss": 0.0738,
4022
+ "mean_token_accuracy": 0.7977005876600742,
4023
+ "num_tokens": 36533248.0,
4024
+ "step": 4460
4025
+ },
4026
+ {
4027
+ "epoch": 2.594355365305086,
4028
+ "grad_norm": 1.3106391429901123,
4029
+ "learning_rate": 1.5047291487532245e-06,
4030
+ "loss": 0.0746,
4031
+ "mean_token_accuracy": 0.7735078293830157,
4032
+ "num_tokens": 36615168.0,
4033
+ "step": 4470
4034
+ },
4035
+ {
4036
+ "epoch": 2.6001596169193935,
4037
+ "grad_norm": 1.400251865386963,
4038
+ "learning_rate": 1.4832330180567501e-06,
4039
+ "loss": 0.0718,
4040
+ "mean_token_accuracy": 0.7904721148312092,
4041
+ "num_tokens": 36697088.0,
4042
+ "step": 4480
4043
+ },
4044
+ {
4045
+ "epoch": 2.605963868533701,
4046
+ "grad_norm": 1.189841628074646,
4047
+ "learning_rate": 1.4617368873602753e-06,
4048
+ "loss": 0.0694,
4049
+ "mean_token_accuracy": 0.7853473566472531,
4050
+ "num_tokens": 36779008.0,
4051
+ "step": 4490
4052
+ },
4053
+ {
4054
+ "epoch": 2.6117681201480085,
4055
+ "grad_norm": 2.03908109664917,
4056
+ "learning_rate": 1.4402407566638007e-06,
4057
+ "loss": 0.0611,
4058
+ "mean_token_accuracy": 0.7851027369499206,
4059
+ "num_tokens": 36860928.0,
4060
+ "step": 4500
4061
  }
4062
  ],
4063
  "logging_steps": 10,
 
4077
  "attributes": {}
4078
  }
4079
  },
4080
+ "total_flos": 9.7416235091755e+16,
4081
  "train_batch_size": 2,
4082
  "trial_name": null,
4083
  "trial_params": null