AiAF commited on
Commit
ef695d7
·
verified ·
1 Parent(s): 1f2e159

Training in progress, step 750, checkpoint

Browse files
last-checkpoint/adapter_model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:59badcbc1d668a371853f284e39f9ca33e2fe2af68b773148163044bb0f70bdd
3
  size 102264160
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f145aad3e393aacb1ea6687fe5c794bd1505c6b68c50e5038c6eac34efa7e4d6
3
  size 102264160
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:49665f71e34f2a3db3bbae94d41e9706d6e4267d7bf49d604935f42728af0512
3
  size 52162827
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:140bdab4eebed8c5ba2417db0ed65f56201fa6307a32fb787ad292b97ae34b13
3
  size 52162827
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b2376b84b97294d583dff60749feb13d6533baf27b96b9a245af922803baac53
3
  size 14645
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4295d68f9590a1ee84490e5a76cd2d12d84f3c4e7c7542a7915be508cf875fe0
3
  size 14645
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1fccf8d05f51ee90d9abfa90ec4fa092bb34ce369846454436f6371151204846
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6af5f150dbd15fa79794ceabe67cfe7018c07d61742eb73c3c6b041388c26d7c
3
  size 1465
last-checkpoint/tokens_state.json CHANGED
@@ -1 +1 @@
1
- {"total": 9769472, "trainable": 4042644}
 
1
+ {"total": 10467328, "trainable": 4329291}
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.3414217778319717,
6
  "eval_steps": 50,
7
- "global_step": 700,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9988,6 +9988,718 @@
9988
  "memory/max_active (GiB)": 11.76,
9989
  "memory/max_allocated (GiB)": 11.76,
9990
  "step": 700
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9991
  }
9992
  ],
9993
  "logging_steps": 1,
@@ -10007,7 +10719,7 @@
10007
  "attributes": {}
10008
  }
10009
  },
10010
- "total_flos": 1.201690148756521e+17,
10011
  "train_batch_size": 2,
10012
  "trial_name": null,
10013
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.36580904767711253,
6
  "eval_steps": 50,
7
+ "global_step": 750,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9988
  "memory/max_active (GiB)": 11.76,
9989
  "memory/max_allocated (GiB)": 11.76,
9990
  "step": 700
9991
+ },
9992
+ {
9993
+ "epoch": 0.3419095232288745,
9994
+ "grad_norm": 0.15700815618038177,
9995
+ "learning_rate": 4.360429701490934e-05,
9996
+ "loss": 2.678558111190796,
9997
+ "memory/device_reserved (GiB)": 37.36,
9998
+ "memory/max_active (GiB)": 16.07,
9999
+ "memory/max_allocated (GiB)": 16.07,
10000
+ "ppl": 14.56408,
10001
+ "step": 701,
10002
+ "tokens/total": 9783296,
10003
+ "tokens/train_per_sec_per_gpu": 731.12,
10004
+ "tokens/trainable": 4048483
10005
+ },
10006
+ {
10007
+ "epoch": 0.34239726862577735,
10008
+ "grad_norm": 0.15936368703842163,
10009
+ "learning_rate": 4.333713551181852e-05,
10010
+ "loss": 2.4016025066375732,
10011
+ "memory/device_reserved (GiB)": 37.36,
10012
+ "memory/max_active (GiB)": 16.07,
10013
+ "memory/max_allocated (GiB)": 16.07,
10014
+ "ppl": 11.04086,
10015
+ "step": 702,
10016
+ "tokens/total": 9796608,
10017
+ "tokens/train_per_sec_per_gpu": 1781.96,
10018
+ "tokens/trainable": 4053789
10019
+ },
10020
+ {
10021
+ "epoch": 0.3428850140226802,
10022
+ "grad_norm": 0.15781526267528534,
10023
+ "learning_rate": 4.307056837536373e-05,
10024
+ "loss": 2.310777187347412,
10025
+ "memory/device_reserved (GiB)": 49.08,
10026
+ "memory/max_active (GiB)": 16.51,
10027
+ "memory/max_allocated (GiB)": 16.51,
10028
+ "ppl": 10.08226,
10029
+ "step": 703,
10030
+ "tokens/total": 9812224,
10031
+ "tokens/train_per_sec_per_gpu": 2462.76,
10032
+ "tokens/trainable": 4058992
10033
+ },
10034
+ {
10035
+ "epoch": 0.34337275941958295,
10036
+ "grad_norm": 0.15782800316810608,
10037
+ "learning_rate": 4.2804598401708175e-05,
10038
+ "loss": 2.5483644008636475,
10039
+ "memory/device_reserved (GiB)": 49.08,
10040
+ "memory/max_active (GiB)": 16.07,
10041
+ "memory/max_allocated (GiB)": 16.07,
10042
+ "ppl": 12.78617,
10043
+ "step": 704,
10044
+ "tokens/total": 9825920,
10045
+ "tokens/train_per_sec_per_gpu": 725.7,
10046
+ "tokens/trainable": 4064335
10047
+ },
10048
+ {
10049
+ "epoch": 0.3438605048164858,
10050
+ "grad_norm": 0.13850180804729462,
10051
+ "learning_rate": 4.253922838075095e-05,
10052
+ "loss": 2.7391016483306885,
10053
+ "memory/device_reserved (GiB)": 49.08,
10054
+ "memory/max_active (GiB)": 16.51,
10055
+ "memory/max_allocated (GiB)": 16.51,
10056
+ "ppl": 15.47308,
10057
+ "step": 705,
10058
+ "tokens/total": 9840896,
10059
+ "tokens/train_per_sec_per_gpu": 2419.77,
10060
+ "tokens/trainable": 4072424
10061
+ },
10062
+ {
10063
+ "epoch": 0.3443482502133886,
10064
+ "grad_norm": 0.16455164551734924,
10065
+ "learning_rate": 4.227446109609809e-05,
10066
+ "loss": 2.409106969833374,
10067
+ "memory/device_reserved (GiB)": 49.08,
10068
+ "memory/max_active (GiB)": 16.07,
10069
+ "memory/max_allocated (GiB)": 16.07,
10070
+ "ppl": 11.12402,
10071
+ "step": 706,
10072
+ "tokens/total": 9855104,
10073
+ "tokens/train_per_sec_per_gpu": 1066.86,
10074
+ "tokens/trainable": 4077224
10075
+ },
10076
+ {
10077
+ "epoch": 0.34483599561029143,
10078
+ "grad_norm": 0.1787019520998001,
10079
+ "learning_rate": 4.2010299325033034e-05,
10080
+ "loss": 2.276559352874756,
10081
+ "memory/device_reserved (GiB)": 49.08,
10082
+ "memory/max_active (GiB)": 15.63,
10083
+ "memory/max_allocated (GiB)": 15.63,
10084
+ "ppl": 9.7431,
10085
+ "step": 707,
10086
+ "tokens/total": 9867776,
10087
+ "tokens/train_per_sec_per_gpu": 799.15,
10088
+ "tokens/trainable": 4081093
10089
+ },
10090
+ {
10091
+ "epoch": 0.34532374100719426,
10092
+ "grad_norm": 0.14423713088035583,
10093
+ "learning_rate": 4.17467458384878e-05,
10094
+ "loss": 2.411412239074707,
10095
+ "memory/device_reserved (GiB)": 49.08,
10096
+ "memory/max_active (GiB)": 16.51,
10097
+ "memory/max_allocated (GiB)": 16.51,
10098
+ "ppl": 11.1497,
10099
+ "step": 708,
10100
+ "tokens/total": 9881856,
10101
+ "tokens/train_per_sec_per_gpu": 3455.55,
10102
+ "tokens/trainable": 4087806
10103
+ },
10104
+ {
10105
+ "epoch": 0.34581148640409703,
10106
+ "grad_norm": 0.1406305432319641,
10107
+ "learning_rate": 4.1483803401013796e-05,
10108
+ "loss": 2.494349241256714,
10109
+ "memory/device_reserved (GiB)": 49.08,
10110
+ "memory/max_active (GiB)": 16.42,
10111
+ "memory/max_allocated (GiB)": 16.42,
10112
+ "ppl": 12.11385,
10113
+ "step": 709,
10114
+ "tokens/total": 9896192,
10115
+ "tokens/train_per_sec_per_gpu": 1836.23,
10116
+ "tokens/trainable": 4094473
10117
+ },
10118
+ {
10119
+ "epoch": 0.34629923180099986,
10120
+ "grad_norm": 0.1451570838689804,
10121
+ "learning_rate": 4.12214747707527e-05,
10122
+ "loss": 2.4983654022216797,
10123
+ "memory/device_reserved (GiB)": 49.08,
10124
+ "memory/max_active (GiB)": 14.74,
10125
+ "memory/max_allocated (GiB)": 14.74,
10126
+ "ppl": 12.1626,
10127
+ "step": 710,
10128
+ "tokens/total": 9909120,
10129
+ "tokens/train_per_sec_per_gpu": 2190.99,
10130
+ "tokens/trainable": 4100682
10131
+ },
10132
+ {
10133
+ "epoch": 0.3467869771979027,
10134
+ "grad_norm": 0.18409603834152222,
10135
+ "learning_rate": 4.0959762699407766e-05,
10136
+ "loss": 2.590090751647949,
10137
+ "memory/device_reserved (GiB)": 49.08,
10138
+ "memory/max_active (GiB)": 15.18,
10139
+ "memory/max_allocated (GiB)": 15.18,
10140
+ "ppl": 13.33098,
10141
+ "step": 711,
10142
+ "tokens/total": 9922304,
10143
+ "tokens/train_per_sec_per_gpu": 2223.08,
10144
+ "tokens/trainable": 4104953
10145
+ },
10146
+ {
10147
+ "epoch": 0.3472747225948055,
10148
+ "grad_norm": 0.14688999950885773,
10149
+ "learning_rate": 4.0698669932214727e-05,
10150
+ "loss": 2.700690507888794,
10151
+ "memory/device_reserved (GiB)": 49.08,
10152
+ "memory/max_active (GiB)": 16.51,
10153
+ "memory/max_allocated (GiB)": 16.51,
10154
+ "ppl": 14.89001,
10155
+ "step": 712,
10156
+ "tokens/total": 9936000,
10157
+ "tokens/train_per_sec_per_gpu": 2682.83,
10158
+ "tokens/trainable": 4111587
10159
+ },
10160
+ {
10161
+ "epoch": 0.34776246799170835,
10162
+ "grad_norm": 0.13892816007137299,
10163
+ "learning_rate": 4.043819920791322e-05,
10164
+ "loss": 2.448453426361084,
10165
+ "memory/device_reserved (GiB)": 49.08,
10166
+ "memory/max_active (GiB)": 16.07,
10167
+ "memory/max_allocated (GiB)": 16.07,
10168
+ "ppl": 11.57044,
10169
+ "step": 713,
10170
+ "tokens/total": 9951360,
10171
+ "tokens/train_per_sec_per_gpu": 3127.06,
10172
+ "tokens/trainable": 4118516
10173
+ },
10174
+ {
10175
+ "epoch": 0.3482502133886111,
10176
+ "grad_norm": 0.1610805243253708,
10177
+ "learning_rate": 4.0178353258717804e-05,
10178
+ "loss": 2.5162341594696045,
10179
+ "memory/device_reserved (GiB)": 49.08,
10180
+ "memory/max_active (GiB)": 16.07,
10181
+ "memory/max_allocated (GiB)": 16.07,
10182
+ "ppl": 12.38188,
10183
+ "step": 714,
10184
+ "tokens/total": 9966336,
10185
+ "tokens/train_per_sec_per_gpu": 1221.89,
10186
+ "tokens/trainable": 4124094
10187
+ },
10188
+ {
10189
+ "epoch": 0.34873795878551395,
10190
+ "grad_norm": 0.14121432602405548,
10191
+ "learning_rate": 3.991913481028965e-05,
10192
+ "loss": 2.4478676319122314,
10193
+ "memory/device_reserved (GiB)": 49.08,
10194
+ "memory/max_active (GiB)": 16.07,
10195
+ "memory/max_allocated (GiB)": 16.07,
10196
+ "ppl": 11.56366,
10197
+ "step": 715,
10198
+ "tokens/total": 9980544,
10199
+ "tokens/train_per_sec_per_gpu": 2758.01,
10200
+ "tokens/trainable": 4130649
10201
+ },
10202
+ {
10203
+ "epoch": 0.3492257041824168,
10204
+ "grad_norm": 0.14568425714969635,
10205
+ "learning_rate": 3.966054658170754e-05,
10206
+ "loss": 2.542024612426758,
10207
+ "memory/device_reserved (GiB)": 49.08,
10208
+ "memory/max_active (GiB)": 16.51,
10209
+ "memory/max_allocated (GiB)": 16.51,
10210
+ "ppl": 12.70537,
10211
+ "step": 716,
10212
+ "tokens/total": 9995008,
10213
+ "tokens/train_per_sec_per_gpu": 1350.01,
10214
+ "tokens/trainable": 4137414
10215
+ },
10216
+ {
10217
+ "epoch": 0.3497134495793196,
10218
+ "grad_norm": 0.14108124375343323,
10219
+ "learning_rate": 3.940259128543967e-05,
10220
+ "loss": 2.3504650592803955,
10221
+ "memory/device_reserved (GiB)": 49.08,
10222
+ "memory/max_active (GiB)": 15.63,
10223
+ "memory/max_allocated (GiB)": 15.63,
10224
+ "ppl": 10.49045,
10225
+ "step": 717,
10226
+ "tokens/total": 10008960,
10227
+ "tokens/train_per_sec_per_gpu": 793.29,
10228
+ "tokens/trainable": 4143920
10229
+ },
10230
+ {
10231
+ "epoch": 0.35020119497622243,
10232
+ "grad_norm": 0.16065354645252228,
10233
+ "learning_rate": 3.9145271627314986e-05,
10234
+ "loss": 2.477329969406128,
10235
+ "memory/device_reserved (GiB)": 49.08,
10236
+ "memory/max_active (GiB)": 13.41,
10237
+ "memory/max_allocated (GiB)": 13.41,
10238
+ "ppl": 11.90942,
10239
+ "step": 718,
10240
+ "tokens/total": 10021120,
10241
+ "tokens/train_per_sec_per_gpu": 2102.68,
10242
+ "tokens/trainable": 4148865
10243
+ },
10244
+ {
10245
+ "epoch": 0.3506889403731252,
10246
+ "grad_norm": 0.145247220993042,
10247
+ "learning_rate": 3.8888590306494974e-05,
10248
+ "loss": 2.375197410583496,
10249
+ "memory/device_reserved (GiB)": 49.08,
10250
+ "memory/max_active (GiB)": 16.42,
10251
+ "memory/max_allocated (GiB)": 16.42,
10252
+ "ppl": 10.75314,
10253
+ "step": 719,
10254
+ "tokens/total": 10034944,
10255
+ "tokens/train_per_sec_per_gpu": 1469.38,
10256
+ "tokens/trainable": 4154874
10257
+ },
10258
+ {
10259
+ "epoch": 0.35117668577002803,
10260
+ "grad_norm": 0.13928496837615967,
10261
+ "learning_rate": 3.8632550015445256e-05,
10262
+ "loss": 2.509256601333618,
10263
+ "memory/device_reserved (GiB)": 49.08,
10264
+ "memory/max_active (GiB)": 15.63,
10265
+ "memory/max_allocated (GiB)": 15.63,
10266
+ "ppl": 12.29579,
10267
+ "step": 720,
10268
+ "tokens/total": 10048000,
10269
+ "tokens/train_per_sec_per_gpu": 1983.62,
10270
+ "tokens/trainable": 4162436
10271
+ },
10272
+ {
10273
+ "epoch": 0.35166443116693086,
10274
+ "grad_norm": 0.1396213173866272,
10275
+ "learning_rate": 3.8377153439907266e-05,
10276
+ "loss": 2.288262367248535,
10277
+ "memory/device_reserved (GiB)": 49.08,
10278
+ "memory/max_active (GiB)": 16.51,
10279
+ "memory/max_allocated (GiB)": 16.51,
10280
+ "ppl": 9.85779,
10281
+ "step": 721,
10282
+ "tokens/total": 10062080,
10283
+ "tokens/train_per_sec_per_gpu": 1594.6,
10284
+ "tokens/trainable": 4168778
10285
+ },
10286
+ {
10287
+ "epoch": 0.3521521765638337,
10288
+ "grad_norm": 0.17178580164909363,
10289
+ "learning_rate": 3.81224032588703e-05,
10290
+ "loss": 2.3965115547180176,
10291
+ "memory/device_reserved (GiB)": 49.08,
10292
+ "memory/max_active (GiB)": 16.07,
10293
+ "memory/max_allocated (GiB)": 16.07,
10294
+ "ppl": 10.98479,
10295
+ "step": 722,
10296
+ "tokens/total": 10075392,
10297
+ "tokens/train_per_sec_per_gpu": 916.45,
10298
+ "tokens/trainable": 4172899
10299
+ },
10300
+ {
10301
+ "epoch": 0.3526399219607365,
10302
+ "grad_norm": 0.19447971880435944,
10303
+ "learning_rate": 3.786830214454315e-05,
10304
+ "loss": 2.4896106719970703,
10305
+ "memory/device_reserved (GiB)": 49.08,
10306
+ "memory/max_active (GiB)": 16.51,
10307
+ "memory/max_allocated (GiB)": 16.51,
10308
+ "ppl": 12.05658,
10309
+ "step": 723,
10310
+ "tokens/total": 10090112,
10311
+ "tokens/train_per_sec_per_gpu": 988.46,
10312
+ "tokens/trainable": 4176781
10313
+ },
10314
+ {
10315
+ "epoch": 0.3531276673576393,
10316
+ "grad_norm": 0.1555708646774292,
10317
+ "learning_rate": 3.7614852762326305e-05,
10318
+ "loss": 2.4370362758636475,
10319
+ "memory/device_reserved (GiB)": 49.08,
10320
+ "memory/max_active (GiB)": 16.51,
10321
+ "memory/max_allocated (GiB)": 16.51,
10322
+ "ppl": 11.43909,
10323
+ "step": 724,
10324
+ "tokens/total": 10104832,
10325
+ "tokens/train_per_sec_per_gpu": 2862.85,
10326
+ "tokens/trainable": 4182322
10327
+ },
10328
+ {
10329
+ "epoch": 0.3536154127545421,
10330
+ "grad_norm": 0.15927904844284058,
10331
+ "learning_rate": 3.736205777078381e-05,
10332
+ "loss": 2.3857152462005615,
10333
+ "memory/device_reserved (GiB)": 49.08,
10334
+ "memory/max_active (GiB)": 15.63,
10335
+ "memory/max_allocated (GiB)": 15.63,
10336
+ "ppl": 10.86683,
10337
+ "step": 725,
10338
+ "tokens/total": 10118144,
10339
+ "tokens/train_per_sec_per_gpu": 1523.11,
10340
+ "tokens/trainable": 4187426
10341
+ },
10342
+ {
10343
+ "epoch": 0.35410315815144494,
10344
+ "grad_norm": 0.16110184788703918,
10345
+ "learning_rate": 3.710991982161555e-05,
10346
+ "loss": 2.508744716644287,
10347
+ "memory/device_reserved (GiB)": 49.08,
10348
+ "memory/max_active (GiB)": 13.41,
10349
+ "memory/max_allocated (GiB)": 13.41,
10350
+ "ppl": 12.28949,
10351
+ "step": 726,
10352
+ "tokens/total": 10129152,
10353
+ "tokens/train_per_sec_per_gpu": 3551.65,
10354
+ "tokens/trainable": 4192287
10355
+ },
10356
+ {
10357
+ "epoch": 0.35459090354834777,
10358
+ "grad_norm": 0.17562873661518097,
10359
+ "learning_rate": 3.6858441559629306e-05,
10360
+ "loss": 2.5916664600372314,
10361
+ "memory/device_reserved (GiB)": 49.08,
10362
+ "memory/max_active (GiB)": 14.74,
10363
+ "memory/max_allocated (GiB)": 14.74,
10364
+ "ppl": 13.352,
10365
+ "step": 727,
10366
+ "tokens/total": 10142080,
10367
+ "tokens/train_per_sec_per_gpu": 833.09,
10368
+ "tokens/trainable": 4196474
10369
+ },
10370
+ {
10371
+ "epoch": 0.3550786489452506,
10372
+ "grad_norm": 0.16362859308719635,
10373
+ "learning_rate": 3.6607625622713e-05,
10374
+ "loss": 2.5120694637298584,
10375
+ "memory/device_reserved (GiB)": 49.08,
10376
+ "memory/max_active (GiB)": 16.51,
10377
+ "memory/max_allocated (GiB)": 16.51,
10378
+ "ppl": 12.33042,
10379
+ "step": 728,
10380
+ "tokens/total": 10156160,
10381
+ "tokens/train_per_sec_per_gpu": 1357.25,
10382
+ "tokens/trainable": 4201530
10383
+ },
10384
+ {
10385
+ "epoch": 0.35556639434215337,
10386
+ "grad_norm": 0.18990835547447205,
10387
+ "learning_rate": 3.63574746418072e-05,
10388
+ "loss": 2.468756914138794,
10389
+ "memory/device_reserved (GiB)": 49.08,
10390
+ "memory/max_active (GiB)": 16.51,
10391
+ "memory/max_allocated (GiB)": 16.51,
10392
+ "ppl": 11.80776,
10393
+ "step": 729,
10394
+ "tokens/total": 10169728,
10395
+ "tokens/train_per_sec_per_gpu": 2009.19,
10396
+ "tokens/trainable": 4205098
10397
+ },
10398
+ {
10399
+ "epoch": 0.3560541397390562,
10400
+ "grad_norm": 0.13236725330352783,
10401
+ "learning_rate": 3.610799124087725e-05,
10402
+ "loss": 2.67596435546875,
10403
+ "memory/device_reserved (GiB)": 49.08,
10404
+ "memory/max_active (GiB)": 16.51,
10405
+ "memory/max_allocated (GiB)": 16.51,
10406
+ "ppl": 14.52635,
10407
+ "step": 730,
10408
+ "tokens/total": 10185344,
10409
+ "tokens/train_per_sec_per_gpu": 3279.33,
10410
+ "tokens/trainable": 4214000
10411
+ },
10412
+ {
10413
+ "epoch": 0.356541885135959,
10414
+ "grad_norm": 0.1518988162279129,
10415
+ "learning_rate": 3.585917803688603e-05,
10416
+ "loss": 2.535468101501465,
10417
+ "memory/device_reserved (GiB)": 49.08,
10418
+ "memory/max_active (GiB)": 15.98,
10419
+ "memory/max_allocated (GiB)": 15.98,
10420
+ "ppl": 12.62234,
10421
+ "step": 731,
10422
+ "tokens/total": 10199296,
10423
+ "tokens/train_per_sec_per_gpu": 646.5,
10424
+ "tokens/trainable": 4219880
10425
+ },
10426
+ {
10427
+ "epoch": 0.35702963053286185,
10428
+ "grad_norm": 0.18351151049137115,
10429
+ "learning_rate": 3.5611037639766265e-05,
10430
+ "loss": 2.455716371536255,
10431
+ "memory/device_reserved (GiB)": 49.08,
10432
+ "memory/max_active (GiB)": 16.42,
10433
+ "memory/max_allocated (GiB)": 16.42,
10434
+ "ppl": 11.65478,
10435
+ "step": 732,
10436
+ "tokens/total": 10214272,
10437
+ "tokens/train_per_sec_per_gpu": 2191.87,
10438
+ "tokens/trainable": 4224167
10439
+ },
10440
+ {
10441
+ "epoch": 0.3575173759297647,
10442
+ "grad_norm": 0.1563291698694229,
10443
+ "learning_rate": 3.5363572652393326e-05,
10444
+ "loss": 2.5146679878234863,
10445
+ "memory/device_reserved (GiB)": 49.08,
10446
+ "memory/max_active (GiB)": 15.63,
10447
+ "memory/max_allocated (GiB)": 15.63,
10448
+ "ppl": 12.3625,
10449
+ "step": 733,
10450
+ "tokens/total": 10227712,
10451
+ "tokens/train_per_sec_per_gpu": 2745.69,
10452
+ "tokens/trainable": 4230596
10453
+ },
10454
+ {
10455
+ "epoch": 0.35800512132666745,
10456
+ "grad_norm": 0.15779973566532135,
10457
+ "learning_rate": 3.511678567055786e-05,
10458
+ "loss": 2.7565038204193115,
10459
+ "memory/device_reserved (GiB)": 49.08,
10460
+ "memory/max_active (GiB)": 16.51,
10461
+ "memory/max_allocated (GiB)": 16.51,
10462
+ "ppl": 15.7447,
10463
+ "step": 734,
10464
+ "tokens/total": 10241152,
10465
+ "tokens/train_per_sec_per_gpu": 3290.48,
10466
+ "tokens/trainable": 4236327
10467
+ },
10468
+ {
10469
+ "epoch": 0.3584928667235703,
10470
+ "grad_norm": 0.14905805885791779,
10471
+ "learning_rate": 3.487067928293848e-05,
10472
+ "loss": 2.6842727661132812,
10473
+ "memory/device_reserved (GiB)": 49.08,
10474
+ "memory/max_active (GiB)": 16.07,
10475
+ "memory/max_allocated (GiB)": 16.07,
10476
+ "ppl": 14.64755,
10477
+ "step": 735,
10478
+ "tokens/total": 10255232,
10479
+ "tokens/train_per_sec_per_gpu": 3533.74,
10480
+ "tokens/trainable": 4242429
10481
+ },
10482
+ {
10483
+ "epoch": 0.3589806121204731,
10484
+ "grad_norm": 0.1586298942565918,
10485
+ "learning_rate": 3.4625256071074773e-05,
10486
+ "loss": 2.7677407264709473,
10487
+ "memory/device_reserved (GiB)": 49.08,
10488
+ "memory/max_active (GiB)": 16.07,
10489
+ "memory/max_allocated (GiB)": 16.07,
10490
+ "ppl": 15.92262,
10491
+ "step": 736,
10492
+ "tokens/total": 10269056,
10493
+ "tokens/train_per_sec_per_gpu": 2718.39,
10494
+ "tokens/trainable": 4248250
10495
+ },
10496
+ {
10497
+ "epoch": 0.35946835751737594,
10498
+ "grad_norm": 0.14619523286819458,
10499
+ "learning_rate": 3.4380518609340076e-05,
10500
+ "loss": 2.4541380405426025,
10501
+ "memory/device_reserved (GiB)": 49.08,
10502
+ "memory/max_active (GiB)": 16.51,
10503
+ "memory/max_allocated (GiB)": 16.51,
10504
+ "ppl": 11.6364,
10505
+ "step": 737,
10506
+ "tokens/total": 10283520,
10507
+ "tokens/train_per_sec_per_gpu": 1616.46,
10508
+ "tokens/trainable": 4254038
10509
+ },
10510
+ {
10511
+ "epoch": 0.35995610291427876,
10512
+ "grad_norm": 0.1348477602005005,
10513
+ "learning_rate": 3.4136469464914575e-05,
10514
+ "loss": 2.392076015472412,
10515
+ "memory/device_reserved (GiB)": 49.08,
10516
+ "memory/max_active (GiB)": 15.63,
10517
+ "memory/max_allocated (GiB)": 15.63,
10518
+ "ppl": 10.93617,
10519
+ "step": 738,
10520
+ "tokens/total": 10297216,
10521
+ "tokens/train_per_sec_per_gpu": 2858.04,
10522
+ "tokens/trainable": 4260608
10523
+ },
10524
+ {
10525
+ "epoch": 0.36044384831118154,
10526
+ "grad_norm": 0.16576313972473145,
10527
+ "learning_rate": 3.389311119775828e-05,
10528
+ "loss": 2.443544864654541,
10529
+ "memory/device_reserved (GiB)": 49.08,
10530
+ "memory/max_active (GiB)": 16.51,
10531
+ "memory/max_allocated (GiB)": 16.51,
10532
+ "ppl": 11.51378,
10533
+ "step": 739,
10534
+ "tokens/total": 10312576,
10535
+ "tokens/train_per_sec_per_gpu": 2672.63,
10536
+ "tokens/trainable": 4265873
10537
+ },
10538
+ {
10539
+ "epoch": 0.36093159370808436,
10540
+ "grad_norm": 0.1353883296251297,
10541
+ "learning_rate": 3.3650446360584275e-05,
10542
+ "loss": 2.4599642753601074,
10543
+ "memory/device_reserved (GiB)": 49.08,
10544
+ "memory/max_active (GiB)": 16.42,
10545
+ "memory/max_allocated (GiB)": 16.42,
10546
+ "ppl": 11.70439,
10547
+ "step": 740,
10548
+ "tokens/total": 10327552,
10549
+ "tokens/train_per_sec_per_gpu": 2203.07,
10550
+ "tokens/trainable": 4273202
10551
+ },
10552
+ {
10553
+ "epoch": 0.3614193391049872,
10554
+ "grad_norm": 0.12930633127689362,
10555
+ "learning_rate": 3.340847749883191e-05,
10556
+ "loss": 2.54553484916687,
10557
+ "memory/device_reserved (GiB)": 49.08,
10558
+ "memory/max_active (GiB)": 16.51,
10559
+ "memory/max_allocated (GiB)": 16.51,
10560
+ "ppl": 12.75005,
10561
+ "step": 741,
10562
+ "tokens/total": 10342656,
10563
+ "tokens/train_per_sec_per_gpu": 1942.69,
10564
+ "tokens/trainable": 4280837
10565
+ },
10566
+ {
10567
+ "epoch": 0.36190708450189,
10568
+ "grad_norm": 0.15681229531764984,
10569
+ "learning_rate": 3.316720715064e-05,
10570
+ "loss": 2.5012075901031494,
10571
+ "memory/device_reserved (GiB)": 49.08,
10572
+ "memory/max_active (GiB)": 15.98,
10573
+ "memory/max_allocated (GiB)": 15.98,
10574
+ "ppl": 12.19721,
10575
+ "step": 742,
10576
+ "tokens/total": 10355200,
10577
+ "tokens/train_per_sec_per_gpu": 1917.69,
10578
+ "tokens/trainable": 4285994
10579
+ },
10580
+ {
10581
+ "epoch": 0.36239482989879285,
10582
+ "grad_norm": 0.1448935866355896,
10583
+ "learning_rate": 3.292663784682036e-05,
10584
+ "loss": 2.5044198036193848,
10585
+ "memory/device_reserved (GiB)": 49.08,
10586
+ "memory/max_active (GiB)": 16.51,
10587
+ "memory/max_allocated (GiB)": 16.51,
10588
+ "ppl": 12.23646,
10589
+ "step": 743,
10590
+ "tokens/total": 10369280,
10591
+ "tokens/train_per_sec_per_gpu": 1901.75,
10592
+ "tokens/trainable": 4292179
10593
+ },
10594
+ {
10595
+ "epoch": 0.3628825752956956,
10596
+ "grad_norm": 0.15751656889915466,
10597
+ "learning_rate": 3.268677211083109e-05,
10598
+ "loss": 2.60463547706604,
10599
+ "memory/device_reserved (GiB)": 49.08,
10600
+ "memory/max_active (GiB)": 14.74,
10601
+ "memory/max_allocated (GiB)": 14.74,
10602
+ "ppl": 13.52629,
10603
+ "step": 744,
10604
+ "tokens/total": 10382336,
10605
+ "tokens/train_per_sec_per_gpu": 3480.31,
10606
+ "tokens/trainable": 4297990
10607
+ },
10608
+ {
10609
+ "epoch": 0.36337032069259845,
10610
+ "grad_norm": 0.12639038264751434,
10611
+ "learning_rate": 3.2447612458750365e-05,
10612
+ "loss": 2.4025323390960693,
10613
+ "memory/device_reserved (GiB)": 49.08,
10614
+ "memory/max_active (GiB)": 15.53,
10615
+ "memory/max_allocated (GiB)": 15.53,
10616
+ "ppl": 11.05113,
10617
+ "step": 745,
10618
+ "tokens/total": 10396416,
10619
+ "tokens/train_per_sec_per_gpu": 1472.7,
10620
+ "tokens/trainable": 4305977
10621
+ },
10622
+ {
10623
+ "epoch": 0.3638580660895013,
10624
+ "grad_norm": 0.1874016672372818,
10625
+ "learning_rate": 3.2209161399249674e-05,
10626
+ "loss": 2.3843958377838135,
10627
+ "memory/device_reserved (GiB)": 49.08,
10628
+ "memory/max_active (GiB)": 16.51,
10629
+ "memory/max_allocated (GiB)": 16.51,
10630
+ "ppl": 10.8525,
10631
+ "step": 746,
10632
+ "tokens/total": 10410112,
10633
+ "tokens/train_per_sec_per_gpu": 2110.82,
10634
+ "tokens/trainable": 4309531
10635
+ },
10636
+ {
10637
+ "epoch": 0.3643458114864041,
10638
+ "grad_norm": 0.15403391420841217,
10639
+ "learning_rate": 3.197142143356787e-05,
10640
+ "loss": 2.4408397674560547,
10641
+ "memory/device_reserved (GiB)": 49.08,
10642
+ "memory/max_active (GiB)": 15.18,
10643
+ "memory/max_allocated (GiB)": 15.18,
10644
+ "ppl": 11.48268,
10645
+ "step": 747,
10646
+ "tokens/total": 10423936,
10647
+ "tokens/train_per_sec_per_gpu": 891.14,
10648
+ "tokens/trainable": 4314760
10649
+ },
10650
+ {
10651
+ "epoch": 0.36483355688330693,
10652
+ "grad_norm": 0.14246027171611786,
10653
+ "learning_rate": 3.173439505548462e-05,
10654
+ "loss": 2.531158447265625,
10655
+ "memory/device_reserved (GiB)": 49.08,
10656
+ "memory/max_active (GiB)": 15.98,
10657
+ "memory/max_allocated (GiB)": 15.98,
10658
+ "ppl": 12.56806,
10659
+ "step": 748,
10660
+ "tokens/total": 10438016,
10661
+ "tokens/train_per_sec_per_gpu": 2604.33,
10662
+ "tokens/trainable": 4321155
10663
+ },
10664
+ {
10665
+ "epoch": 0.3653213022802097,
10666
+ "grad_norm": 0.18709643185138702,
10667
+ "learning_rate": 3.149808475129452e-05,
10668
+ "loss": 2.3285350799560547,
10669
+ "memory/device_reserved (GiB)": 49.08,
10670
+ "memory/max_active (GiB)": 16.51,
10671
+ "memory/max_allocated (GiB)": 16.51,
10672
+ "ppl": 10.2629,
10673
+ "step": 749,
10674
+ "tokens/total": 10453760,
10675
+ "tokens/train_per_sec_per_gpu": 703.15,
10676
+ "tokens/trainable": 4325157
10677
+ },
10678
+ {
10679
+ "epoch": 0.36580904767711253,
10680
+ "grad_norm": 0.1710476279258728,
10681
+ "learning_rate": 3.126249299978086e-05,
10682
+ "loss": 2.375027656555176,
10683
+ "memory/device_reserved (GiB)": 49.08,
10684
+ "memory/max_active (GiB)": 16.51,
10685
+ "memory/max_allocated (GiB)": 16.51,
10686
+ "ppl": 10.75131,
10687
+ "step": 750,
10688
+ "tokens/total": 10467328,
10689
+ "tokens/train_per_sec_per_gpu": 2769.28,
10690
+ "tokens/trainable": 4329291
10691
+ },
10692
+ {
10693
+ "epoch": 0.36580904767711253,
10694
+ "eval_loss": 2.4998860359191895,
10695
+ "eval_ppl": 12.18111,
10696
+ "eval_runtime": 6.0497,
10697
+ "eval_samples_per_second": 33.06,
10698
+ "eval_steps_per_second": 16.53,
10699
+ "memory/device_reserved (GiB)": 49.08,
10700
+ "memory/max_active (GiB)": 11.76,
10701
+ "memory/max_allocated (GiB)": 11.76,
10702
+ "step": 750
10703
  }
10704
  ],
10705
  "logging_steps": 1,
 
10719
  "attributes": {}
10720
  }
10721
  },
10722
+ "total_flos": 1.287529657836503e+17,
10723
  "train_batch_size": 2,
10724
  "trial_name": null,
10725
  "trial_params": null