mohammadmahdinouri commited on
Commit
3fd64ef
·
verified ·
1 Parent(s): ad296cf

Training in progress, step 58000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:972aa91ec388a1f2f04b57475bbe0ef1d7a488751339adb89aa78c0871d0f22b
3
  size 304481530
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55010ca37211cc6b640c88e9f40807107bec277ebcc5b0b118f1cea15eed44f5
3
  size 304481530
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1f588ba0d0b39a0c0daf2cb6afacca8a7aef1f4bc72fe4409ce0b2281d2e356a
3
  size 402029570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc364080893bb423d47b8bfaac6a84d534e79aa0580cf54e20a609c7ac276c5b
3
  size 402029570
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:93c5029373839975c8e2ce486239c3c93c8bcc84856a9726f25e6b39e80d4bdb
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c37c5923f3d68f847ed300ddb34aea7ac5e2328c7df69f2be7f755bc9e45036
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ba86940b99fa7512a6bd263e7bdaf7ba94fc8e695324bdfda4c03882f64aa78d
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b439560e2350d72b3dd331a1a8b64962c6b47e1a1078857e970f6226f8e52122
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cf8627c515e0a9fd4095a16f3cf6f960eebbddd06bd5667ffafe332a0150e802
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d72e4bdd3428ede798be981d61f831c294e4c1f306f292cd3880bbf3dd42566d
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:55937ecae83bb1b9ebb2721682f64ea1aca1aefba9e61d245b7d516977f878f9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:13a06ce0cb98db3e26ada8fd779ab287dc006dddd3604ca6c762fd20a85c4365
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:039c09879ba9a48ef7918776fd751a67234de8e6a37518ae707982e7427ed8c9
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:703b87170b0696f3b2a83c775117cb6a49f63bf8b6fd7a85d19b5f6decf028d6
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.08443493769590386,
6
  "eval_steps": 500,
7
- "global_step": 57000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -19958,6 +19958,356 @@
19958
  "learning_rate": 0.00048604775555037517,
19959
  "loss": 16.9712,
19960
  "step": 57000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19961
  }
19962
  ],
19963
  "logging_steps": 20,
@@ -19977,7 +20327,7 @@
19977
  "attributes": {}
19978
  }
19979
  },
19980
- "total_flos": 4.190665024641329e+19,
19981
  "train_batch_size": 48,
19982
  "trial_name": null,
19983
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.08591625239232323,
6
  "eval_steps": 500,
7
+ "global_step": 58000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
19958
  "learning_rate": 0.00048604775555037517,
19959
  "loss": 16.9712,
19960
  "step": 57000
19961
+ },
19962
+ {
19963
+ "epoch": 0.08446456398983225,
19964
+ "grad_norm": 7.34375,
19965
+ "learning_rate": 0.00048604281661536756,
19966
+ "loss": 17.056,
19967
+ "step": 57020
19968
+ },
19969
+ {
19970
+ "epoch": 0.08449419028376064,
19971
+ "grad_norm": 7.4375,
19972
+ "learning_rate": 0.00048603787768036007,
19973
+ "loss": 17.0091,
19974
+ "step": 57040
19975
+ },
19976
+ {
19977
+ "epoch": 0.08452381657768904,
19978
+ "grad_norm": 6.78125,
19979
+ "learning_rate": 0.00048603293874535246,
19980
+ "loss": 16.9849,
19981
+ "step": 57060
19982
+ },
19983
+ {
19984
+ "epoch": 0.08455344287161742,
19985
+ "grad_norm": 7.40625,
19986
+ "learning_rate": 0.0004860279998103449,
19987
+ "loss": 17.0407,
19988
+ "step": 57080
19989
+ },
19990
+ {
19991
+ "epoch": 0.08458306916554581,
19992
+ "grad_norm": 7.21875,
19993
+ "learning_rate": 0.0004860230608753373,
19994
+ "loss": 17.0114,
19995
+ "step": 57100
19996
+ },
19997
+ {
19998
+ "epoch": 0.0846126954594742,
19999
+ "grad_norm": 7.0,
20000
+ "learning_rate": 0.0004860181219403298,
20001
+ "loss": 17.0502,
20002
+ "step": 57120
20003
+ },
20004
+ {
20005
+ "epoch": 0.08464232175340258,
20006
+ "grad_norm": 7.4375,
20007
+ "learning_rate": 0.0004860131830053222,
20008
+ "loss": 17.044,
20009
+ "step": 57140
20010
+ },
20011
+ {
20012
+ "epoch": 0.08467194804733097,
20013
+ "grad_norm": 6.5625,
20014
+ "learning_rate": 0.00048600824407031464,
20015
+ "loss": 17.0582,
20016
+ "step": 57160
20017
+ },
20018
+ {
20019
+ "epoch": 0.08470157434125936,
20020
+ "grad_norm": 7.59375,
20021
+ "learning_rate": 0.00048600330513530704,
20022
+ "loss": 17.0316,
20023
+ "step": 57180
20024
+ },
20025
+ {
20026
+ "epoch": 0.08473120063518774,
20027
+ "grad_norm": 7.25,
20028
+ "learning_rate": 0.00048599836620029954,
20029
+ "loss": 17.0576,
20030
+ "step": 57200
20031
+ },
20032
+ {
20033
+ "epoch": 0.08476082692911613,
20034
+ "grad_norm": 7.6875,
20035
+ "learning_rate": 0.00048599342726529193,
20036
+ "loss": 17.0589,
20037
+ "step": 57220
20038
+ },
20039
+ {
20040
+ "epoch": 0.08479045322304452,
20041
+ "grad_norm": 6.46875,
20042
+ "learning_rate": 0.0004859884883302844,
20043
+ "loss": 16.9831,
20044
+ "step": 57240
20045
+ },
20046
+ {
20047
+ "epoch": 0.0848200795169729,
20048
+ "grad_norm": 6.125,
20049
+ "learning_rate": 0.0004859835493952768,
20050
+ "loss": 16.9896,
20051
+ "step": 57260
20052
+ },
20053
+ {
20054
+ "epoch": 0.08484970581090129,
20055
+ "grad_norm": 6.6875,
20056
+ "learning_rate": 0.0004859786104602693,
20057
+ "loss": 16.9824,
20058
+ "step": 57280
20059
+ },
20060
+ {
20061
+ "epoch": 0.08487933210482967,
20062
+ "grad_norm": 8.25,
20063
+ "learning_rate": 0.00048597367152526167,
20064
+ "loss": 16.9623,
20065
+ "step": 57300
20066
+ },
20067
+ {
20068
+ "epoch": 0.08490895839875806,
20069
+ "grad_norm": 7.03125,
20070
+ "learning_rate": 0.0004859687325902541,
20071
+ "loss": 16.9834,
20072
+ "step": 57320
20073
+ },
20074
+ {
20075
+ "epoch": 0.08493858469268645,
20076
+ "grad_norm": 6.53125,
20077
+ "learning_rate": 0.00048596379365524657,
20078
+ "loss": 16.9729,
20079
+ "step": 57340
20080
+ },
20081
+ {
20082
+ "epoch": 0.08496821098661483,
20083
+ "grad_norm": 6.21875,
20084
+ "learning_rate": 0.00048595885472023896,
20085
+ "loss": 17.0528,
20086
+ "step": 57360
20087
+ },
20088
+ {
20089
+ "epoch": 0.08499783728054323,
20090
+ "grad_norm": 7.15625,
20091
+ "learning_rate": 0.0004859539157852314,
20092
+ "loss": 17.0257,
20093
+ "step": 57380
20094
+ },
20095
+ {
20096
+ "epoch": 0.08502746357447162,
20097
+ "grad_norm": 7.09375,
20098
+ "learning_rate": 0.0004859489768502238,
20099
+ "loss": 16.8887,
20100
+ "step": 57400
20101
+ },
20102
+ {
20103
+ "epoch": 0.08505708986840001,
20104
+ "grad_norm": 7.4375,
20105
+ "learning_rate": 0.0004859440379152163,
20106
+ "loss": 17.0345,
20107
+ "step": 57420
20108
+ },
20109
+ {
20110
+ "epoch": 0.0850867161623284,
20111
+ "grad_norm": 6.84375,
20112
+ "learning_rate": 0.0004859390989802087,
20113
+ "loss": 16.9734,
20114
+ "step": 57440
20115
+ },
20116
+ {
20117
+ "epoch": 0.08511634245625678,
20118
+ "grad_norm": 6.875,
20119
+ "learning_rate": 0.00048593416004520114,
20120
+ "loss": 17.0113,
20121
+ "step": 57460
20122
+ },
20123
+ {
20124
+ "epoch": 0.08514596875018517,
20125
+ "grad_norm": 7.21875,
20126
+ "learning_rate": 0.00048592922111019354,
20127
+ "loss": 16.9691,
20128
+ "step": 57480
20129
+ },
20130
+ {
20131
+ "epoch": 0.08517559504411355,
20132
+ "grad_norm": 7.0625,
20133
+ "learning_rate": 0.00048592428217518604,
20134
+ "loss": 17.0288,
20135
+ "step": 57500
20136
+ },
20137
+ {
20138
+ "epoch": 0.08520522133804194,
20139
+ "grad_norm": 6.78125,
20140
+ "learning_rate": 0.00048591934324017843,
20141
+ "loss": 17.0317,
20142
+ "step": 57520
20143
+ },
20144
+ {
20145
+ "epoch": 0.08523484763197033,
20146
+ "grad_norm": 7.0625,
20147
+ "learning_rate": 0.0004859144043051709,
20148
+ "loss": 17.0653,
20149
+ "step": 57540
20150
+ },
20151
+ {
20152
+ "epoch": 0.08526447392589871,
20153
+ "grad_norm": 6.375,
20154
+ "learning_rate": 0.0004859094653701633,
20155
+ "loss": 17.0473,
20156
+ "step": 57560
20157
+ },
20158
+ {
20159
+ "epoch": 0.0852941002198271,
20160
+ "grad_norm": 7.375,
20161
+ "learning_rate": 0.0004859045264351558,
20162
+ "loss": 16.9975,
20163
+ "step": 57580
20164
+ },
20165
+ {
20166
+ "epoch": 0.08532372651375549,
20167
+ "grad_norm": 7.15625,
20168
+ "learning_rate": 0.00048589958750014817,
20169
+ "loss": 16.9854,
20170
+ "step": 57600
20171
+ },
20172
+ {
20173
+ "epoch": 0.08535335280768387,
20174
+ "grad_norm": 6.4375,
20175
+ "learning_rate": 0.0004858946485651406,
20176
+ "loss": 16.9975,
20177
+ "step": 57620
20178
+ },
20179
+ {
20180
+ "epoch": 0.08538297910161226,
20181
+ "grad_norm": 6.625,
20182
+ "learning_rate": 0.00048588970963013307,
20183
+ "loss": 17.0178,
20184
+ "step": 57640
20185
+ },
20186
+ {
20187
+ "epoch": 0.08541260539554064,
20188
+ "grad_norm": 7.3125,
20189
+ "learning_rate": 0.0004858847706951255,
20190
+ "loss": 16.9128,
20191
+ "step": 57660
20192
+ },
20193
+ {
20194
+ "epoch": 0.08544223168946903,
20195
+ "grad_norm": 7.5,
20196
+ "learning_rate": 0.0004858798317601179,
20197
+ "loss": 17.0329,
20198
+ "step": 57680
20199
+ },
20200
+ {
20201
+ "epoch": 0.08547185798339743,
20202
+ "grad_norm": 7.15625,
20203
+ "learning_rate": 0.0004858748928251103,
20204
+ "loss": 17.0416,
20205
+ "step": 57700
20206
+ },
20207
+ {
20208
+ "epoch": 0.08550148427732582,
20209
+ "grad_norm": 7.34375,
20210
+ "learning_rate": 0.0004858699538901028,
20211
+ "loss": 17.0423,
20212
+ "step": 57720
20213
+ },
20214
+ {
20215
+ "epoch": 0.0855311105712542,
20216
+ "grad_norm": 6.21875,
20217
+ "learning_rate": 0.0004858650149550952,
20218
+ "loss": 16.9665,
20219
+ "step": 57740
20220
+ },
20221
+ {
20222
+ "epoch": 0.08556073686518259,
20223
+ "grad_norm": 6.9375,
20224
+ "learning_rate": 0.00048586007602008765,
20225
+ "loss": 16.9616,
20226
+ "step": 57760
20227
+ },
20228
+ {
20229
+ "epoch": 0.08559036315911098,
20230
+ "grad_norm": 7.3125,
20231
+ "learning_rate": 0.00048585513708508004,
20232
+ "loss": 16.9745,
20233
+ "step": 57780
20234
+ },
20235
+ {
20236
+ "epoch": 0.08561998945303936,
20237
+ "grad_norm": 6.90625,
20238
+ "learning_rate": 0.00048585019815007254,
20239
+ "loss": 16.9302,
20240
+ "step": 57800
20241
+ },
20242
+ {
20243
+ "epoch": 0.08564961574696775,
20244
+ "grad_norm": 7.125,
20245
+ "learning_rate": 0.00048584525921506493,
20246
+ "loss": 16.9713,
20247
+ "step": 57820
20248
+ },
20249
+ {
20250
+ "epoch": 0.08567924204089614,
20251
+ "grad_norm": 6.59375,
20252
+ "learning_rate": 0.0004858403202800574,
20253
+ "loss": 16.9939,
20254
+ "step": 57840
20255
+ },
20256
+ {
20257
+ "epoch": 0.08570886833482452,
20258
+ "grad_norm": 6.78125,
20259
+ "learning_rate": 0.0004858353813450498,
20260
+ "loss": 16.9307,
20261
+ "step": 57860
20262
+ },
20263
+ {
20264
+ "epoch": 0.08573849462875291,
20265
+ "grad_norm": 7.125,
20266
+ "learning_rate": 0.0004858304424100423,
20267
+ "loss": 17.044,
20268
+ "step": 57880
20269
+ },
20270
+ {
20271
+ "epoch": 0.0857681209226813,
20272
+ "grad_norm": 7.25,
20273
+ "learning_rate": 0.00048582550347503467,
20274
+ "loss": 17.0308,
20275
+ "step": 57900
20276
+ },
20277
+ {
20278
+ "epoch": 0.08579774721660968,
20279
+ "grad_norm": 6.96875,
20280
+ "learning_rate": 0.0004858205645400271,
20281
+ "loss": 17.0287,
20282
+ "step": 57920
20283
+ },
20284
+ {
20285
+ "epoch": 0.08582737351053807,
20286
+ "grad_norm": 6.9375,
20287
+ "learning_rate": 0.00048581562560501957,
20288
+ "loss": 17.018,
20289
+ "step": 57940
20290
+ },
20291
+ {
20292
+ "epoch": 0.08585699980446646,
20293
+ "grad_norm": 7.28125,
20294
+ "learning_rate": 0.000485810686670012,
20295
+ "loss": 16.9714,
20296
+ "step": 57960
20297
+ },
20298
+ {
20299
+ "epoch": 0.08588662609839484,
20300
+ "grad_norm": 6.875,
20301
+ "learning_rate": 0.0004858057477350044,
20302
+ "loss": 16.9836,
20303
+ "step": 57980
20304
+ },
20305
+ {
20306
+ "epoch": 0.08591625239232323,
20307
+ "grad_norm": 8.125,
20308
+ "learning_rate": 0.00048580080879999686,
20309
+ "loss": 16.9677,
20310
+ "step": 58000
20311
  }
20312
  ],
20313
  "logging_steps": 20,
 
20327
  "attributes": {}
20328
  }
20329
  },
20330
+ "total_flos": 4.264201071083966e+19,
20331
  "train_batch_size": 48,
20332
  "trial_name": null,
20333
  "trial_params": null