CocoRoF commited on
Commit
f286da2
·
verified ·
1 Parent(s): 0ef8f64

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c03dfe3ff98720b641d5b3253f189443475f90c5848bfce1ee42b4e25e9a06d9
3
  size 737580392
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:92c7c2466e9547634a505ccaf2590f9e4d9d15d2f31d94aa4c0cfe5f155dc10b
3
  size 737580392
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:781ca001e4eef0894d5dc0a043ec1d7414e5f687b44a3bb27578a66df794e142
3
  size 1475248442
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:05b07ba25a847922c59fe9c0ee222039fd2b55eb27e7164ec80572760094d906
3
  size 1475248442
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cd02421b7ec256714ec03c37d51589e92544068eeda4bae107d407e8dfd0cb9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b8a0379bdd10765d4926325b17779ba084884beedfbdf271680e1d1bd136b43
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8785b8509dc9a197581e45af973f623b343ec6de3eb0eeab89b29a64ed0e10d5
3
  size 1000
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea8c56402fe28ab6610db127ee707a0d7bbb7e8371ebb7f77b59566a41c7f5ef
3
  size 1000
last-checkpoint/trainer_state.json CHANGED
@@ -1,9 +1,9 @@
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
- "epoch": 1.640112464854733,
5
  "eval_steps": 100,
6
- "global_step": 3500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
@@ -3017,6 +3017,436 @@
3017
  "eval_spearman_manhattan": 0.8222685344671697,
3018
  "eval_steps_per_second": 15.727,
3019
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3020
  }
3021
  ],
3022
  "logging_steps": 10,
 
1
  {
2
  "best_metric": null,
3
  "best_model_checkpoint": null,
4
+ "epoch": 1.8744142455482662,
5
  "eval_steps": 100,
6
+ "global_step": 4000,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
 
3017
  "eval_spearman_manhattan": 0.8222685344671697,
3018
  "eval_steps_per_second": 15.727,
3019
  "step": 3500
3020
+ },
3021
+ {
3022
+ "epoch": 1.6447985004686037,
3023
+ "grad_norm": 0.9714635610580444,
3024
+ "learning_rate": 4.486000468603562e-05,
3025
+ "loss": 0.1841,
3026
+ "step": 3510
3027
+ },
3028
+ {
3029
+ "epoch": 1.6494845360824741,
3030
+ "grad_norm": 1.4874918460845947,
3031
+ "learning_rate": 4.484536082474227e-05,
3032
+ "loss": 0.1833,
3033
+ "step": 3520
3034
+ },
3035
+ {
3036
+ "epoch": 1.6541705716963448,
3037
+ "grad_norm": 1.3226478099822998,
3038
+ "learning_rate": 4.4830716963448926e-05,
3039
+ "loss": 0.1946,
3040
+ "step": 3530
3041
+ },
3042
+ {
3043
+ "epoch": 1.6588566073102156,
3044
+ "grad_norm": 1.6590946912765503,
3045
+ "learning_rate": 4.481607310215558e-05,
3046
+ "loss": 0.192,
3047
+ "step": 3540
3048
+ },
3049
+ {
3050
+ "epoch": 1.6635426429240863,
3051
+ "grad_norm": 1.8779162168502808,
3052
+ "learning_rate": 4.480142924086223e-05,
3053
+ "loss": 0.1841,
3054
+ "step": 3550
3055
+ },
3056
+ {
3057
+ "epoch": 1.668228678537957,
3058
+ "grad_norm": 1.6211146116256714,
3059
+ "learning_rate": 4.478678537956888e-05,
3060
+ "loss": 0.1943,
3061
+ "step": 3560
3062
+ },
3063
+ {
3064
+ "epoch": 1.6729147141518275,
3065
+ "grad_norm": 1.4703700542449951,
3066
+ "learning_rate": 4.477214151827554e-05,
3067
+ "loss": 0.1728,
3068
+ "step": 3570
3069
+ },
3070
+ {
3071
+ "epoch": 1.6776007497656982,
3072
+ "grad_norm": 1.689462661743164,
3073
+ "learning_rate": 4.47574976569822e-05,
3074
+ "loss": 0.1857,
3075
+ "step": 3580
3076
+ },
3077
+ {
3078
+ "epoch": 1.6822867853795689,
3079
+ "grad_norm": 1.7197692394256592,
3080
+ "learning_rate": 4.474285379568885e-05,
3081
+ "loss": 0.2084,
3082
+ "step": 3590
3083
+ },
3084
+ {
3085
+ "epoch": 1.6869728209934396,
3086
+ "grad_norm": 1.2906155586242676,
3087
+ "learning_rate": 4.4728209934395506e-05,
3088
+ "loss": 0.1982,
3089
+ "step": 3600
3090
+ },
3091
+ {
3092
+ "epoch": 1.6869728209934396,
3093
+ "eval_loss": 0.03799282759428024,
3094
+ "eval_pearson_cosine": 0.8220264449208883,
3095
+ "eval_pearson_dot": 0.7551673016916052,
3096
+ "eval_pearson_euclidean": 0.818234030509938,
3097
+ "eval_pearson_manhattan": 0.8196075002869208,
3098
+ "eval_runtime": 5.9912,
3099
+ "eval_samples_per_second": 250.368,
3100
+ "eval_spearman_cosine": 0.8233208131275441,
3101
+ "eval_spearman_dot": 0.7535137341757772,
3102
+ "eval_spearman_euclidean": 0.8248668260426036,
3103
+ "eval_spearman_manhattan": 0.825693429330301,
3104
+ "eval_steps_per_second": 15.69,
3105
+ "step": 3600
3106
+ },
3107
+ {
3108
+ "epoch": 1.6916588566073103,
3109
+ "grad_norm": 1.595618724822998,
3110
+ "learning_rate": 4.471356607310216e-05,
3111
+ "loss": 0.1828,
3112
+ "step": 3610
3113
+ },
3114
+ {
3115
+ "epoch": 1.6963448922211808,
3116
+ "grad_norm": 1.280032753944397,
3117
+ "learning_rate": 4.4698922211808815e-05,
3118
+ "loss": 0.1748,
3119
+ "step": 3620
3120
+ },
3121
+ {
3122
+ "epoch": 1.7010309278350515,
3123
+ "grad_norm": 1.2117244005203247,
3124
+ "learning_rate": 4.468427835051547e-05,
3125
+ "loss": 0.1758,
3126
+ "step": 3630
3127
+ },
3128
+ {
3129
+ "epoch": 1.7057169634489222,
3130
+ "grad_norm": 1.7422757148742676,
3131
+ "learning_rate": 4.466963448922212e-05,
3132
+ "loss": 0.17,
3133
+ "step": 3640
3134
+ },
3135
+ {
3136
+ "epoch": 1.710402999062793,
3137
+ "grad_norm": 1.6089109182357788,
3138
+ "learning_rate": 4.465499062792877e-05,
3139
+ "loss": 0.1745,
3140
+ "step": 3650
3141
+ },
3142
+ {
3143
+ "epoch": 1.7150890346766636,
3144
+ "grad_norm": 1.664401650428772,
3145
+ "learning_rate": 4.4640346766635425e-05,
3146
+ "loss": 0.153,
3147
+ "step": 3660
3148
+ },
3149
+ {
3150
+ "epoch": 1.7197750702905341,
3151
+ "grad_norm": 1.5395989418029785,
3152
+ "learning_rate": 4.462570290534208e-05,
3153
+ "loss": 0.2049,
3154
+ "step": 3670
3155
+ },
3156
+ {
3157
+ "epoch": 1.7244611059044048,
3158
+ "grad_norm": 1.8465495109558105,
3159
+ "learning_rate": 4.4611059044048734e-05,
3160
+ "loss": 0.208,
3161
+ "step": 3680
3162
+ },
3163
+ {
3164
+ "epoch": 1.7291471415182755,
3165
+ "grad_norm": 1.6127640008926392,
3166
+ "learning_rate": 4.459641518275539e-05,
3167
+ "loss": 0.1732,
3168
+ "step": 3690
3169
+ },
3170
+ {
3171
+ "epoch": 1.7338331771321462,
3172
+ "grad_norm": 1.2604060173034668,
3173
+ "learning_rate": 4.458177132146205e-05,
3174
+ "loss": 0.1824,
3175
+ "step": 3700
3176
+ },
3177
+ {
3178
+ "epoch": 1.7338331771321462,
3179
+ "eval_loss": 0.03523562103509903,
3180
+ "eval_pearson_cosine": 0.8246188621544412,
3181
+ "eval_pearson_dot": 0.7566728482844667,
3182
+ "eval_pearson_euclidean": 0.816607137727027,
3183
+ "eval_pearson_manhattan": 0.8181353811719987,
3184
+ "eval_runtime": 6.4257,
3185
+ "eval_samples_per_second": 233.438,
3186
+ "eval_spearman_cosine": 0.8251725549021353,
3187
+ "eval_spearman_dot": 0.7553705266499796,
3188
+ "eval_spearman_euclidean": 0.8232824851511874,
3189
+ "eval_spearman_manhattan": 0.8242087634144919,
3190
+ "eval_steps_per_second": 14.629,
3191
+ "step": 3700
3192
+ },
3193
+ {
3194
+ "epoch": 1.738519212746017,
3195
+ "grad_norm": 1.4244945049285889,
3196
+ "learning_rate": 4.45671274601687e-05,
3197
+ "loss": 0.1704,
3198
+ "step": 3710
3199
+ },
3200
+ {
3201
+ "epoch": 1.7432052483598874,
3202
+ "grad_norm": 1.5206220149993896,
3203
+ "learning_rate": 4.455248359887536e-05,
3204
+ "loss": 0.1921,
3205
+ "step": 3720
3206
+ },
3207
+ {
3208
+ "epoch": 1.7478912839737581,
3209
+ "grad_norm": 1.596449375152588,
3210
+ "learning_rate": 4.453783973758201e-05,
3211
+ "loss": 0.1714,
3212
+ "step": 3730
3213
+ },
3214
+ {
3215
+ "epoch": 1.7525773195876289,
3216
+ "grad_norm": 1.6774275302886963,
3217
+ "learning_rate": 4.452319587628866e-05,
3218
+ "loss": 0.1889,
3219
+ "step": 3740
3220
+ },
3221
+ {
3222
+ "epoch": 1.7572633552014996,
3223
+ "grad_norm": 1.4467116594314575,
3224
+ "learning_rate": 4.4508552014995314e-05,
3225
+ "loss": 0.2031,
3226
+ "step": 3750
3227
+ },
3228
+ {
3229
+ "epoch": 1.7619493908153703,
3230
+ "grad_norm": 1.722493290901184,
3231
+ "learning_rate": 4.449390815370197e-05,
3232
+ "loss": 0.1931,
3233
+ "step": 3760
3234
+ },
3235
+ {
3236
+ "epoch": 1.7666354264292408,
3237
+ "grad_norm": 1.8555185794830322,
3238
+ "learning_rate": 4.447926429240862e-05,
3239
+ "loss": 0.211,
3240
+ "step": 3770
3241
+ },
3242
+ {
3243
+ "epoch": 1.7713214620431117,
3244
+ "grad_norm": 1.5005462169647217,
3245
+ "learning_rate": 4.4464620431115277e-05,
3246
+ "loss": 0.1939,
3247
+ "step": 3780
3248
+ },
3249
+ {
3250
+ "epoch": 1.7760074976569822,
3251
+ "grad_norm": 1.88517165184021,
3252
+ "learning_rate": 4.444997656982193e-05,
3253
+ "loss": 0.1868,
3254
+ "step": 3790
3255
+ },
3256
+ {
3257
+ "epoch": 1.780693533270853,
3258
+ "grad_norm": 1.7735782861709595,
3259
+ "learning_rate": 4.4435332708528585e-05,
3260
+ "loss": 0.2009,
3261
+ "step": 3800
3262
+ },
3263
+ {
3264
+ "epoch": 1.780693533270853,
3265
+ "eval_loss": 0.03577423095703125,
3266
+ "eval_pearson_cosine": 0.8269622117688868,
3267
+ "eval_pearson_dot": 0.7668511969090961,
3268
+ "eval_pearson_euclidean": 0.8089875954267027,
3269
+ "eval_pearson_manhattan": 0.8104624503351374,
3270
+ "eval_runtime": 6.3923,
3271
+ "eval_samples_per_second": 234.659,
3272
+ "eval_spearman_cosine": 0.8277880382919593,
3273
+ "eval_spearman_dot": 0.7654651690867051,
3274
+ "eval_spearman_euclidean": 0.8164441074213089,
3275
+ "eval_spearman_manhattan": 0.8181458418663312,
3276
+ "eval_steps_per_second": 14.705,
3277
+ "step": 3800
3278
+ },
3279
+ {
3280
+ "epoch": 1.7853795688847236,
3281
+ "grad_norm": 1.697729229927063,
3282
+ "learning_rate": 4.442068884723524e-05,
3283
+ "loss": 0.1902,
3284
+ "step": 3810
3285
+ },
3286
+ {
3287
+ "epoch": 1.790065604498594,
3288
+ "grad_norm": 1.3950953483581543,
3289
+ "learning_rate": 4.44060449859419e-05,
3290
+ "loss": 0.1937,
3291
+ "step": 3820
3292
+ },
3293
+ {
3294
+ "epoch": 1.794751640112465,
3295
+ "grad_norm": 2.385718584060669,
3296
+ "learning_rate": 4.4391401124648555e-05,
3297
+ "loss": 0.2259,
3298
+ "step": 3830
3299
+ },
3300
+ {
3301
+ "epoch": 1.7994376757263355,
3302
+ "grad_norm": 1.0628514289855957,
3303
+ "learning_rate": 4.43767572633552e-05,
3304
+ "loss": 0.1611,
3305
+ "step": 3840
3306
+ },
3307
+ {
3308
+ "epoch": 1.8041237113402062,
3309
+ "grad_norm": 1.6918362379074097,
3310
+ "learning_rate": 4.4362113402061856e-05,
3311
+ "loss": 0.1767,
3312
+ "step": 3850
3313
+ },
3314
+ {
3315
+ "epoch": 1.808809746954077,
3316
+ "grad_norm": 1.2459661960601807,
3317
+ "learning_rate": 4.434746954076851e-05,
3318
+ "loss": 0.1477,
3319
+ "step": 3860
3320
+ },
3321
+ {
3322
+ "epoch": 1.8134957825679474,
3323
+ "grad_norm": 1.2925353050231934,
3324
+ "learning_rate": 4.4332825679475165e-05,
3325
+ "loss": 0.1848,
3326
+ "step": 3870
3327
+ },
3328
+ {
3329
+ "epoch": 1.8181818181818183,
3330
+ "grad_norm": 1.3787219524383545,
3331
+ "learning_rate": 4.431818181818182e-05,
3332
+ "loss": 0.2216,
3333
+ "step": 3880
3334
+ },
3335
+ {
3336
+ "epoch": 1.8228678537956888,
3337
+ "grad_norm": 1.531141996383667,
3338
+ "learning_rate": 4.4303537956888473e-05,
3339
+ "loss": 0.2087,
3340
+ "step": 3890
3341
+ },
3342
+ {
3343
+ "epoch": 1.8275538894095595,
3344
+ "grad_norm": 1.4053128957748413,
3345
+ "learning_rate": 4.428889409559513e-05,
3346
+ "loss": 0.1899,
3347
+ "step": 3900
3348
+ },
3349
+ {
3350
+ "epoch": 1.8275538894095595,
3351
+ "eval_loss": 0.03851017728447914,
3352
+ "eval_pearson_cosine": 0.8239765523882259,
3353
+ "eval_pearson_dot": 0.7418402003946696,
3354
+ "eval_pearson_euclidean": 0.8110650670007509,
3355
+ "eval_pearson_manhattan": 0.8133011255650899,
3356
+ "eval_runtime": 6.2641,
3357
+ "eval_samples_per_second": 239.46,
3358
+ "eval_spearman_cosine": 0.8252211874177356,
3359
+ "eval_spearman_dot": 0.7383247864885149,
3360
+ "eval_spearman_euclidean": 0.8180097764566381,
3361
+ "eval_spearman_manhattan": 0.8202301926079525,
3362
+ "eval_steps_per_second": 15.006,
3363
+ "step": 3900
3364
+ },
3365
+ {
3366
+ "epoch": 1.8322399250234302,
3367
+ "grad_norm": 1.564635157585144,
3368
+ "learning_rate": 4.427425023430178e-05,
3369
+ "loss": 0.1912,
3370
+ "step": 3910
3371
+ },
3372
+ {
3373
+ "epoch": 1.8369259606373007,
3374
+ "grad_norm": 1.525820255279541,
3375
+ "learning_rate": 4.4259606373008436e-05,
3376
+ "loss": 0.1725,
3377
+ "step": 3920
3378
+ },
3379
+ {
3380
+ "epoch": 1.8416119962511717,
3381
+ "grad_norm": 1.6183199882507324,
3382
+ "learning_rate": 4.424496251171509e-05,
3383
+ "loss": 0.1771,
3384
+ "step": 3930
3385
+ },
3386
+ {
3387
+ "epoch": 1.8462980318650422,
3388
+ "grad_norm": 1.614140510559082,
3389
+ "learning_rate": 4.4230318650421745e-05,
3390
+ "loss": 0.2085,
3391
+ "step": 3940
3392
+ },
3393
+ {
3394
+ "epoch": 1.8509840674789129,
3395
+ "grad_norm": 1.6187098026275635,
3396
+ "learning_rate": 4.42156747891284e-05,
3397
+ "loss": 0.1841,
3398
+ "step": 3950
3399
+ },
3400
+ {
3401
+ "epoch": 1.8556701030927836,
3402
+ "grad_norm": 1.9337131977081299,
3403
+ "learning_rate": 4.4201030927835053e-05,
3404
+ "loss": 0.1814,
3405
+ "step": 3960
3406
+ },
3407
+ {
3408
+ "epoch": 1.860356138706654,
3409
+ "grad_norm": 1.4200247526168823,
3410
+ "learning_rate": 4.418638706654171e-05,
3411
+ "loss": 0.2138,
3412
+ "step": 3970
3413
+ },
3414
+ {
3415
+ "epoch": 1.865042174320525,
3416
+ "grad_norm": 1.2319873571395874,
3417
+ "learning_rate": 4.417174320524836e-05,
3418
+ "loss": 0.1811,
3419
+ "step": 3980
3420
+ },
3421
+ {
3422
+ "epoch": 1.8697282099343955,
3423
+ "grad_norm": 1.5090093612670898,
3424
+ "learning_rate": 4.4157099343955016e-05,
3425
+ "loss": 0.2013,
3426
+ "step": 3990
3427
+ },
3428
+ {
3429
+ "epoch": 1.8744142455482662,
3430
+ "grad_norm": 1.4747837781906128,
3431
+ "learning_rate": 4.414245548266167e-05,
3432
+ "loss": 0.1858,
3433
+ "step": 4000
3434
+ },
3435
+ {
3436
+ "epoch": 1.8744142455482662,
3437
+ "eval_loss": 0.03368546813726425,
3438
+ "eval_pearson_cosine": 0.8281146927252365,
3439
+ "eval_pearson_dot": 0.761976509126896,
3440
+ "eval_pearson_euclidean": 0.810243447652832,
3441
+ "eval_pearson_manhattan": 0.8121517436135477,
3442
+ "eval_runtime": 7.0014,
3443
+ "eval_samples_per_second": 214.242,
3444
+ "eval_spearman_cosine": 0.8273786901092443,
3445
+ "eval_spearman_dot": 0.7590219133940205,
3446
+ "eval_spearman_euclidean": 0.8180153411946929,
3447
+ "eval_spearman_manhattan": 0.8198041702608989,
3448
+ "eval_steps_per_second": 13.426,
3449
+ "step": 4000
3450
  }
3451
  ],
3452
  "logging_steps": 10,