mohammadmahdinouri commited on
Commit
8a814fd
·
verified ·
1 Parent(s): f07e655

Training in progress, step 12000, checkpoint

Browse files
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0be33c01dee181e33e061344f496bf9f0f8254d9c952504e3f2eff2fd927a507
3
  size 319352826
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fe4d66c1c45da3d7c1f12fac8784e3a9ed3f9c80aaaeb6d876b0d9ad695ad6ac
3
  size 319352826
last-checkpoint/pytorch_model.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:57767e1e2df2301a0767b5a50453dc690b5bf04bbb49968d143ddd3df541b903
3
  size 900372486
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:037cbabeb904fe4bc6ddc6347802807d0fa4aee83b97d95773c6ab9e0710b4c5
3
  size 900372486
last-checkpoint/rng_state_0.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2a391d6914515f000a01afb4a1a2c5b3509b792b87df9eab51940ca46c8bfa01
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f66d2cdb2727898aa57d1805f0959aa2d5b4d00101c612d89b884ee5f99403d
3
  size 14960
last-checkpoint/rng_state_1.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:809828103d06d4ed101dc133013fb67d73e4aacc0a915f197e310cb2de50091a
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1c45c8627a787157525e0441f74e851dbeed0484ecee31ac2a6d27081cbd4784
3
  size 14960
last-checkpoint/rng_state_2.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3996102cde4d1f082ab988babe46130ecc500251a1ac64fab95d9f2050cfec9
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:668b78f9e33a1b391d4ae9971e15c64123216cc17e0701f0c88ed161b79d182b
3
  size 14960
last-checkpoint/rng_state_3.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4427db14e889501b71e194951d2916ae6e318332ca869cabe18801e4985a8472
3
  size 14960
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a22c853237523fba49f3eb78490cfffdd83e225ddbf7f5581f622db7e7b1ed5
3
  size 14960
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e52eba74f68ff2286d13be680b4b69d2294410dac3c4f995d7813f0299e7993b
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:75ee9351d0b9f6c92b7d23f6bec3e5476b4b37acddcbc9d54675b0b4f4ee65b1
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.016294461660613026,
6
  "eval_steps": 500,
7
- "global_step": 11000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3858,6 +3858,356 @@
3858
  "learning_rate": 4.97407306067778e-05,
3859
  "loss": 25.0612,
3860
  "step": 11000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3861
  }
3862
  ],
3863
  "logging_steps": 20,
@@ -3877,7 +4227,7 @@
3877
  "attributes": {}
3878
  }
3879
  },
3880
- "total_flos": 2.0352843215280275e+19,
3881
  "train_batch_size": 48,
3882
  "trial_name": null,
3883
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.017775776357032393,
6
  "eval_steps": 500,
7
+ "global_step": 12000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3858
  "learning_rate": 4.97407306067778e-05,
3859
  "loss": 25.0612,
3860
  "step": 11000
3861
+ },
3862
+ {
3863
+ "epoch": 0.016324087954541416,
3864
+ "grad_norm": 8.875,
3865
+ "learning_rate": 4.9740236713277047e-05,
3866
+ "loss": 24.9356,
3867
+ "step": 11020
3868
+ },
3869
+ {
3870
+ "epoch": 0.016353714248469802,
3871
+ "grad_norm": 8.9375,
3872
+ "learning_rate": 4.973974281977628e-05,
3873
+ "loss": 24.9984,
3874
+ "step": 11040
3875
+ },
3876
+ {
3877
+ "epoch": 0.01638334054239819,
3878
+ "grad_norm": 7.46875,
3879
+ "learning_rate": 4.9739248926275533e-05,
3880
+ "loss": 24.9063,
3881
+ "step": 11060
3882
+ },
3883
+ {
3884
+ "epoch": 0.016412966836326575,
3885
+ "grad_norm": 7.125,
3886
+ "learning_rate": 4.973875503277478e-05,
3887
+ "loss": 25.0004,
3888
+ "step": 11080
3889
+ },
3890
+ {
3891
+ "epoch": 0.016442593130254965,
3892
+ "grad_norm": 8.6875,
3893
+ "learning_rate": 4.973826113927402e-05,
3894
+ "loss": 24.8795,
3895
+ "step": 11100
3896
+ },
3897
+ {
3898
+ "epoch": 0.01647221942418335,
3899
+ "grad_norm": 7.28125,
3900
+ "learning_rate": 4.973776724577326e-05,
3901
+ "loss": 24.9027,
3902
+ "step": 11120
3903
+ },
3904
+ {
3905
+ "epoch": 0.016501845718111738,
3906
+ "grad_norm": 7.53125,
3907
+ "learning_rate": 4.973727335227251e-05,
3908
+ "loss": 24.885,
3909
+ "step": 11140
3910
+ },
3911
+ {
3912
+ "epoch": 0.016531472012040124,
3913
+ "grad_norm": 7.8125,
3914
+ "learning_rate": 4.973677945877175e-05,
3915
+ "loss": 24.9615,
3916
+ "step": 11160
3917
+ },
3918
+ {
3919
+ "epoch": 0.016561098305968514,
3920
+ "grad_norm": 9.125,
3921
+ "learning_rate": 4.973628556527099e-05,
3922
+ "loss": 25.0393,
3923
+ "step": 11180
3924
+ },
3925
+ {
3926
+ "epoch": 0.0165907245998969,
3927
+ "grad_norm": 9.5625,
3928
+ "learning_rate": 4.973579167177024e-05,
3929
+ "loss": 25.0047,
3930
+ "step": 11200
3931
+ },
3932
+ {
3933
+ "epoch": 0.016620350893825287,
3934
+ "grad_norm": 7.6875,
3935
+ "learning_rate": 4.973529777826948e-05,
3936
+ "loss": 24.8962,
3937
+ "step": 11220
3938
+ },
3939
+ {
3940
+ "epoch": 0.016649977187753674,
3941
+ "grad_norm": 8.5,
3942
+ "learning_rate": 4.9734803884768724e-05,
3943
+ "loss": 24.9105,
3944
+ "step": 11240
3945
+ },
3946
+ {
3947
+ "epoch": 0.016679603481682063,
3948
+ "grad_norm": 8.375,
3949
+ "learning_rate": 4.973430999126796e-05,
3950
+ "loss": 24.9264,
3951
+ "step": 11260
3952
+ },
3953
+ {
3954
+ "epoch": 0.01670922977561045,
3955
+ "grad_norm": 11.625,
3956
+ "learning_rate": 4.973381609776721e-05,
3957
+ "loss": 24.9545,
3958
+ "step": 11280
3959
+ },
3960
+ {
3961
+ "epoch": 0.016738856069538836,
3962
+ "grad_norm": 6.875,
3963
+ "learning_rate": 4.9733322204266455e-05,
3964
+ "loss": 24.8835,
3965
+ "step": 11300
3966
+ },
3967
+ {
3968
+ "epoch": 0.016768482363467223,
3969
+ "grad_norm": 7.625,
3970
+ "learning_rate": 4.973282831076569e-05,
3971
+ "loss": 24.8933,
3972
+ "step": 11320
3973
+ },
3974
+ {
3975
+ "epoch": 0.016798108657395613,
3976
+ "grad_norm": 8.375,
3977
+ "learning_rate": 4.9732334417264935e-05,
3978
+ "loss": 24.9687,
3979
+ "step": 11340
3980
+ },
3981
+ {
3982
+ "epoch": 0.016827734951324,
3983
+ "grad_norm": 7.65625,
3984
+ "learning_rate": 4.9731840523764185e-05,
3985
+ "loss": 24.9447,
3986
+ "step": 11360
3987
+ },
3988
+ {
3989
+ "epoch": 0.016857361245252386,
3990
+ "grad_norm": 7.46875,
3991
+ "learning_rate": 4.973134663026343e-05,
3992
+ "loss": 24.8808,
3993
+ "step": 11380
3994
+ },
3995
+ {
3996
+ "epoch": 0.016886987539180772,
3997
+ "grad_norm": 9.125,
3998
+ "learning_rate": 4.9730852736762665e-05,
3999
+ "loss": 24.931,
4000
+ "step": 11400
4001
+ },
4002
+ {
4003
+ "epoch": 0.016916613833109162,
4004
+ "grad_norm": 9.5625,
4005
+ "learning_rate": 4.973035884326191e-05,
4006
+ "loss": 24.9442,
4007
+ "step": 11420
4008
+ },
4009
+ {
4010
+ "epoch": 0.01694624012703755,
4011
+ "grad_norm": 7.09375,
4012
+ "learning_rate": 4.972986494976116e-05,
4013
+ "loss": 24.8634,
4014
+ "step": 11440
4015
+ },
4016
+ {
4017
+ "epoch": 0.016975866420965935,
4018
+ "grad_norm": 7.71875,
4019
+ "learning_rate": 4.97293710562604e-05,
4020
+ "loss": 24.9538,
4021
+ "step": 11460
4022
+ },
4023
+ {
4024
+ "epoch": 0.01700549271489432,
4025
+ "grad_norm": 8.3125,
4026
+ "learning_rate": 4.972887716275964e-05,
4027
+ "loss": 24.8944,
4028
+ "step": 11480
4029
+ },
4030
+ {
4031
+ "epoch": 0.01703511900882271,
4032
+ "grad_norm": 11.25,
4033
+ "learning_rate": 4.972838326925889e-05,
4034
+ "loss": 24.8674,
4035
+ "step": 11500
4036
+ },
4037
+ {
4038
+ "epoch": 0.017064745302751098,
4039
+ "grad_norm": 7.21875,
4040
+ "learning_rate": 4.972788937575813e-05,
4041
+ "loss": 24.9174,
4042
+ "step": 11520
4043
+ },
4044
+ {
4045
+ "epoch": 0.017094371596679484,
4046
+ "grad_norm": 8.1875,
4047
+ "learning_rate": 4.972739548225737e-05,
4048
+ "loss": 24.8759,
4049
+ "step": 11540
4050
+ },
4051
+ {
4052
+ "epoch": 0.01712399789060787,
4053
+ "grad_norm": 10.0625,
4054
+ "learning_rate": 4.972690158875661e-05,
4055
+ "loss": 24.8022,
4056
+ "step": 11560
4057
+ },
4058
+ {
4059
+ "epoch": 0.01715362418453626,
4060
+ "grad_norm": 9.4375,
4061
+ "learning_rate": 4.972640769525586e-05,
4062
+ "loss": 24.8826,
4063
+ "step": 11580
4064
+ },
4065
+ {
4066
+ "epoch": 0.017183250478464647,
4067
+ "grad_norm": 9.0625,
4068
+ "learning_rate": 4.9725913801755106e-05,
4069
+ "loss": 24.8848,
4070
+ "step": 11600
4071
+ },
4072
+ {
4073
+ "epoch": 0.017212876772393033,
4074
+ "grad_norm": 9.4375,
4075
+ "learning_rate": 4.972541990825434e-05,
4076
+ "loss": 24.7912,
4077
+ "step": 11620
4078
+ },
4079
+ {
4080
+ "epoch": 0.01724250306632142,
4081
+ "grad_norm": 8.875,
4082
+ "learning_rate": 4.9724926014753586e-05,
4083
+ "loss": 24.9131,
4084
+ "step": 11640
4085
+ },
4086
+ {
4087
+ "epoch": 0.01727212936024981,
4088
+ "grad_norm": 9.3125,
4089
+ "learning_rate": 4.9724432121252836e-05,
4090
+ "loss": 24.9074,
4091
+ "step": 11660
4092
+ },
4093
+ {
4094
+ "epoch": 0.017301755654178196,
4095
+ "grad_norm": 7.375,
4096
+ "learning_rate": 4.972393822775207e-05,
4097
+ "loss": 24.8813,
4098
+ "step": 11680
4099
+ },
4100
+ {
4101
+ "epoch": 0.017331381948106583,
4102
+ "grad_norm": 7.40625,
4103
+ "learning_rate": 4.9723444334251316e-05,
4104
+ "loss": 24.7468,
4105
+ "step": 11700
4106
+ },
4107
+ {
4108
+ "epoch": 0.01736100824203497,
4109
+ "grad_norm": 7.65625,
4110
+ "learning_rate": 4.972295044075056e-05,
4111
+ "loss": 24.791,
4112
+ "step": 11720
4113
+ },
4114
+ {
4115
+ "epoch": 0.01739063453596336,
4116
+ "grad_norm": 7.3125,
4117
+ "learning_rate": 4.972245654724981e-05,
4118
+ "loss": 24.8677,
4119
+ "step": 11740
4120
+ },
4121
+ {
4122
+ "epoch": 0.017420260829891746,
4123
+ "grad_norm": 11.625,
4124
+ "learning_rate": 4.9721962653749047e-05,
4125
+ "loss": 24.8118,
4126
+ "step": 11760
4127
+ },
4128
+ {
4129
+ "epoch": 0.017449887123820132,
4130
+ "grad_norm": 7.15625,
4131
+ "learning_rate": 4.972146876024829e-05,
4132
+ "loss": 24.8483,
4133
+ "step": 11780
4134
+ },
4135
+ {
4136
+ "epoch": 0.017479513417748522,
4137
+ "grad_norm": 9.8125,
4138
+ "learning_rate": 4.972097486674754e-05,
4139
+ "loss": 24.8652,
4140
+ "step": 11800
4141
+ },
4142
+ {
4143
+ "epoch": 0.01750913971167691,
4144
+ "grad_norm": 7.4375,
4145
+ "learning_rate": 4.972048097324678e-05,
4146
+ "loss": 24.7769,
4147
+ "step": 11820
4148
+ },
4149
+ {
4150
+ "epoch": 0.017538766005605295,
4151
+ "grad_norm": 8.1875,
4152
+ "learning_rate": 4.971998707974602e-05,
4153
+ "loss": 24.8606,
4154
+ "step": 11840
4155
+ },
4156
+ {
4157
+ "epoch": 0.01756839229953368,
4158
+ "grad_norm": 8.25,
4159
+ "learning_rate": 4.9719493186245264e-05,
4160
+ "loss": 24.7523,
4161
+ "step": 11860
4162
+ },
4163
+ {
4164
+ "epoch": 0.01759801859346207,
4165
+ "grad_norm": 9.9375,
4166
+ "learning_rate": 4.9718999292744514e-05,
4167
+ "loss": 24.7848,
4168
+ "step": 11880
4169
+ },
4170
+ {
4171
+ "epoch": 0.017627644887390458,
4172
+ "grad_norm": 7.6875,
4173
+ "learning_rate": 4.971850539924375e-05,
4174
+ "loss": 24.7667,
4175
+ "step": 11900
4176
+ },
4177
+ {
4178
+ "epoch": 0.017657271181318844,
4179
+ "grad_norm": 9.0,
4180
+ "learning_rate": 4.9718011505742994e-05,
4181
+ "loss": 24.7369,
4182
+ "step": 11920
4183
+ },
4184
+ {
4185
+ "epoch": 0.01768689747524723,
4186
+ "grad_norm": 7.28125,
4187
+ "learning_rate": 4.971751761224224e-05,
4188
+ "loss": 24.7954,
4189
+ "step": 11940
4190
+ },
4191
+ {
4192
+ "epoch": 0.01771652376917562,
4193
+ "grad_norm": 8.4375,
4194
+ "learning_rate": 4.971702371874149e-05,
4195
+ "loss": 24.7734,
4196
+ "step": 11960
4197
+ },
4198
+ {
4199
+ "epoch": 0.017746150063104007,
4200
+ "grad_norm": 8.0,
4201
+ "learning_rate": 4.9716529825240724e-05,
4202
+ "loss": 24.8222,
4203
+ "step": 11980
4204
+ },
4205
+ {
4206
+ "epoch": 0.017775776357032393,
4207
+ "grad_norm": 10.625,
4208
+ "learning_rate": 4.971603593173997e-05,
4209
+ "loss": 24.7799,
4210
+ "step": 12000
4211
  }
4212
  ],
4213
  "logging_steps": 20,
 
4227
  "attributes": {}
4228
  }
4229
  },
4230
+ "total_flos": 2.22031198508611e+19,
4231
  "train_batch_size": 48,
4232
  "trial_name": null,
4233
  "trial_params": null