Azrail commited on
Commit
d720b9d
·
verified ·
1 Parent(s): b00cafe

Training in progress, step 23000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b81985a8a1ebad5f960997d908e43f8d285835abd4645a1ad5e8d86d7a91e976
3
  size 517931840
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b09b714fd1dab9349e276a2d1d64f33fa5004406d993d6a429a5d969766a4711
3
  size 517931840
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8e989ff28ea16b1edbb8530fe11c0f4057d65c6350ad0a17cbf0a4960b2cb6ea
3
  size 1035661434
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0c41cb06b8dbe81630708c3aba06b43b83a86488a65bdb7921cf7d53205761b4
3
  size 1035661434
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9ec14043d0cb9b7579fdf9075af0f9a31a2393b8ca68497f0e5375a4fe1a3cf9
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e51d5acf7af721ddb096287cdb18fed327f0e9b8f18a038e8c92ad3d7c982961
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ce6ab6f335eafbbff78f85b703b750b35d8b96e0da89ad49a445d3e07ab4df8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d95e34f92e07ac3fca3b42c6f704d96b3191058ad871fb9d07f6b2779013efbe
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.48325295134384133,
6
  "eval_steps": 500,
7
- "global_step": 22000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3924,11 +3924,189 @@
3924
  "eval_steps_per_second": 19.24,
3925
  "num_input_tokens_seen": 23068672000,
3926
  "step": 22000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3927
  }
3928
  ],
3929
  "logging_steps": 50,
3930
  "max_steps": 200000,
3931
- "num_input_tokens_seen": 23068672000,
3932
  "num_train_epochs": 5,
3933
  "save_steps": 1000,
3934
  "stateful_callbacks": {
@@ -3943,7 +4121,7 @@
3943
  "attributes": {}
3944
  }
3945
  },
3946
- "total_flos": 1.3137777751228416e+19,
3947
  "train_batch_size": 64,
3948
  "trial_name": null,
3949
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.5052189945867432,
6
  "eval_steps": 500,
7
+ "global_step": 23000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3924
  "eval_steps_per_second": 19.24,
3925
  "num_input_tokens_seen": 23068672000,
3926
  "step": 22000
3927
+ },
3928
+ {
3929
+ "epoch": 0.48435125350598646,
3930
+ "grad_norm": 0.15728288888931274,
3931
+ "learning_rate": 0.001,
3932
+ "loss": 2.703,
3933
+ "num_input_tokens_seen": 23121100800,
3934
+ "step": 22050
3935
+ },
3936
+ {
3937
+ "epoch": 0.4854495556681315,
3938
+ "grad_norm": 0.13599443435668945,
3939
+ "learning_rate": 0.001,
3940
+ "loss": 2.6984,
3941
+ "num_input_tokens_seen": 23173529600,
3942
+ "step": 22100
3943
+ },
3944
+ {
3945
+ "epoch": 0.4865478578302766,
3946
+ "grad_norm": 0.25702551007270813,
3947
+ "learning_rate": 0.001,
3948
+ "loss": 2.9388,
3949
+ "num_input_tokens_seen": 23225958400,
3950
+ "step": 22150
3951
+ },
3952
+ {
3953
+ "epoch": 0.4876461599924217,
3954
+ "grad_norm": 0.12942279875278473,
3955
+ "learning_rate": 0.001,
3956
+ "loss": 2.7568,
3957
+ "num_input_tokens_seen": 23278383360,
3958
+ "step": 22200
3959
+ },
3960
+ {
3961
+ "epoch": 0.4887444621545668,
3962
+ "grad_norm": 0.12908817827701569,
3963
+ "learning_rate": 0.001,
3964
+ "loss": 2.7195,
3965
+ "num_input_tokens_seen": 23330812160,
3966
+ "step": 22250
3967
+ },
3968
+ {
3969
+ "epoch": 0.4898427643167119,
3970
+ "grad_norm": 0.1351587176322937,
3971
+ "learning_rate": 0.001,
3972
+ "loss": 2.7155,
3973
+ "num_input_tokens_seen": 23383240960,
3974
+ "step": 22300
3975
+ },
3976
+ {
3977
+ "epoch": 0.490941066478857,
3978
+ "grad_norm": 0.1245250552892685,
3979
+ "learning_rate": 0.001,
3980
+ "loss": 2.7074,
3981
+ "num_input_tokens_seen": 23435669760,
3982
+ "step": 22350
3983
+ },
3984
+ {
3985
+ "epoch": 0.4920393686410021,
3986
+ "grad_norm": 0.13818837702274323,
3987
+ "learning_rate": 0.001,
3988
+ "loss": 2.7064,
3989
+ "num_input_tokens_seen": 23488098560,
3990
+ "step": 22400
3991
+ },
3992
+ {
3993
+ "epoch": 0.4931376708031472,
3994
+ "grad_norm": 0.15505041182041168,
3995
+ "learning_rate": 0.001,
3996
+ "loss": 2.7044,
3997
+ "num_input_tokens_seen": 23540527360,
3998
+ "step": 22450
3999
+ },
4000
+ {
4001
+ "epoch": 0.49423597296529226,
4002
+ "grad_norm": 0.14414137601852417,
4003
+ "learning_rate": 0.001,
4004
+ "loss": 2.7046,
4005
+ "num_input_tokens_seen": 23592956160,
4006
+ "step": 22500
4007
+ },
4008
+ {
4009
+ "epoch": 0.49423597296529226,
4010
+ "eval_loss": 2.60188627243042,
4011
+ "eval_runtime": 67.3268,
4012
+ "eval_samples_per_second": 74.265,
4013
+ "eval_steps_per_second": 18.566,
4014
+ "num_input_tokens_seen": 23592956160,
4015
+ "step": 22500
4016
+ },
4017
+ {
4018
+ "epoch": 0.4953342751274374,
4019
+ "grad_norm": 0.14763414859771729,
4020
+ "learning_rate": 0.001,
4021
+ "loss": 2.695,
4022
+ "num_input_tokens_seen": 23645384960,
4023
+ "step": 22550
4024
+ },
4025
+ {
4026
+ "epoch": 0.49643257728958246,
4027
+ "grad_norm": 0.14800110459327698,
4028
+ "learning_rate": 0.001,
4029
+ "loss": 2.6939,
4030
+ "num_input_tokens_seen": 23697813760,
4031
+ "step": 22600
4032
+ },
4033
+ {
4034
+ "epoch": 0.4975308794517276,
4035
+ "grad_norm": 0.13590902090072632,
4036
+ "learning_rate": 0.001,
4037
+ "loss": 2.6967,
4038
+ "num_input_tokens_seen": 23750242560,
4039
+ "step": 22650
4040
+ },
4041
+ {
4042
+ "epoch": 0.49862918161387265,
4043
+ "grad_norm": 0.1315733939409256,
4044
+ "learning_rate": 0.001,
4045
+ "loss": 2.6909,
4046
+ "num_input_tokens_seen": 23802671360,
4047
+ "step": 22700
4048
+ },
4049
+ {
4050
+ "epoch": 0.4997274837760177,
4051
+ "grad_norm": 0.13714700937271118,
4052
+ "learning_rate": 0.001,
4053
+ "loss": 2.6957,
4054
+ "num_input_tokens_seen": 23855100160,
4055
+ "step": 22750
4056
+ },
4057
+ {
4058
+ "epoch": 0.5008257859381628,
4059
+ "grad_norm": 0.1412438154220581,
4060
+ "learning_rate": 0.001,
4061
+ "loss": 2.6977,
4062
+ "num_input_tokens_seen": 23907528960,
4063
+ "step": 22800
4064
+ },
4065
+ {
4066
+ "epoch": 0.501924088100308,
4067
+ "grad_norm": 0.15368172526359558,
4068
+ "learning_rate": 0.001,
4069
+ "loss": 2.6977,
4070
+ "num_input_tokens_seen": 23959957760,
4071
+ "step": 22850
4072
+ },
4073
+ {
4074
+ "epoch": 0.503022390262453,
4075
+ "grad_norm": 0.14018824696540833,
4076
+ "learning_rate": 0.001,
4077
+ "loss": 2.6992,
4078
+ "num_input_tokens_seen": 24012386560,
4079
+ "step": 22900
4080
+ },
4081
+ {
4082
+ "epoch": 0.5041206924245981,
4083
+ "grad_norm": 0.1284814178943634,
4084
+ "learning_rate": 0.001,
4085
+ "loss": 2.6962,
4086
+ "num_input_tokens_seen": 24064815360,
4087
+ "step": 22950
4088
+ },
4089
+ {
4090
+ "epoch": 0.5052189945867432,
4091
+ "grad_norm": 0.15145835280418396,
4092
+ "learning_rate": 0.001,
4093
+ "loss": 2.692,
4094
+ "num_input_tokens_seen": 24117244160,
4095
+ "step": 23000
4096
+ },
4097
+ {
4098
+ "epoch": 0.5052189945867432,
4099
+ "eval_loss": 2.5970778465270996,
4100
+ "eval_runtime": 66.1666,
4101
+ "eval_samples_per_second": 75.567,
4102
+ "eval_steps_per_second": 18.892,
4103
+ "num_input_tokens_seen": 24117244160,
4104
+ "step": 23000
4105
  }
4106
  ],
4107
  "logging_steps": 50,
4108
  "max_steps": 200000,
4109
+ "num_input_tokens_seen": 24117244160,
4110
  "num_train_epochs": 5,
4111
  "save_steps": 1000,
4112
  "stateful_callbacks": {
 
4121
  "attributes": {}
4122
  }
4123
  },
4124
+ "total_flos": 1.3734947280285204e+19,
4125
  "train_batch_size": 64,
4126
  "trial_name": null,
4127
  "trial_params": null