Azrail commited on
Commit
e435817
·
verified ·
1 Parent(s): 72ae520

Training in progress, step 57000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:75a54732bc39e58afccb21a46f57190dd49c2ae00c7fd73b4d8434827934d2aa
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c12b0497c316584eab0a6471e97deaea6b6c97411924d2517f029fde79d3b1c2
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:da1643d7c66b6de7210d626427e81524686db0e0650499f03aeaee61e640ca95
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e51e859ffdf4b3059a027d7764e0788d882ec9bf060bed69c183a774f7373cd
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80a94302b027aba469e721f259f7cea336e0f08145beaf0eef00eec23f3459c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56dc1edb3d2e4264095d54347eab2555bc17fb9d10875074bfbbaaa6e5eeeb69
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25aca1947c52853a475b5e869ec5722620ca13248105b9ec208f0e66ff7cf239
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.2671214090654328,
6
  "eval_steps": 500,
7
- "global_step": 56000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9976,11 +9976,189 @@
9976
  "eval_steps_per_second": 23.416,
9977
  "num_input_tokens_seen": 14680059456,
9978
  "step": 56000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9979
  }
9980
  ],
9981
  "logging_steps": 50,
9982
  "max_steps": 70000,
9983
- "num_input_tokens_seen": 14680059456,
9984
  "num_train_epochs": 1,
9985
  "save_steps": 1000,
9986
  "stateful_callbacks": {
@@ -9995,7 +10173,7 @@
9995
  "attributes": {}
9996
  }
9997
  },
9998
- "total_flos": 3.9270591818602906e+18,
9999
  "train_batch_size": 64,
10000
  "trial_name": null,
10001
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.27189143422731554,
6
  "eval_steps": 500,
7
+ "global_step": 57000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9976
  "eval_steps_per_second": 23.416,
9977
  "num_input_tokens_seen": 14680059456,
9978
  "step": 56000
9979
+ },
9980
+ {
9981
+ "epoch": 0.26735991032352696,
9982
+ "grad_norm": 0.22615984082221985,
9983
+ "learning_rate": 0.0009999685283773503,
9984
+ "loss": 2.5961,
9985
+ "num_input_tokens_seen": 14693166656,
9986
+ "step": 56050
9987
+ },
9988
+ {
9989
+ "epoch": 0.2675984115816211,
9990
+ "grad_norm": 0.2738794982433319,
9991
+ "learning_rate": 0.0009998741174712534,
9992
+ "loss": 2.612,
9993
+ "num_input_tokens_seen": 14706273856,
9994
+ "step": 56100
9995
+ },
9996
+ {
9997
+ "epoch": 0.2678369128397152,
9998
+ "grad_norm": 0.23470066487789154,
9999
+ "learning_rate": 0.0009997167791667668,
10000
+ "loss": 2.6071,
10001
+ "num_input_tokens_seen": 14719381056,
10002
+ "step": 56150
10003
+ },
10004
+ {
10005
+ "epoch": 0.2680754140978094,
10006
+ "grad_norm": 0.23558543622493744,
10007
+ "learning_rate": 0.0009994965332706573,
10008
+ "loss": 2.5956,
10009
+ "num_input_tokens_seen": 14732488256,
10010
+ "step": 56200
10011
+ },
10012
+ {
10013
+ "epoch": 0.2683139153559035,
10014
+ "grad_norm": 0.2274416983127594,
10015
+ "learning_rate": 0.0009992134075089082,
10016
+ "loss": 2.5873,
10017
+ "num_input_tokens_seen": 14745595456,
10018
+ "step": 56250
10019
+ },
10020
+ {
10021
+ "epoch": 0.26855241661399765,
10022
+ "grad_norm": 0.21609161794185638,
10023
+ "learning_rate": 0.000998867437523228,
10024
+ "loss": 2.6043,
10025
+ "num_input_tokens_seen": 14758702656,
10026
+ "step": 56300
10027
+ },
10028
+ {
10029
+ "epoch": 0.26879091787209175,
10030
+ "grad_norm": 0.2368565797805786,
10031
+ "learning_rate": 0.000998458666866564,
10032
+ "loss": 2.5952,
10033
+ "num_input_tokens_seen": 14771809856,
10034
+ "step": 56350
10035
+ },
10036
+ {
10037
+ "epoch": 0.2690294191301859,
10038
+ "grad_norm": 0.22180891036987305,
10039
+ "learning_rate": 0.0009979871469976197,
10040
+ "loss": 2.5934,
10041
+ "num_input_tokens_seen": 14784917056,
10042
+ "step": 56400
10043
+ },
10044
+ {
10045
+ "epoch": 0.26926792038828007,
10046
+ "grad_norm": 0.3060019910335541,
10047
+ "learning_rate": 0.0009974529372743762,
10048
+ "loss": 2.6224,
10049
+ "num_input_tokens_seen": 14798024256,
10050
+ "step": 56450
10051
+ },
10052
+ {
10053
+ "epoch": 0.2695064216463742,
10054
+ "grad_norm": 0.2387322634458542,
10055
+ "learning_rate": 0.0009968561049466214,
10056
+ "loss": 2.5905,
10057
+ "num_input_tokens_seen": 14811131456,
10058
+ "step": 56500
10059
+ },
10060
+ {
10061
+ "epoch": 0.2695064216463742,
10062
+ "eval_loss": 2.4835996627807617,
10063
+ "eval_runtime": 53.8478,
10064
+ "eval_samples_per_second": 92.854,
10065
+ "eval_steps_per_second": 23.214,
10066
+ "num_input_tokens_seen": 14811131456,
10067
+ "step": 56500
10068
+ },
10069
+ {
10070
+ "epoch": 0.26974492290446833,
10071
+ "grad_norm": 0.22091372311115265,
10072
+ "learning_rate": 0.0009961967251474822,
10073
+ "loss": 2.6139,
10074
+ "num_input_tokens_seen": 14824238656,
10075
+ "step": 56550
10076
+ },
10077
+ {
10078
+ "epoch": 0.26998342416256244,
10079
+ "grad_norm": 0.2304680198431015,
10080
+ "learning_rate": 0.0009954748808839674,
10081
+ "loss": 2.6167,
10082
+ "num_input_tokens_seen": 14837345856,
10083
+ "step": 56600
10084
+ },
10085
+ {
10086
+ "epoch": 0.2702219254206566,
10087
+ "grad_norm": 0.19777421653270721,
10088
+ "learning_rate": 0.0009946906630265184,
10089
+ "loss": 2.6082,
10090
+ "num_input_tokens_seen": 14850453056,
10091
+ "step": 56650
10092
+ },
10093
+ {
10094
+ "epoch": 0.27046042667875075,
10095
+ "grad_norm": 0.2113979458808899,
10096
+ "learning_rate": 0.0009938441702975688,
10097
+ "loss": 2.5981,
10098
+ "num_input_tokens_seen": 14863560256,
10099
+ "step": 56700
10100
+ },
10101
+ {
10102
+ "epoch": 0.27069892793684486,
10103
+ "grad_norm": 0.19911637902259827,
10104
+ "learning_rate": 0.0009929355092591179,
10105
+ "loss": 2.5904,
10106
+ "num_input_tokens_seen": 14876667456,
10107
+ "step": 56750
10108
+ },
10109
+ {
10110
+ "epoch": 0.270937429194939,
10111
+ "grad_norm": 0.20081694424152374,
10112
+ "learning_rate": 0.0009919647942993148,
10113
+ "loss": 2.6012,
10114
+ "num_input_tokens_seen": 14889774656,
10115
+ "step": 56800
10116
+ },
10117
+ {
10118
+ "epoch": 0.2711759304530331,
10119
+ "grad_norm": 0.22752800583839417,
10120
+ "learning_rate": 0.0009909321476180592,
10121
+ "loss": 2.6017,
10122
+ "num_input_tokens_seen": 14902881856,
10123
+ "step": 56850
10124
+ },
10125
+ {
10126
+ "epoch": 0.2714144317111273,
10127
+ "grad_norm": 0.23174402117729187,
10128
+ "learning_rate": 0.0009898376992116178,
10129
+ "loss": 2.6012,
10130
+ "num_input_tokens_seen": 14915989056,
10131
+ "step": 56900
10132
+ },
10133
+ {
10134
+ "epoch": 0.27165293296922144,
10135
+ "grad_norm": 0.22149533033370972,
10136
+ "learning_rate": 0.0009886815868562597,
10137
+ "loss": 2.5881,
10138
+ "num_input_tokens_seen": 14929096256,
10139
+ "step": 56950
10140
+ },
10141
+ {
10142
+ "epoch": 0.27189143422731554,
10143
+ "grad_norm": 0.22576771676540375,
10144
+ "learning_rate": 0.0009874639560909118,
10145
+ "loss": 2.6021,
10146
+ "num_input_tokens_seen": 14942203456,
10147
+ "step": 57000
10148
+ },
10149
+ {
10150
+ "epoch": 0.27189143422731554,
10151
+ "eval_loss": 2.482896566390991,
10152
+ "eval_runtime": 53.3773,
10153
+ "eval_samples_per_second": 93.673,
10154
+ "eval_steps_per_second": 23.418,
10155
+ "num_input_tokens_seen": 14942203456,
10156
+ "step": 57000
10157
  }
10158
  ],
10159
  "logging_steps": 50,
10160
  "max_steps": 70000,
10161
+ "num_input_tokens_seen": 14942203456,
10162
  "num_train_epochs": 1,
10163
  "save_steps": 1000,
10164
  "stateful_callbacks": {
 
10173
  "attributes": {}
10174
  }
10175
  },
10176
+ "total_flos": 3.9971852603857306e+18,
10177
  "train_batch_size": 64,
10178
  "trial_name": null,
10179
  "trial_params": null