Azrail commited on
Commit
9a1307e
·
verified ·
1 Parent(s): d6bdea7

Training in progress, step 57000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3d360be7fe2543c78a1f7ac85877b8ebcc55a8fc7ce7ea8871241b28859be01
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16b35a6c5a2893347ac39200ce6524a1890f21615a98cf260909a1625f36f1c5
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c586225c37191bdb386336c5aa7eba4c313537c276b8b87dd7fefbcb4a3ca975
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b96c4f49154280d995e547e25a75aad825b4ac333aa881c2f7edaa3460a4415
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0f1d7953b9adf97d81c8d5df7c90f2cd3786e196584c751d3c25ee459604bb2b
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b80a94302b027aba469e721f259f7cea336e0f08145beaf0eef00eec23f3459c
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:56641b065a04f5f757422df636842a91ff2acd7d071b6672db512bd44af71813
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d03f04e05cd70ad1a826e9dcf44af396ac68835a057941493a30d6d09cfeca51
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.37668393695252606,
6
  "eval_steps": 500,
7
- "global_step": 56000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -9976,11 +9976,189 @@
9976
  "eval_steps_per_second": 23.522,
9977
  "num_input_tokens_seen": 14680064000,
9978
  "step": 56000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9979
  }
9980
  ],
9981
  "logging_steps": 50,
9982
  "max_steps": 60000,
9983
- "num_input_tokens_seen": 14680064000,
9984
  "num_train_epochs": 1,
9985
  "save_steps": 1000,
9986
  "stateful_callbacks": {
@@ -9995,7 +10173,7 @@
9995
  "attributes": {}
9996
  }
9997
  },
9998
- "total_flos": 3.92706039742464e+18,
9999
  "train_batch_size": 64,
10000
  "trial_name": null,
10001
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.38341043582667833,
6
  "eval_steps": 500,
7
+ "global_step": 57000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
9976
  "eval_steps_per_second": 23.522,
9977
  "num_input_tokens_seen": 14680064000,
9978
  "step": 56000
9979
+ },
9980
+ {
9981
+ "epoch": 0.37702026189623367,
9982
+ "grad_norm": 0.2067674696445465,
9983
+ "learning_rate": 0.0007385793801298042,
9984
+ "loss": 3.05,
9985
+ "num_input_tokens_seen": 14693171200,
9986
+ "step": 56050
9987
+ },
9988
+ {
9989
+ "epoch": 0.3773565868399413,
9990
+ "grad_norm": 0.20803235471248627,
9991
+ "learning_rate": 0.0007269952498697733,
9992
+ "loss": 3.0451,
9993
+ "num_input_tokens_seen": 14706278400,
9994
+ "step": 56100
9995
+ },
9996
+ {
9997
+ "epoch": 0.3776929117836489,
9998
+ "grad_norm": 0.2035783976316452,
9999
+ "learning_rate": 0.0007152555484041476,
10000
+ "loss": 3.0281,
10001
+ "num_input_tokens_seen": 14719385600,
10002
+ "step": 56150
10003
+ },
10004
+ {
10005
+ "epoch": 0.3780292367273565,
10006
+ "grad_norm": 0.21911849081516266,
10007
+ "learning_rate": 0.0007033683215379002,
10008
+ "loss": 3.0312,
10009
+ "num_input_tokens_seen": 14732492800,
10010
+ "step": 56200
10011
+ },
10012
+ {
10013
+ "epoch": 0.3783655616710641,
10014
+ "grad_norm": 0.2263978123664856,
10015
+ "learning_rate": 0.000691341716182545,
10016
+ "loss": 3.0237,
10017
+ "num_input_tokens_seen": 14745600000,
10018
+ "step": 56250
10019
+ },
10020
+ {
10021
+ "epoch": 0.3787018866147717,
10022
+ "grad_norm": 0.20394045114517212,
10023
+ "learning_rate": 0.0006791839747726501,
10024
+ "loss": 3.0271,
10025
+ "num_input_tokens_seen": 14758707200,
10026
+ "step": 56300
10027
+ },
10028
+ {
10029
+ "epoch": 0.37903821155847933,
10030
+ "grad_norm": 0.1954122930765152,
10031
+ "learning_rate": 0.0006669034296168854,
10032
+ "loss": 3.0368,
10033
+ "num_input_tokens_seen": 14771814400,
10034
+ "step": 56350
10035
+ },
10036
+ {
10037
+ "epoch": 0.37937453650218694,
10038
+ "grad_norm": 0.2434541881084442,
10039
+ "learning_rate": 0.0006545084971874737,
10040
+ "loss": 3.0268,
10041
+ "num_input_tokens_seen": 14784921600,
10042
+ "step": 56400
10043
+ },
10044
+ {
10045
+ "epoch": 0.37971086144589455,
10046
+ "grad_norm": 0.19820261001586914,
10047
+ "learning_rate": 0.0006420076723519614,
10048
+ "loss": 3.0193,
10049
+ "num_input_tokens_seen": 14798028800,
10050
+ "step": 56450
10051
+ },
10052
+ {
10053
+ "epoch": 0.38004718638960217,
10054
+ "grad_norm": 0.18117697536945343,
10055
+ "learning_rate": 0.0006294095225512603,
10056
+ "loss": 3.0241,
10057
+ "num_input_tokens_seen": 14811136000,
10058
+ "step": 56500
10059
+ },
10060
+ {
10061
+ "epoch": 0.38004718638960217,
10062
+ "eval_loss": 2.920185089111328,
10063
+ "eval_runtime": 53.8805,
10064
+ "eval_samples_per_second": 92.798,
10065
+ "eval_steps_per_second": 23.199,
10066
+ "num_input_tokens_seen": 14811136000,
10067
+ "step": 56500
10068
+ },
10069
+ {
10070
+ "epoch": 0.3803835113333098,
10071
+ "grad_norm": 0.20303522050380707,
10072
+ "learning_rate": 0.0006167226819279528,
10073
+ "loss": 3.0133,
10074
+ "num_input_tokens_seen": 14824243200,
10075
+ "step": 56550
10076
+ },
10077
+ {
10078
+ "epoch": 0.3807198362770174,
10079
+ "grad_norm": 0.19498929381370544,
10080
+ "learning_rate": 0.0006039558454088796,
10081
+ "loss": 3.0241,
10082
+ "num_input_tokens_seen": 14837350400,
10083
+ "step": 56600
10084
+ },
10085
+ {
10086
+ "epoch": 0.381056161220725,
10087
+ "grad_norm": 0.21773076057434082,
10088
+ "learning_rate": 0.0005911177627460738,
10089
+ "loss": 3.0235,
10090
+ "num_input_tokens_seen": 14850457600,
10091
+ "step": 56650
10092
+ },
10093
+ {
10094
+ "epoch": 0.3813924861644326,
10095
+ "grad_norm": 0.19796748459339142,
10096
+ "learning_rate": 0.0005782172325201155,
10097
+ "loss": 3.019,
10098
+ "num_input_tokens_seen": 14863564800,
10099
+ "step": 56700
10100
+ },
10101
+ {
10102
+ "epoch": 0.3817288111081402,
10103
+ "grad_norm": 0.18569409847259521,
10104
+ "learning_rate": 0.000565263096110026,
10105
+ "loss": 3.0189,
10106
+ "num_input_tokens_seen": 14876672000,
10107
+ "step": 56750
10108
+ },
10109
+ {
10110
+ "epoch": 0.38206513605184783,
10111
+ "grad_norm": 0.27358362078666687,
10112
+ "learning_rate": 0.0005522642316338268,
10113
+ "loss": 3.0107,
10114
+ "num_input_tokens_seen": 14889779200,
10115
+ "step": 56800
10116
+ },
10117
+ {
10118
+ "epoch": 0.38240146099555544,
10119
+ "grad_norm": 0.2143600583076477,
10120
+ "learning_rate": 0.0005392295478639225,
10121
+ "loss": 3.0139,
10122
+ "num_input_tokens_seen": 14902886400,
10123
+ "step": 56850
10124
+ },
10125
+ {
10126
+ "epoch": 0.38273778593926305,
10127
+ "grad_norm": 0.18786349892616272,
10128
+ "learning_rate": 0.000526167978121472,
10129
+ "loss": 3.0187,
10130
+ "num_input_tokens_seen": 14915993600,
10131
+ "step": 56900
10132
+ },
10133
+ {
10134
+ "epoch": 0.38307411088297066,
10135
+ "grad_norm": 0.1809261441230774,
10136
+ "learning_rate": 0.0005130884741539367,
10137
+ "loss": 3.0197,
10138
+ "num_input_tokens_seen": 14929100800,
10139
+ "step": 56950
10140
+ },
10141
+ {
10142
+ "epoch": 0.38341043582667833,
10143
+ "grad_norm": 0.1926116794347763,
10144
+ "learning_rate": 0.0005,
10145
+ "loss": 3.0101,
10146
+ "num_input_tokens_seen": 14942208000,
10147
+ "step": 57000
10148
+ },
10149
+ {
10150
+ "epoch": 0.38341043582667833,
10151
+ "eval_loss": 2.912503242492676,
10152
+ "eval_runtime": 52.7455,
10153
+ "eval_samples_per_second": 94.795,
10154
+ "eval_steps_per_second": 23.699,
10155
+ "num_input_tokens_seen": 14942208000,
10156
+ "step": 57000
10157
  }
10158
  ],
10159
  "logging_steps": 50,
10160
  "max_steps": 60000,
10161
+ "num_input_tokens_seen": 14942208000,
10162
  "num_train_epochs": 1,
10163
  "save_steps": 1000,
10164
  "stateful_callbacks": {
 
10173
  "attributes": {}
10174
  }
10175
  },
10176
+ "total_flos": 3.99718647595008e+18,
10177
  "train_batch_size": 64,
10178
  "trial_name": null,
10179
  "trial_params": null