Azrail commited on
Commit
f5e74ba
·
verified ·
1 Parent(s): f60955d

Training in progress, step 158000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f6fe11c67454b196001e4ed79995fab7a74672309c3dd2962d5f80ffb63bc57b
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:416215f0149ccdc8795d454a743c8eca2679e864ee093c2c92b9a7dc89d715bb
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b377b9ca632415e5cc259fc332790fb52abc8eed12ac83f3e71264ccc731be8f
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f007001464c0735c2a1e81f89ec3fb177e1babb5501510e4e9c8bbe36278a009
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7eb45c212b2ad29aa591c00be2416908d2afe25e585ed34f3beb9e136a92cef
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09a1e45eb7b9bd5bee8831d08f28097d6e76d93bacd9f185db082ca81501cddf
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:50e7a91a2d6de7899f3ef55596029edda575e3992fc80bcaed05a5c6cf935dee
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:30aae8d205cf2e798817f5244eb6202efaefc5672b52769c357c6911e1e312c2
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 3.4487226393866526,
6
  "eval_steps": 500,
7
- "global_step": 157000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -27954,11 +27954,189 @@
27954
  "eval_steps_per_second": 15.527,
27955
  "num_input_tokens_seen": 91213822304,
27956
  "step": 157000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27957
  }
27958
  ],
27959
  "logging_steps": 50,
27960
  "max_steps": 200000,
27961
- "num_input_tokens_seen": 91213822304,
27962
  "num_train_epochs": 5,
27963
  "save_steps": 1000,
27964
  "stateful_callbacks": {
@@ -27973,7 +28151,7 @@
27973
  "attributes": {}
27974
  }
27975
  },
27976
- "total_flos": 1.614318280768918e+20,
27977
  "train_batch_size": 32,
27978
  "trial_name": null,
27979
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.4706886977079106,
6
  "eval_steps": 500,
7
+ "global_step": 158000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
27954
  "eval_steps_per_second": 15.527,
27955
  "num_input_tokens_seen": 91213822304,
27956
  "step": 157000
27957
+ },
27958
+ {
27959
+ "epoch": 3.4498209423027157,
27960
+ "grad_norm": 0.09102839231491089,
27961
+ "learning_rate": 0.0001,
27962
+ "loss": 2.3407,
27963
+ "num_input_tokens_seen": 91266251104,
27964
+ "step": 157050
27965
+ },
27966
+ {
27967
+ "epoch": 3.4509192452187785,
27968
+ "grad_norm": 0.09813258051872253,
27969
+ "learning_rate": 0.0001,
27970
+ "loss": 2.3461,
27971
+ "num_input_tokens_seen": 91318679904,
27972
+ "step": 157100
27973
+ },
27974
+ {
27975
+ "epoch": 3.452017548134841,
27976
+ "grad_norm": 0.09532950073480606,
27977
+ "learning_rate": 0.0001,
27978
+ "loss": 2.3457,
27979
+ "num_input_tokens_seen": 91371108704,
27980
+ "step": 157150
27981
+ },
27982
+ {
27983
+ "epoch": 3.4531158510509044,
27984
+ "grad_norm": 0.10110923647880554,
27985
+ "learning_rate": 0.0001,
27986
+ "loss": 2.3426,
27987
+ "num_input_tokens_seen": 91423537504,
27988
+ "step": 157200
27989
+ },
27990
+ {
27991
+ "epoch": 3.454214153966967,
27992
+ "grad_norm": 0.09686494618654251,
27993
+ "learning_rate": 0.0001,
27994
+ "loss": 2.342,
27995
+ "num_input_tokens_seen": 91475959552,
27996
+ "step": 157250
27997
+ },
27998
+ {
27999
+ "epoch": 3.4553124568830302,
28000
+ "grad_norm": 0.09327523410320282,
28001
+ "learning_rate": 0.0001,
28002
+ "loss": 2.3477,
28003
+ "num_input_tokens_seen": 91528385536,
28004
+ "step": 157300
28005
+ },
28006
+ {
28007
+ "epoch": 3.456410759799093,
28008
+ "grad_norm": 0.10524465143680573,
28009
+ "learning_rate": 0.0001,
28010
+ "loss": 2.351,
28011
+ "num_input_tokens_seen": 91580812064,
28012
+ "step": 157350
28013
+ },
28014
+ {
28015
+ "epoch": 3.4575090627151557,
28016
+ "grad_norm": 0.08858100324869156,
28017
+ "learning_rate": 0.0001,
28018
+ "loss": 2.3443,
28019
+ "num_input_tokens_seen": 91633240864,
28020
+ "step": 157400
28021
+ },
28022
+ {
28023
+ "epoch": 3.458607365631219,
28024
+ "grad_norm": 0.0905861109495163,
28025
+ "learning_rate": 0.0001,
28026
+ "loss": 2.338,
28027
+ "num_input_tokens_seen": 91685669664,
28028
+ "step": 157450
28029
+ },
28030
+ {
28031
+ "epoch": 3.4597056685472816,
28032
+ "grad_norm": 0.0902877077460289,
28033
+ "learning_rate": 0.0001,
28034
+ "loss": 2.3447,
28035
+ "num_input_tokens_seen": 91738098464,
28036
+ "step": 157500
28037
+ },
28038
+ {
28039
+ "epoch": 3.4597056685472816,
28040
+ "eval_loss": 2.257868766784668,
28041
+ "eval_runtime": 80.702,
28042
+ "eval_samples_per_second": 61.956,
28043
+ "eval_steps_per_second": 15.489,
28044
+ "num_input_tokens_seen": 91738098464,
28045
+ "step": 157500
28046
+ },
28047
+ {
28048
+ "epoch": 3.4608039714633447,
28049
+ "grad_norm": 0.09488774091005325,
28050
+ "learning_rate": 0.0001,
28051
+ "loss": 2.348,
28052
+ "num_input_tokens_seen": 91790527264,
28053
+ "step": 157550
28054
+ },
28055
+ {
28056
+ "epoch": 3.4619022743794075,
28057
+ "grad_norm": 0.09437818825244904,
28058
+ "learning_rate": 0.0001,
28059
+ "loss": 2.3366,
28060
+ "num_input_tokens_seen": 91842955104,
28061
+ "step": 157600
28062
+ },
28063
+ {
28064
+ "epoch": 3.46300057729547,
28065
+ "grad_norm": 0.09216772764921188,
28066
+ "learning_rate": 0.0001,
28067
+ "loss": 2.3455,
28068
+ "num_input_tokens_seen": 91895383904,
28069
+ "step": 157650
28070
+ },
28071
+ {
28072
+ "epoch": 3.464098880211533,
28073
+ "grad_norm": 0.0893646627664566,
28074
+ "learning_rate": 0.0001,
28075
+ "loss": 2.3436,
28076
+ "num_input_tokens_seen": 91947812704,
28077
+ "step": 157700
28078
+ },
28079
+ {
28080
+ "epoch": 3.465197183127596,
28081
+ "grad_norm": 0.10555808991193771,
28082
+ "learning_rate": 0.0001,
28083
+ "loss": 2.3407,
28084
+ "num_input_tokens_seen": 92000241504,
28085
+ "step": 157750
28086
+ },
28087
+ {
28088
+ "epoch": 3.466295486043659,
28089
+ "grad_norm": 0.09263647347688675,
28090
+ "learning_rate": 0.0001,
28091
+ "loss": 2.3455,
28092
+ "num_input_tokens_seen": 92052667776,
28093
+ "step": 157800
28094
+ },
28095
+ {
28096
+ "epoch": 3.467393788959722,
28097
+ "grad_norm": 0.09790777415037155,
28098
+ "learning_rate": 0.0001,
28099
+ "loss": 2.3426,
28100
+ "num_input_tokens_seen": 92105094528,
28101
+ "step": 157850
28102
+ },
28103
+ {
28104
+ "epoch": 3.4684920918757847,
28105
+ "grad_norm": 0.0883532464504242,
28106
+ "learning_rate": 0.0001,
28107
+ "loss": 2.3453,
28108
+ "num_input_tokens_seen": 92157523328,
28109
+ "step": 157900
28110
+ },
28111
+ {
28112
+ "epoch": 3.4695903947918474,
28113
+ "grad_norm": 0.09379395842552185,
28114
+ "learning_rate": 0.0001,
28115
+ "loss": 2.3452,
28116
+ "num_input_tokens_seen": 92209949248,
28117
+ "step": 157950
28118
+ },
28119
+ {
28120
+ "epoch": 3.4706886977079106,
28121
+ "grad_norm": 0.09533659368753433,
28122
+ "learning_rate": 0.0001,
28123
+ "loss": 2.3454,
28124
+ "num_input_tokens_seen": 92262378048,
28125
+ "step": 158000
28126
+ },
28127
+ {
28128
+ "epoch": 3.4706886977079106,
28129
+ "eval_loss": 2.258249521255493,
28130
+ "eval_runtime": 80.6484,
28131
+ "eval_samples_per_second": 61.998,
28132
+ "eval_steps_per_second": 15.499,
28133
+ "num_input_tokens_seen": 92262378048,
28134
+ "step": 158000
28135
  }
28136
  ],
28137
  "logging_steps": 50,
28138
  "max_steps": 200000,
28139
+ "num_input_tokens_seen": 92262378048,
28140
  "num_train_epochs": 5,
28141
  "save_steps": 1000,
28142
  "stateful_callbacks": {
 
28151
  "attributes": {}
28152
  }
28153
  },
28154
+ "total_flos": 1.632875804872041e+20,
28155
  "train_batch_size": 32,
28156
  "trial_name": null,
28157
  "trial_params": null