Azrail commited on
Commit
fecf097
·
verified ·
1 Parent(s): 0728805

Training in progress, step 68000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4df562358f0b3d93fdb48e67f5210b057adeffd8b788222cd6d30c1e17d16a45
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b26db7188c89cde52f93cc8f561f4529a8702aaa52ce9c883892b96769dd603
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:095f32100e867e0fe913cd1c8e425177cd1f66e07c341665a191649c37a86bd3
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec885f087630fd98da5aea6a3b9af5bf67a1e0daf9ab5c57e09d7f1ac7385946
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:00a7e117096eaa1f05b475c020696dc81b37bf94c840c6a7b407a88337130d26
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4fb0106671a29e67305a03ecdd422ffd62f40cc2f3e19327fe3581d2d1603d90
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.31959168584614284,
6
  "eval_steps": 500,
7
- "global_step": 67000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11934,11 +11934,189 @@
11934
  "eval_steps_per_second": 23.351,
11935
  "num_input_tokens_seen": 17563643456,
11936
  "step": 67000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11937
  }
11938
  ],
11939
  "logging_steps": 50,
11940
  "max_steps": 70000,
11941
- "num_input_tokens_seen": 17563643456,
11942
  "num_train_epochs": 1,
11943
  "save_steps": 1000,
11944
  "stateful_callbacks": {
@@ -11953,7 +12131,7 @@
11953
  "attributes": {}
11954
  }
11955
  },
11956
- "total_flos": 4.698446045640131e+18,
11957
  "train_batch_size": 64,
11958
  "trial_name": null,
11959
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.3243617110080256,
6
  "eval_steps": 500,
7
+ "global_step": 68000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11934
  "eval_steps_per_second": 23.351,
11935
  "num_input_tokens_seen": 17563643456,
11936
  "step": 67000
11937
+ },
11938
+ {
11939
+ "epoch": 0.319830187104237,
11940
+ "grad_norm": 0.1690913438796997,
11941
+ "learning_rate": 0.00010561116804955451,
11942
+ "loss": 2.5364,
11943
+ "num_input_tokens_seen": 17576750656,
11944
+ "step": 67050
11945
+ },
11946
+ {
11947
+ "epoch": 0.3200686883623311,
11948
+ "grad_norm": 0.16436229646205902,
11949
+ "learning_rate": 0.00010218772555910954,
11950
+ "loss": 2.5298,
11951
+ "num_input_tokens_seen": 17589857856,
11952
+ "step": 67100
11953
+ },
11954
+ {
11955
+ "epoch": 0.32030718962042526,
11956
+ "grad_norm": 0.15499907732009888,
11957
+ "learning_rate": 9.881436225981105e-05,
11958
+ "loss": 2.5484,
11959
+ "num_input_tokens_seen": 17602965056,
11960
+ "step": 67150
11961
+ },
11962
+ {
11963
+ "epoch": 0.32054569087851936,
11964
+ "grad_norm": 0.16237874329090118,
11965
+ "learning_rate": 9.549150281252633e-05,
11966
+ "loss": 2.5271,
11967
+ "num_input_tokens_seen": 17616072256,
11968
+ "step": 67200
11969
+ },
11970
+ {
11971
+ "epoch": 0.3207841921366135,
11972
+ "grad_norm": 0.16813968122005463,
11973
+ "learning_rate": 9.221956552036992e-05,
11974
+ "loss": 2.5295,
11975
+ "num_input_tokens_seen": 17629179456,
11976
+ "step": 67250
11977
+ },
11978
+ {
11979
+ "epoch": 0.3210226933947077,
11980
+ "grad_norm": 0.15672080218791962,
11981
+ "learning_rate": 8.899896227604509e-05,
11982
+ "loss": 2.528,
11983
+ "num_input_tokens_seen": 17642286656,
11984
+ "step": 67300
11985
+ },
11986
+ {
11987
+ "epoch": 0.3212611946528018,
11988
+ "grad_norm": 0.16523708403110504,
11989
+ "learning_rate": 8.58300985099918e-05,
11990
+ "loss": 2.5288,
11991
+ "num_input_tokens_seen": 17655393856,
11992
+ "step": 67350
11993
+ },
11994
+ {
11995
+ "epoch": 0.32149969591089594,
11996
+ "grad_norm": 0.16759687662124634,
11997
+ "learning_rate": 8.271337313934868e-05,
11998
+ "loss": 2.5431,
11999
+ "num_input_tokens_seen": 17668501056,
12000
+ "step": 67400
12001
+ },
12002
+ {
12003
+ "epoch": 0.32173819716899005,
12004
+ "grad_norm": 0.15507538616657257,
12005
+ "learning_rate": 7.964917851773496e-05,
12006
+ "loss": 2.5342,
12007
+ "num_input_tokens_seen": 17681608256,
12008
+ "step": 67450
12009
+ },
12010
+ {
12011
+ "epoch": 0.3219766984270842,
12012
+ "grad_norm": 0.1556961089372635,
12013
+ "learning_rate": 7.663790038585794e-05,
12014
+ "loss": 2.5189,
12015
+ "num_input_tokens_seen": 17694715456,
12016
+ "step": 67500
12017
+ },
12018
+ {
12019
+ "epoch": 0.3219766984270842,
12020
+ "eval_loss": 2.415555000305176,
12021
+ "eval_runtime": 53.2935,
12022
+ "eval_samples_per_second": 93.82,
12023
+ "eval_steps_per_second": 23.455,
12024
+ "num_input_tokens_seen": 17694715456,
12025
+ "step": 67500
12026
+ },
12027
+ {
12028
+ "epoch": 0.32221519968517837,
12029
+ "grad_norm": 0.16804397106170654,
12030
+ "learning_rate": 7.367991782295391e-05,
12031
+ "loss": 2.5218,
12032
+ "num_input_tokens_seen": 17707822656,
12033
+ "step": 67550
12034
+ },
12035
+ {
12036
+ "epoch": 0.32245370094327247,
12037
+ "grad_norm": 0.15728074312210083,
12038
+ "learning_rate": 7.077560319906695e-05,
12039
+ "loss": 2.5261,
12040
+ "num_input_tokens_seen": 17720929856,
12041
+ "step": 67600
12042
+ },
12043
+ {
12044
+ "epoch": 0.32269220220136663,
12045
+ "grad_norm": 0.1641319841146469,
12046
+ "learning_rate": 6.792532212817271e-05,
12047
+ "loss": 2.5398,
12048
+ "num_input_tokens_seen": 17734037056,
12049
+ "step": 67650
12050
+ },
12051
+ {
12052
+ "epoch": 0.32293070345946073,
12053
+ "grad_norm": 0.1575596034526825,
12054
+ "learning_rate": 6.512943342215233e-05,
12055
+ "loss": 2.5211,
12056
+ "num_input_tokens_seen": 17747144256,
12057
+ "step": 67700
12058
+ },
12059
+ {
12060
+ "epoch": 0.3231692047175549,
12061
+ "grad_norm": 0.16352206468582153,
12062
+ "learning_rate": 6.238828904562316e-05,
12063
+ "loss": 2.5143,
12064
+ "num_input_tokens_seen": 17760251456,
12065
+ "step": 67750
12066
+ },
12067
+ {
12068
+ "epoch": 0.323407705975649,
12069
+ "grad_norm": 0.16303551197052002,
12070
+ "learning_rate": 5.9702234071631e-05,
12071
+ "loss": 2.5262,
12072
+ "num_input_tokens_seen": 17773358656,
12073
+ "step": 67800
12074
+ },
12075
+ {
12076
+ "epoch": 0.32364620723374316,
12077
+ "grad_norm": 0.15572308003902435,
12078
+ "learning_rate": 5.7071606638210094e-05,
12079
+ "loss": 2.5278,
12080
+ "num_input_tokens_seen": 17786465856,
12081
+ "step": 67850
12082
+ },
12083
+ {
12084
+ "epoch": 0.3238847084918373,
12085
+ "grad_norm": 0.15960544347763062,
12086
+ "learning_rate": 5.449673790581611e-05,
12087
+ "loss": 2.522,
12088
+ "num_input_tokens_seen": 17799573056,
12089
+ "step": 67900
12090
+ },
12091
+ {
12092
+ "epoch": 0.3241232097499314,
12093
+ "grad_norm": 0.15617695450782776,
12094
+ "learning_rate": 5.197795201563743e-05,
12095
+ "loss": 2.5151,
12096
+ "num_input_tokens_seen": 17812680256,
12097
+ "step": 67950
12098
+ },
12099
+ {
12100
+ "epoch": 0.3243617110080256,
12101
+ "grad_norm": 0.1527390033006668,
12102
+ "learning_rate": 4.9515566048790485e-05,
12103
+ "loss": 2.5213,
12104
+ "num_input_tokens_seen": 17825787456,
12105
+ "step": 68000
12106
+ },
12107
+ {
12108
+ "epoch": 0.3243617110080256,
12109
+ "eval_loss": 2.4139962196350098,
12110
+ "eval_runtime": 53.933,
12111
+ "eval_samples_per_second": 92.708,
12112
+ "eval_steps_per_second": 23.177,
12113
+ "num_input_tokens_seen": 17825787456,
12114
+ "step": 68000
12115
  }
12116
  ],
12117
  "logging_steps": 50,
12118
  "max_steps": 70000,
12119
+ "num_input_tokens_seen": 17825787456,
12120
  "num_train_epochs": 1,
12121
  "save_steps": 1000,
12122
  "stateful_callbacks": {
 
12131
  "attributes": {}
12132
  }
12133
  },
12134
+ "total_flos": 4.768572124165571e+18,
12135
  "train_batch_size": 64,
12136
  "trial_name": null,
12137
  "trial_params": null