Azrail commited on
Commit
f835372
·
verified ·
1 Parent(s): 2adb3c0

Training in progress, step 68000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:2b6d95b7e811d1f68b64bc7cb8a6aa2be60af9ae27cf26bbdeedecc87fc96939
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88085ee37b0edacc225a0fb86ed3cfd9ddce1ecb2e83ddb9feeeb81a70bb80bd
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4764d7e2e901d9dd421188980b44c73e20159a2b530b5e58e042540dbd4ca383
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b2f336afb5813ccf452282223e763afdce040692a315590bb908f2063975a3f
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f2f0aa502d64898ee3e50486c039d0e2439e7552237090a80d559862b18540a7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eb3d1fb9e8324a04c98053fb02a6fde8d1a865fd7ced6a674f76811c1bbb259f
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c8f163bf0d684bb1f1d6d058d310158a309f623a594242fc874446ccea1105f8
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:215e906fb9e492afed15b6bbd2ab828199f0238620feca89e4e09f3e2ffc4109
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.4506754245682008,
6
  "eval_steps": 500,
7
- "global_step": 67000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -11934,11 +11934,189 @@
11934
  "eval_steps_per_second": 23.475,
11935
  "num_input_tokens_seen": 17563648000,
11936
  "step": 67000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11937
  }
11938
  ],
11939
  "logging_steps": 50,
11940
  "max_steps": 70000,
11941
- "num_input_tokens_seen": 17563648000,
11942
  "num_train_epochs": 1,
11943
  "save_steps": 1000,
11944
  "stateful_callbacks": {
@@ -11953,7 +12131,7 @@
11953
  "attributes": {}
11954
  }
11955
  },
11956
- "total_flos": 4.69844726120448e+18,
11957
  "train_batch_size": 64,
11958
  "trial_name": null,
11959
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.4574019234423531,
6
  "eval_steps": 500,
7
+ "global_step": 68000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
11934
  "eval_steps_per_second": 23.475,
11935
  "num_input_tokens_seen": 17563648000,
11936
  "step": 67000
11937
+ },
11938
+ {
11939
+ "epoch": 0.4510117495119084,
11940
+ "grad_norm": 0.14675357937812805,
11941
+ "learning_rate": 8.155812334579532e-05,
11942
+ "loss": 2.9682,
11943
+ "num_input_tokens_seen": 17576755200,
11944
+ "step": 67050
11945
+ },
11946
+ {
11947
+ "epoch": 0.45134807445561603,
11948
+ "grad_norm": 0.14341385662555695,
11949
+ "learning_rate": 7.889138314185678e-05,
11950
+ "loss": 2.9749,
11951
+ "num_input_tokens_seen": 17589862400,
11952
+ "step": 67100
11953
+ },
11954
+ {
11955
+ "epoch": 0.45168439939932364,
11956
+ "grad_norm": 0.1442009061574936,
11957
+ "learning_rate": 7.626523026288279e-05,
11958
+ "loss": 2.9637,
11959
+ "num_input_tokens_seen": 17602969600,
11960
+ "step": 67150
11961
+ },
11962
+ {
11963
+ "epoch": 0.45202072434303125,
11964
+ "grad_norm": 0.14580078423023224,
11965
+ "learning_rate": 7.367991782295391e-05,
11966
+ "loss": 2.9636,
11967
+ "num_input_tokens_seen": 17616076800,
11968
+ "step": 67200
11969
+ },
11970
+ {
11971
+ "epoch": 0.45235704928673887,
11972
+ "grad_norm": 0.13888555765151978,
11973
+ "learning_rate": 7.1135694999864e-05,
11974
+ "loss": 2.9737,
11975
+ "num_input_tokens_seen": 17629184000,
11976
+ "step": 67250
11977
+ },
11978
+ {
11979
+ "epoch": 0.4526933742304465,
11980
+ "grad_norm": 0.14820803701877594,
11981
+ "learning_rate": 6.863280701110408e-05,
11982
+ "loss": 2.9778,
11983
+ "num_input_tokens_seen": 17642291200,
11984
+ "step": 67300
11985
+ },
11986
+ {
11987
+ "epoch": 0.4530296991741541,
11988
+ "grad_norm": 0.14933691918849945,
11989
+ "learning_rate": 6.617149509022808e-05,
11990
+ "loss": 2.9667,
11991
+ "num_input_tokens_seen": 17655398400,
11992
+ "step": 67350
11993
+ },
11994
+ {
11995
+ "epoch": 0.4533660241178617,
11996
+ "grad_norm": 0.14829853177070618,
11997
+ "learning_rate": 6.375199646360142e-05,
11998
+ "loss": 2.9691,
11999
+ "num_input_tokens_seen": 17668505600,
12000
+ "step": 67400
12001
+ },
12002
+ {
12003
+ "epoch": 0.4537023490615693,
12004
+ "grad_norm": 0.14731477200984955,
12005
+ "learning_rate": 6.137454432753797e-05,
12006
+ "loss": 2.9731,
12007
+ "num_input_tokens_seen": 17681612800,
12008
+ "step": 67450
12009
+ },
12010
+ {
12011
+ "epoch": 0.4540386740052769,
12012
+ "grad_norm": 0.14357906579971313,
12013
+ "learning_rate": 5.903936782582253e-05,
12014
+ "loss": 2.9785,
12015
+ "num_input_tokens_seen": 17694720000,
12016
+ "step": 67500
12017
+ },
12018
+ {
12019
+ "epoch": 0.4540386740052769,
12020
+ "eval_loss": 2.867840528488159,
12021
+ "eval_runtime": 53.8197,
12022
+ "eval_samples_per_second": 92.903,
12023
+ "eval_steps_per_second": 23.226,
12024
+ "num_input_tokens_seen": 17694720000,
12025
+ "step": 67500
12026
+ },
12027
+ {
12028
+ "epoch": 0.45437499894898453,
12029
+ "grad_norm": 0.1438903659582138,
12030
+ "learning_rate": 5.6746692027626835e-05,
12031
+ "loss": 2.9733,
12032
+ "num_input_tokens_seen": 17707827200,
12033
+ "step": 67550
12034
+ },
12035
+ {
12036
+ "epoch": 0.45471132389269214,
12037
+ "grad_norm": 0.14171506464481354,
12038
+ "learning_rate": 5.449673790581611e-05,
12039
+ "loss": 2.9637,
12040
+ "num_input_tokens_seen": 17720934400,
12041
+ "step": 67600
12042
+ },
12043
+ {
12044
+ "epoch": 0.45504764883639975,
12045
+ "grad_norm": 0.1645549088716507,
12046
+ "learning_rate": 5.2289722315651546e-05,
12047
+ "loss": 2.9668,
12048
+ "num_input_tokens_seen": 17734041600,
12049
+ "step": 67650
12050
+ },
12051
+ {
12052
+ "epoch": 0.45538397378010737,
12053
+ "grad_norm": 0.1390199065208435,
12054
+ "learning_rate": 5.0125857973889355e-05,
12055
+ "loss": 2.9762,
12056
+ "num_input_tokens_seen": 17747148800,
12057
+ "step": 67700
12058
+ },
12059
+ {
12060
+ "epoch": 0.455720298723815,
12061
+ "grad_norm": 0.14667369425296783,
12062
+ "learning_rate": 4.800535343827833e-05,
12063
+ "loss": 2.9724,
12064
+ "num_input_tokens_seen": 17760256000,
12065
+ "step": 67750
12066
+ },
12067
+ {
12068
+ "epoch": 0.4560566236675226,
12069
+ "grad_norm": 0.14203302562236786,
12070
+ "learning_rate": 4.592841308745932e-05,
12071
+ "loss": 2.9679,
12072
+ "num_input_tokens_seen": 17773363200,
12073
+ "step": 67800
12074
+ },
12075
+ {
12076
+ "epoch": 0.45639294861123025,
12077
+ "grad_norm": 0.1517883837223053,
12078
+ "learning_rate": 4.389523710126619e-05,
12079
+ "loss": 2.9723,
12080
+ "num_input_tokens_seen": 17786470400,
12081
+ "step": 67850
12082
+ },
12083
+ {
12084
+ "epoch": 0.45672927355493786,
12085
+ "grad_norm": 0.1438019722700119,
12086
+ "learning_rate": 4.190602144143207e-05,
12087
+ "loss": 2.973,
12088
+ "num_input_tokens_seen": 17799577600,
12089
+ "step": 67900
12090
+ },
12091
+ {
12092
+ "epoch": 0.4570655984986455,
12093
+ "grad_norm": 0.14281606674194336,
12094
+ "learning_rate": 3.9960957832702595e-05,
12095
+ "loss": 2.9733,
12096
+ "num_input_tokens_seen": 17812684800,
12097
+ "step": 67950
12098
+ },
12099
+ {
12100
+ "epoch": 0.4574019234423531,
12101
+ "grad_norm": 0.14911025762557983,
12102
+ "learning_rate": 3.806023374435663e-05,
12103
+ "loss": 2.9724,
12104
+ "num_input_tokens_seen": 17825792000,
12105
+ "step": 68000
12106
+ },
12107
+ {
12108
+ "epoch": 0.4574019234423531,
12109
+ "eval_loss": 2.8663442134857178,
12110
+ "eval_runtime": 53.8853,
12111
+ "eval_samples_per_second": 92.79,
12112
+ "eval_steps_per_second": 23.197,
12113
+ "num_input_tokens_seen": 17825792000,
12114
+ "step": 68000
12115
  }
12116
  ],
12117
  "logging_steps": 50,
12118
  "max_steps": 70000,
12119
+ "num_input_tokens_seen": 17825792000,
12120
  "num_train_epochs": 1,
12121
  "save_steps": 1000,
12122
  "stateful_callbacks": {
 
12131
  "attributes": {}
12132
  }
12133
  },
12134
+ "total_flos": 4.76857333972992e+18,
12135
  "train_batch_size": 64,
12136
  "trial_name": null,
12137
  "trial_params": null