Azrail commited on
Commit
a5d2b4c
·
verified ·
1 Parent(s): 8d3fdee

Training in progress, step 118000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0ccf8d1ee3da4942ba95f7a3a54578d6c16809257e74ad1be0b26812641e3056
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cb226fadbf28661b9371114993dc12e49ac5975cdb3cc0b050988cda066eda63
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5f1a7487954ffb44d1bab57c681b14f7a5680ded0c52a6c8bb015865beff7ed1
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619f8c200e9aaadfdae5aad82237b7f7ba5a625617b8275ba58c98a0a1cd45f8
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:48498b576bbabf1971bbdc1b63e18da5e5d6ff6ee2d2893d269ddf346414745c
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:eadabea5b840d3b07e42e9e423397807b167316e75ece7076e65c7e1fda35503
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:3f7ac3b8ebf1c0d4bfd4f038411c119a54a5a538a834ebe005f085cdf984be31
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8c77e72696edbb72e0b5c20319181466e8d1ea3a266d160a365b8e9afc9f97b0
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1161882728931396,
6
  "eval_steps": 500,
7
- "global_step": 117000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -20834,11 +20834,189 @@
20834
  "eval_steps_per_second": 15.104,
20835
  "num_input_tokens_seen": 61331831488,
20836
  "step": 117000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20837
  }
20838
  ],
20839
  "logging_steps": 50,
20840
  "max_steps": 140000,
20841
- "num_input_tokens_seen": 61331831488,
20842
  "num_train_epochs": 2,
20843
  "save_steps": 1000,
20844
  "stateful_callbacks": {
@@ -20853,7 +21031,7 @@
20853
  "attributes": {}
20854
  }
20855
  },
20856
- "total_flos": 1.0854615480769659e+20,
20857
  "train_batch_size": 32,
20858
  "trial_name": null,
20859
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.1257283232169049,
6
  "eval_steps": 500,
7
+ "global_step": 118000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
20834
  "eval_steps_per_second": 15.104,
20835
  "num_input_tokens_seen": 61331831488,
20836
  "step": 117000
20837
+ },
20838
+ {
20839
+ "epoch": 1.1166652754093278,
20840
+ "grad_norm": 0.13141483068466187,
20841
+ "learning_rate": 0.0009218630989585645,
20842
+ "loss": 2.0933,
20843
+ "num_input_tokens_seen": 61358045888,
20844
+ "step": 117050
20845
+ },
20846
+ {
20847
+ "epoch": 1.117142277925516,
20848
+ "grad_norm": 0.14495305716991425,
20849
+ "learning_rate": 0.0009203508214822651,
20850
+ "loss": 2.0864,
20851
+ "num_input_tokens_seen": 61384257568,
20852
+ "step": 117100
20853
+ },
20854
+ {
20855
+ "epoch": 1.1176192804417044,
20856
+ "grad_norm": 0.14642465114593506,
20857
+ "learning_rate": 0.0009188253147794443,
20858
+ "loss": 2.0918,
20859
+ "num_input_tokens_seen": 61410471968,
20860
+ "step": 117150
20861
+ },
20862
+ {
20863
+ "epoch": 1.1180962829578927,
20864
+ "grad_norm": 0.13314634561538696,
20865
+ "learning_rate": 0.0009172866268606513,
20866
+ "loss": 2.0896,
20867
+ "num_input_tokens_seen": 61436668768,
20868
+ "step": 117200
20869
+ },
20870
+ {
20871
+ "epoch": 1.1185732854740809,
20872
+ "grad_norm": 0.15387175977230072,
20873
+ "learning_rate": 0.0009157348061512727,
20874
+ "loss": 2.0771,
20875
+ "num_input_tokens_seen": 61462881056,
20876
+ "step": 117250
20877
+ },
20878
+ {
20879
+ "epoch": 1.119050287990269,
20880
+ "grad_norm": 0.13886821269989014,
20881
+ "learning_rate": 0.0009141699014900082,
20882
+ "loss": 2.0945,
20883
+ "num_input_tokens_seen": 61489085536,
20884
+ "step": 117300
20885
+ },
20886
+ {
20887
+ "epoch": 1.1195272905064575,
20888
+ "grad_norm": 0.13939301669597626,
20889
+ "learning_rate": 0.0009125919621273348,
20890
+ "loss": 2.0918,
20891
+ "num_input_tokens_seen": 61515286016,
20892
+ "step": 117350
20893
+ },
20894
+ {
20895
+ "epoch": 1.1200042930226457,
20896
+ "grad_norm": 0.1996990144252777,
20897
+ "learning_rate": 0.0009110010377239551,
20898
+ "loss": 2.0859,
20899
+ "num_input_tokens_seen": 61541500416,
20900
+ "step": 117400
20901
+ },
20902
+ {
20903
+ "epoch": 1.120481295538834,
20904
+ "grad_norm": 0.135545015335083,
20905
+ "learning_rate": 0.0009093971783492354,
20906
+ "loss": 2.089,
20907
+ "num_input_tokens_seen": 61567714816,
20908
+ "step": 117450
20909
+ },
20910
+ {
20911
+ "epoch": 1.1209582980550223,
20912
+ "grad_norm": 0.1394105702638626,
20913
+ "learning_rate": 0.0009077804344796301,
20914
+ "loss": 2.0759,
20915
+ "num_input_tokens_seen": 61593927520,
20916
+ "step": 117500
20917
+ },
20918
+ {
20919
+ "epoch": 1.1209582980550223,
20920
+ "eval_loss": 2.003880739212036,
20921
+ "eval_runtime": 83.0803,
20922
+ "eval_samples_per_second": 60.183,
20923
+ "eval_steps_per_second": 15.046,
20924
+ "num_input_tokens_seen": 61593927520,
20925
+ "step": 117500
20926
+ },
20927
+ {
20928
+ "epoch": 1.1214353005712105,
20929
+ "grad_norm": 0.1590648591518402,
20930
+ "learning_rate": 0.0009061508569970925,
20931
+ "loss": 2.0825,
20932
+ "num_input_tokens_seen": 61620139072,
20933
+ "step": 117550
20934
+ },
20935
+ {
20936
+ "epoch": 1.1219123030873988,
20937
+ "grad_norm": 0.13328000903129578,
20938
+ "learning_rate": 0.0009045084971874737,
20939
+ "loss": 2.0877,
20940
+ "num_input_tokens_seen": 61646353472,
20941
+ "step": 117600
20942
+ },
20943
+ {
20944
+ "epoch": 1.122389305603587,
20945
+ "grad_norm": 0.13834019005298615,
20946
+ "learning_rate": 0.0009028534067389086,
20947
+ "loss": 2.0871,
20948
+ "num_input_tokens_seen": 61672566336,
20949
+ "step": 117650
20950
+ },
20951
+ {
20952
+ "epoch": 1.1228663081197754,
20953
+ "grad_norm": 0.13156409561634064,
20954
+ "learning_rate": 0.000901185637740189,
20955
+ "loss": 2.0906,
20956
+ "num_input_tokens_seen": 61698777696,
20957
+ "step": 117700
20958
+ },
20959
+ {
20960
+ "epoch": 1.1233433106359636,
20961
+ "grad_norm": 0.1528773009777069,
20962
+ "learning_rate": 0.0008995052426791246,
20963
+ "loss": 2.0731,
20964
+ "num_input_tokens_seen": 61724974336,
20965
+ "step": 117750
20966
+ },
20967
+ {
20968
+ "epoch": 1.1238203131521518,
20969
+ "grad_norm": 0.14865480363368988,
20970
+ "learning_rate": 0.0008978122744408905,
20971
+ "loss": 2.082,
20972
+ "num_input_tokens_seen": 61751177792,
20973
+ "step": 117800
20974
+ },
20975
+ {
20976
+ "epoch": 1.1242973156683402,
20977
+ "grad_norm": 0.14318804442882538,
20978
+ "learning_rate": 0.0008961067863063638,
20979
+ "loss": 2.0891,
20980
+ "num_input_tokens_seen": 61777391648,
20981
+ "step": 117850
20982
+ },
20983
+ {
20984
+ "epoch": 1.1247743181845284,
20985
+ "grad_norm": 0.14581789076328278,
20986
+ "learning_rate": 0.0008943888319504456,
20987
+ "loss": 2.0908,
20988
+ "num_input_tokens_seen": 61803602176,
20989
+ "step": 117900
20990
+ },
20991
+ {
20992
+ "epoch": 1.1252513207007167,
20993
+ "grad_norm": 0.14142882823944092,
20994
+ "learning_rate": 0.0008926584654403724,
20995
+ "loss": 2.0791,
20996
+ "num_input_tokens_seen": 61829816576,
20997
+ "step": 117950
20998
+ },
20999
+ {
21000
+ "epoch": 1.1257283232169049,
21001
+ "grad_norm": 0.15033917129039764,
21002
+ "learning_rate": 0.000890915741234015,
21003
+ "loss": 2.0801,
21004
+ "num_input_tokens_seen": 61856020192,
21005
+ "step": 118000
21006
+ },
21007
+ {
21008
+ "epoch": 1.1257283232169049,
21009
+ "eval_loss": 2.0019845962524414,
21010
+ "eval_runtime": 82.7188,
21011
+ "eval_samples_per_second": 60.446,
21012
+ "eval_steps_per_second": 15.111,
21013
+ "num_input_tokens_seen": 61856020192,
21014
+ "step": 118000
21015
  }
21016
  ],
21017
  "logging_steps": 50,
21018
  "max_steps": 140000,
21019
+ "num_input_tokens_seen": 61856020192,
21020
  "num_train_epochs": 2,
21021
  "save_steps": 1000,
21022
  "stateful_callbacks": {
 
21031
  "attributes": {}
21032
  }
21033
  },
21034
+ "total_flos": 1.0947387320175698e+20,
21035
  "train_batch_size": 32,
21036
  "trial_name": null,
21037
  "trial_params": null