Azrail commited on
Commit
f483548
·
verified ·
1 Parent(s): 4bd63bd

Training in progress, step 45000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:9605858ca8b64eb89cb8c33fd56e7ec671551b1e5005f2598e074ca5b397cafd
3
  size 301235464
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c3ffbf5a816a6aa824466bdde4390b737dfef3183acb26f39844f7b4017bf30d
3
  size 301235464
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e2584fadfe92de84830b6f68a11ff9f4508f42d733151a8e29faa8885164fa9e
3
  size 602335994
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c6b19fbbf1f84052b99affd1a4abf045aa0dc4dae5e3396c29093fc71d96182f
3
  size 602335994
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5c93fe38009a049e639e4ec9c47956d4822c559f5ecfd6d8454c217a91259ec7
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c2efdaece0c1a392cf0dde4c3fd595f174e50c13358c4a6e5301669f684c3b3b
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5356afb30d3aa5783dfb45e83d3ec8fbfdbc01397770efc134aa996a2dcb7311
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4373b3ab47408a8ab65ab61c7aee7bfdf3c940344f36a198973da2bfc9da86a8
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.20988110712284008,
6
  "eval_steps": 500,
7
- "global_step": 44000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7840,11 +7840,189 @@
7840
  "eval_steps_per_second": 24.302,
7841
  "num_input_tokens_seen": 11534331456,
7842
  "step": 44000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7843
  }
7844
  ],
7845
  "logging_steps": 50,
7846
  "max_steps": 70000,
7847
- "num_input_tokens_seen": 11534331456,
7848
  "num_train_epochs": 1,
7849
  "save_steps": 1000,
7850
  "stateful_callbacks": {
@@ -7859,7 +8037,7 @@
7859
  "attributes": {}
7860
  }
7861
  },
7862
- "total_flos": 3.0855462395550106e+18,
7863
  "train_batch_size": 64,
7864
  "trial_name": null,
7865
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.2146511322847228,
6
  "eval_steps": 500,
7
+ "global_step": 45000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7840
  "eval_steps_per_second": 24.302,
7841
  "num_input_tokens_seen": 11534331456,
7842
  "step": 44000
7843
+ },
7844
+ {
7845
+ "epoch": 0.21011960838093421,
7846
+ "grad_norm": 0.19737008213996887,
7847
+ "learning_rate": 0.001,
7848
+ "loss": 2.6272,
7849
+ "num_input_tokens_seen": 11547438656,
7850
+ "step": 44050
7851
+ },
7852
+ {
7853
+ "epoch": 0.21035810963902835,
7854
+ "grad_norm": 0.1984977424144745,
7855
+ "learning_rate": 0.001,
7856
+ "loss": 2.6417,
7857
+ "num_input_tokens_seen": 11560545856,
7858
+ "step": 44100
7859
+ },
7860
+ {
7861
+ "epoch": 0.21059661089712248,
7862
+ "grad_norm": 0.19575904309749603,
7863
+ "learning_rate": 0.001,
7864
+ "loss": 2.6277,
7865
+ "num_input_tokens_seen": 11573653056,
7866
+ "step": 44150
7867
+ },
7868
+ {
7869
+ "epoch": 0.2108351121552166,
7870
+ "grad_norm": 0.19875651597976685,
7871
+ "learning_rate": 0.001,
7872
+ "loss": 2.6362,
7873
+ "num_input_tokens_seen": 11586760256,
7874
+ "step": 44200
7875
+ },
7876
+ {
7877
+ "epoch": 0.21107361341331077,
7878
+ "grad_norm": 0.20936185121536255,
7879
+ "learning_rate": 0.001,
7880
+ "loss": 2.6217,
7881
+ "num_input_tokens_seen": 11599867456,
7882
+ "step": 44250
7883
+ },
7884
+ {
7885
+ "epoch": 0.2113121146714049,
7886
+ "grad_norm": 0.19474463164806366,
7887
+ "learning_rate": 0.001,
7888
+ "loss": 2.6235,
7889
+ "num_input_tokens_seen": 11612974656,
7890
+ "step": 44300
7891
+ },
7892
+ {
7893
+ "epoch": 0.21155061592949903,
7894
+ "grad_norm": 0.20833207666873932,
7895
+ "learning_rate": 0.001,
7896
+ "loss": 2.6,
7897
+ "num_input_tokens_seen": 11626081856,
7898
+ "step": 44350
7899
+ },
7900
+ {
7901
+ "epoch": 0.21178911718759316,
7902
+ "grad_norm": 0.19269512593746185,
7903
+ "learning_rate": 0.001,
7904
+ "loss": 2.6211,
7905
+ "num_input_tokens_seen": 11639189056,
7906
+ "step": 44400
7907
+ },
7908
+ {
7909
+ "epoch": 0.2120276184456873,
7910
+ "grad_norm": 0.21018226444721222,
7911
+ "learning_rate": 0.001,
7912
+ "loss": 2.6294,
7913
+ "num_input_tokens_seen": 11652296256,
7914
+ "step": 44450
7915
+ },
7916
+ {
7917
+ "epoch": 0.21226611970378143,
7918
+ "grad_norm": 0.19836543500423431,
7919
+ "learning_rate": 0.001,
7920
+ "loss": 2.6051,
7921
+ "num_input_tokens_seen": 11665403456,
7922
+ "step": 44500
7923
+ },
7924
+ {
7925
+ "epoch": 0.21226611970378143,
7926
+ "eval_loss": 2.499817132949829,
7927
+ "eval_runtime": 50.9003,
7928
+ "eval_samples_per_second": 98.231,
7929
+ "eval_steps_per_second": 24.558,
7930
+ "num_input_tokens_seen": 11665403456,
7931
+ "step": 44500
7932
+ },
7933
+ {
7934
+ "epoch": 0.21250462096187558,
7935
+ "grad_norm": 0.18411967158317566,
7936
+ "learning_rate": 0.001,
7937
+ "loss": 2.6228,
7938
+ "num_input_tokens_seen": 11678510656,
7939
+ "step": 44550
7940
+ },
7941
+ {
7942
+ "epoch": 0.21274312221996972,
7943
+ "grad_norm": 0.19387467205524445,
7944
+ "learning_rate": 0.001,
7945
+ "loss": 2.5902,
7946
+ "num_input_tokens_seen": 11691617856,
7947
+ "step": 44600
7948
+ },
7949
+ {
7950
+ "epoch": 0.21298162347806385,
7951
+ "grad_norm": 0.22076952457427979,
7952
+ "learning_rate": 0.001,
7953
+ "loss": 2.613,
7954
+ "num_input_tokens_seen": 11704725056,
7955
+ "step": 44650
7956
+ },
7957
+ {
7958
+ "epoch": 0.21322012473615798,
7959
+ "grad_norm": 0.33861082792282104,
7960
+ "learning_rate": 0.001,
7961
+ "loss": 2.6142,
7962
+ "num_input_tokens_seen": 11717832256,
7963
+ "step": 44700
7964
+ },
7965
+ {
7966
+ "epoch": 0.2134586259942521,
7967
+ "grad_norm": 0.20097902417182922,
7968
+ "learning_rate": 0.001,
7969
+ "loss": 2.6549,
7970
+ "num_input_tokens_seen": 11730939456,
7971
+ "step": 44750
7972
+ },
7973
+ {
7974
+ "epoch": 0.21369712725234627,
7975
+ "grad_norm": 0.24534635245800018,
7976
+ "learning_rate": 0.001,
7977
+ "loss": 2.6293,
7978
+ "num_input_tokens_seen": 11744046656,
7979
+ "step": 44800
7980
+ },
7981
+ {
7982
+ "epoch": 0.2139356285104404,
7983
+ "grad_norm": 0.2439020723104477,
7984
+ "learning_rate": 0.001,
7985
+ "loss": 2.635,
7986
+ "num_input_tokens_seen": 11757153856,
7987
+ "step": 44850
7988
+ },
7989
+ {
7990
+ "epoch": 0.21417412976853453,
7991
+ "grad_norm": 0.24259154498577118,
7992
+ "learning_rate": 0.001,
7993
+ "loss": 2.6232,
7994
+ "num_input_tokens_seen": 11770261056,
7995
+ "step": 44900
7996
+ },
7997
+ {
7998
+ "epoch": 0.21441263102662866,
7999
+ "grad_norm": 0.23554636538028717,
8000
+ "learning_rate": 0.001,
8001
+ "loss": 2.6061,
8002
+ "num_input_tokens_seen": 11783368256,
8003
+ "step": 44950
8004
+ },
8005
+ {
8006
+ "epoch": 0.2146511322847228,
8007
+ "grad_norm": 0.20377275347709656,
8008
+ "learning_rate": 0.001,
8009
+ "loss": 2.6156,
8010
+ "num_input_tokens_seen": 11796475456,
8011
+ "step": 45000
8012
+ },
8013
+ {
8014
+ "epoch": 0.2146511322847228,
8015
+ "eval_loss": 2.503781318664551,
8016
+ "eval_runtime": 51.1656,
8017
+ "eval_samples_per_second": 97.722,
8018
+ "eval_steps_per_second": 24.43,
8019
+ "num_input_tokens_seen": 11796475456,
8020
+ "step": 45000
8021
  }
8022
  ],
8023
  "logging_steps": 50,
8024
  "max_steps": 70000,
8025
+ "num_input_tokens_seen": 11796475456,
8026
  "num_train_epochs": 1,
8027
  "save_steps": 1000,
8028
  "stateful_callbacks": {
 
8037
  "attributes": {}
8038
  }
8039
  },
8040
+ "total_flos": 3.1556723180804506e+18,
8041
  "train_batch_size": 64,
8042
  "trial_name": null,
8043
  "trial_params": null