Azrail commited on
Commit
083ed97
·
verified ·
1 Parent(s): c49d5ba

Training in progress, step 141000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:b6e43382fe5ddb78fed06a23ba6c7b8489c50f8ee7949d8db86e49cd8910036e
3
  size 1410301944
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a27f87288d8d797a749da5bf4d352cdabd92413a2e35e052af216c7df1f69945
3
  size 1410301944
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1c18874d88aac76ea7c7006e997509fca95df88b10d2c13b5a6816de7643ed6e
3
  size 2820185786
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36756e75b3466f2e619ffcb01fde732bcbed6a8bb6e17f933bd8b701f263e4f2
3
  size 2820185786
last-checkpoint/rng_state.pth CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:82993dca9aea22266a253201514efb5478f36bf5a374573dc48fbab5e03c52d6
3
  size 14244
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:95aeb3e8ddbb19f44b8ac55566129494d59b1f0669d87d7f6b45254087f1767e
3
  size 14244
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bf74877c1fcc66d6df58cb7c2b28db5c3be81aec77034ec2a9ace3e30449eb22
3
  size 1064
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e7f2c2062cd4eab2105e1d3af30621ba0055a18128fac0ce700a512d64dfcfc4
3
  size 1064
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.335609430339745,
6
  "eval_steps": 500,
7
- "global_step": 140000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -24928,12 +24928,190 @@
24928
  "eval_steps_per_second": 15.131,
24929
  "num_input_tokens_seen": 73388446624,
24930
  "step": 140000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24931
  }
24932
  ],
24933
  "logging_steps": 50,
24934
- "max_steps": 140000,
24935
- "num_input_tokens_seen": 73388446624,
24936
- "num_train_epochs": 2,
24937
  "save_steps": 1000,
24938
  "stateful_callbacks": {
24939
  "TrainerControl": {
@@ -24942,12 +25120,12 @@
24942
  "should_evaluate": false,
24943
  "should_log": false,
24944
  "should_save": true,
24945
- "should_training_stop": true
24946
  },
24947
  "attributes": {}
24948
  }
24949
  },
24950
- "total_flos": 1.2988416447181578e+20,
24951
  "train_batch_size": 32,
24952
  "trial_name": null,
24953
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 3.097265706246529,
6
  "eval_steps": 500,
7
+ "global_step": 141000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
24928
  "eval_steps_per_second": 15.131,
24929
  "num_input_tokens_seen": 73388446624,
24930
  "step": 140000
24931
+ },
24932
+ {
24933
+ "epoch": 3.076397950841334,
24934
+ "grad_norm": 0.09252593666315079,
24935
+ "learning_rate": 0.0001,
24936
+ "loss": 2.3535,
24937
+ "num_input_tokens_seen": 73440875424,
24938
+ "step": 140050
24939
+ },
24940
+ {
24941
+ "epoch": 3.0774962537573973,
24942
+ "grad_norm": 0.08520153909921646,
24943
+ "learning_rate": 0.0001,
24944
+ "loss": 2.3529,
24945
+ "num_input_tokens_seen": 73493304224,
24946
+ "step": 140100
24947
+ },
24948
+ {
24949
+ "epoch": 3.07859455667346,
24950
+ "grad_norm": 0.09475487470626831,
24951
+ "learning_rate": 0.0001,
24952
+ "loss": 2.3539,
24953
+ "num_input_tokens_seen": 73545729952,
24954
+ "step": 140150
24955
+ },
24956
+ {
24957
+ "epoch": 3.079692859589523,
24958
+ "grad_norm": 0.08525670319795609,
24959
+ "learning_rate": 0.0001,
24960
+ "loss": 2.3603,
24961
+ "num_input_tokens_seen": 73598155232,
24962
+ "step": 140200
24963
+ },
24964
+ {
24965
+ "epoch": 3.080791162505586,
24966
+ "grad_norm": 0.09414695203304291,
24967
+ "learning_rate": 0.0001,
24968
+ "loss": 2.3596,
24969
+ "num_input_tokens_seen": 73650584032,
24970
+ "step": 140250
24971
+ },
24972
+ {
24973
+ "epoch": 3.0818894654216487,
24974
+ "grad_norm": 0.08829599618911743,
24975
+ "learning_rate": 0.0001,
24976
+ "loss": 2.3582,
24977
+ "num_input_tokens_seen": 73703009408,
24978
+ "step": 140300
24979
+ },
24980
+ {
24981
+ "epoch": 3.082987768337712,
24982
+ "grad_norm": 0.08346480131149292,
24983
+ "learning_rate": 0.0001,
24984
+ "loss": 2.3473,
24985
+ "num_input_tokens_seen": 73755435104,
24986
+ "step": 140350
24987
+ },
24988
+ {
24989
+ "epoch": 3.0840860712537745,
24990
+ "grad_norm": 0.09302923828363419,
24991
+ "learning_rate": 0.0001,
24992
+ "loss": 2.3555,
24993
+ "num_input_tokens_seen": 73807860000,
24994
+ "step": 140400
24995
+ },
24996
+ {
24997
+ "epoch": 3.0851843741698373,
24998
+ "grad_norm": 0.08695721626281738,
24999
+ "learning_rate": 0.0001,
25000
+ "loss": 2.3578,
25001
+ "num_input_tokens_seen": 73860288800,
25002
+ "step": 140450
25003
+ },
25004
+ {
25005
+ "epoch": 3.0862826770859004,
25006
+ "grad_norm": 0.09424284100532532,
25007
+ "learning_rate": 0.0001,
25008
+ "loss": 2.3523,
25009
+ "num_input_tokens_seen": 73912717600,
25010
+ "step": 140500
25011
+ },
25012
+ {
25013
+ "epoch": 3.0862826770859004,
25014
+ "eval_loss": 2.2698493003845215,
25015
+ "eval_runtime": 81.2331,
25016
+ "eval_samples_per_second": 61.551,
25017
+ "eval_steps_per_second": 15.388,
25018
+ "num_input_tokens_seen": 73912717600,
25019
+ "step": 140500
25020
+ },
25021
+ {
25022
+ "epoch": 3.087380980001963,
25023
+ "grad_norm": 0.08606674522161484,
25024
+ "learning_rate": 0.0001,
25025
+ "loss": 2.3589,
25026
+ "num_input_tokens_seen": 73965145984,
25027
+ "step": 140550
25028
+ },
25029
+ {
25030
+ "epoch": 3.0884792829180263,
25031
+ "grad_norm": 0.09220123291015625,
25032
+ "learning_rate": 0.0001,
25033
+ "loss": 2.3503,
25034
+ "num_input_tokens_seen": 74017574784,
25035
+ "step": 140600
25036
+ },
25037
+ {
25038
+ "epoch": 3.089577585834089,
25039
+ "grad_norm": 0.10021138191223145,
25040
+ "learning_rate": 0.0001,
25041
+ "loss": 2.3528,
25042
+ "num_input_tokens_seen": 74070003040,
25043
+ "step": 140650
25044
+ },
25045
+ {
25046
+ "epoch": 3.0906758887501518,
25047
+ "grad_norm": 0.08400563895702362,
25048
+ "learning_rate": 0.0001,
25049
+ "loss": 2.3575,
25050
+ "num_input_tokens_seen": 74122431840,
25051
+ "step": 140700
25052
+ },
25053
+ {
25054
+ "epoch": 3.091774191666215,
25055
+ "grad_norm": 0.08861430734395981,
25056
+ "learning_rate": 0.0001,
25057
+ "loss": 2.3552,
25058
+ "num_input_tokens_seen": 74174859680,
25059
+ "step": 140750
25060
+ },
25061
+ {
25062
+ "epoch": 3.0928724945822776,
25063
+ "grad_norm": 0.08466708660125732,
25064
+ "learning_rate": 0.0001,
25065
+ "loss": 2.3603,
25066
+ "num_input_tokens_seen": 74227284768,
25067
+ "step": 140800
25068
+ },
25069
+ {
25070
+ "epoch": 3.0939707974983404,
25071
+ "grad_norm": 0.08707701414823532,
25072
+ "learning_rate": 0.0001,
25073
+ "loss": 2.3595,
25074
+ "num_input_tokens_seen": 74279711840,
25075
+ "step": 140850
25076
+ },
25077
+ {
25078
+ "epoch": 3.0950691004144035,
25079
+ "grad_norm": 0.08657340705394745,
25080
+ "learning_rate": 0.0001,
25081
+ "loss": 2.3511,
25082
+ "num_input_tokens_seen": 74332140640,
25083
+ "step": 140900
25084
+ },
25085
+ {
25086
+ "epoch": 3.0961674033304663,
25087
+ "grad_norm": 0.08521311730146408,
25088
+ "learning_rate": 0.0001,
25089
+ "loss": 2.3569,
25090
+ "num_input_tokens_seen": 74384569440,
25091
+ "step": 140950
25092
+ },
25093
+ {
25094
+ "epoch": 3.097265706246529,
25095
+ "grad_norm": 0.08738870918750763,
25096
+ "learning_rate": 0.0001,
25097
+ "loss": 2.3587,
25098
+ "num_input_tokens_seen": 74436998240,
25099
+ "step": 141000
25100
+ },
25101
+ {
25102
+ "epoch": 3.097265706246529,
25103
+ "eval_loss": 2.269127607345581,
25104
+ "eval_runtime": 80.825,
25105
+ "eval_samples_per_second": 61.862,
25106
+ "eval_steps_per_second": 15.466,
25107
+ "num_input_tokens_seen": 74436998240,
25108
+ "step": 141000
25109
  }
25110
  ],
25111
  "logging_steps": 50,
25112
+ "max_steps": 200000,
25113
+ "num_input_tokens_seen": 74436998240,
25114
+ "num_train_epochs": 5,
25115
  "save_steps": 1000,
25116
  "stateful_callbacks": {
25117
  "TrainerControl": {
 
25120
  "should_evaluate": false,
25121
  "should_log": false,
25122
  "should_save": true,
25123
+ "should_training_stop": false
25124
  },
25125
  "attributes": {}
25126
  }
25127
  },
25128
+ "total_flos": 1.3173990957632102e+20,
25129
  "train_batch_size": 32,
25130
  "trial_name": null,
25131
  "trial_params": null
last-checkpoint/training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d0658393df628109da44bf73cd2c5b7b1aaaea9c34af48a5725df7cb08bc1427
3
  size 6008
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:98ea109117710c0c998ea268594e6a7d0e86331c406b4b50e21b67f4948ff266
3
  size 6008