Fill-Mask
Transformers
Safetensors
English
deberta-v2
political-nlp
domain-adaptation
argument-mining
sentiment-analysis
stance-detection
named-entity-recognition
political-debates
Instructions to use ddore14/DeRooseBERTa with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use ddore14/DeRooseBERTa with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("fill-mask", model="ddore14/DeRooseBERTa")# Load model directly from transformers import AutoTokenizer, AutoModelForMaskedLM tokenizer = AutoTokenizer.from_pretrained("ddore14/DeRooseBERTa") model = AutoModelForMaskedLM.from_pretrained("ddore14/DeRooseBERTa") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 75.1879934828926, | |
| "eval_steps": 2000, | |
| "global_step": 150000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0, | |
| "eval_accuracy": 1.3660035857594126e-07, | |
| "eval_loss": 132.875, | |
| "eval_runtime": 254.898, | |
| "eval_samples_per_second": 6486.396, | |
| "eval_steps_per_second": 12.672, | |
| "step": 0 | |
| }, | |
| { | |
| "epoch": 0.13762730525736305, | |
| "grad_norm": 49.58098602294922, | |
| "learning_rate": 2.967e-05, | |
| "loss": 178.872046875, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.2752546105147261, | |
| "grad_norm": 40.552101135253906, | |
| "learning_rate": 5.966999999999999e-05, | |
| "loss": 82.2545625, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.2752546105147261, | |
| "eval_accuracy": 0.6086885594122716, | |
| "eval_loss": 16.890625, | |
| "eval_runtime": 245.8339, | |
| "eval_samples_per_second": 6725.555, | |
| "eval_steps_per_second": 13.139, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.41288191577208916, | |
| "grad_norm": 37.14718246459961, | |
| "learning_rate": 8.966999999999999e-05, | |
| "loss": 66.408390625, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 0.5505092210294522, | |
| "grad_norm": 32.7165641784668, | |
| "learning_rate": 0.00011960999999999999, | |
| "loss": 59.63419921875, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.5505092210294522, | |
| "eval_accuracy": 0.6626575571303744, | |
| "eval_loss": 13.6171875, | |
| "eval_runtime": 239.1422, | |
| "eval_samples_per_second": 6913.748, | |
| "eval_steps_per_second": 13.507, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 0.6881365262868153, | |
| "grad_norm": 26.651721954345703, | |
| "learning_rate": 0.00014960999999999997, | |
| "loss": 55.59596484375, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 0.8257638315441783, | |
| "grad_norm": 25.13609504699707, | |
| "learning_rate": 0.00017961, | |
| "loss": 52.9440390625, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.8257638315441783, | |
| "eval_accuracy": 0.6820435002667244, | |
| "eval_loss": 12.4453125, | |
| "eval_runtime": 241.0405, | |
| "eval_samples_per_second": 6859.298, | |
| "eval_steps_per_second": 13.4, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 0.9633911368015414, | |
| "grad_norm": 25.540454864501953, | |
| "learning_rate": 0.00020960999999999997, | |
| "loss": 51.0653359375, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 1.1010184420589044, | |
| "grad_norm": 22.598819732666016, | |
| "learning_rate": 0.00023960999999999996, | |
| "loss": 49.669171875, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.1010184420589044, | |
| "eval_accuracy": 0.6911625811516694, | |
| "eval_loss": 11.875, | |
| "eval_runtime": 238.8748, | |
| "eval_samples_per_second": 6921.488, | |
| "eval_steps_per_second": 13.522, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 1.2386457473162675, | |
| "grad_norm": 21.2167911529541, | |
| "learning_rate": 0.00026957999999999995, | |
| "loss": 48.713546875, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 1.3762730525736306, | |
| "grad_norm": 20.751371383666992, | |
| "learning_rate": 0.00029955, | |
| "loss": 47.99215625, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.3762730525736306, | |
| "eval_accuracy": 0.6964798957051618, | |
| "eval_loss": 11.5546875, | |
| "eval_runtime": 240.5455, | |
| "eval_samples_per_second": 6873.416, | |
| "eval_steps_per_second": 13.428, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 1.5139003578309937, | |
| "grad_norm": 21.686861038208008, | |
| "learning_rate": 0.0003, | |
| "loss": 47.29948046875, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 1.6515276630883569, | |
| "grad_norm": 18.800752639770508, | |
| "learning_rate": 0.0003, | |
| "loss": 46.53960546875, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.6515276630883569, | |
| "eval_accuracy": 0.701906575930677, | |
| "eval_loss": 11.2265625, | |
| "eval_runtime": 239.4873, | |
| "eval_samples_per_second": 6903.785, | |
| "eval_steps_per_second": 13.487, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 1.7891549683457197, | |
| "grad_norm": 19.42099952697754, | |
| "learning_rate": 0.0003, | |
| "loss": 45.9191953125, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 1.9267822736030829, | |
| "grad_norm": 19.15869140625, | |
| "learning_rate": 0.0003, | |
| "loss": 45.381796875, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 1.9267822736030829, | |
| "eval_accuracy": 0.7061886218301343, | |
| "eval_loss": 10.9765625, | |
| "eval_runtime": 239.2568, | |
| "eval_samples_per_second": 6910.438, | |
| "eval_steps_per_second": 13.5, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 2.0644095788604457, | |
| "grad_norm": 16.94078826904297, | |
| "learning_rate": 0.0003, | |
| "loss": 44.90976953125, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 2.202036884117809, | |
| "grad_norm": 17.655250549316406, | |
| "learning_rate": 0.0003, | |
| "loss": 44.45808203125, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.202036884117809, | |
| "eval_accuracy": 0.7093567074531988, | |
| "eval_loss": 10.8203125, | |
| "eval_runtime": 241.1911, | |
| "eval_samples_per_second": 6855.016, | |
| "eval_steps_per_second": 13.392, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 2.339664189375172, | |
| "grad_norm": 17.694721221923828, | |
| "learning_rate": 0.0003, | |
| "loss": 44.16196484375, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 2.477291494632535, | |
| "grad_norm": 17.49053955078125, | |
| "learning_rate": 0.0003, | |
| "loss": 43.84825, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.477291494632535, | |
| "eval_accuracy": 0.7115680703757034, | |
| "eval_loss": 10.6640625, | |
| "eval_runtime": 239.4688, | |
| "eval_samples_per_second": 6904.32, | |
| "eval_steps_per_second": 13.488, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 2.614918799889898, | |
| "grad_norm": 19.09914207458496, | |
| "learning_rate": 0.0003, | |
| "loss": 43.59271875, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 2.7525461051472613, | |
| "grad_norm": 16.3907527923584, | |
| "learning_rate": 0.0003, | |
| "loss": 43.352640625, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.7525461051472613, | |
| "eval_accuracy": 0.7139675040013439, | |
| "eval_loss": 10.5546875, | |
| "eval_runtime": 238.1647, | |
| "eval_samples_per_second": 6942.126, | |
| "eval_steps_per_second": 13.562, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 2.8901734104046244, | |
| "grad_norm": 15.896549224853516, | |
| "learning_rate": 0.0003, | |
| "loss": 43.17196875, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 3.0278007156619875, | |
| "grad_norm": 29.67310905456543, | |
| "learning_rate": 0.0003, | |
| "loss": 42.92155859375, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.0278007156619875, | |
| "eval_accuracy": 0.7146398068421496, | |
| "eval_loss": 10.484375, | |
| "eval_runtime": 238.4701, | |
| "eval_samples_per_second": 6933.233, | |
| "eval_steps_per_second": 13.545, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 3.1654280209193506, | |
| "grad_norm": 16.424579620361328, | |
| "learning_rate": 0.0003, | |
| "loss": 42.65390625, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 3.3030553261767133, | |
| "grad_norm": 16.19496726989746, | |
| "learning_rate": 0.0003, | |
| "loss": 42.48802734375, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.3030553261767133, | |
| "eval_accuracy": 0.7170818189501579, | |
| "eval_loss": 10.3671875, | |
| "eval_runtime": 240.7935, | |
| "eval_samples_per_second": 6866.337, | |
| "eval_steps_per_second": 13.414, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 3.4406826314340764, | |
| "grad_norm": 15.53753662109375, | |
| "learning_rate": 0.0003, | |
| "loss": 42.357984375, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 3.5783099366914395, | |
| "grad_norm": 16.701377868652344, | |
| "learning_rate": 0.0003, | |
| "loss": 42.1965703125, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.5783099366914395, | |
| "eval_accuracy": 0.7182790131411184, | |
| "eval_loss": 10.2890625, | |
| "eval_runtime": 241.2004, | |
| "eval_samples_per_second": 6854.753, | |
| "eval_steps_per_second": 13.391, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 3.7159372419488026, | |
| "grad_norm": 15.334391593933105, | |
| "learning_rate": 0.0003, | |
| "loss": 42.05885546875, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 3.8535645472061657, | |
| "grad_norm": 15.341226577758789, | |
| "learning_rate": 0.0003, | |
| "loss": 41.9392421875, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.8535645472061657, | |
| "eval_accuracy": 0.719322972712139, | |
| "eval_loss": 10.2421875, | |
| "eval_runtime": 241.186, | |
| "eval_samples_per_second": 6855.162, | |
| "eval_steps_per_second": 13.392, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 3.991191852463529, | |
| "grad_norm": 16.253334045410156, | |
| "learning_rate": 0.0003, | |
| "loss": 41.81163671875, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 4.1288191577208915, | |
| "grad_norm": 15.035149574279785, | |
| "learning_rate": 0.0003, | |
| "loss": 41.617953125, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.1288191577208915, | |
| "eval_accuracy": 0.720451396648655, | |
| "eval_loss": 10.171875, | |
| "eval_runtime": 240.2855, | |
| "eval_samples_per_second": 6880.853, | |
| "eval_steps_per_second": 13.442, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 4.266446462978255, | |
| "grad_norm": 14.762296676635742, | |
| "learning_rate": 0.0003, | |
| "loss": 41.5138203125, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 4.404073768235618, | |
| "grad_norm": 14.627701759338379, | |
| "learning_rate": 0.0003, | |
| "loss": 41.4306015625, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.404073768235618, | |
| "eval_accuracy": 0.7213651673804347, | |
| "eval_loss": 10.1328125, | |
| "eval_runtime": 242.2962, | |
| "eval_samples_per_second": 6823.752, | |
| "eval_steps_per_second": 13.331, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 4.541701073492981, | |
| "grad_norm": 14.57941722869873, | |
| "learning_rate": 0.0003, | |
| "loss": 41.3221171875, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 4.679328378750344, | |
| "grad_norm": 15.291731834411621, | |
| "learning_rate": 0.0003, | |
| "loss": 41.276203125, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.679328378750344, | |
| "eval_accuracy": 0.7223401458779132, | |
| "eval_loss": 10.0703125, | |
| "eval_runtime": 239.5218, | |
| "eval_samples_per_second": 6902.793, | |
| "eval_steps_per_second": 13.485, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 4.8169556840077075, | |
| "grad_norm": 15.057552337646484, | |
| "learning_rate": 0.0003, | |
| "loss": 41.19701171875, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 4.95458298926507, | |
| "grad_norm": 15.457907676696777, | |
| "learning_rate": 0.0003, | |
| "loss": 41.10438671875, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 4.95458298926507, | |
| "eval_accuracy": 0.7230995451445803, | |
| "eval_loss": 10.0546875, | |
| "eval_runtime": 240.0931, | |
| "eval_samples_per_second": 6886.367, | |
| "eval_steps_per_second": 13.453, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 5.092210294522434, | |
| "grad_norm": 15.539594650268555, | |
| "learning_rate": 0.0003, | |
| "loss": 40.93646875, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 5.229837599779796, | |
| "grad_norm": 14.915628433227539, | |
| "learning_rate": 0.0003, | |
| "loss": 40.8286875, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 5.229837599779796, | |
| "eval_accuracy": 0.7238966529952525, | |
| "eval_loss": 10.0, | |
| "eval_runtime": 239.9098, | |
| "eval_samples_per_second": 6891.629, | |
| "eval_steps_per_second": 13.463, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 5.367464905037159, | |
| "grad_norm": 14.271048545837402, | |
| "learning_rate": 0.0003, | |
| "loss": 40.80625, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 5.505092210294523, | |
| "grad_norm": 14.605119705200195, | |
| "learning_rate": 0.0003, | |
| "loss": 40.713796875, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 5.505092210294523, | |
| "eval_accuracy": 0.7245612679427045, | |
| "eval_loss": 9.9609375, | |
| "eval_runtime": 240.0103, | |
| "eval_samples_per_second": 6888.743, | |
| "eval_steps_per_second": 13.458, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 5.642719515551885, | |
| "grad_norm": 14.748287200927734, | |
| "learning_rate": 0.0003, | |
| "loss": 40.62338671875, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 5.780346820809249, | |
| "grad_norm": 15.422652244567871, | |
| "learning_rate": 0.0003, | |
| "loss": 40.56144140625, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 5.780346820809249, | |
| "eval_accuracy": 0.7251576961266964, | |
| "eval_loss": 9.9375, | |
| "eval_runtime": 240.638, | |
| "eval_samples_per_second": 6870.772, | |
| "eval_steps_per_second": 13.423, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 5.917974126066611, | |
| "grad_norm": 15.326558113098145, | |
| "learning_rate": 0.0003, | |
| "loss": 40.5059375, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 6.055601431323975, | |
| "grad_norm": 15.331598281860352, | |
| "learning_rate": 0.0003, | |
| "loss": 40.40818359375, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 6.055601431323975, | |
| "eval_accuracy": 0.7254487554600376, | |
| "eval_loss": 9.8984375, | |
| "eval_runtime": 240.1599, | |
| "eval_samples_per_second": 6884.449, | |
| "eval_steps_per_second": 13.449, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 6.193228736581338, | |
| "grad_norm": 14.527973175048828, | |
| "learning_rate": 0.0003, | |
| "loss": 40.3428828125, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 6.330856041838701, | |
| "grad_norm": 15.686996459960938, | |
| "learning_rate": 0.0003, | |
| "loss": 40.3244765625, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 6.330856041838701, | |
| "eval_accuracy": 0.7256079479674087, | |
| "eval_loss": 9.8984375, | |
| "eval_runtime": 239.085, | |
| "eval_samples_per_second": 6915.403, | |
| "eval_steps_per_second": 13.51, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 6.468483347096064, | |
| "grad_norm": 14.848986625671387, | |
| "learning_rate": 0.0003, | |
| "loss": 40.312796875, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 6.6061106523534265, | |
| "grad_norm": 14.275111198425293, | |
| "learning_rate": 0.0003, | |
| "loss": 40.28499609375, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 6.6061106523534265, | |
| "eval_accuracy": 0.7262142861047188, | |
| "eval_loss": 9.875, | |
| "eval_runtime": 240.5807, | |
| "eval_samples_per_second": 6872.409, | |
| "eval_steps_per_second": 13.426, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 6.74373795761079, | |
| "grad_norm": 14.665587425231934, | |
| "learning_rate": 0.0003, | |
| "loss": 40.18369921875, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 6.881365262868153, | |
| "grad_norm": 14.547246932983398, | |
| "learning_rate": 0.0003, | |
| "loss": 40.1498828125, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 6.881365262868153, | |
| "eval_accuracy": 0.7269716959581425, | |
| "eval_loss": 9.8515625, | |
| "eval_runtime": 241.4549, | |
| "eval_samples_per_second": 6847.527, | |
| "eval_steps_per_second": 13.377, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 7.018992568125516, | |
| "grad_norm": 14.525768280029297, | |
| "learning_rate": 0.0003, | |
| "loss": 40.1036328125, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 7.156619873382879, | |
| "grad_norm": 14.632113456726074, | |
| "learning_rate": 0.0003, | |
| "loss": 39.9834296875, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 7.156619873382879, | |
| "eval_accuracy": 0.7272316426626143, | |
| "eval_loss": 9.828125, | |
| "eval_runtime": 239.3181, | |
| "eval_samples_per_second": 6908.667, | |
| "eval_steps_per_second": 13.497, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 7.2942471786402425, | |
| "grad_norm": 14.982499122619629, | |
| "learning_rate": 0.0003, | |
| "loss": 39.9509375, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 7.431874483897605, | |
| "grad_norm": 16.801025390625, | |
| "learning_rate": 0.0003, | |
| "loss": 39.891859375, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 7.431874483897605, | |
| "eval_accuracy": 0.7271305788939304, | |
| "eval_loss": 9.828125, | |
| "eval_runtime": 240.159, | |
| "eval_samples_per_second": 6884.477, | |
| "eval_steps_per_second": 13.449, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 7.569501789154968, | |
| "grad_norm": 14.868009567260742, | |
| "learning_rate": 0.0003, | |
| "loss": 39.88668359375, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 7.707129094412331, | |
| "grad_norm": 14.595479011535645, | |
| "learning_rate": 0.0003, | |
| "loss": 39.821890625, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 7.707129094412331, | |
| "eval_accuracy": 0.7280901536840519, | |
| "eval_loss": 9.7734375, | |
| "eval_runtime": 238.9096, | |
| "eval_samples_per_second": 6920.478, | |
| "eval_steps_per_second": 13.52, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 7.844756399669695, | |
| "grad_norm": 13.92586612701416, | |
| "learning_rate": 0.0003, | |
| "loss": 39.78269921875, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 7.982383704927058, | |
| "grad_norm": 15.85058307647705, | |
| "learning_rate": 0.0003, | |
| "loss": 39.72277734375, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 7.982383704927058, | |
| "eval_accuracy": 0.7287356832938983, | |
| "eval_loss": 9.7578125, | |
| "eval_runtime": 239.7822, | |
| "eval_samples_per_second": 6895.295, | |
| "eval_steps_per_second": 13.471, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 8.12001101018442, | |
| "grad_norm": 15.202603340148926, | |
| "learning_rate": 0.0003, | |
| "loss": 39.6687421875, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 8.257638315441783, | |
| "grad_norm": 14.994338989257812, | |
| "learning_rate": 0.0003, | |
| "loss": 39.60739453125, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 8.257638315441783, | |
| "eval_accuracy": 0.7289930926403759, | |
| "eval_loss": 9.7265625, | |
| "eval_runtime": 241.1318, | |
| "eval_samples_per_second": 6856.702, | |
| "eval_steps_per_second": 13.395, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 8.395265620699147, | |
| "grad_norm": 15.15245532989502, | |
| "learning_rate": 0.0003, | |
| "loss": 39.57180859375, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 8.53289292595651, | |
| "grad_norm": 15.941924095153809, | |
| "learning_rate": 0.0003, | |
| "loss": 39.5704296875, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 8.53289292595651, | |
| "eval_accuracy": 0.7289831970926051, | |
| "eval_loss": 9.734375, | |
| "eval_runtime": 241.0009, | |
| "eval_samples_per_second": 6860.426, | |
| "eval_steps_per_second": 13.402, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 8.670520231213873, | |
| "grad_norm": 14.842296600341797, | |
| "learning_rate": 0.0003, | |
| "loss": 39.53778125, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 8.808147536471235, | |
| "grad_norm": 17.454763412475586, | |
| "learning_rate": 0.0003, | |
| "loss": 39.540921875, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 8.808147536471235, | |
| "eval_accuracy": 0.7290886771189041, | |
| "eval_loss": 9.7109375, | |
| "eval_runtime": 240.0306, | |
| "eval_samples_per_second": 6888.158, | |
| "eval_steps_per_second": 13.457, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 8.9457748417286, | |
| "grad_norm": 13.98570442199707, | |
| "learning_rate": 0.0003, | |
| "loss": 39.512796875, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 9.083402146985962, | |
| "grad_norm": 18.010318756103516, | |
| "learning_rate": 0.0003, | |
| "loss": 39.4786171875, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 9.083402146985962, | |
| "eval_accuracy": 0.7294749784251455, | |
| "eval_loss": 9.7109375, | |
| "eval_runtime": 240.1115, | |
| "eval_samples_per_second": 6885.838, | |
| "eval_steps_per_second": 13.452, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 9.221029452243325, | |
| "grad_norm": 15.137900352478027, | |
| "learning_rate": 0.0003, | |
| "loss": 39.4073359375, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 9.358656757500688, | |
| "grad_norm": 18.228130340576172, | |
| "learning_rate": 0.0003, | |
| "loss": 39.3549765625, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 9.358656757500688, | |
| "eval_accuracy": 0.7301181665976199, | |
| "eval_loss": 9.671875, | |
| "eval_runtime": 239.7862, | |
| "eval_samples_per_second": 6895.179, | |
| "eval_steps_per_second": 13.47, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 9.49628406275805, | |
| "grad_norm": 16.575559616088867, | |
| "learning_rate": 0.0003, | |
| "loss": 39.3098828125, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 9.633911368015415, | |
| "grad_norm": 14.635740280151367, | |
| "learning_rate": 0.0003, | |
| "loss": 39.35287890625, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 9.633911368015415, | |
| "eval_accuracy": 0.7295560826467988, | |
| "eval_loss": 9.6875, | |
| "eval_runtime": 241.3494, | |
| "eval_samples_per_second": 6850.521, | |
| "eval_steps_per_second": 13.383, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 9.771538673272778, | |
| "grad_norm": 14.436244010925293, | |
| "learning_rate": 0.0003, | |
| "loss": 39.29956640625, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 9.90916597853014, | |
| "grad_norm": 14.493698120117188, | |
| "learning_rate": 0.0003, | |
| "loss": 39.31009375, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 9.90916597853014, | |
| "eval_accuracy": 0.7304579509385183, | |
| "eval_loss": 9.6484375, | |
| "eval_runtime": 245.6684, | |
| "eval_samples_per_second": 6730.085, | |
| "eval_steps_per_second": 13.148, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 10.046793283787503, | |
| "grad_norm": 15.077356338500977, | |
| "learning_rate": 0.0003, | |
| "loss": 39.2335546875, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 10.184420589044867, | |
| "grad_norm": 13.661473274230957, | |
| "learning_rate": 0.0003, | |
| "loss": 39.09965625, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 10.184420589044867, | |
| "eval_accuracy": 0.7312455280822778, | |
| "eval_loss": 9.625, | |
| "eval_runtime": 239.6349, | |
| "eval_samples_per_second": 6899.534, | |
| "eval_steps_per_second": 13.479, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 10.32204789430223, | |
| "grad_norm": 15.429136276245117, | |
| "learning_rate": 0.0003, | |
| "loss": 39.147140625, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 10.459675199559593, | |
| "grad_norm": 15.229757308959961, | |
| "learning_rate": 0.0003, | |
| "loss": 39.1339453125, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 10.459675199559593, | |
| "eval_accuracy": 0.731277762807936, | |
| "eval_loss": 9.609375, | |
| "eval_runtime": 238.7439, | |
| "eval_samples_per_second": 6925.282, | |
| "eval_steps_per_second": 13.529, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 10.597302504816955, | |
| "grad_norm": 14.771382331848145, | |
| "learning_rate": 0.0003, | |
| "loss": 39.1441796875, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 10.734929810074318, | |
| "grad_norm": 13.703607559204102, | |
| "learning_rate": 0.0003, | |
| "loss": 39.141265625, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 10.734929810074318, | |
| "eval_accuracy": 0.7310708531463442, | |
| "eval_loss": 9.609375, | |
| "eval_runtime": 239.4162, | |
| "eval_samples_per_second": 6905.836, | |
| "eval_steps_per_second": 13.491, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 10.872557115331682, | |
| "grad_norm": 19.041141510009766, | |
| "learning_rate": 0.0003, | |
| "loss": 39.0934140625, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 11.010184420589045, | |
| "grad_norm": 17.401290893554688, | |
| "learning_rate": 0.0003, | |
| "loss": 39.113875, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 11.010184420589045, | |
| "eval_accuracy": 0.7312406631974454, | |
| "eval_loss": 9.6015625, | |
| "eval_runtime": 238.9522, | |
| "eval_samples_per_second": 6919.246, | |
| "eval_steps_per_second": 13.517, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 11.147811725846408, | |
| "grad_norm": 14.292427062988281, | |
| "learning_rate": 0.0003, | |
| "loss": 39.012484375, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 11.28543903110377, | |
| "grad_norm": 15.462931632995605, | |
| "learning_rate": 0.0003, | |
| "loss": 39.04391796875, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 11.28543903110377, | |
| "eval_accuracy": 0.7316027472794033, | |
| "eval_loss": 9.6015625, | |
| "eval_runtime": 240.4477, | |
| "eval_samples_per_second": 6876.21, | |
| "eval_steps_per_second": 13.433, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 11.423066336361135, | |
| "grad_norm": 17.796772003173828, | |
| "learning_rate": 0.0003, | |
| "loss": 38.957421875, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 11.560693641618498, | |
| "grad_norm": 17.314067840576172, | |
| "learning_rate": 0.0003, | |
| "loss": 38.9495234375, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 11.560693641618498, | |
| "eval_accuracy": 0.7321146855990825, | |
| "eval_loss": 9.578125, | |
| "eval_runtime": 239.2384, | |
| "eval_samples_per_second": 6910.967, | |
| "eval_steps_per_second": 13.501, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 11.69832094687586, | |
| "grad_norm": 16.145645141601562, | |
| "learning_rate": 0.0003, | |
| "loss": 38.91906640625, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 11.835948252133223, | |
| "grad_norm": 13.51314640045166, | |
| "learning_rate": 0.0003, | |
| "loss": 38.91014453125, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 11.835948252133223, | |
| "eval_accuracy": 0.732051599943418, | |
| "eval_loss": 9.5546875, | |
| "eval_runtime": 240.3949, | |
| "eval_samples_per_second": 6877.722, | |
| "eval_steps_per_second": 13.436, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 11.973575557390586, | |
| "grad_norm": 15.877927780151367, | |
| "learning_rate": 0.0003, | |
| "loss": 38.933609375, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 12.11120286264795, | |
| "grad_norm": 15.215489387512207, | |
| "learning_rate": 0.0003, | |
| "loss": 38.8452265625, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 12.11120286264795, | |
| "eval_accuracy": 0.7323534973774022, | |
| "eval_loss": 9.546875, | |
| "eval_runtime": 240.1188, | |
| "eval_samples_per_second": 6885.629, | |
| "eval_steps_per_second": 13.452, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 12.248830167905313, | |
| "grad_norm": 15.539190292358398, | |
| "learning_rate": 0.0003, | |
| "loss": 38.8104296875, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 12.386457473162675, | |
| "grad_norm": 15.577831268310547, | |
| "learning_rate": 0.0003, | |
| "loss": 38.80796875, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 12.386457473162675, | |
| "eval_accuracy": 0.7324531616408847, | |
| "eval_loss": 9.546875, | |
| "eval_runtime": 240.2465, | |
| "eval_samples_per_second": 6881.969, | |
| "eval_steps_per_second": 13.445, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 12.524084778420038, | |
| "grad_norm": 14.47063159942627, | |
| "learning_rate": 0.0003, | |
| "loss": 38.865859375, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 12.661712083677402, | |
| "grad_norm": 13.968493461608887, | |
| "learning_rate": 0.0003, | |
| "loss": 38.81719921875, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 12.661712083677402, | |
| "eval_accuracy": 0.7321305936040334, | |
| "eval_loss": 9.546875, | |
| "eval_runtime": 239.1976, | |
| "eval_samples_per_second": 6912.148, | |
| "eval_steps_per_second": 13.503, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 12.799339388934765, | |
| "grad_norm": 28.390636444091797, | |
| "learning_rate": 0.0003, | |
| "loss": 38.815578125, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 12.936966694192128, | |
| "grad_norm": 27.102386474609375, | |
| "learning_rate": 0.0003, | |
| "loss": 38.82604296875, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 12.936966694192128, | |
| "eval_accuracy": 0.732027704335829, | |
| "eval_loss": 9.546875, | |
| "eval_runtime": 240.3497, | |
| "eval_samples_per_second": 6879.014, | |
| "eval_steps_per_second": 13.439, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 13.07459399944949, | |
| "grad_norm": 14.193507194519043, | |
| "learning_rate": 0.0003, | |
| "loss": 38.72788671875, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 13.212221304706853, | |
| "grad_norm": 18.604595184326172, | |
| "learning_rate": 0.0003, | |
| "loss": 38.6876171875, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 13.212221304706853, | |
| "eval_accuracy": 0.7321750300843878, | |
| "eval_loss": 9.546875, | |
| "eval_runtime": 240.1101, | |
| "eval_samples_per_second": 6885.879, | |
| "eval_steps_per_second": 13.452, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 13.349848609964218, | |
| "grad_norm": 16.717756271362305, | |
| "learning_rate": 0.0003, | |
| "loss": 38.7415390625, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 13.48747591522158, | |
| "grad_norm": 13.74322509765625, | |
| "learning_rate": 0.0003, | |
| "loss": 38.704234375, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 13.48747591522158, | |
| "eval_accuracy": 0.7335116918906991, | |
| "eval_loss": 9.4921875, | |
| "eval_runtime": 240.4214, | |
| "eval_samples_per_second": 6876.962, | |
| "eval_steps_per_second": 13.435, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 13.625103220478943, | |
| "grad_norm": 17.836227416992188, | |
| "learning_rate": 0.0003, | |
| "loss": 38.6647890625, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 13.762730525736306, | |
| "grad_norm": 20.256298065185547, | |
| "learning_rate": 0.0003, | |
| "loss": 38.654390625, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 13.762730525736306, | |
| "eval_accuracy": 0.7328628073699861, | |
| "eval_loss": 9.5078125, | |
| "eval_runtime": 240.901, | |
| "eval_samples_per_second": 6863.272, | |
| "eval_steps_per_second": 13.408, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 13.762730525736306, | |
| "eval_accuracy": 0.7328718517886109, | |
| "eval_loss": 9.5078125, | |
| "eval_runtime": 257.3667, | |
| "eval_samples_per_second": 6424.176, | |
| "eval_steps_per_second": 12.55, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 13.90035783099367, | |
| "grad_norm": 17.175275802612305, | |
| "learning_rate": 0.0003, | |
| "loss": 38.7094453125, | |
| "step": 101000 | |
| }, | |
| { | |
| "epoch": 14.037985136251033, | |
| "grad_norm": 30.791107177734375, | |
| "learning_rate": 0.0003, | |
| "loss": 38.7431796875, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 14.037985136251033, | |
| "eval_accuracy": 0.732853775052298, | |
| "eval_loss": 9.53125, | |
| "eval_runtime": 244.7843, | |
| "eval_samples_per_second": 6754.392, | |
| "eval_steps_per_second": 13.195, | |
| "step": 102000 | |
| }, | |
| { | |
| "epoch": 14.175612441508395, | |
| "grad_norm": 15.07434368133545, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5621875, | |
| "step": 103000 | |
| }, | |
| { | |
| "epoch": 14.313239746765758, | |
| "grad_norm": 16.333436965942383, | |
| "learning_rate": 0.0003, | |
| "loss": 38.6172734375, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 14.313239746765758, | |
| "eval_accuracy": 0.7328456338360237, | |
| "eval_loss": 9.515625, | |
| "eval_runtime": 243.7048, | |
| "eval_samples_per_second": 6784.311, | |
| "eval_steps_per_second": 13.254, | |
| "step": 104000 | |
| }, | |
| { | |
| "epoch": 14.45086705202312, | |
| "grad_norm": 14.872163772583008, | |
| "learning_rate": 0.0003, | |
| "loss": 38.61624609375, | |
| "step": 105000 | |
| }, | |
| { | |
| "epoch": 14.588494357280485, | |
| "grad_norm": 15.491616249084473, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5978203125, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 14.588494357280485, | |
| "eval_accuracy": 0.7325266385860558, | |
| "eval_loss": 9.53125, | |
| "eval_runtime": 241.371, | |
| "eval_samples_per_second": 6849.906, | |
| "eval_steps_per_second": 13.382, | |
| "step": 106000 | |
| }, | |
| { | |
| "epoch": 14.726121662537848, | |
| "grad_norm": 14.945006370544434, | |
| "learning_rate": 0.0003, | |
| "loss": 38.621796875, | |
| "step": 107000 | |
| }, | |
| { | |
| "epoch": 14.86374896779521, | |
| "grad_norm": 14.714298248291016, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5805546875, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 14.86374896779521, | |
| "eval_accuracy": 0.7336863429887471, | |
| "eval_loss": 9.484375, | |
| "eval_runtime": 243.3178, | |
| "eval_samples_per_second": 6795.1, | |
| "eval_steps_per_second": 13.275, | |
| "step": 108000 | |
| }, | |
| { | |
| "epoch": 15.001376273052573, | |
| "grad_norm": 17.513687133789062, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5988359375, | |
| "step": 109000 | |
| }, | |
| { | |
| "epoch": 15.139003578309937, | |
| "grad_norm": 14.208888053894043, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5494453125, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 15.139003578309937, | |
| "eval_accuracy": 0.7334265583450897, | |
| "eval_loss": 9.4921875, | |
| "eval_runtime": 245.3975, | |
| "eval_samples_per_second": 6737.512, | |
| "eval_steps_per_second": 13.162, | |
| "step": 110000 | |
| }, | |
| { | |
| "epoch": 15.2766308835673, | |
| "grad_norm": 20.13620376586914, | |
| "learning_rate": 0.0003, | |
| "loss": 38.51769140625, | |
| "step": 111000 | |
| }, | |
| { | |
| "epoch": 15.414258188824663, | |
| "grad_norm": 14.885974884033203, | |
| "learning_rate": 0.0003, | |
| "loss": 38.52906640625, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 15.414258188824663, | |
| "eval_accuracy": 0.7332003955432331, | |
| "eval_loss": 9.4921875, | |
| "eval_runtime": 246.8519, | |
| "eval_samples_per_second": 6697.818, | |
| "eval_steps_per_second": 13.085, | |
| "step": 112000 | |
| }, | |
| { | |
| "epoch": 15.551885494082025, | |
| "grad_norm": 14.931363105773926, | |
| "learning_rate": 0.0003, | |
| "loss": 38.534203125, | |
| "step": 113000 | |
| }, | |
| { | |
| "epoch": 15.689512799339388, | |
| "grad_norm": 15.144700050354004, | |
| "learning_rate": 0.0003, | |
| "loss": 38.5433125, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 15.689512799339388, | |
| "eval_accuracy": 0.7337025970829132, | |
| "eval_loss": 9.4765625, | |
| "eval_runtime": 244.85, | |
| "eval_samples_per_second": 6752.58, | |
| "eval_steps_per_second": 13.192, | |
| "step": 114000 | |
| }, | |
| { | |
| "epoch": 15.827140104596753, | |
| "grad_norm": 17.183073043823242, | |
| "learning_rate": 0.0003, | |
| "loss": 38.4901015625, | |
| "step": 115000 | |
| }, | |
| { | |
| "epoch": 15.964767409854115, | |
| "grad_norm": 14.985239028930664, | |
| "learning_rate": 0.0003, | |
| "loss": 38.51575390625, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 15.964767409854115, | |
| "eval_accuracy": 0.7338189696183159, | |
| "eval_loss": 9.484375, | |
| "eval_runtime": 245.1155, | |
| "eval_samples_per_second": 6745.266, | |
| "eval_steps_per_second": 13.177, | |
| "step": 116000 | |
| }, | |
| { | |
| "epoch": 16.10239471511148, | |
| "grad_norm": 19.971887588500977, | |
| "learning_rate": 0.0003, | |
| "loss": 38.4035234375, | |
| "step": 117000 | |
| }, | |
| { | |
| "epoch": 16.24002202036884, | |
| "grad_norm": 17.1956844329834, | |
| "learning_rate": 0.0003, | |
| "loss": 38.42918359375, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 16.24002202036884, | |
| "eval_accuracy": 0.733730730614503, | |
| "eval_loss": 9.46875, | |
| "eval_runtime": 243.3012, | |
| "eval_samples_per_second": 6795.566, | |
| "eval_steps_per_second": 13.276, | |
| "step": 118000 | |
| }, | |
| { | |
| "epoch": 16.377649325626205, | |
| "grad_norm": 15.118714332580566, | |
| "learning_rate": 0.0003, | |
| "loss": 38.507515625, | |
| "step": 119000 | |
| }, | |
| { | |
| "epoch": 16.515276630883566, | |
| "grad_norm": 14.03774642944336, | |
| "learning_rate": 0.0003, | |
| "loss": 38.526671875, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 16.515276630883566, | |
| "eval_accuracy": 0.733831136300071, | |
| "eval_loss": 9.484375, | |
| "eval_runtime": 241.9438, | |
| "eval_samples_per_second": 6833.691, | |
| "eval_steps_per_second": 13.35, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 16.515276630883566, | |
| "eval_accuracy": 0.7365382984533457, | |
| "eval_loss": 9.328125, | |
| "eval_runtime": 320.3822, | |
| "eval_samples_per_second": 1416.711, | |
| "eval_steps_per_second": 2.769, | |
| "step": 120000 | |
| }, | |
| { | |
| "epoch": 60.65171074069432, | |
| "grad_norm": 8.715871810913086, | |
| "learning_rate": 0.0003, | |
| "loss": 34.4526328125, | |
| "step": 121000 | |
| }, | |
| { | |
| "epoch": 61.15290136608598, | |
| "grad_norm": 16.51197052001953, | |
| "learning_rate": 0.0003, | |
| "loss": 33.80009375, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 61.15290136608598, | |
| "eval_accuracy": 0.7592972259433672, | |
| "eval_loss": 8.328125, | |
| "eval_runtime": 322.1474, | |
| "eval_samples_per_second": 1408.948, | |
| "eval_steps_per_second": 2.753, | |
| "step": 122000 | |
| }, | |
| { | |
| "epoch": 61.65421732046622, | |
| "grad_norm": 8.568217277526855, | |
| "learning_rate": 0.0003, | |
| "loss": 33.51157421875, | |
| "step": 123000 | |
| }, | |
| { | |
| "epoch": 62.155407945857874, | |
| "grad_norm": 13.904038429260254, | |
| "learning_rate": 0.0003, | |
| "loss": 33.3759140625, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 62.155407945857874, | |
| "eval_accuracy": 0.7606993464209245, | |
| "eval_loss": 8.2578125, | |
| "eval_runtime": 310.5126, | |
| "eval_samples_per_second": 1461.741, | |
| "eval_steps_per_second": 2.857, | |
| "step": 124000 | |
| }, | |
| { | |
| "epoch": 62.65672390023813, | |
| "grad_norm": 9.302454948425293, | |
| "learning_rate": 0.0003, | |
| "loss": 33.2303125, | |
| "step": 125000 | |
| }, | |
| { | |
| "epoch": 63.15791452562978, | |
| "grad_norm": 10.245097160339355, | |
| "learning_rate": 0.0003, | |
| "loss": 33.114984375, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 63.15791452562978, | |
| "eval_accuracy": 0.7620252803249203, | |
| "eval_loss": 8.1953125, | |
| "eval_runtime": 311.281, | |
| "eval_samples_per_second": 1458.133, | |
| "eval_steps_per_second": 2.85, | |
| "step": 126000 | |
| }, | |
| { | |
| "epoch": 63.659230480010024, | |
| "grad_norm": 9.459521293640137, | |
| "learning_rate": 0.0003, | |
| "loss": 33.0674765625, | |
| "step": 127000 | |
| }, | |
| { | |
| "epoch": 64.16042110540168, | |
| "grad_norm": 12.050172805786133, | |
| "learning_rate": 0.0003, | |
| "loss": 33.0123046875, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 64.16042110540168, | |
| "eval_accuracy": 0.7628614283635131, | |
| "eval_loss": 8.15625, | |
| "eval_runtime": 309.2479, | |
| "eval_samples_per_second": 1467.719, | |
| "eval_steps_per_second": 2.868, | |
| "step": 128000 | |
| }, | |
| { | |
| "epoch": 64.66173705978193, | |
| "grad_norm": 8.326544761657715, | |
| "learning_rate": 0.0003, | |
| "loss": 32.89726171875, | |
| "step": 129000 | |
| }, | |
| { | |
| "epoch": 65.16292768517359, | |
| "grad_norm": 9.267374038696289, | |
| "learning_rate": 0.0003, | |
| "loss": 32.78715625, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 65.16292768517359, | |
| "eval_accuracy": 0.7632605632607093, | |
| "eval_loss": 8.1484375, | |
| "eval_runtime": 313.0209, | |
| "eval_samples_per_second": 1450.028, | |
| "eval_steps_per_second": 2.834, | |
| "step": 130000 | |
| }, | |
| { | |
| "epoch": 65.66424363955383, | |
| "grad_norm": 9.583052635192871, | |
| "learning_rate": 0.0003, | |
| "loss": 32.747501953125, | |
| "step": 131000 | |
| }, | |
| { | |
| "epoch": 66.16543426494549, | |
| "grad_norm": 8.761311531066895, | |
| "learning_rate": 0.0003, | |
| "loss": 32.67369140625, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 66.16543426494549, | |
| "eval_accuracy": 0.7639422135833412, | |
| "eval_loss": 8.1015625, | |
| "eval_runtime": 311.6656, | |
| "eval_samples_per_second": 1456.333, | |
| "eval_steps_per_second": 2.846, | |
| "step": 132000 | |
| }, | |
| { | |
| "epoch": 66.66675021932573, | |
| "grad_norm": 8.83479118347168, | |
| "learning_rate": 0.0003, | |
| "loss": 32.617767578125, | |
| "step": 133000 | |
| }, | |
| { | |
| "epoch": 67.16794084471738, | |
| "grad_norm": 8.598926544189453, | |
| "learning_rate": 0.0003, | |
| "loss": 32.5695625, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 67.16794084471738, | |
| "eval_accuracy": 0.7644549296283725, | |
| "eval_loss": 8.078125, | |
| "eval_runtime": 308.4248, | |
| "eval_samples_per_second": 1471.636, | |
| "eval_steps_per_second": 2.876, | |
| "step": 134000 | |
| }, | |
| { | |
| "epoch": 67.66925679909762, | |
| "grad_norm": 10.846793174743652, | |
| "learning_rate": 0.0003, | |
| "loss": 32.53196484375, | |
| "step": 135000 | |
| }, | |
| { | |
| "epoch": 68.17044742448928, | |
| "grad_norm": 23.080833435058594, | |
| "learning_rate": 0.0003, | |
| "loss": 32.47344140625, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 68.17044742448928, | |
| "eval_accuracy": 0.7638109525627774, | |
| "eval_loss": 8.109375, | |
| "eval_runtime": 312.4841, | |
| "eval_samples_per_second": 1452.519, | |
| "eval_steps_per_second": 2.839, | |
| "step": 136000 | |
| }, | |
| { | |
| "epoch": 68.67176337886953, | |
| "grad_norm": 11.440296173095703, | |
| "learning_rate": 0.0003, | |
| "loss": 32.4546796875, | |
| "step": 137000 | |
| }, | |
| { | |
| "epoch": 69.17295400426119, | |
| "grad_norm": 9.561952590942383, | |
| "learning_rate": 0.0003, | |
| "loss": 32.3915703125, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 69.17295400426119, | |
| "eval_accuracy": 0.7654183586207545, | |
| "eval_loss": 8.0234375, | |
| "eval_runtime": 311.376, | |
| "eval_samples_per_second": 1457.688, | |
| "eval_steps_per_second": 2.849, | |
| "step": 138000 | |
| }, | |
| { | |
| "epoch": 69.67426995864143, | |
| "grad_norm": 10.652801513671875, | |
| "learning_rate": 0.0003, | |
| "loss": 32.3813203125, | |
| "step": 139000 | |
| }, | |
| { | |
| "epoch": 70.17546058403309, | |
| "grad_norm": 9.549755096435547, | |
| "learning_rate": 0.0003, | |
| "loss": 32.329857421875, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 70.17546058403309, | |
| "eval_accuracy": 0.765731013146163, | |
| "eval_loss": 8.015625, | |
| "eval_runtime": 311.5237, | |
| "eval_samples_per_second": 1456.997, | |
| "eval_steps_per_second": 2.847, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 70.17546058403309, | |
| "eval_accuracy": 0.7655576478890911, | |
| "eval_loss": 8.03125, | |
| "eval_runtime": 312.782, | |
| "eval_samples_per_second": 1451.135, | |
| "eval_steps_per_second": 2.836, | |
| "step": 140000 | |
| }, | |
| { | |
| "epoch": 70.67677653841334, | |
| "grad_norm": 8.273364067077637, | |
| "learning_rate": 0.0003, | |
| "loss": 32.32880859375, | |
| "step": 141000 | |
| }, | |
| { | |
| "epoch": 71.177967163805, | |
| "grad_norm": 11.310037612915039, | |
| "learning_rate": 0.0003, | |
| "loss": 32.2803671875, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 71.177967163805, | |
| "eval_accuracy": 0.7654140428452689, | |
| "eval_loss": 8.0234375, | |
| "eval_runtime": 302.7715, | |
| "eval_samples_per_second": 1499.114, | |
| "eval_steps_per_second": 2.93, | |
| "step": 142000 | |
| }, | |
| { | |
| "epoch": 71.67928311818524, | |
| "grad_norm": 9.46422004699707, | |
| "learning_rate": 0.0003, | |
| "loss": 32.241615234375, | |
| "step": 143000 | |
| }, | |
| { | |
| "epoch": 72.18047374357688, | |
| "grad_norm": 9.287914276123047, | |
| "learning_rate": 0.0003, | |
| "loss": 32.22880078125, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 72.18047374357688, | |
| "eval_accuracy": 0.7658155554395308, | |
| "eval_loss": 8.015625, | |
| "eval_runtime": 300.7976, | |
| "eval_samples_per_second": 1508.951, | |
| "eval_steps_per_second": 2.949, | |
| "step": 144000 | |
| }, | |
| { | |
| "epoch": 72.68178969795714, | |
| "grad_norm": 9.183584213256836, | |
| "learning_rate": 0.0003, | |
| "loss": 32.233244140625, | |
| "step": 145000 | |
| }, | |
| { | |
| "epoch": 73.18298032334879, | |
| "grad_norm": 9.008417129516602, | |
| "learning_rate": 0.0003, | |
| "loss": 32.181228515625, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 73.18298032334879, | |
| "eval_accuracy": 0.76619202647217, | |
| "eval_loss": 7.98828125, | |
| "eval_runtime": 302.0251, | |
| "eval_samples_per_second": 1502.819, | |
| "eval_steps_per_second": 2.937, | |
| "step": 146000 | |
| }, | |
| { | |
| "epoch": 73.68429627772903, | |
| "grad_norm": 8.19743537902832, | |
| "learning_rate": 0.0003, | |
| "loss": 32.162357421875, | |
| "step": 147000 | |
| }, | |
| { | |
| "epoch": 74.18548690312069, | |
| "grad_norm": 8.455910682678223, | |
| "learning_rate": 0.0003, | |
| "loss": 32.091048828125, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 74.18548690312069, | |
| "eval_accuracy": 0.7663843416476586, | |
| "eval_loss": 7.97265625, | |
| "eval_runtime": 301.7215, | |
| "eval_samples_per_second": 1504.331, | |
| "eval_steps_per_second": 2.94, | |
| "step": 148000 | |
| }, | |
| { | |
| "epoch": 74.68680285750094, | |
| "grad_norm": 8.09157943725586, | |
| "learning_rate": 0.0003, | |
| "loss": 32.071322265625, | |
| "step": 149000 | |
| }, | |
| { | |
| "epoch": 75.1879934828926, | |
| "grad_norm": 12.704072952270508, | |
| "learning_rate": 0.0003, | |
| "loss": 32.044611328125, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 75.1879934828926, | |
| "eval_accuracy": 0.7670952482486783, | |
| "eval_loss": 7.96484375, | |
| "eval_runtime": 301.7456, | |
| "eval_samples_per_second": 1504.211, | |
| "eval_steps_per_second": 2.94, | |
| "step": 150000 | |
| }, | |
| { | |
| "epoch": 75.1879934828926, | |
| "step": 150000, | |
| "total_flos": 3.23779983669461e+19, | |
| "train_loss": 2.1457560286458333, | |
| "train_runtime": 27498.2172, | |
| "train_samples_per_second": 11171.633, | |
| "train_steps_per_second": 5.455 | |
| } | |
| ], | |
| "logging_steps": 1000, | |
| "max_steps": 150000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 76, | |
| "save_steps": 10000, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.23779983669461e+19, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |