{ "best_metric": 0.5330188679245284, "best_model_checkpoint": "./Validated_cracks_raw_dataset_266_outputs/checkpoint-1280", "epoch": 30.0, "eval_steps": 500, "global_step": 2400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.625, "grad_norm": 12.7417573928833, "learning_rate": 3.916666666666667e-06, "loss": 1.3904, "step": 50 }, { "epoch": 1.0, "eval_accuracy": 0.35377358490566035, "eval_loss": 1.347093939781189, "eval_runtime": 1.5501, "eval_samples_per_second": 136.765, "eval_steps_per_second": 17.418, "step": 80 }, { "epoch": 1.25, "grad_norm": 13.116891860961914, "learning_rate": 8.083333333333334e-06, "loss": 1.3764, "step": 100 }, { "epoch": 1.875, "grad_norm": Infinity, "learning_rate": 9.996791734463078e-06, "loss": 1.3264, "step": 150 }, { "epoch": 2.0, "eval_accuracy": 0.33962264150943394, "eval_loss": 1.2973852157592773, "eval_runtime": 1.5451, "eval_samples_per_second": 137.208, "eval_steps_per_second": 17.475, "step": 160 }, { "epoch": 2.5, "grad_norm": 11.807487487792969, "learning_rate": 9.972609476841368e-06, "loss": 1.2875, "step": 200 }, { "epoch": 3.0, "eval_accuracy": 0.3490566037735849, "eval_loss": 1.2867087125778198, "eval_runtime": 1.5667, "eval_samples_per_second": 135.313, "eval_steps_per_second": 17.233, "step": 240 }, { "epoch": 3.125, "grad_norm": 13.677336692810059, "learning_rate": 9.92483424862726e-06, "loss": 1.2754, "step": 250 }, { "epoch": 3.75, "grad_norm": 11.059372901916504, "learning_rate": 9.8536927234736e-06, "loss": 1.2415, "step": 300 }, { "epoch": 4.0, "eval_accuracy": 0.37735849056603776, "eval_loss": 1.257009744644165, "eval_runtime": 1.5386, "eval_samples_per_second": 137.789, "eval_steps_per_second": 17.549, "step": 320 }, { "epoch": 4.375, "grad_norm": 9.970629692077637, "learning_rate": 9.759522438425813e-06, "loss": 1.2347, "step": 350 }, { "epoch": 5.0, "grad_norm": 36.81943130493164, "learning_rate": 9.642770192448537e-06, "loss": 1.2209, "step": 400 }, { "epoch": 5.0, "eval_accuracy": 0.4339622641509434, "eval_loss": 1.214514136314392, "eval_runtime": 1.5466, "eval_samples_per_second": 137.075, "eval_steps_per_second": 17.458, "step": 400 }, { "epoch": 5.625, "grad_norm": 25.951169967651367, "learning_rate": 9.50398992654976e-06, "loss": 1.1699, "step": 450 }, { "epoch": 6.0, "eval_accuracy": 0.41509433962264153, "eval_loss": 1.2138594388961792, "eval_runtime": 1.5575, "eval_samples_per_second": 136.117, "eval_steps_per_second": 17.336, "step": 480 }, { "epoch": 6.25, "grad_norm": 33.16694259643555, "learning_rate": 9.343840095560373e-06, "loss": 1.1707, "step": 500 }, { "epoch": 6.875, "grad_norm": 15.973888397216797, "learning_rate": 9.163080544038953e-06, "loss": 1.1315, "step": 550 }, { "epoch": 7.0, "eval_accuracy": 0.46226415094339623, "eval_loss": 1.1762322187423706, "eval_runtime": 1.548, "eval_samples_per_second": 136.955, "eval_steps_per_second": 17.442, "step": 560 }, { "epoch": 7.5, "grad_norm": 16.8402042388916, "learning_rate": 8.962568901124326e-06, "loss": 1.1565, "step": 600 }, { "epoch": 8.0, "eval_accuracy": 0.45754716981132076, "eval_loss": 1.1620187759399414, "eval_runtime": 1.5613, "eval_samples_per_second": 135.788, "eval_steps_per_second": 17.294, "step": 640 }, { "epoch": 8.125, "grad_norm": 27.61142921447754, "learning_rate": 8.743256511440829e-06, "loss": 1.1218, "step": 650 }, { "epoch": 8.75, "grad_norm": 14.339193344116211, "learning_rate": 8.511092281712174e-06, "loss": 1.1111, "step": 700 }, { "epoch": 9.0, "eval_accuracy": 0.4811320754716981, "eval_loss": 1.1559137105941772, "eval_runtime": 1.5476, "eval_samples_per_second": 136.986, "eval_steps_per_second": 17.446, "step": 720 }, { "epoch": 9.375, "grad_norm": 22.12592124938965, "learning_rate": 8.257705467351144e-06, "loss": 1.0349, "step": 750 }, { "epoch": 10.0, "grad_norm": 30.31277847290039, "learning_rate": 7.988862191016204e-06, "loss": 1.117, "step": 800 }, { "epoch": 10.0, "eval_accuracy": 0.5, "eval_loss": 1.125497817993164, "eval_runtime": 1.5617, "eval_samples_per_second": 135.746, "eval_steps_per_second": 17.288, "step": 800 }, { "epoch": 10.625, "grad_norm": 53.451171875, "learning_rate": 7.705838002605665e-06, "loss": 1.0174, "step": 850 }, { "epoch": 11.0, "eval_accuracy": 0.5047169811320755, "eval_loss": 1.1186039447784424, "eval_runtime": 1.5425, "eval_samples_per_second": 137.44, "eval_steps_per_second": 17.504, "step": 880 }, { "epoch": 11.25, "grad_norm": 18.92786979675293, "learning_rate": 7.409975734566998e-06, "loss": 1.0209, "step": 900 }, { "epoch": 11.875, "grad_norm": 25.6451358795166, "learning_rate": 7.102679130713538e-06, "loss": 1.0569, "step": 950 }, { "epoch": 12.0, "eval_accuracy": 0.5, "eval_loss": 1.1092461347579956, "eval_runtime": 1.5482, "eval_samples_per_second": 136.93, "eval_steps_per_second": 17.439, "step": 960 }, { "epoch": 12.5, "grad_norm": 16.849384307861328, "learning_rate": 6.785406186042e-06, "loss": 1.0089, "step": 1000 }, { "epoch": 13.0, "eval_accuracy": 0.5, "eval_loss": 1.1156014204025269, "eval_runtime": 1.5482, "eval_samples_per_second": 136.935, "eval_steps_per_second": 17.44, "step": 1040 }, { "epoch": 13.125, "grad_norm": 29.936918258666992, "learning_rate": 6.45966222915063e-06, "loss": 1.0412, "step": 1050 }, { "epoch": 13.75, "grad_norm": 46.736331939697266, "learning_rate": 6.126992780079032e-06, "loss": 1.0413, "step": 1100 }, { "epoch": 14.0, "eval_accuracy": 0.49528301886792453, "eval_loss": 1.108467698097229, "eval_runtime": 1.5451, "eval_samples_per_second": 137.209, "eval_steps_per_second": 17.475, "step": 1120 }, { "epoch": 14.375, "grad_norm": 35.94117736816406, "learning_rate": 5.788976217456275e-06, "loss": 1.019, "step": 1150 }, { "epoch": 15.0, "grad_norm": 23.572219848632812, "learning_rate": 5.447216289748596e-06, "loss": 0.9958, "step": 1200 }, { "epoch": 15.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.115731954574585, "eval_runtime": 1.5428, "eval_samples_per_second": 137.41, "eval_steps_per_second": 17.5, "step": 1200 }, { "epoch": 15.625, "grad_norm": 19.54017448425293, "learning_rate": 5.103334506137773e-06, "loss": 0.9969, "step": 1250 }, { "epoch": 16.0, "eval_accuracy": 0.5330188679245284, "eval_loss": 1.1048997640609741, "eval_runtime": 1.5622, "eval_samples_per_second": 135.705, "eval_steps_per_second": 17.283, "step": 1280 }, { "epoch": 16.25, "grad_norm": 33.59697341918945, "learning_rate": 4.758962443132227e-06, "loss": 0.9203, "step": 1300 }, { "epoch": 16.875, "grad_norm": 22.073335647583008, "learning_rate": 4.415734003412873e-06, "loss": 0.9918, "step": 1350 }, { "epoch": 17.0, "eval_accuracy": 0.5047169811320755, "eval_loss": 1.1045137643814087, "eval_runtime": 1.5338, "eval_samples_per_second": 138.214, "eval_steps_per_second": 17.603, "step": 1360 }, { "epoch": 17.5, "grad_norm": 39.07722473144531, "learning_rate": 4.075277663642208e-06, "loss": 0.9798, "step": 1400 }, { "epoch": 18.0, "eval_accuracy": 0.5141509433962265, "eval_loss": 1.0932115316390991, "eval_runtime": 1.5496, "eval_samples_per_second": 136.805, "eval_steps_per_second": 17.423, "step": 1440 }, { "epoch": 18.125, "grad_norm": 34.9410285949707, "learning_rate": 3.739208748017647e-06, "loss": 0.9567, "step": 1450 }, { "epoch": 18.75, "grad_norm": 13.545350074768066, "learning_rate": 3.409121764227809e-06, "loss": 0.9232, "step": 1500 }, { "epoch": 19.0, "eval_accuracy": 0.5047169811320755, "eval_loss": 1.0961326360702515, "eval_runtime": 1.5504, "eval_samples_per_second": 136.736, "eval_steps_per_second": 17.414, "step": 1520 }, { "epoch": 19.375, "grad_norm": 21.65803337097168, "learning_rate": 3.0865828381745515e-06, "loss": 0.9919, "step": 1550 }, { "epoch": 20.0, "grad_norm": 24.88987159729004, "learning_rate": 2.7731222833547842e-06, "loss": 0.8817, "step": 1600 }, { "epoch": 20.0, "eval_accuracy": 0.49056603773584906, "eval_loss": 1.1115567684173584, "eval_runtime": 1.544, "eval_samples_per_second": 137.306, "eval_steps_per_second": 17.487, "step": 1600 }, { "epoch": 20.625, "grad_norm": 33.98754119873047, "learning_rate": 2.470227340157316e-06, "loss": 0.9587, "step": 1650 }, { "epoch": 21.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.1000138521194458, "eval_runtime": 1.5478, "eval_samples_per_second": 136.971, "eval_steps_per_second": 17.444, "step": 1680 }, { "epoch": 21.25, "grad_norm": 18.886362075805664, "learning_rate": 2.179335119523745e-06, "loss": 0.8872, "step": 1700 }, { "epoch": 21.875, "grad_norm": 45.27411651611328, "learning_rate": 1.901825784452777e-06, "loss": 0.964, "step": 1750 }, { "epoch": 22.0, "eval_accuracy": 0.5094339622641509, "eval_loss": 1.097833275794983, "eval_runtime": 1.5565, "eval_samples_per_second": 136.207, "eval_steps_per_second": 17.347, "step": 1760 }, { "epoch": 22.5, "grad_norm": 46.455753326416016, "learning_rate": 1.6390160016989487e-06, "loss": 0.8906, "step": 1800 }, { "epoch": 23.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.0974699258804321, "eval_runtime": 1.5424, "eval_samples_per_second": 137.448, "eval_steps_per_second": 17.505, "step": 1840 }, { "epoch": 23.125, "grad_norm": 40.78911590576172, "learning_rate": 1.3921526947346902e-06, "loss": 0.945, "step": 1850 }, { "epoch": 23.75, "grad_norm": 23.577985763549805, "learning_rate": 1.162407127615357e-06, "loss": 0.896, "step": 1900 }, { "epoch": 24.0, "eval_accuracy": 0.5141509433962265, "eval_loss": 1.099574089050293, "eval_runtime": 1.549, "eval_samples_per_second": 136.865, "eval_steps_per_second": 17.431, "step": 1920 }, { "epoch": 24.375, "grad_norm": 33.863609313964844, "learning_rate": 9.508693478168346e-07, "loss": 0.891, "step": 1950 }, { "epoch": 25.0, "grad_norm": 27.020963668823242, "learning_rate": 7.585430144121319e-07, "loss": 0.9156, "step": 2000 }, { "epoch": 25.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.0951563119888306, "eval_runtime": 1.5411, "eval_samples_per_second": 137.565, "eval_steps_per_second": 17.52, "step": 2000 }, { "epoch": 25.625, "grad_norm": 15.748483657836914, "learning_rate": 5.863406361251472e-07, "loss": 0.8797, "step": 2050 }, { "epoch": 26.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.0959974527359009, "eval_runtime": 1.5455, "eval_samples_per_second": 137.175, "eval_steps_per_second": 17.47, "step": 2080 }, { "epoch": 26.25, "grad_norm": 25.159223556518555, "learning_rate": 4.350792418550509e-07, "loss": 0.9148, "step": 2100 }, { "epoch": 26.875, "grad_norm": 18.654874801635742, "learning_rate": 3.0547650421285216e-07, "loss": 0.8781, "step": 2150 }, { "epoch": 27.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.0947679281234741, "eval_runtime": 1.5435, "eval_samples_per_second": 137.346, "eval_steps_per_second": 17.492, "step": 2160 }, { "epoch": 27.5, "grad_norm": 37.69408416748047, "learning_rate": 1.9814733446237356e-07, "loss": 0.8698, "step": 2200 }, { "epoch": 28.0, "eval_accuracy": 0.5235849056603774, "eval_loss": 1.0946481227874756, "eval_runtime": 1.5439, "eval_samples_per_second": 137.312, "eval_steps_per_second": 17.488, "step": 2240 }, { "epoch": 28.125, "grad_norm": 18.448745727539062, "learning_rate": 1.1360096502120387e-07, "loss": 0.915, "step": 2250 }, { "epoch": 28.75, "grad_norm": 25.56351661682129, "learning_rate": 5.223853336398632e-08, "loss": 0.9, "step": 2300 }, { "epoch": 29.0, "eval_accuracy": 0.5283018867924528, "eval_loss": 1.0949124097824097, "eval_runtime": 1.5384, "eval_samples_per_second": 137.806, "eval_steps_per_second": 17.551, "step": 2320 }, { "epoch": 29.375, "grad_norm": 20.619009017944336, "learning_rate": 1.4351178791384702e-08, "loss": 0.9145, "step": 2350 }, { "epoch": 30.0, "grad_norm": 16.861053466796875, "learning_rate": 1.1866109479674593e-10, "loss": 0.8853, "step": 2400 }, { "epoch": 30.0, "eval_accuracy": 0.5283018867924528, "eval_loss": 1.0948740243911743, "eval_runtime": 2.7457, "eval_samples_per_second": 77.212, "eval_steps_per_second": 9.834, "step": 2400 }, { "epoch": 30.0, "step": 2400, "total_flos": 1.5090549662800282e+18, "train_loss": 1.0357336870829263, "train_runtime": 587.3439, "train_samples_per_second": 32.485, "train_steps_per_second": 4.086 } ], "logging_steps": 50, "max_steps": 2400, "num_input_tokens_seen": 0, "num_train_epochs": 30, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5090549662800282e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }