| { | |
| "best_metric": 1.7788236141204834, | |
| "best_model_checkpoint": "./Sustainability_model/checkpoint-2000", | |
| "epoch": 0.9765625, | |
| "eval_steps": 100, | |
| "global_step": 2000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.01220703125, | |
| "grad_norm": 1.6068687438964844, | |
| "learning_rate": 2e-05, | |
| "loss": 2.3822, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0244140625, | |
| "grad_norm": 2.9462766647338867, | |
| "learning_rate": 2e-05, | |
| "loss": 2.2692, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.03662109375, | |
| "grad_norm": 1.7852909564971924, | |
| "learning_rate": 2e-05, | |
| "loss": 2.1006, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.048828125, | |
| "grad_norm": 7.235644817352295, | |
| "learning_rate": 2e-05, | |
| "loss": 2.0278, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.048828125, | |
| "eval_loss": 1.9972734451293945, | |
| "eval_runtime": 558.7385, | |
| "eval_samples_per_second": 3.667, | |
| "eval_steps_per_second": 0.46, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.06103515625, | |
| "grad_norm": 2.125588893890381, | |
| "learning_rate": 2e-05, | |
| "loss": 2.022, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.0732421875, | |
| "grad_norm": 4.054937362670898, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8849, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.08544921875, | |
| "grad_norm": 2.199298858642578, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9177, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "grad_norm": 3.9646759033203125, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8583, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.09765625, | |
| "eval_loss": 1.8928931951522827, | |
| "eval_runtime": 556.4849, | |
| "eval_samples_per_second": 3.682, | |
| "eval_steps_per_second": 0.462, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10986328125, | |
| "grad_norm": 2.1288766860961914, | |
| "learning_rate": 2e-05, | |
| "loss": 1.833, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.1220703125, | |
| "grad_norm": 3.614868640899658, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8706, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.13427734375, | |
| "grad_norm": 1.9845377206802368, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8807, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.146484375, | |
| "grad_norm": 3.452430009841919, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8816, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.146484375, | |
| "eval_loss": 1.8663687705993652, | |
| "eval_runtime": 559.6385, | |
| "eval_samples_per_second": 3.661, | |
| "eval_steps_per_second": 0.459, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.15869140625, | |
| "grad_norm": 2.014082908630371, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9114, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.1708984375, | |
| "grad_norm": 3.840735673904419, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7894, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.18310546875, | |
| "grad_norm": 1.8957724571228027, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8793, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "grad_norm": 2.8154468536376953, | |
| "learning_rate": 2e-05, | |
| "loss": 1.825, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.1953125, | |
| "eval_loss": 1.8505265712738037, | |
| "eval_runtime": 561.2097, | |
| "eval_samples_per_second": 3.651, | |
| "eval_steps_per_second": 0.458, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.20751953125, | |
| "grad_norm": 1.919839859008789, | |
| "learning_rate": 2e-05, | |
| "loss": 1.859, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.2197265625, | |
| "grad_norm": 4.224733829498291, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7288, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.23193359375, | |
| "grad_norm": 2.011308431625366, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8153, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.244140625, | |
| "grad_norm": 2.4386088848114014, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7606, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.244140625, | |
| "eval_loss": 1.8400288820266724, | |
| "eval_runtime": 556.9, | |
| "eval_samples_per_second": 3.679, | |
| "eval_steps_per_second": 0.461, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.25634765625, | |
| "grad_norm": 1.661309003829956, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9004, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 0.2685546875, | |
| "grad_norm": 3.9414749145507812, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8056, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 0.28076171875, | |
| "grad_norm": 1.7469593286514282, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7967, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "grad_norm": 2.9050376415252686, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8635, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.29296875, | |
| "eval_loss": 1.8328964710235596, | |
| "eval_runtime": 557.6609, | |
| "eval_samples_per_second": 3.674, | |
| "eval_steps_per_second": 0.461, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.30517578125, | |
| "grad_norm": 2.3050591945648193, | |
| "learning_rate": 2e-05, | |
| "loss": 1.879, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 0.3173828125, | |
| "grad_norm": 3.167156219482422, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8708, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 0.32958984375, | |
| "grad_norm": 1.6256564855575562, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8206, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 0.341796875, | |
| "grad_norm": 2.5053651332855225, | |
| "learning_rate": 2e-05, | |
| "loss": 1.773, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.341796875, | |
| "eval_loss": 1.8295789957046509, | |
| "eval_runtime": 559.9778, | |
| "eval_samples_per_second": 3.659, | |
| "eval_steps_per_second": 0.459, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.35400390625, | |
| "grad_norm": 1.832867980003357, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8656, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 0.3662109375, | |
| "grad_norm": 3.687462568283081, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8182, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 0.37841796875, | |
| "grad_norm": 1.7461413145065308, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8164, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "grad_norm": 3.033299684524536, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7928, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.390625, | |
| "eval_loss": 1.8239413499832153, | |
| "eval_runtime": 556.8543, | |
| "eval_samples_per_second": 3.68, | |
| "eval_steps_per_second": 0.462, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 0.40283203125, | |
| "grad_norm": 1.7098289728164673, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8807, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 0.4150390625, | |
| "grad_norm": 2.885392665863037, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7988, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 0.42724609375, | |
| "grad_norm": 1.851592779159546, | |
| "learning_rate": 2e-05, | |
| "loss": 1.9545, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 0.439453125, | |
| "grad_norm": 3.0471203327178955, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7525, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.439453125, | |
| "eval_loss": 1.8138561248779297, | |
| "eval_runtime": 555.766, | |
| "eval_samples_per_second": 3.687, | |
| "eval_steps_per_second": 0.462, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 0.45166015625, | |
| "grad_norm": 1.6414488554000854, | |
| "learning_rate": 2e-05, | |
| "loss": 1.811, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 0.4638671875, | |
| "grad_norm": 2.5925180912017822, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7496, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 0.47607421875, | |
| "grad_norm": 1.6050211191177368, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7312, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "grad_norm": 2.859921932220459, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8117, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.48828125, | |
| "eval_loss": 1.8138936758041382, | |
| "eval_runtime": 558.8348, | |
| "eval_samples_per_second": 3.667, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 0.50048828125, | |
| "grad_norm": 1.8070147037506104, | |
| "learning_rate": 2e-05, | |
| "loss": 1.784, | |
| "step": 1025 | |
| }, | |
| { | |
| "epoch": 0.5126953125, | |
| "grad_norm": 3.1732072830200195, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8634, | |
| "step": 1050 | |
| }, | |
| { | |
| "epoch": 0.52490234375, | |
| "grad_norm": 1.788548231124878, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8776, | |
| "step": 1075 | |
| }, | |
| { | |
| "epoch": 0.537109375, | |
| "grad_norm": 3.278043031692505, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7439, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.537109375, | |
| "eval_loss": 1.8076274394989014, | |
| "eval_runtime": 558.6938, | |
| "eval_samples_per_second": 3.667, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 0.54931640625, | |
| "grad_norm": 1.5899131298065186, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7778, | |
| "step": 1125 | |
| }, | |
| { | |
| "epoch": 0.5615234375, | |
| "grad_norm": 2.8870561122894287, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7076, | |
| "step": 1150 | |
| }, | |
| { | |
| "epoch": 0.57373046875, | |
| "grad_norm": 1.869402527809143, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8059, | |
| "step": 1175 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "grad_norm": 3.1489057540893555, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7877, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.5859375, | |
| "eval_loss": 1.8044580221176147, | |
| "eval_runtime": 558.1822, | |
| "eval_samples_per_second": 3.671, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 0.59814453125, | |
| "grad_norm": 1.7333471775054932, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8707, | |
| "step": 1225 | |
| }, | |
| { | |
| "epoch": 0.6103515625, | |
| "grad_norm": 2.4759361743927, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8181, | |
| "step": 1250 | |
| }, | |
| { | |
| "epoch": 0.62255859375, | |
| "grad_norm": 1.5844708681106567, | |
| "learning_rate": 2e-05, | |
| "loss": 1.75, | |
| "step": 1275 | |
| }, | |
| { | |
| "epoch": 0.634765625, | |
| "grad_norm": 3.089168071746826, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7684, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.634765625, | |
| "eval_loss": 1.799038052558899, | |
| "eval_runtime": 560.26, | |
| "eval_samples_per_second": 3.657, | |
| "eval_steps_per_second": 0.459, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 0.64697265625, | |
| "grad_norm": 1.7920058965682983, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8733, | |
| "step": 1325 | |
| }, | |
| { | |
| "epoch": 0.6591796875, | |
| "grad_norm": 2.8308629989624023, | |
| "learning_rate": 2e-05, | |
| "loss": 1.796, | |
| "step": 1350 | |
| }, | |
| { | |
| "epoch": 0.67138671875, | |
| "grad_norm": 1.7894172668457031, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8136, | |
| "step": 1375 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "grad_norm": 2.4572231769561768, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7446, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.68359375, | |
| "eval_loss": 1.7971055507659912, | |
| "eval_runtime": 557.4223, | |
| "eval_samples_per_second": 3.676, | |
| "eval_steps_per_second": 0.461, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 0.69580078125, | |
| "grad_norm": 1.9371016025543213, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8745, | |
| "step": 1425 | |
| }, | |
| { | |
| "epoch": 0.7080078125, | |
| "grad_norm": 2.6103124618530273, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8161, | |
| "step": 1450 | |
| }, | |
| { | |
| "epoch": 0.72021484375, | |
| "grad_norm": 1.9414174556732178, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7821, | |
| "step": 1475 | |
| }, | |
| { | |
| "epoch": 0.732421875, | |
| "grad_norm": 2.5814948081970215, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7906, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.732421875, | |
| "eval_loss": 1.7981668710708618, | |
| "eval_runtime": 558.1699, | |
| "eval_samples_per_second": 3.671, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 0.74462890625, | |
| "grad_norm": 1.8409160375595093, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8324, | |
| "step": 1525 | |
| }, | |
| { | |
| "epoch": 0.7568359375, | |
| "grad_norm": 3.2258872985839844, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7808, | |
| "step": 1550 | |
| }, | |
| { | |
| "epoch": 0.76904296875, | |
| "grad_norm": 1.7108904123306274, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8532, | |
| "step": 1575 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "grad_norm": 2.734626531600952, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7532, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.78125, | |
| "eval_loss": 1.7900142669677734, | |
| "eval_runtime": 556.0078, | |
| "eval_samples_per_second": 3.685, | |
| "eval_steps_per_second": 0.462, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 0.79345703125, | |
| "grad_norm": 1.509756326675415, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8452, | |
| "step": 1625 | |
| }, | |
| { | |
| "epoch": 0.8056640625, | |
| "grad_norm": 2.925522804260254, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7518, | |
| "step": 1650 | |
| }, | |
| { | |
| "epoch": 0.81787109375, | |
| "grad_norm": 1.6526622772216797, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8258, | |
| "step": 1675 | |
| }, | |
| { | |
| "epoch": 0.830078125, | |
| "grad_norm": 2.6019341945648193, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7076, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.830078125, | |
| "eval_loss": 1.788050889968872, | |
| "eval_runtime": 559.141, | |
| "eval_samples_per_second": 3.665, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 0.84228515625, | |
| "grad_norm": 1.7243990898132324, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8014, | |
| "step": 1725 | |
| }, | |
| { | |
| "epoch": 0.8544921875, | |
| "grad_norm": 2.976571559906006, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7137, | |
| "step": 1750 | |
| }, | |
| { | |
| "epoch": 0.86669921875, | |
| "grad_norm": 1.5606143474578857, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8229, | |
| "step": 1775 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "grad_norm": 2.303173780441284, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8016, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.87890625, | |
| "eval_loss": 1.784562110900879, | |
| "eval_runtime": 558.6515, | |
| "eval_samples_per_second": 3.668, | |
| "eval_steps_per_second": 0.46, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 0.89111328125, | |
| "grad_norm": 1.7488081455230713, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7196, | |
| "step": 1825 | |
| }, | |
| { | |
| "epoch": 0.9033203125, | |
| "grad_norm": 2.4788448810577393, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7941, | |
| "step": 1850 | |
| }, | |
| { | |
| "epoch": 0.91552734375, | |
| "grad_norm": 1.5106720924377441, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8584, | |
| "step": 1875 | |
| }, | |
| { | |
| "epoch": 0.927734375, | |
| "grad_norm": 2.238421678543091, | |
| "learning_rate": 2e-05, | |
| "loss": 1.6856, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.927734375, | |
| "eval_loss": 1.7807437181472778, | |
| "eval_runtime": 557.7297, | |
| "eval_samples_per_second": 3.674, | |
| "eval_steps_per_second": 0.461, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 0.93994140625, | |
| "grad_norm": 1.5449031591415405, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7737, | |
| "step": 1925 | |
| }, | |
| { | |
| "epoch": 0.9521484375, | |
| "grad_norm": 2.7735748291015625, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7397, | |
| "step": 1950 | |
| }, | |
| { | |
| "epoch": 0.96435546875, | |
| "grad_norm": 1.6932679414749146, | |
| "learning_rate": 2e-05, | |
| "loss": 1.8728, | |
| "step": 1975 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "grad_norm": 2.5551226139068604, | |
| "learning_rate": 2e-05, | |
| "loss": 1.7274, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 0.9765625, | |
| "eval_loss": 1.7788236141204834, | |
| "eval_runtime": 559.018, | |
| "eval_samples_per_second": 3.665, | |
| "eval_steps_per_second": 0.46, | |
| "step": 2000 | |
| } | |
| ], | |
| "logging_steps": 25, | |
| "max_steps": 2048, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 6, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 6.806352400986931e+16, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |