| { | |
| "best_metric": 0.7487396597862244, | |
| "best_model_checkpoint": "mgh6/TCS_MLM/checkpoint-6500", | |
| "epoch": 9.370816599732262, | |
| "eval_steps": 100, | |
| "global_step": 7000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.13386880856760375, | |
| "grad_norm": 0.18324387073516846, | |
| "learning_rate": 0.0009866131191432397, | |
| "loss": 1.432, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.13386880856760375, | |
| "eval_loss": 1.1609667539596558, | |
| "eval_runtime": 6.3619, | |
| "eval_samples_per_second": 894.697, | |
| "eval_steps_per_second": 3.615, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.2677376171352075, | |
| "grad_norm": 0.24928592145442963, | |
| "learning_rate": 0.0009732262382864793, | |
| "loss": 1.195, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.2677376171352075, | |
| "eval_loss": 1.0809524059295654, | |
| "eval_runtime": 6.3583, | |
| "eval_samples_per_second": 895.204, | |
| "eval_steps_per_second": 3.617, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.40160642570281124, | |
| "grad_norm": 0.2209886759519577, | |
| "learning_rate": 0.0009598393574297188, | |
| "loss": 1.124, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.40160642570281124, | |
| "eval_loss": 1.0328294038772583, | |
| "eval_runtime": 6.3831, | |
| "eval_samples_per_second": 891.734, | |
| "eval_steps_per_second": 3.603, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.535475234270415, | |
| "grad_norm": 0.21808107197284698, | |
| "learning_rate": 0.0009464524765729585, | |
| "loss": 1.0729, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.535475234270415, | |
| "eval_loss": 1.0003758668899536, | |
| "eval_runtime": 6.3823, | |
| "eval_samples_per_second": 891.836, | |
| "eval_steps_per_second": 3.604, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.6693440428380187, | |
| "grad_norm": 0.19088135659694672, | |
| "learning_rate": 0.0009330655957161981, | |
| "loss": 1.0403, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.6693440428380187, | |
| "eval_loss": 0.9832900762557983, | |
| "eval_runtime": 6.38, | |
| "eval_samples_per_second": 892.158, | |
| "eval_steps_per_second": 3.605, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.8032128514056225, | |
| "grad_norm": 0.19500073790550232, | |
| "learning_rate": 0.0009196787148594378, | |
| "loss": 1.0145, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.8032128514056225, | |
| "eval_loss": 0.9555763602256775, | |
| "eval_runtime": 6.3688, | |
| "eval_samples_per_second": 893.733, | |
| "eval_steps_per_second": 3.611, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 0.9370816599732262, | |
| "grad_norm": 0.21632438898086548, | |
| "learning_rate": 0.0009062918340026773, | |
| "loss": 0.9922, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 0.9370816599732262, | |
| "eval_loss": 0.9372277855873108, | |
| "eval_runtime": 6.3894, | |
| "eval_samples_per_second": 890.852, | |
| "eval_steps_per_second": 3.6, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.07095046854083, | |
| "grad_norm": 0.19669267535209656, | |
| "learning_rate": 0.000892904953145917, | |
| "loss": 0.9746, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.07095046854083, | |
| "eval_loss": 0.9347544312477112, | |
| "eval_runtime": 6.3997, | |
| "eval_samples_per_second": 889.412, | |
| "eval_steps_per_second": 3.594, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 0.19168391823768616, | |
| "learning_rate": 0.0008795180722891566, | |
| "loss": 0.956, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.2048192771084336, | |
| "eval_loss": 0.9178469181060791, | |
| "eval_runtime": 6.3662, | |
| "eval_samples_per_second": 894.095, | |
| "eval_steps_per_second": 3.613, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.3386880856760375, | |
| "grad_norm": 0.18636095523834229, | |
| "learning_rate": 0.0008661311914323963, | |
| "loss": 0.9389, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.3386880856760375, | |
| "eval_loss": 0.9004408717155457, | |
| "eval_runtime": 6.385, | |
| "eval_samples_per_second": 891.467, | |
| "eval_steps_per_second": 3.602, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.4725568942436413, | |
| "grad_norm": 0.2045995146036148, | |
| "learning_rate": 0.0008527443105756359, | |
| "loss": 0.9242, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.4725568942436413, | |
| "eval_loss": 0.8951759934425354, | |
| "eval_runtime": 6.39, | |
| "eval_samples_per_second": 890.768, | |
| "eval_steps_per_second": 3.599, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.606425702811245, | |
| "grad_norm": 0.18196602165699005, | |
| "learning_rate": 0.0008393574297188755, | |
| "loss": 0.9153, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.606425702811245, | |
| "eval_loss": 0.8943730592727661, | |
| "eval_runtime": 6.3848, | |
| "eval_samples_per_second": 891.496, | |
| "eval_steps_per_second": 3.602, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.7402945113788486, | |
| "grad_norm": 0.18478406965732574, | |
| "learning_rate": 0.0008259705488621151, | |
| "loss": 0.9034, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.7402945113788486, | |
| "eval_loss": 0.8838639259338379, | |
| "eval_runtime": 6.3769, | |
| "eval_samples_per_second": 892.596, | |
| "eval_steps_per_second": 3.607, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.8741633199464525, | |
| "grad_norm": 0.196046844124794, | |
| "learning_rate": 0.0008125836680053548, | |
| "loss": 0.8933, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.8741633199464525, | |
| "eval_loss": 0.8721866607666016, | |
| "eval_runtime": 6.3772, | |
| "eval_samples_per_second": 892.551, | |
| "eval_steps_per_second": 3.607, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 2.0080321285140563, | |
| "grad_norm": 0.19012120366096497, | |
| "learning_rate": 0.0007991967871485943, | |
| "loss": 0.885, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.0080321285140563, | |
| "eval_loss": 0.8718409538269043, | |
| "eval_runtime": 6.3647, | |
| "eval_samples_per_second": 894.312, | |
| "eval_steps_per_second": 3.614, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 2.14190093708166, | |
| "grad_norm": 0.21610258519649506, | |
| "learning_rate": 0.0007858099062918341, | |
| "loss": 0.8716, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.14190093708166, | |
| "eval_loss": 0.8591811060905457, | |
| "eval_runtime": 6.3979, | |
| "eval_samples_per_second": 889.671, | |
| "eval_steps_per_second": 3.595, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 2.2757697456492636, | |
| "grad_norm": 0.1839440017938614, | |
| "learning_rate": 0.0007724230254350736, | |
| "loss": 0.8628, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.2757697456492636, | |
| "eval_loss": 0.8537179231643677, | |
| "eval_runtime": 6.3793, | |
| "eval_samples_per_second": 892.265, | |
| "eval_steps_per_second": 3.605, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 0.18840855360031128, | |
| "learning_rate": 0.0007590361445783132, | |
| "loss": 0.858, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.4096385542168672, | |
| "eval_loss": 0.8528442978858948, | |
| "eval_runtime": 6.3969, | |
| "eval_samples_per_second": 889.806, | |
| "eval_steps_per_second": 3.595, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 2.5435073627844713, | |
| "grad_norm": 0.18157944083213806, | |
| "learning_rate": 0.0007456492637215529, | |
| "loss": 0.8556, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.5435073627844713, | |
| "eval_loss": 0.8446890115737915, | |
| "eval_runtime": 6.3767, | |
| "eval_samples_per_second": 892.622, | |
| "eval_steps_per_second": 3.607, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2.677376171352075, | |
| "grad_norm": 0.1894202083349228, | |
| "learning_rate": 0.0007322623828647925, | |
| "loss": 0.8463, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.677376171352075, | |
| "eval_loss": 0.8482021689414978, | |
| "eval_runtime": 6.3911, | |
| "eval_samples_per_second": 890.609, | |
| "eval_steps_per_second": 3.599, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.8112449799196786, | |
| "grad_norm": 0.2042614370584488, | |
| "learning_rate": 0.000718875502008032, | |
| "loss": 0.8372, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.8112449799196786, | |
| "eval_loss": 0.8368203639984131, | |
| "eval_runtime": 6.3674, | |
| "eval_samples_per_second": 893.931, | |
| "eval_steps_per_second": 3.612, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.9451137884872827, | |
| "grad_norm": 0.1776580661535263, | |
| "learning_rate": 0.0007054886211512718, | |
| "loss": 0.8327, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.9451137884872827, | |
| "eval_loss": 0.8410006165504456, | |
| "eval_runtime": 6.4205, | |
| "eval_samples_per_second": 886.529, | |
| "eval_steps_per_second": 3.582, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 3.0789825970548863, | |
| "grad_norm": 0.2126319259405136, | |
| "learning_rate": 0.0006921017402945113, | |
| "loss": 0.8298, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.0789825970548863, | |
| "eval_loss": 0.8286001682281494, | |
| "eval_runtime": 6.4466, | |
| "eval_samples_per_second": 882.942, | |
| "eval_steps_per_second": 3.568, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 3.21285140562249, | |
| "grad_norm": 0.1851571649312973, | |
| "learning_rate": 0.000678714859437751, | |
| "loss": 0.8181, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.21285140562249, | |
| "eval_loss": 0.8264986872673035, | |
| "eval_runtime": 6.3671, | |
| "eval_samples_per_second": 893.972, | |
| "eval_steps_per_second": 3.612, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 3.3467202141900936, | |
| "grad_norm": 0.21075735986232758, | |
| "learning_rate": 0.0006653279785809906, | |
| "loss": 0.814, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.3467202141900936, | |
| "eval_loss": 0.8332136869430542, | |
| "eval_runtime": 6.3889, | |
| "eval_samples_per_second": 890.924, | |
| "eval_steps_per_second": 3.6, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 3.480589022757697, | |
| "grad_norm": 0.1842898279428482, | |
| "learning_rate": 0.0006519410977242302, | |
| "loss": 0.8072, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.480589022757697, | |
| "eval_loss": 0.8202800750732422, | |
| "eval_runtime": 6.3504, | |
| "eval_samples_per_second": 896.32, | |
| "eval_steps_per_second": 3.622, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 3.6144578313253013, | |
| "grad_norm": 0.18545600771903992, | |
| "learning_rate": 0.0006385542168674699, | |
| "loss": 0.8071, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.6144578313253013, | |
| "eval_loss": 0.8289027214050293, | |
| "eval_runtime": 6.4542, | |
| "eval_samples_per_second": 881.901, | |
| "eval_steps_per_second": 3.564, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 3.748326639892905, | |
| "grad_norm": 0.19937384128570557, | |
| "learning_rate": 0.0006251673360107095, | |
| "loss": 0.8, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.748326639892905, | |
| "eval_loss": 0.8232221007347107, | |
| "eval_runtime": 6.3959, | |
| "eval_samples_per_second": 889.945, | |
| "eval_steps_per_second": 3.596, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 3.8821954484605086, | |
| "grad_norm": 0.22407300770282745, | |
| "learning_rate": 0.0006117804551539491, | |
| "loss": 0.7964, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3.8821954484605086, | |
| "eval_loss": 0.8169597387313843, | |
| "eval_runtime": 6.4097, | |
| "eval_samples_per_second": 888.036, | |
| "eval_steps_per_second": 3.588, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 4.016064257028113, | |
| "grad_norm": 0.20041291415691376, | |
| "learning_rate": 0.0005983935742971888, | |
| "loss": 0.7909, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 4.016064257028113, | |
| "eval_loss": 0.819555401802063, | |
| "eval_runtime": 6.3716, | |
| "eval_samples_per_second": 893.337, | |
| "eval_steps_per_second": 3.61, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 4.149933065595716, | |
| "grad_norm": 0.19783490896224976, | |
| "learning_rate": 0.0005850066934404283, | |
| "loss": 0.7826, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.149933065595716, | |
| "eval_loss": 0.8133747577667236, | |
| "eval_runtime": 6.3555, | |
| "eval_samples_per_second": 895.607, | |
| "eval_steps_per_second": 3.619, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 4.28380187416332, | |
| "grad_norm": 0.19236040115356445, | |
| "learning_rate": 0.000571619812583668, | |
| "loss": 0.7805, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.28380187416332, | |
| "eval_loss": 0.805473268032074, | |
| "eval_runtime": 6.4107, | |
| "eval_samples_per_second": 887.894, | |
| "eval_steps_per_second": 3.588, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 4.417670682730924, | |
| "grad_norm": 0.20242071151733398, | |
| "learning_rate": 0.0005582329317269076, | |
| "loss": 0.775, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.417670682730924, | |
| "eval_loss": 0.8090841174125671, | |
| "eval_runtime": 6.3804, | |
| "eval_samples_per_second": 892.112, | |
| "eval_steps_per_second": 3.605, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 4.551539491298527, | |
| "grad_norm": 0.18258976936340332, | |
| "learning_rate": 0.0005448460508701473, | |
| "loss": 0.7677, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.551539491298527, | |
| "eval_loss": 0.8068288564682007, | |
| "eval_runtime": 6.3962, | |
| "eval_samples_per_second": 889.896, | |
| "eval_steps_per_second": 3.596, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 4.685408299866131, | |
| "grad_norm": 0.2070203423500061, | |
| "learning_rate": 0.0005314591700133868, | |
| "loss": 0.7658, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.685408299866131, | |
| "eval_loss": 0.7966070771217346, | |
| "eval_runtime": 6.3599, | |
| "eval_samples_per_second": 894.984, | |
| "eval_steps_per_second": 3.616, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 4.8192771084337345, | |
| "grad_norm": 0.19489823281764984, | |
| "learning_rate": 0.0005180722891566265, | |
| "loss": 0.768, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.8192771084337345, | |
| "eval_loss": 0.800128698348999, | |
| "eval_runtime": 6.3948, | |
| "eval_samples_per_second": 890.099, | |
| "eval_steps_per_second": 3.597, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 4.953145917001339, | |
| "grad_norm": 0.18897077441215515, | |
| "learning_rate": 0.0005046854082998661, | |
| "loss": 0.765, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 4.953145917001339, | |
| "eval_loss": 0.7916857600212097, | |
| "eval_runtime": 6.3763, | |
| "eval_samples_per_second": 892.674, | |
| "eval_steps_per_second": 3.607, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 5.087014725568943, | |
| "grad_norm": 0.19471462070941925, | |
| "learning_rate": 0.0004912985274431057, | |
| "loss": 0.7532, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 5.087014725568943, | |
| "eval_loss": 0.8013682961463928, | |
| "eval_runtime": 6.4045, | |
| "eval_samples_per_second": 888.745, | |
| "eval_steps_per_second": 3.591, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 5.220883534136546, | |
| "grad_norm": 0.19813202321529388, | |
| "learning_rate": 0.0004779116465863454, | |
| "loss": 0.75, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 5.220883534136546, | |
| "eval_loss": 0.7911626696586609, | |
| "eval_runtime": 6.3995, | |
| "eval_samples_per_second": 889.442, | |
| "eval_steps_per_second": 3.594, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 5.35475234270415, | |
| "grad_norm": 0.199341282248497, | |
| "learning_rate": 0.000464524765729585, | |
| "loss": 0.7462, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 5.35475234270415, | |
| "eval_loss": 0.7948747873306274, | |
| "eval_runtime": 6.3721, | |
| "eval_samples_per_second": 893.27, | |
| "eval_steps_per_second": 3.609, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 5.4886211512717535, | |
| "grad_norm": 0.22185169160366058, | |
| "learning_rate": 0.00045113788487282465, | |
| "loss": 0.7461, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 5.4886211512717535, | |
| "eval_loss": 0.7832607626914978, | |
| "eval_runtime": 6.3959, | |
| "eval_samples_per_second": 889.94, | |
| "eval_steps_per_second": 3.596, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 5.622489959839357, | |
| "grad_norm": 0.19276629388332367, | |
| "learning_rate": 0.0004377510040160643, | |
| "loss": 0.7411, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.622489959839357, | |
| "eval_loss": 0.78049236536026, | |
| "eval_runtime": 6.3726, | |
| "eval_samples_per_second": 893.205, | |
| "eval_steps_per_second": 3.609, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 5.756358768406961, | |
| "grad_norm": 0.19334332644939423, | |
| "learning_rate": 0.00042436412315930387, | |
| "loss": 0.7389, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.756358768406961, | |
| "eval_loss": 0.7910569906234741, | |
| "eval_runtime": 6.3418, | |
| "eval_samples_per_second": 897.535, | |
| "eval_steps_per_second": 3.627, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 5.890227576974565, | |
| "grad_norm": 0.19738435745239258, | |
| "learning_rate": 0.0004109772423025435, | |
| "loss": 0.7339, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 5.890227576974565, | |
| "eval_loss": 0.7912316918373108, | |
| "eval_runtime": 6.4227, | |
| "eval_samples_per_second": 886.234, | |
| "eval_steps_per_second": 3.581, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 6.024096385542169, | |
| "grad_norm": 0.19529978930950165, | |
| "learning_rate": 0.00039759036144578315, | |
| "loss": 0.7329, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 6.024096385542169, | |
| "eval_loss": 0.7827839851379395, | |
| "eval_runtime": 6.3652, | |
| "eval_samples_per_second": 894.234, | |
| "eval_steps_per_second": 3.613, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 6.157965194109773, | |
| "grad_norm": 0.18886443972587585, | |
| "learning_rate": 0.0003842034805890228, | |
| "loss": 0.7246, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 6.157965194109773, | |
| "eval_loss": 0.7793735861778259, | |
| "eval_runtime": 6.3661, | |
| "eval_samples_per_second": 894.112, | |
| "eval_steps_per_second": 3.613, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 6.291834002677376, | |
| "grad_norm": 0.20140951871871948, | |
| "learning_rate": 0.0003708165997322624, | |
| "loss": 0.7186, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 6.291834002677376, | |
| "eval_loss": 0.7824135422706604, | |
| "eval_runtime": 6.379, | |
| "eval_samples_per_second": 892.303, | |
| "eval_steps_per_second": 3.606, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 6.42570281124498, | |
| "grad_norm": 0.19508065283298492, | |
| "learning_rate": 0.000357429718875502, | |
| "loss": 0.7196, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 6.42570281124498, | |
| "eval_loss": 0.7769716382026672, | |
| "eval_runtime": 6.3587, | |
| "eval_samples_per_second": 895.148, | |
| "eval_steps_per_second": 3.617, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 6.5595716198125835, | |
| "grad_norm": 0.2040824443101883, | |
| "learning_rate": 0.00034404283801874166, | |
| "loss": 0.7194, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 6.5595716198125835, | |
| "eval_loss": 0.775974452495575, | |
| "eval_runtime": 6.415, | |
| "eval_samples_per_second": 887.297, | |
| "eval_steps_per_second": 3.585, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 6.693440428380187, | |
| "grad_norm": 0.21073400974273682, | |
| "learning_rate": 0.00033065595716198125, | |
| "loss": 0.7166, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 6.693440428380187, | |
| "eval_loss": 0.7732182145118713, | |
| "eval_runtime": 6.3552, | |
| "eval_samples_per_second": 895.647, | |
| "eval_steps_per_second": 3.619, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 6.827309236947791, | |
| "grad_norm": 0.19911488890647888, | |
| "learning_rate": 0.0003172690763052209, | |
| "loss": 0.7113, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.827309236947791, | |
| "eval_loss": 0.7706419825553894, | |
| "eval_runtime": 6.3916, | |
| "eval_samples_per_second": 890.547, | |
| "eval_steps_per_second": 3.598, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 6.961178045515394, | |
| "grad_norm": 0.1983019858598709, | |
| "learning_rate": 0.00030388219544846053, | |
| "loss": 0.7077, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 6.961178045515394, | |
| "eval_loss": 0.7824519276618958, | |
| "eval_runtime": 6.3567, | |
| "eval_samples_per_second": 895.429, | |
| "eval_steps_per_second": 3.618, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 7.095046854082999, | |
| "grad_norm": 0.19971829652786255, | |
| "learning_rate": 0.0002904953145917001, | |
| "loss": 0.6997, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 7.095046854082999, | |
| "eval_loss": 0.7725899815559387, | |
| "eval_runtime": 6.3729, | |
| "eval_samples_per_second": 893.153, | |
| "eval_steps_per_second": 3.609, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 7.228915662650603, | |
| "grad_norm": 0.2070448100566864, | |
| "learning_rate": 0.00027710843373493976, | |
| "loss": 0.6983, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 7.228915662650603, | |
| "eval_loss": 0.7650670409202576, | |
| "eval_runtime": 6.4167, | |
| "eval_samples_per_second": 887.065, | |
| "eval_steps_per_second": 3.584, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 7.362784471218206, | |
| "grad_norm": 0.19670027494430542, | |
| "learning_rate": 0.0002637215528781794, | |
| "loss": 0.6967, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 7.362784471218206, | |
| "eval_loss": 0.7688850164413452, | |
| "eval_runtime": 6.3788, | |
| "eval_samples_per_second": 892.334, | |
| "eval_steps_per_second": 3.606, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 7.49665327978581, | |
| "grad_norm": 0.22708941996097565, | |
| "learning_rate": 0.00025033467202141904, | |
| "loss": 0.6978, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 7.49665327978581, | |
| "eval_loss": 0.7693562507629395, | |
| "eval_runtime": 6.3747, | |
| "eval_samples_per_second": 892.903, | |
| "eval_steps_per_second": 3.608, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 7.6305220883534135, | |
| "grad_norm": 0.2055513709783554, | |
| "learning_rate": 0.00023694779116465866, | |
| "loss": 0.69, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 7.6305220883534135, | |
| "eval_loss": 0.7648805975914001, | |
| "eval_runtime": 6.3649, | |
| "eval_samples_per_second": 894.277, | |
| "eval_steps_per_second": 3.614, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 7.764390896921017, | |
| "grad_norm": 0.19769689440727234, | |
| "learning_rate": 0.00022356091030789827, | |
| "loss": 0.6921, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 7.764390896921017, | |
| "eval_loss": 0.7649876475334167, | |
| "eval_runtime": 6.3876, | |
| "eval_samples_per_second": 891.102, | |
| "eval_steps_per_second": 3.601, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 7.898259705488621, | |
| "grad_norm": 0.20442116260528564, | |
| "learning_rate": 0.00021017402945113788, | |
| "loss": 0.6876, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 7.898259705488621, | |
| "eval_loss": 0.7717822790145874, | |
| "eval_runtime": 6.3521, | |
| "eval_samples_per_second": 896.089, | |
| "eval_steps_per_second": 3.621, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 8.032128514056225, | |
| "grad_norm": 0.21449404954910278, | |
| "learning_rate": 0.00019678714859437752, | |
| "loss": 0.6838, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 8.032128514056225, | |
| "eval_loss": 0.7580859661102295, | |
| "eval_runtime": 6.4002, | |
| "eval_samples_per_second": 889.346, | |
| "eval_steps_per_second": 3.594, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 8.165997322623829, | |
| "grad_norm": 0.2097356915473938, | |
| "learning_rate": 0.00018340026773761714, | |
| "loss": 0.6789, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 8.165997322623829, | |
| "eval_loss": 0.7637941241264343, | |
| "eval_runtime": 6.4327, | |
| "eval_samples_per_second": 884.859, | |
| "eval_steps_per_second": 3.576, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 8.299866131191433, | |
| "grad_norm": 0.19806508719921112, | |
| "learning_rate": 0.00017001338688085678, | |
| "loss": 0.6774, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 8.299866131191433, | |
| "eval_loss": 0.757574200630188, | |
| "eval_runtime": 6.4053, | |
| "eval_samples_per_second": 888.642, | |
| "eval_steps_per_second": 3.591, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 8.433734939759036, | |
| "grad_norm": 0.1967461109161377, | |
| "learning_rate": 0.0001566265060240964, | |
| "loss": 0.672, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 8.433734939759036, | |
| "eval_loss": 0.7565015554428101, | |
| "eval_runtime": 6.3503, | |
| "eval_samples_per_second": 896.341, | |
| "eval_steps_per_second": 3.622, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 8.56760374832664, | |
| "grad_norm": 0.21538911759853363, | |
| "learning_rate": 0.000143239625167336, | |
| "loss": 0.6759, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 8.56760374832664, | |
| "eval_loss": 0.7605956792831421, | |
| "eval_runtime": 6.4074, | |
| "eval_samples_per_second": 888.342, | |
| "eval_steps_per_second": 3.59, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 8.701472556894243, | |
| "grad_norm": 0.20278280973434448, | |
| "learning_rate": 0.00012985274431057565, | |
| "loss": 0.6707, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 8.701472556894243, | |
| "eval_loss": 0.7487396597862244, | |
| "eval_runtime": 6.3846, | |
| "eval_samples_per_second": 891.523, | |
| "eval_steps_per_second": 3.602, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 8.835341365461847, | |
| "grad_norm": 0.20785841345787048, | |
| "learning_rate": 0.00011646586345381527, | |
| "loss": 0.6697, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 8.835341365461847, | |
| "eval_loss": 0.755291223526001, | |
| "eval_runtime": 6.3631, | |
| "eval_samples_per_second": 894.526, | |
| "eval_steps_per_second": 3.615, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 8.96921017402945, | |
| "grad_norm": 0.20792551338672638, | |
| "learning_rate": 0.00010307898259705489, | |
| "loss": 0.6629, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 8.96921017402945, | |
| "eval_loss": 0.7500482201576233, | |
| "eval_runtime": 6.3736, | |
| "eval_samples_per_second": 893.061, | |
| "eval_steps_per_second": 3.609, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 9.103078982597054, | |
| "grad_norm": 0.2049485594034195, | |
| "learning_rate": 8.969210174029451e-05, | |
| "loss": 0.6629, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 9.103078982597054, | |
| "eval_loss": 0.7550941705703735, | |
| "eval_runtime": 6.3709, | |
| "eval_samples_per_second": 893.433, | |
| "eval_steps_per_second": 3.61, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 9.236947791164658, | |
| "grad_norm": 0.21241113543510437, | |
| "learning_rate": 7.630522088353414e-05, | |
| "loss": 0.6629, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 9.236947791164658, | |
| "eval_loss": 0.7512398958206177, | |
| "eval_runtime": 6.3681, | |
| "eval_samples_per_second": 893.836, | |
| "eval_steps_per_second": 3.612, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 9.370816599732262, | |
| "grad_norm": 0.2047683447599411, | |
| "learning_rate": 6.291834002677377e-05, | |
| "loss": 0.6625, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 9.370816599732262, | |
| "eval_loss": 0.7502346038818359, | |
| "eval_runtime": 6.3912, | |
| "eval_samples_per_second": 890.597, | |
| "eval_steps_per_second": 3.599, | |
| "step": 7000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 7470, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 10, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 5, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 5 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 2.5418589536256e+17, | |
| "train_batch_size": 64, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |