| { | |
| "best_metric": 0.035888671875, | |
| "best_model_checkpoint": "./results_morgangen/checkpoint-100000", | |
| "epoch": 0.001, | |
| "eval_steps": 20000, | |
| "global_step": 100000, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 1e-06, | |
| "grad_norm": 2.3610475063323975, | |
| "learning_rate": 4.9328196339992464e-06, | |
| "loss": 3.6864, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 2e-06, | |
| "grad_norm": 1.656525731086731, | |
| "learning_rate": 5.719504324825564e-06, | |
| "loss": 1.718, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 3e-06, | |
| "grad_norm": 1.8576802015304565, | |
| "learning_rate": 6.1708683260303926e-06, | |
| "loss": 1.4701, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 4e-06, | |
| "grad_norm": 1.383483648300171, | |
| "learning_rate": 6.488740554563935e-06, | |
| "loss": 1.3455, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 5e-06, | |
| "grad_norm": 1.4719737768173218, | |
| "learning_rate": 6.734317372309117e-06, | |
| "loss": 1.261, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 6e-06, | |
| "grad_norm": 1.5180269479751587, | |
| "learning_rate": 6.934466112452983e-06, | |
| "loss": 1.1993, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 7e-06, | |
| "grad_norm": 1.4714821577072144, | |
| "learning_rate": 7.103398676137137e-06, | |
| "loss": 1.1509, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 8e-06, | |
| "grad_norm": 1.4220333099365234, | |
| "learning_rate": 7.249551256067741e-06, | |
| "loss": 1.104, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 9e-06, | |
| "grad_norm": 2.056689977645874, | |
| "learning_rate": 7.378343796989793e-06, | |
| "loss": 1.0759, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1e-05, | |
| "grad_norm": 1.5754709243774414, | |
| "learning_rate": 7.493465960993282e-06, | |
| "loss": 1.0397, | |
| "step": 1000 | |
| }, | |
| { | |
| "epoch": 1.1e-05, | |
| "grad_norm": 1.4028830528259277, | |
| "learning_rate": 7.596550404874257e-06, | |
| "loss": 1.0055, | |
| "step": 1100 | |
| }, | |
| { | |
| "epoch": 1.2e-05, | |
| "grad_norm": 1.4989030361175537, | |
| "learning_rate": 7.691601109175854e-06, | |
| "loss": 0.9769, | |
| "step": 1200 | |
| }, | |
| { | |
| "epoch": 1.3e-05, | |
| "grad_norm": 1.9516693353652954, | |
| "learning_rate": 7.778996312200985e-06, | |
| "loss": 0.9499, | |
| "step": 1300 | |
| }, | |
| { | |
| "epoch": 1.4e-05, | |
| "grad_norm": 1.7118744850158691, | |
| "learning_rate": 7.859877791059908e-06, | |
| "loss": 0.9245, | |
| "step": 1400 | |
| }, | |
| { | |
| "epoch": 1.5e-05, | |
| "grad_norm": 1.9349831342697144, | |
| "learning_rate": 7.935149519312563e-06, | |
| "loss": 0.9002, | |
| "step": 1500 | |
| }, | |
| { | |
| "epoch": 1.6e-05, | |
| "grad_norm": 1.3496636152267456, | |
| "learning_rate": 8.005539439502828e-06, | |
| "loss": 0.881, | |
| "step": 1600 | |
| }, | |
| { | |
| "epoch": 1.7e-05, | |
| "grad_norm": 1.6026203632354736, | |
| "learning_rate": 8.071642395272339e-06, | |
| "loss": 0.8592, | |
| "step": 1700 | |
| }, | |
| { | |
| "epoch": 1.8e-05, | |
| "grad_norm": 1.9675606489181519, | |
| "learning_rate": 8.133950723905457e-06, | |
| "loss": 0.841, | |
| "step": 1800 | |
| }, | |
| { | |
| "epoch": 1.9e-05, | |
| "grad_norm": 2.057551145553589, | |
| "learning_rate": 8.19287653490949e-06, | |
| "loss": 0.8228, | |
| "step": 1900 | |
| }, | |
| { | |
| "epoch": 2e-05, | |
| "grad_norm": 1.8387638330459595, | |
| "learning_rate": 8.248223335219199e-06, | |
| "loss": 0.8037, | |
| "step": 2000 | |
| }, | |
| { | |
| "epoch": 2.1e-05, | |
| "grad_norm": 1.7323527336120605, | |
| "learning_rate": 8.30140420048809e-06, | |
| "loss": 0.7874, | |
| "step": 2100 | |
| }, | |
| { | |
| "epoch": 2.2e-05, | |
| "grad_norm": 2.6023099422454834, | |
| "learning_rate": 8.352101374530827e-06, | |
| "loss": 0.7677, | |
| "step": 2200 | |
| }, | |
| { | |
| "epoch": 2.3e-05, | |
| "grad_norm": 1.9935753345489502, | |
| "learning_rate": 8.400536533238381e-06, | |
| "loss": 0.7554, | |
| "step": 2300 | |
| }, | |
| { | |
| "epoch": 2.4e-05, | |
| "grad_norm": 1.632101058959961, | |
| "learning_rate": 8.446902938290931e-06, | |
| "loss": 0.7376, | |
| "step": 2400 | |
| }, | |
| { | |
| "epoch": 2.5e-05, | |
| "grad_norm": 2.2343554496765137, | |
| "learning_rate": 8.491370094967829e-06, | |
| "loss": 0.7202, | |
| "step": 2500 | |
| }, | |
| { | |
| "epoch": 2.6e-05, | |
| "grad_norm": 2.182626962661743, | |
| "learning_rate": 8.534087492996389e-06, | |
| "loss": 0.706, | |
| "step": 2600 | |
| }, | |
| { | |
| "epoch": 2.7e-05, | |
| "grad_norm": 1.9008549451828003, | |
| "learning_rate": 8.575187638879847e-06, | |
| "loss": 0.694, | |
| "step": 2700 | |
| }, | |
| { | |
| "epoch": 2.8e-05, | |
| "grad_norm": 1.8491170406341553, | |
| "learning_rate": 8.614788534877808e-06, | |
| "loss": 0.6765, | |
| "step": 2800 | |
| }, | |
| { | |
| "epoch": 2.9e-05, | |
| "grad_norm": 2.2471179962158203, | |
| "learning_rate": 8.652995721556234e-06, | |
| "loss": 0.6633, | |
| "step": 2900 | |
| }, | |
| { | |
| "epoch": 3e-05, | |
| "grad_norm": 1.9031002521514893, | |
| "learning_rate": 8.689903972981059e-06, | |
| "loss": 0.6455, | |
| "step": 3000 | |
| }, | |
| { | |
| "epoch": 3.1e-05, | |
| "grad_norm": 1.8448904752731323, | |
| "learning_rate": 8.725598713115716e-06, | |
| "loss": 0.6356, | |
| "step": 3100 | |
| }, | |
| { | |
| "epoch": 3.2e-05, | |
| "grad_norm": 1.7731763124465942, | |
| "learning_rate": 8.760157206696729e-06, | |
| "loss": 0.6215, | |
| "step": 3200 | |
| }, | |
| { | |
| "epoch": 3.3e-05, | |
| "grad_norm": 1.9336998462677002, | |
| "learning_rate": 8.79364956635058e-06, | |
| "loss": 0.6082, | |
| "step": 3300 | |
| }, | |
| { | |
| "epoch": 3.4e-05, | |
| "grad_norm": 1.5020197629928589, | |
| "learning_rate": 8.82613960896169e-06, | |
| "loss": 0.5918, | |
| "step": 3400 | |
| }, | |
| { | |
| "epoch": 3.5e-05, | |
| "grad_norm": 1.8765689134597778, | |
| "learning_rate": 8.85768558758383e-06, | |
| "loss": 0.5799, | |
| "step": 3500 | |
| }, | |
| { | |
| "epoch": 3.6e-05, | |
| "grad_norm": 1.8304609060287476, | |
| "learning_rate": 8.888340819988166e-06, | |
| "loss": 0.5658, | |
| "step": 3600 | |
| }, | |
| { | |
| "epoch": 3.7e-05, | |
| "grad_norm": 1.663233757019043, | |
| "learning_rate": 8.918154230884686e-06, | |
| "loss": 0.5574, | |
| "step": 3700 | |
| }, | |
| { | |
| "epoch": 3.8e-05, | |
| "grad_norm": 1.8172345161437988, | |
| "learning_rate": 8.947170821665072e-06, | |
| "loss": 0.5465, | |
| "step": 3800 | |
| }, | |
| { | |
| "epoch": 3.9e-05, | |
| "grad_norm": 1.7093075513839722, | |
| "learning_rate": 8.975432078990786e-06, | |
| "loss": 0.5315, | |
| "step": 3900 | |
| }, | |
| { | |
| "epoch": 4e-05, | |
| "grad_norm": 1.7914036512374878, | |
| "learning_rate": 9.002976331538332e-06, | |
| "loss": 0.5227, | |
| "step": 4000 | |
| }, | |
| { | |
| "epoch": 4.1e-05, | |
| "grad_norm": 1.441435694694519, | |
| "learning_rate": 9.029839062600307e-06, | |
| "loss": 0.5131, | |
| "step": 4100 | |
| }, | |
| { | |
| "epoch": 4.2e-05, | |
| "grad_norm": 1.3954449892044067, | |
| "learning_rate": 9.056053184939176e-06, | |
| "loss": 0.4998, | |
| "step": 4200 | |
| }, | |
| { | |
| "epoch": 4.3e-05, | |
| "grad_norm": 1.7888164520263672, | |
| "learning_rate": 9.081649283234784e-06, | |
| "loss": 0.4961, | |
| "step": 4300 | |
| }, | |
| { | |
| "epoch": 4.4e-05, | |
| "grad_norm": 1.7587261199951172, | |
| "learning_rate": 9.106655828605087e-06, | |
| "loss": 0.4875, | |
| "step": 4400 | |
| }, | |
| { | |
| "epoch": 4.5e-05, | |
| "grad_norm": 1.9113582372665405, | |
| "learning_rate": 9.13109936897355e-06, | |
| "loss": 0.4794, | |
| "step": 4500 | |
| }, | |
| { | |
| "epoch": 4.6e-05, | |
| "grad_norm": 1.6648356914520264, | |
| "learning_rate": 9.155004698474792e-06, | |
| "loss": 0.4697, | |
| "step": 4600 | |
| }, | |
| { | |
| "epoch": 4.7e-05, | |
| "grad_norm": 1.5259454250335693, | |
| "learning_rate": 9.17839500860873e-06, | |
| "loss": 0.4622, | |
| "step": 4700 | |
| }, | |
| { | |
| "epoch": 4.8e-05, | |
| "grad_norm": 1.8361080884933472, | |
| "learning_rate": 9.201292023453135e-06, | |
| "loss": 0.453, | |
| "step": 4800 | |
| }, | |
| { | |
| "epoch": 4.9e-05, | |
| "grad_norm": 1.6309137344360352, | |
| "learning_rate": 9.22371612091062e-06, | |
| "loss": 0.4429, | |
| "step": 4900 | |
| }, | |
| { | |
| "epoch": 5e-05, | |
| "grad_norm": 1.7207796573638916, | |
| "learning_rate": 9.245686441685918e-06, | |
| "loss": 0.4382, | |
| "step": 5000 | |
| }, | |
| { | |
| "epoch": 5.1e-05, | |
| "grad_norm": 1.552103042602539, | |
| "learning_rate": 9.267220987454044e-06, | |
| "loss": 0.4315, | |
| "step": 5100 | |
| }, | |
| { | |
| "epoch": 5.2e-05, | |
| "grad_norm": 1.6008425951004028, | |
| "learning_rate": 9.28833670948078e-06, | |
| "loss": 0.4244, | |
| "step": 5200 | |
| }, | |
| { | |
| "epoch": 5.3e-05, | |
| "grad_norm": 1.8220570087432861, | |
| "learning_rate": 9.309049588788657e-06, | |
| "loss": 0.4162, | |
| "step": 5300 | |
| }, | |
| { | |
| "epoch": 5.4e-05, | |
| "grad_norm": 1.5230612754821777, | |
| "learning_rate": 9.329374708818158e-06, | |
| "loss": 0.4112, | |
| "step": 5400 | |
| }, | |
| { | |
| "epoch": 5.5e-05, | |
| "grad_norm": 1.7809470891952515, | |
| "learning_rate": 9.349326321411793e-06, | |
| "loss": 0.4052, | |
| "step": 5500 | |
| }, | |
| { | |
| "epoch": 5.6e-05, | |
| "grad_norm": 1.5959115028381348, | |
| "learning_rate": 9.368917906844062e-06, | |
| "loss": 0.4009, | |
| "step": 5600 | |
| }, | |
| { | |
| "epoch": 5.7e-05, | |
| "grad_norm": 1.563692331314087, | |
| "learning_rate": 9.388162228530614e-06, | |
| "loss": 0.394, | |
| "step": 5700 | |
| }, | |
| { | |
| "epoch": 5.8e-05, | |
| "grad_norm": 1.4869149923324585, | |
| "learning_rate": 9.407071382972726e-06, | |
| "loss": 0.3879, | |
| "step": 5800 | |
| }, | |
| { | |
| "epoch": 5.9e-05, | |
| "grad_norm": 1.5701963901519775, | |
| "learning_rate": 9.425656845426483e-06, | |
| "loss": 0.3784, | |
| "step": 5900 | |
| }, | |
| { | |
| "epoch": 6e-05, | |
| "grad_norm": 1.496894359588623, | |
| "learning_rate": 9.443929511728523e-06, | |
| "loss": 0.3746, | |
| "step": 6000 | |
| }, | |
| { | |
| "epoch": 6.1e-05, | |
| "grad_norm": 1.4307634830474854, | |
| "learning_rate": 9.461899736660011e-06, | |
| "loss": 0.372, | |
| "step": 6100 | |
| }, | |
| { | |
| "epoch": 6.2e-05, | |
| "grad_norm": 1.4771479368209839, | |
| "learning_rate": 9.479577369187091e-06, | |
| "loss": 0.3661, | |
| "step": 6200 | |
| }, | |
| { | |
| "epoch": 6.3e-05, | |
| "grad_norm": 1.2904491424560547, | |
| "learning_rate": 9.496971784878123e-06, | |
| "loss": 0.3625, | |
| "step": 6300 | |
| }, | |
| { | |
| "epoch": 6.4e-05, | |
| "grad_norm": 1.5488417148590088, | |
| "learning_rate": 9.514091915764837e-06, | |
| "loss": 0.3547, | |
| "step": 6400 | |
| }, | |
| { | |
| "epoch": 6.5e-05, | |
| "grad_norm": 1.4266217947006226, | |
| "learning_rate": 9.530946277885485e-06, | |
| "loss": 0.3491, | |
| "step": 6500 | |
| }, | |
| { | |
| "epoch": 6.6e-05, | |
| "grad_norm": 1.5423930883407593, | |
| "learning_rate": 9.547542996722649e-06, | |
| "loss": 0.3442, | |
| "step": 6600 | |
| }, | |
| { | |
| "epoch": 6.7e-05, | |
| "grad_norm": 1.3324171304702759, | |
| "learning_rate": 9.563889830725893e-06, | |
| "loss": 0.3427, | |
| "step": 6700 | |
| }, | |
| { | |
| "epoch": 6.8e-05, | |
| "grad_norm": 1.3407986164093018, | |
| "learning_rate": 9.57999419308974e-06, | |
| "loss": 0.3376, | |
| "step": 6800 | |
| }, | |
| { | |
| "epoch": 6.9e-05, | |
| "grad_norm": 1.303074598312378, | |
| "learning_rate": 9.595863171939976e-06, | |
| "loss": 0.3346, | |
| "step": 6900 | |
| }, | |
| { | |
| "epoch": 7e-05, | |
| "grad_norm": 1.3955286741256714, | |
| "learning_rate": 9.611192939364202e-06, | |
| "loss": 0.3283, | |
| "step": 7000 | |
| }, | |
| { | |
| "epoch": 7.1e-05, | |
| "grad_norm": 1.3460999727249146, | |
| "learning_rate": 9.626462440880078e-06, | |
| "loss": 0.3269, | |
| "step": 7100 | |
| }, | |
| { | |
| "epoch": 7.2e-05, | |
| "grad_norm": 1.5643832683563232, | |
| "learning_rate": 9.641671209028838e-06, | |
| "loss": 0.3235, | |
| "step": 7200 | |
| }, | |
| { | |
| "epoch": 7.3e-05, | |
| "grad_norm": 1.503298044204712, | |
| "learning_rate": 9.65666987557147e-06, | |
| "loss": 0.3184, | |
| "step": 7300 | |
| }, | |
| { | |
| "epoch": 7.4e-05, | |
| "grad_norm": 1.4040926694869995, | |
| "learning_rate": 9.671464166396914e-06, | |
| "loss": 0.3173, | |
| "step": 7400 | |
| }, | |
| { | |
| "epoch": 7.5e-05, | |
| "grad_norm": 1.5793565511703491, | |
| "learning_rate": 9.686059576466255e-06, | |
| "loss": 0.3118, | |
| "step": 7500 | |
| }, | |
| { | |
| "epoch": 7.6e-05, | |
| "grad_norm": 1.2530806064605713, | |
| "learning_rate": 9.700461382066083e-06, | |
| "loss": 0.3073, | |
| "step": 7600 | |
| }, | |
| { | |
| "epoch": 7.7e-05, | |
| "grad_norm": 1.6009125709533691, | |
| "learning_rate": 9.714674652259765e-06, | |
| "loss": 0.3058, | |
| "step": 7700 | |
| }, | |
| { | |
| "epoch": 7.8e-05, | |
| "grad_norm": 1.52604341506958, | |
| "learning_rate": 9.7287042595988e-06, | |
| "loss": 0.299, | |
| "step": 7800 | |
| }, | |
| { | |
| "epoch": 7.9e-05, | |
| "grad_norm": 1.512654185295105, | |
| "learning_rate": 9.742554890150908e-06, | |
| "loss": 0.2997, | |
| "step": 7900 | |
| }, | |
| { | |
| "epoch": 8e-05, | |
| "grad_norm": 1.3372293710708618, | |
| "learning_rate": 9.75623105289651e-06, | |
| "loss": 0.2959, | |
| "step": 8000 | |
| }, | |
| { | |
| "epoch": 8.1e-05, | |
| "grad_norm": 1.3194124698638916, | |
| "learning_rate": 9.769737088540707e-06, | |
| "loss": 0.2915, | |
| "step": 8100 | |
| }, | |
| { | |
| "epoch": 8.2e-05, | |
| "grad_norm": 1.3267931938171387, | |
| "learning_rate": 9.783077177783901e-06, | |
| "loss": 0.2883, | |
| "step": 8200 | |
| }, | |
| { | |
| "epoch": 8.3e-05, | |
| "grad_norm": 1.4453672170639038, | |
| "learning_rate": 9.796255349090433e-06, | |
| "loss": 0.2857, | |
| "step": 8300 | |
| }, | |
| { | |
| "epoch": 8.4e-05, | |
| "grad_norm": 1.2656625509262085, | |
| "learning_rate": 9.809275485991406e-06, | |
| "loss": 0.2824, | |
| "step": 8400 | |
| }, | |
| { | |
| "epoch": 8.5e-05, | |
| "grad_norm": 1.347659707069397, | |
| "learning_rate": 9.822141333954775e-06, | |
| "loss": 0.2805, | |
| "step": 8500 | |
| }, | |
| { | |
| "epoch": 8.6e-05, | |
| "grad_norm": 1.3958872556686401, | |
| "learning_rate": 9.834856506853153e-06, | |
| "loss": 0.2777, | |
| "step": 8600 | |
| }, | |
| { | |
| "epoch": 8.7e-05, | |
| "grad_norm": 1.4277667999267578, | |
| "learning_rate": 9.847424493057225e-06, | |
| "loss": 0.2734, | |
| "step": 8700 | |
| }, | |
| { | |
| "epoch": 8.8e-05, | |
| "grad_norm": 1.233550786972046, | |
| "learning_rate": 9.85984866118054e-06, | |
| "loss": 0.2727, | |
| "step": 8800 | |
| }, | |
| { | |
| "epoch": 8.9e-05, | |
| "grad_norm": 1.499273657798767, | |
| "learning_rate": 9.872132265499283e-06, | |
| "loss": 0.2712, | |
| "step": 8900 | |
| }, | |
| { | |
| "epoch": 9e-05, | |
| "grad_norm": 1.4485379457473755, | |
| "learning_rate": 9.884278451068888e-06, | |
| "loss": 0.2669, | |
| "step": 9000 | |
| }, | |
| { | |
| "epoch": 9.1e-05, | |
| "grad_norm": 1.2728357315063477, | |
| "learning_rate": 9.896051320131294e-06, | |
| "loss": 0.2657, | |
| "step": 9100 | |
| }, | |
| { | |
| "epoch": 9.2e-05, | |
| "grad_norm": 1.2725111246109009, | |
| "learning_rate": 9.90793429093813e-06, | |
| "loss": 0.2635, | |
| "step": 9200 | |
| }, | |
| { | |
| "epoch": 9.3e-05, | |
| "grad_norm": 1.3907318115234375, | |
| "learning_rate": 9.919688613870083e-06, | |
| "loss": 0.2581, | |
| "step": 9300 | |
| }, | |
| { | |
| "epoch": 9.4e-05, | |
| "grad_norm": 1.3836479187011719, | |
| "learning_rate": 9.93131704466464e-06, | |
| "loss": 0.2588, | |
| "step": 9400 | |
| }, | |
| { | |
| "epoch": 9.5e-05, | |
| "grad_norm": 1.2773383855819702, | |
| "learning_rate": 9.942822251451706e-06, | |
| "loss": 0.2536, | |
| "step": 9500 | |
| }, | |
| { | |
| "epoch": 9.6e-05, | |
| "grad_norm": 1.2910404205322266, | |
| "learning_rate": 9.954206818428214e-06, | |
| "loss": 0.2513, | |
| "step": 9600 | |
| }, | |
| { | |
| "epoch": 9.7e-05, | |
| "grad_norm": 1.234729528427124, | |
| "learning_rate": 9.96547324934206e-06, | |
| "loss": 0.2476, | |
| "step": 9700 | |
| }, | |
| { | |
| "epoch": 9.8e-05, | |
| "grad_norm": 1.1756150722503662, | |
| "learning_rate": 9.976623970797134e-06, | |
| "loss": 0.2471, | |
| "step": 9800 | |
| }, | |
| { | |
| "epoch": 9.9e-05, | |
| "grad_norm": 1.261687159538269, | |
| "learning_rate": 9.987661335390354e-06, | |
| "loss": 0.2489, | |
| "step": 9900 | |
| }, | |
| { | |
| "epoch": 0.0001, | |
| "grad_norm": 1.3017668724060059, | |
| "learning_rate": 9.998587624690824e-06, | |
| "loss": 0.2435, | |
| "step": 10000 | |
| }, | |
| { | |
| "epoch": 0.000101, | |
| "grad_norm": 1.232535719871521, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2425, | |
| "step": 10100 | |
| }, | |
| { | |
| "epoch": 0.000102, | |
| "grad_norm": 1.306433081626892, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2406, | |
| "step": 10200 | |
| }, | |
| { | |
| "epoch": 0.000103, | |
| "grad_norm": 1.3659272193908691, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2389, | |
| "step": 10300 | |
| }, | |
| { | |
| "epoch": 0.000104, | |
| "grad_norm": 1.1521058082580566, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2338, | |
| "step": 10400 | |
| }, | |
| { | |
| "epoch": 0.000105, | |
| "grad_norm": 1.1397546529769897, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2342, | |
| "step": 10500 | |
| }, | |
| { | |
| "epoch": 0.000106, | |
| "grad_norm": 1.3130905628204346, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2313, | |
| "step": 10600 | |
| }, | |
| { | |
| "epoch": 0.000107, | |
| "grad_norm": 1.1320550441741943, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2288, | |
| "step": 10700 | |
| }, | |
| { | |
| "epoch": 0.000108, | |
| "grad_norm": 1.2157635688781738, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2296, | |
| "step": 10800 | |
| }, | |
| { | |
| "epoch": 0.000109, | |
| "grad_norm": 1.2038499116897583, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2249, | |
| "step": 10900 | |
| }, | |
| { | |
| "epoch": 0.00011, | |
| "grad_norm": 1.3213508129119873, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2243, | |
| "step": 11000 | |
| }, | |
| { | |
| "epoch": 0.000111, | |
| "grad_norm": 1.1428966522216797, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2213, | |
| "step": 11100 | |
| }, | |
| { | |
| "epoch": 0.000112, | |
| "grad_norm": 1.2259374856948853, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2202, | |
| "step": 11200 | |
| }, | |
| { | |
| "epoch": 0.000113, | |
| "grad_norm": 1.1567683219909668, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2175, | |
| "step": 11300 | |
| }, | |
| { | |
| "epoch": 0.000114, | |
| "grad_norm": 1.2655612230300903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.216, | |
| "step": 11400 | |
| }, | |
| { | |
| "epoch": 0.000115, | |
| "grad_norm": 1.1602586507797241, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2146, | |
| "step": 11500 | |
| }, | |
| { | |
| "epoch": 0.000116, | |
| "grad_norm": 1.1369308233261108, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2126, | |
| "step": 11600 | |
| }, | |
| { | |
| "epoch": 0.000117, | |
| "grad_norm": 1.1988592147827148, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2121, | |
| "step": 11700 | |
| }, | |
| { | |
| "epoch": 0.000118, | |
| "grad_norm": 1.087939977645874, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2101, | |
| "step": 11800 | |
| }, | |
| { | |
| "epoch": 0.000119, | |
| "grad_norm": 1.2805454730987549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2094, | |
| "step": 11900 | |
| }, | |
| { | |
| "epoch": 0.00012, | |
| "grad_norm": 1.4006527662277222, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2043, | |
| "step": 12000 | |
| }, | |
| { | |
| "epoch": 0.000121, | |
| "grad_norm": 1.2651677131652832, | |
| "learning_rate": 1e-05, | |
| "loss": 0.205, | |
| "step": 12100 | |
| }, | |
| { | |
| "epoch": 0.000122, | |
| "grad_norm": 1.3023113012313843, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2066, | |
| "step": 12200 | |
| }, | |
| { | |
| "epoch": 0.000123, | |
| "grad_norm": 1.0964651107788086, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2019, | |
| "step": 12300 | |
| }, | |
| { | |
| "epoch": 0.000124, | |
| "grad_norm": 1.1747757196426392, | |
| "learning_rate": 1e-05, | |
| "loss": 0.201, | |
| "step": 12400 | |
| }, | |
| { | |
| "epoch": 0.000125, | |
| "grad_norm": 1.0360560417175293, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1995, | |
| "step": 12500 | |
| }, | |
| { | |
| "epoch": 0.000126, | |
| "grad_norm": 1.0915257930755615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1979, | |
| "step": 12600 | |
| }, | |
| { | |
| "epoch": 0.000127, | |
| "grad_norm": 1.1433717012405396, | |
| "learning_rate": 1e-05, | |
| "loss": 0.2003, | |
| "step": 12700 | |
| }, | |
| { | |
| "epoch": 0.000128, | |
| "grad_norm": 1.1049145460128784, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1956, | |
| "step": 12800 | |
| }, | |
| { | |
| "epoch": 0.000129, | |
| "grad_norm": 1.11701238155365, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1951, | |
| "step": 12900 | |
| }, | |
| { | |
| "epoch": 0.00013, | |
| "grad_norm": 1.1755869388580322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1936, | |
| "step": 13000 | |
| }, | |
| { | |
| "epoch": 0.000131, | |
| "grad_norm": 1.0519227981567383, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1914, | |
| "step": 13100 | |
| }, | |
| { | |
| "epoch": 0.000132, | |
| "grad_norm": 1.1982672214508057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1895, | |
| "step": 13200 | |
| }, | |
| { | |
| "epoch": 0.000133, | |
| "grad_norm": 1.135452389717102, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1899, | |
| "step": 13300 | |
| }, | |
| { | |
| "epoch": 0.000134, | |
| "grad_norm": 1.0130894184112549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1858, | |
| "step": 13400 | |
| }, | |
| { | |
| "epoch": 0.000135, | |
| "grad_norm": 1.1471365690231323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1872, | |
| "step": 13500 | |
| }, | |
| { | |
| "epoch": 0.000136, | |
| "grad_norm": 1.1107739210128784, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1864, | |
| "step": 13600 | |
| }, | |
| { | |
| "epoch": 0.000137, | |
| "grad_norm": 1.1473486423492432, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1854, | |
| "step": 13700 | |
| }, | |
| { | |
| "epoch": 0.000138, | |
| "grad_norm": 1.0697531700134277, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1813, | |
| "step": 13800 | |
| }, | |
| { | |
| "epoch": 0.000139, | |
| "grad_norm": 0.9683561325073242, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1801, | |
| "step": 13900 | |
| }, | |
| { | |
| "epoch": 0.00014, | |
| "grad_norm": 1.1696103811264038, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1802, | |
| "step": 14000 | |
| }, | |
| { | |
| "epoch": 0.000141, | |
| "grad_norm": 1.2879928350448608, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1808, | |
| "step": 14100 | |
| }, | |
| { | |
| "epoch": 0.000142, | |
| "grad_norm": 1.0318293571472168, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1792, | |
| "step": 14200 | |
| }, | |
| { | |
| "epoch": 0.000143, | |
| "grad_norm": 1.0072672367095947, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1784, | |
| "step": 14300 | |
| }, | |
| { | |
| "epoch": 0.000144, | |
| "grad_norm": 1.0204075574874878, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1756, | |
| "step": 14400 | |
| }, | |
| { | |
| "epoch": 0.000145, | |
| "grad_norm": 1.1072639226913452, | |
| "learning_rate": 1e-05, | |
| "loss": 0.174, | |
| "step": 14500 | |
| }, | |
| { | |
| "epoch": 0.000146, | |
| "grad_norm": 1.1650497913360596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1699, | |
| "step": 14600 | |
| }, | |
| { | |
| "epoch": 0.000147, | |
| "grad_norm": 1.1133906841278076, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1712, | |
| "step": 14700 | |
| }, | |
| { | |
| "epoch": 0.000148, | |
| "grad_norm": 1.2355847358703613, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1712, | |
| "step": 14800 | |
| }, | |
| { | |
| "epoch": 0.000149, | |
| "grad_norm": 1.0743693113327026, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1701, | |
| "step": 14900 | |
| }, | |
| { | |
| "epoch": 0.00015, | |
| "grad_norm": 1.1882842779159546, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1703, | |
| "step": 15000 | |
| }, | |
| { | |
| "epoch": 0.000151, | |
| "grad_norm": 1.0762616395950317, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1692, | |
| "step": 15100 | |
| }, | |
| { | |
| "epoch": 0.000152, | |
| "grad_norm": 1.0435552597045898, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1675, | |
| "step": 15200 | |
| }, | |
| { | |
| "epoch": 0.000153, | |
| "grad_norm": 1.0835367441177368, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1668, | |
| "step": 15300 | |
| }, | |
| { | |
| "epoch": 0.000154, | |
| "grad_norm": 1.0594781637191772, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1638, | |
| "step": 15400 | |
| }, | |
| { | |
| "epoch": 0.000155, | |
| "grad_norm": 1.0666881799697876, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1636, | |
| "step": 15500 | |
| }, | |
| { | |
| "epoch": 0.000156, | |
| "grad_norm": 0.9173826575279236, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1632, | |
| "step": 15600 | |
| }, | |
| { | |
| "epoch": 0.000157, | |
| "grad_norm": 1.1107499599456787, | |
| "learning_rate": 1e-05, | |
| "loss": 0.163, | |
| "step": 15700 | |
| }, | |
| { | |
| "epoch": 0.000158, | |
| "grad_norm": 1.0352386236190796, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1602, | |
| "step": 15800 | |
| }, | |
| { | |
| "epoch": 0.000159, | |
| "grad_norm": 0.9977409839630127, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1623, | |
| "step": 15900 | |
| }, | |
| { | |
| "epoch": 0.00016, | |
| "grad_norm": 1.0943259000778198, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1603, | |
| "step": 16000 | |
| }, | |
| { | |
| "epoch": 0.000161, | |
| "grad_norm": 1.0809710025787354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1582, | |
| "step": 16100 | |
| }, | |
| { | |
| "epoch": 0.000162, | |
| "grad_norm": 1.1283208131790161, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1583, | |
| "step": 16200 | |
| }, | |
| { | |
| "epoch": 0.000163, | |
| "grad_norm": 1.0325435400009155, | |
| "learning_rate": 1e-05, | |
| "loss": 0.158, | |
| "step": 16300 | |
| }, | |
| { | |
| "epoch": 0.000164, | |
| "grad_norm": 1.0305627584457397, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1573, | |
| "step": 16400 | |
| }, | |
| { | |
| "epoch": 0.000165, | |
| "grad_norm": 1.0640127658843994, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1551, | |
| "step": 16500 | |
| }, | |
| { | |
| "epoch": 0.000166, | |
| "grad_norm": 0.9327529668807983, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1562, | |
| "step": 16600 | |
| }, | |
| { | |
| "epoch": 0.000167, | |
| "grad_norm": 1.0069410800933838, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1533, | |
| "step": 16700 | |
| }, | |
| { | |
| "epoch": 0.000168, | |
| "grad_norm": 1.040076494216919, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1527, | |
| "step": 16800 | |
| }, | |
| { | |
| "epoch": 0.000169, | |
| "grad_norm": 1.008461356163025, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1525, | |
| "step": 16900 | |
| }, | |
| { | |
| "epoch": 0.00017, | |
| "grad_norm": 1.0036898851394653, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1517, | |
| "step": 17000 | |
| }, | |
| { | |
| "epoch": 0.000171, | |
| "grad_norm": 0.9357483386993408, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1511, | |
| "step": 17100 | |
| }, | |
| { | |
| "epoch": 0.000172, | |
| "grad_norm": 1.0033488273620605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1468, | |
| "step": 17200 | |
| }, | |
| { | |
| "epoch": 0.000173, | |
| "grad_norm": 1.0451477766036987, | |
| "learning_rate": 1e-05, | |
| "loss": 0.15, | |
| "step": 17300 | |
| }, | |
| { | |
| "epoch": 0.000174, | |
| "grad_norm": 0.971612274646759, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1476, | |
| "step": 17400 | |
| }, | |
| { | |
| "epoch": 0.000175, | |
| "grad_norm": 1.079099416732788, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1479, | |
| "step": 17500 | |
| }, | |
| { | |
| "epoch": 0.000176, | |
| "grad_norm": 1.0661680698394775, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1476, | |
| "step": 17600 | |
| }, | |
| { | |
| "epoch": 0.000177, | |
| "grad_norm": 1.0154145956039429, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1467, | |
| "step": 17700 | |
| }, | |
| { | |
| "epoch": 0.000178, | |
| "grad_norm": 1.0474337339401245, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1441, | |
| "step": 17800 | |
| }, | |
| { | |
| "epoch": 0.000179, | |
| "grad_norm": 1.0646860599517822, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1459, | |
| "step": 17900 | |
| }, | |
| { | |
| "epoch": 0.00018, | |
| "grad_norm": 1.0854105949401855, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1437, | |
| "step": 18000 | |
| }, | |
| { | |
| "epoch": 0.000181, | |
| "grad_norm": 0.9846110939979553, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1425, | |
| "step": 18100 | |
| }, | |
| { | |
| "epoch": 0.000182, | |
| "grad_norm": 1.0286470651626587, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1432, | |
| "step": 18200 | |
| }, | |
| { | |
| "epoch": 0.000183, | |
| "grad_norm": 1.0388602018356323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1403, | |
| "step": 18300 | |
| }, | |
| { | |
| "epoch": 0.000184, | |
| "grad_norm": 0.9657048583030701, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1417, | |
| "step": 18400 | |
| }, | |
| { | |
| "epoch": 0.000185, | |
| "grad_norm": 0.8501772880554199, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1424, | |
| "step": 18500 | |
| }, | |
| { | |
| "epoch": 0.000186, | |
| "grad_norm": 0.9153370261192322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1376, | |
| "step": 18600 | |
| }, | |
| { | |
| "epoch": 0.000187, | |
| "grad_norm": 0.9047082662582397, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1413, | |
| "step": 18700 | |
| }, | |
| { | |
| "epoch": 0.000188, | |
| "grad_norm": 0.9566175937652588, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1387, | |
| "step": 18800 | |
| }, | |
| { | |
| "epoch": 0.000189, | |
| "grad_norm": 1.069942831993103, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1355, | |
| "step": 18900 | |
| }, | |
| { | |
| "epoch": 0.00019, | |
| "grad_norm": 1.019620656967163, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1357, | |
| "step": 19000 | |
| }, | |
| { | |
| "epoch": 0.000191, | |
| "grad_norm": 0.9842545390129089, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1366, | |
| "step": 19100 | |
| }, | |
| { | |
| "epoch": 0.000192, | |
| "grad_norm": 0.972135603427887, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1357, | |
| "step": 19200 | |
| }, | |
| { | |
| "epoch": 0.000193, | |
| "grad_norm": 0.9025226831436157, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1347, | |
| "step": 19300 | |
| }, | |
| { | |
| "epoch": 0.000194, | |
| "grad_norm": 0.9164988398551941, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1338, | |
| "step": 19400 | |
| }, | |
| { | |
| "epoch": 0.000195, | |
| "grad_norm": 0.8067638874053955, | |
| "learning_rate": 1e-05, | |
| "loss": 0.133, | |
| "step": 19500 | |
| }, | |
| { | |
| "epoch": 0.000196, | |
| "grad_norm": 0.8477145433425903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1334, | |
| "step": 19600 | |
| }, | |
| { | |
| "epoch": 0.000197, | |
| "grad_norm": 0.860883891582489, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1327, | |
| "step": 19700 | |
| }, | |
| { | |
| "epoch": 0.000198, | |
| "grad_norm": 0.9660979509353638, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1332, | |
| "step": 19800 | |
| }, | |
| { | |
| "epoch": 0.000199, | |
| "grad_norm": 0.8979732394218445, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1317, | |
| "step": 19900 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "grad_norm": 0.8831902146339417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1313, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.0002, | |
| "eval_loss": 0.0992431640625, | |
| "eval_runtime": 152.8076, | |
| "eval_samples_per_second": 327.209, | |
| "eval_steps_per_second": 20.451, | |
| "step": 20000 | |
| }, | |
| { | |
| "epoch": 0.000201, | |
| "grad_norm": 0.9081249833106995, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1296, | |
| "step": 20100 | |
| }, | |
| { | |
| "epoch": 0.000202, | |
| "grad_norm": 1.0295116901397705, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1296, | |
| "step": 20200 | |
| }, | |
| { | |
| "epoch": 0.000203, | |
| "grad_norm": 0.8534417152404785, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1271, | |
| "step": 20300 | |
| }, | |
| { | |
| "epoch": 0.000204, | |
| "grad_norm": 0.8878474235534668, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1276, | |
| "step": 20400 | |
| }, | |
| { | |
| "epoch": 0.000205, | |
| "grad_norm": 0.9492274522781372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1296, | |
| "step": 20500 | |
| }, | |
| { | |
| "epoch": 0.000206, | |
| "grad_norm": 0.9542170166969299, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1284, | |
| "step": 20600 | |
| }, | |
| { | |
| "epoch": 0.000207, | |
| "grad_norm": 0.8887580633163452, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1257, | |
| "step": 20700 | |
| }, | |
| { | |
| "epoch": 0.000208, | |
| "grad_norm": 0.8237319588661194, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1253, | |
| "step": 20800 | |
| }, | |
| { | |
| "epoch": 0.000209, | |
| "grad_norm": 0.8409337401390076, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1241, | |
| "step": 20900 | |
| }, | |
| { | |
| "epoch": 0.00021, | |
| "grad_norm": 0.8566481471061707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1252, | |
| "step": 21000 | |
| }, | |
| { | |
| "epoch": 0.000211, | |
| "grad_norm": 0.8407108783721924, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1242, | |
| "step": 21100 | |
| }, | |
| { | |
| "epoch": 0.000212, | |
| "grad_norm": 0.853947639465332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1246, | |
| "step": 21200 | |
| }, | |
| { | |
| "epoch": 0.000213, | |
| "grad_norm": 0.8899252414703369, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1237, | |
| "step": 21300 | |
| }, | |
| { | |
| "epoch": 0.000214, | |
| "grad_norm": 0.8689791560173035, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1225, | |
| "step": 21400 | |
| }, | |
| { | |
| "epoch": 0.000215, | |
| "grad_norm": 0.9782620668411255, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1229, | |
| "step": 21500 | |
| }, | |
| { | |
| "epoch": 0.000216, | |
| "grad_norm": 0.9015646576881409, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1241, | |
| "step": 21600 | |
| }, | |
| { | |
| "epoch": 0.000217, | |
| "grad_norm": 0.9284467697143555, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1216, | |
| "step": 21700 | |
| }, | |
| { | |
| "epoch": 0.000218, | |
| "grad_norm": 0.8393162488937378, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1219, | |
| "step": 21800 | |
| }, | |
| { | |
| "epoch": 0.000219, | |
| "grad_norm": 0.9249029159545898, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1222, | |
| "step": 21900 | |
| }, | |
| { | |
| "epoch": 0.00022, | |
| "grad_norm": 0.931483805179596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1208, | |
| "step": 22000 | |
| }, | |
| { | |
| "epoch": 0.000221, | |
| "grad_norm": 0.9092661142349243, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1214, | |
| "step": 22100 | |
| }, | |
| { | |
| "epoch": 0.000222, | |
| "grad_norm": 0.9886374473571777, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1189, | |
| "step": 22200 | |
| }, | |
| { | |
| "epoch": 0.000223, | |
| "grad_norm": 0.8833937644958496, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1175, | |
| "step": 22300 | |
| }, | |
| { | |
| "epoch": 0.000224, | |
| "grad_norm": 0.9673048257827759, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1168, | |
| "step": 22400 | |
| }, | |
| { | |
| "epoch": 0.000225, | |
| "grad_norm": 0.872240424156189, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1177, | |
| "step": 22500 | |
| }, | |
| { | |
| "epoch": 0.000226, | |
| "grad_norm": 0.849644660949707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1177, | |
| "step": 22600 | |
| }, | |
| { | |
| "epoch": 0.000227, | |
| "grad_norm": 0.9396729469299316, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1174, | |
| "step": 22700 | |
| }, | |
| { | |
| "epoch": 0.000228, | |
| "grad_norm": 0.9100921750068665, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1161, | |
| "step": 22800 | |
| }, | |
| { | |
| "epoch": 0.000229, | |
| "grad_norm": 0.8232945203781128, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1149, | |
| "step": 22900 | |
| }, | |
| { | |
| "epoch": 0.00023, | |
| "grad_norm": 0.8654581904411316, | |
| "learning_rate": 1e-05, | |
| "loss": 0.116, | |
| "step": 23000 | |
| }, | |
| { | |
| "epoch": 0.000231, | |
| "grad_norm": 0.8864552974700928, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1161, | |
| "step": 23100 | |
| }, | |
| { | |
| "epoch": 0.000232, | |
| "grad_norm": 0.9292982816696167, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1126, | |
| "step": 23200 | |
| }, | |
| { | |
| "epoch": 0.000233, | |
| "grad_norm": 0.8095874786376953, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1141, | |
| "step": 23300 | |
| }, | |
| { | |
| "epoch": 0.000234, | |
| "grad_norm": 1.1662276983261108, | |
| "learning_rate": 1e-05, | |
| "loss": 0.113, | |
| "step": 23400 | |
| }, | |
| { | |
| "epoch": 0.000235, | |
| "grad_norm": 0.8531011343002319, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1147, | |
| "step": 23500 | |
| }, | |
| { | |
| "epoch": 0.000236, | |
| "grad_norm": 0.895802915096283, | |
| "learning_rate": 1e-05, | |
| "loss": 0.114, | |
| "step": 23600 | |
| }, | |
| { | |
| "epoch": 0.000237, | |
| "grad_norm": 0.8489896655082703, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1142, | |
| "step": 23700 | |
| }, | |
| { | |
| "epoch": 0.000238, | |
| "grad_norm": 0.8372708559036255, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1123, | |
| "step": 23800 | |
| }, | |
| { | |
| "epoch": 0.000239, | |
| "grad_norm": 0.8919999003410339, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1134, | |
| "step": 23900 | |
| }, | |
| { | |
| "epoch": 0.00024, | |
| "grad_norm": 0.8561524152755737, | |
| "learning_rate": 1e-05, | |
| "loss": 0.112, | |
| "step": 24000 | |
| }, | |
| { | |
| "epoch": 0.000241, | |
| "grad_norm": 0.8549727201461792, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1123, | |
| "step": 24100 | |
| }, | |
| { | |
| "epoch": 0.000242, | |
| "grad_norm": 0.8339006900787354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1116, | |
| "step": 24200 | |
| }, | |
| { | |
| "epoch": 0.000243, | |
| "grad_norm": 0.8727480173110962, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1113, | |
| "step": 24300 | |
| }, | |
| { | |
| "epoch": 0.000244, | |
| "grad_norm": 0.881377637386322, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1098, | |
| "step": 24400 | |
| }, | |
| { | |
| "epoch": 0.000245, | |
| "grad_norm": 0.8690173029899597, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1109, | |
| "step": 24500 | |
| }, | |
| { | |
| "epoch": 0.000246, | |
| "grad_norm": 0.833027720451355, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1094, | |
| "step": 24600 | |
| }, | |
| { | |
| "epoch": 0.000247, | |
| "grad_norm": 0.8230149745941162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1094, | |
| "step": 24700 | |
| }, | |
| { | |
| "epoch": 0.000248, | |
| "grad_norm": 0.8857430219650269, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1077, | |
| "step": 24800 | |
| }, | |
| { | |
| "epoch": 0.000249, | |
| "grad_norm": 0.9106509685516357, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1081, | |
| "step": 24900 | |
| }, | |
| { | |
| "epoch": 0.00025, | |
| "grad_norm": 0.9534709453582764, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1084, | |
| "step": 25000 | |
| }, | |
| { | |
| "epoch": 0.000251, | |
| "grad_norm": 0.8446188569068909, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1069, | |
| "step": 25100 | |
| }, | |
| { | |
| "epoch": 0.000252, | |
| "grad_norm": 0.8347111344337463, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1077, | |
| "step": 25200 | |
| }, | |
| { | |
| "epoch": 0.000253, | |
| "grad_norm": 0.8703511357307434, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1069, | |
| "step": 25300 | |
| }, | |
| { | |
| "epoch": 0.000254, | |
| "grad_norm": 0.8182582259178162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1058, | |
| "step": 25400 | |
| }, | |
| { | |
| "epoch": 0.000255, | |
| "grad_norm": 0.8704941868782043, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1063, | |
| "step": 25500 | |
| }, | |
| { | |
| "epoch": 0.000256, | |
| "grad_norm": 0.8137685656547546, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1041, | |
| "step": 25600 | |
| }, | |
| { | |
| "epoch": 0.000257, | |
| "grad_norm": 0.7531348466873169, | |
| "learning_rate": 1e-05, | |
| "loss": 0.106, | |
| "step": 25700 | |
| }, | |
| { | |
| "epoch": 0.000258, | |
| "grad_norm": 0.886814534664154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1051, | |
| "step": 25800 | |
| }, | |
| { | |
| "epoch": 0.000259, | |
| "grad_norm": 0.8390068411827087, | |
| "learning_rate": 1e-05, | |
| "loss": 0.105, | |
| "step": 25900 | |
| }, | |
| { | |
| "epoch": 0.00026, | |
| "grad_norm": 0.7962291836738586, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1046, | |
| "step": 26000 | |
| }, | |
| { | |
| "epoch": 0.000261, | |
| "grad_norm": 0.9102724194526672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1044, | |
| "step": 26100 | |
| }, | |
| { | |
| "epoch": 0.000262, | |
| "grad_norm": 0.8715778589248657, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1031, | |
| "step": 26200 | |
| }, | |
| { | |
| "epoch": 0.000263, | |
| "grad_norm": 0.8876039385795593, | |
| "learning_rate": 1e-05, | |
| "loss": 0.103, | |
| "step": 26300 | |
| }, | |
| { | |
| "epoch": 0.000264, | |
| "grad_norm": 0.7934551239013672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1017, | |
| "step": 26400 | |
| }, | |
| { | |
| "epoch": 0.000265, | |
| "grad_norm": 0.9847850799560547, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1032, | |
| "step": 26500 | |
| }, | |
| { | |
| "epoch": 0.000266, | |
| "grad_norm": 0.8920612335205078, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1032, | |
| "step": 26600 | |
| }, | |
| { | |
| "epoch": 0.000267, | |
| "grad_norm": 0.9092204570770264, | |
| "learning_rate": 1e-05, | |
| "loss": 0.102, | |
| "step": 26700 | |
| }, | |
| { | |
| "epoch": 0.000268, | |
| "grad_norm": 0.7922365069389343, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1024, | |
| "step": 26800 | |
| }, | |
| { | |
| "epoch": 0.000269, | |
| "grad_norm": 0.8614472150802612, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1022, | |
| "step": 26900 | |
| }, | |
| { | |
| "epoch": 0.00027, | |
| "grad_norm": 0.7870116829872131, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1004, | |
| "step": 27000 | |
| }, | |
| { | |
| "epoch": 0.000271, | |
| "grad_norm": 0.6980022192001343, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1006, | |
| "step": 27100 | |
| }, | |
| { | |
| "epoch": 0.000272, | |
| "grad_norm": 0.7720369100570679, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1012, | |
| "step": 27200 | |
| }, | |
| { | |
| "epoch": 0.000273, | |
| "grad_norm": 0.8154132962226868, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1005, | |
| "step": 27300 | |
| }, | |
| { | |
| "epoch": 0.000274, | |
| "grad_norm": 0.8288457989692688, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0985, | |
| "step": 27400 | |
| }, | |
| { | |
| "epoch": 0.000275, | |
| "grad_norm": 0.8117573261260986, | |
| "learning_rate": 1e-05, | |
| "loss": 0.1008, | |
| "step": 27500 | |
| }, | |
| { | |
| "epoch": 0.000276, | |
| "grad_norm": 0.7800782918930054, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0988, | |
| "step": 27600 | |
| }, | |
| { | |
| "epoch": 0.000277, | |
| "grad_norm": 0.9139901399612427, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0994, | |
| "step": 27700 | |
| }, | |
| { | |
| "epoch": 0.000278, | |
| "grad_norm": 0.745152473449707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0982, | |
| "step": 27800 | |
| }, | |
| { | |
| "epoch": 0.000279, | |
| "grad_norm": 0.7476614117622375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0965, | |
| "step": 27900 | |
| }, | |
| { | |
| "epoch": 0.00028, | |
| "grad_norm": 0.7490776777267456, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0972, | |
| "step": 28000 | |
| }, | |
| { | |
| "epoch": 0.000281, | |
| "grad_norm": 0.7730040550231934, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0976, | |
| "step": 28100 | |
| }, | |
| { | |
| "epoch": 0.000282, | |
| "grad_norm": 0.7657092213630676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0982, | |
| "step": 28200 | |
| }, | |
| { | |
| "epoch": 0.000283, | |
| "grad_norm": 0.9147765040397644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0978, | |
| "step": 28300 | |
| }, | |
| { | |
| "epoch": 0.000284, | |
| "grad_norm": 0.7426789999008179, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0968, | |
| "step": 28400 | |
| }, | |
| { | |
| "epoch": 0.000285, | |
| "grad_norm": 0.8652293086051941, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0981, | |
| "step": 28500 | |
| }, | |
| { | |
| "epoch": 0.000286, | |
| "grad_norm": 0.6864128112792969, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0963, | |
| "step": 28600 | |
| }, | |
| { | |
| "epoch": 0.000287, | |
| "grad_norm": 0.7807822227478027, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0962, | |
| "step": 28700 | |
| }, | |
| { | |
| "epoch": 0.000288, | |
| "grad_norm": 0.8013282418251038, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0964, | |
| "step": 28800 | |
| }, | |
| { | |
| "epoch": 0.000289, | |
| "grad_norm": 0.7287372350692749, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0966, | |
| "step": 28900 | |
| }, | |
| { | |
| "epoch": 0.00029, | |
| "grad_norm": 0.7577667832374573, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0958, | |
| "step": 29000 | |
| }, | |
| { | |
| "epoch": 0.000291, | |
| "grad_norm": 0.7510080933570862, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0947, | |
| "step": 29100 | |
| }, | |
| { | |
| "epoch": 0.000292, | |
| "grad_norm": 0.8355770707130432, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0946, | |
| "step": 29200 | |
| }, | |
| { | |
| "epoch": 0.000293, | |
| "grad_norm": 0.8899005651473999, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0948, | |
| "step": 29300 | |
| }, | |
| { | |
| "epoch": 0.000294, | |
| "grad_norm": 0.8526831865310669, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0947, | |
| "step": 29400 | |
| }, | |
| { | |
| "epoch": 0.000295, | |
| "grad_norm": 0.740943968296051, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0928, | |
| "step": 29500 | |
| }, | |
| { | |
| "epoch": 0.000296, | |
| "grad_norm": 0.8096754550933838, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0948, | |
| "step": 29600 | |
| }, | |
| { | |
| "epoch": 0.000297, | |
| "grad_norm": 0.8890173435211182, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0934, | |
| "step": 29700 | |
| }, | |
| { | |
| "epoch": 0.000298, | |
| "grad_norm": 0.8200284838676453, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0931, | |
| "step": 29800 | |
| }, | |
| { | |
| "epoch": 0.000299, | |
| "grad_norm": 0.70655757188797, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0946, | |
| "step": 29900 | |
| }, | |
| { | |
| "epoch": 0.0003, | |
| "grad_norm": 0.7843393087387085, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0924, | |
| "step": 30000 | |
| }, | |
| { | |
| "epoch": 0.000301, | |
| "grad_norm": 0.6674346923828125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0925, | |
| "step": 30100 | |
| }, | |
| { | |
| "epoch": 0.000302, | |
| "grad_norm": 0.7955383062362671, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0927, | |
| "step": 30200 | |
| }, | |
| { | |
| "epoch": 0.000303, | |
| "grad_norm": 0.7410333752632141, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0923, | |
| "step": 30300 | |
| }, | |
| { | |
| "epoch": 0.000304, | |
| "grad_norm": 0.716390073299408, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0924, | |
| "step": 30400 | |
| }, | |
| { | |
| "epoch": 0.000305, | |
| "grad_norm": 0.7392554879188538, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0921, | |
| "step": 30500 | |
| }, | |
| { | |
| "epoch": 0.000306, | |
| "grad_norm": 0.9256471991539001, | |
| "learning_rate": 1e-05, | |
| "loss": 0.091, | |
| "step": 30600 | |
| }, | |
| { | |
| "epoch": 0.000307, | |
| "grad_norm": 0.7692530751228333, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0928, | |
| "step": 30700 | |
| }, | |
| { | |
| "epoch": 0.000308, | |
| "grad_norm": 0.7785292863845825, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0906, | |
| "step": 30800 | |
| }, | |
| { | |
| "epoch": 0.000309, | |
| "grad_norm": 0.8413007259368896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.09, | |
| "step": 30900 | |
| }, | |
| { | |
| "epoch": 0.00031, | |
| "grad_norm": 0.9082907438278198, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0896, | |
| "step": 31000 | |
| }, | |
| { | |
| "epoch": 0.000311, | |
| "grad_norm": 0.7937412261962891, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0892, | |
| "step": 31100 | |
| }, | |
| { | |
| "epoch": 0.000312, | |
| "grad_norm": 0.7778225541114807, | |
| "learning_rate": 1e-05, | |
| "loss": 0.088, | |
| "step": 31200 | |
| }, | |
| { | |
| "epoch": 0.000313, | |
| "grad_norm": 0.7651337385177612, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0897, | |
| "step": 31300 | |
| }, | |
| { | |
| "epoch": 0.000314, | |
| "grad_norm": 0.7604988217353821, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0901, | |
| "step": 31400 | |
| }, | |
| { | |
| "epoch": 0.000315, | |
| "grad_norm": 0.779761016368866, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0903, | |
| "step": 31500 | |
| }, | |
| { | |
| "epoch": 0.000316, | |
| "grad_norm": 0.7517678737640381, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0885, | |
| "step": 31600 | |
| }, | |
| { | |
| "epoch": 0.000317, | |
| "grad_norm": 0.8016210794448853, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0893, | |
| "step": 31700 | |
| }, | |
| { | |
| "epoch": 0.000318, | |
| "grad_norm": 0.678521990776062, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0886, | |
| "step": 31800 | |
| }, | |
| { | |
| "epoch": 0.000319, | |
| "grad_norm": 0.7407852411270142, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0899, | |
| "step": 31900 | |
| }, | |
| { | |
| "epoch": 0.00032, | |
| "grad_norm": 0.8720430135726929, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0876, | |
| "step": 32000 | |
| }, | |
| { | |
| "epoch": 0.000321, | |
| "grad_norm": 0.7622641324996948, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0881, | |
| "step": 32100 | |
| }, | |
| { | |
| "epoch": 0.000322, | |
| "grad_norm": 0.6715940237045288, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0867, | |
| "step": 32200 | |
| }, | |
| { | |
| "epoch": 0.000323, | |
| "grad_norm": 0.8118298053741455, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0887, | |
| "step": 32300 | |
| }, | |
| { | |
| "epoch": 0.000324, | |
| "grad_norm": 0.7427231073379517, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0878, | |
| "step": 32400 | |
| }, | |
| { | |
| "epoch": 0.000325, | |
| "grad_norm": 0.7627066969871521, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0879, | |
| "step": 32500 | |
| }, | |
| { | |
| "epoch": 0.000326, | |
| "grad_norm": 0.7354280948638916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.088, | |
| "step": 32600 | |
| }, | |
| { | |
| "epoch": 0.000327, | |
| "grad_norm": 0.6953477263450623, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0867, | |
| "step": 32700 | |
| }, | |
| { | |
| "epoch": 0.000328, | |
| "grad_norm": 0.7861385345458984, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0858, | |
| "step": 32800 | |
| }, | |
| { | |
| "epoch": 0.000329, | |
| "grad_norm": 0.7112125158309937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0859, | |
| "step": 32900 | |
| }, | |
| { | |
| "epoch": 0.00033, | |
| "grad_norm": 0.7531374096870422, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0862, | |
| "step": 33000 | |
| }, | |
| { | |
| "epoch": 0.000331, | |
| "grad_norm": 0.7147675156593323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0851, | |
| "step": 33100 | |
| }, | |
| { | |
| "epoch": 0.000332, | |
| "grad_norm": 0.8516043424606323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0858, | |
| "step": 33200 | |
| }, | |
| { | |
| "epoch": 0.000333, | |
| "grad_norm": 0.7007201313972473, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0856, | |
| "step": 33300 | |
| }, | |
| { | |
| "epoch": 0.000334, | |
| "grad_norm": 0.7700639963150024, | |
| "learning_rate": 1e-05, | |
| "loss": 0.085, | |
| "step": 33400 | |
| }, | |
| { | |
| "epoch": 0.000335, | |
| "grad_norm": 0.7579879760742188, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0844, | |
| "step": 33500 | |
| }, | |
| { | |
| "epoch": 0.000336, | |
| "grad_norm": 0.7982689738273621, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0849, | |
| "step": 33600 | |
| }, | |
| { | |
| "epoch": 0.000337, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0864, | |
| "step": 33700 | |
| }, | |
| { | |
| "epoch": 0.000338, | |
| "grad_norm": 0.723205029964447, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0858, | |
| "step": 33800 | |
| }, | |
| { | |
| "epoch": 0.000339, | |
| "grad_norm": 0.7827596664428711, | |
| "learning_rate": 1e-05, | |
| "loss": 0.084, | |
| "step": 33900 | |
| }, | |
| { | |
| "epoch": 0.00034, | |
| "grad_norm": 0.8219903111457825, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0852, | |
| "step": 34000 | |
| }, | |
| { | |
| "epoch": 0.000341, | |
| "grad_norm": 0.8129620552062988, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0848, | |
| "step": 34100 | |
| }, | |
| { | |
| "epoch": 0.000342, | |
| "grad_norm": 0.6510952115058899, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0827, | |
| "step": 34200 | |
| }, | |
| { | |
| "epoch": 0.000343, | |
| "grad_norm": 0.7110053896903992, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0836, | |
| "step": 34300 | |
| }, | |
| { | |
| "epoch": 0.000344, | |
| "grad_norm": 0.7686619162559509, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0835, | |
| "step": 34400 | |
| }, | |
| { | |
| "epoch": 0.000345, | |
| "grad_norm": 0.829767107963562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0827, | |
| "step": 34500 | |
| }, | |
| { | |
| "epoch": 0.000346, | |
| "grad_norm": 0.7650629281997681, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0826, | |
| "step": 34600 | |
| }, | |
| { | |
| "epoch": 0.000347, | |
| "grad_norm": 0.6766960024833679, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0831, | |
| "step": 34700 | |
| }, | |
| { | |
| "epoch": 0.000348, | |
| "grad_norm": 0.7824012637138367, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0831, | |
| "step": 34800 | |
| }, | |
| { | |
| "epoch": 0.000349, | |
| "grad_norm": 0.697309136390686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0826, | |
| "step": 34900 | |
| }, | |
| { | |
| "epoch": 0.00035, | |
| "grad_norm": 0.6359274387359619, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0821, | |
| "step": 35000 | |
| }, | |
| { | |
| "epoch": 0.000351, | |
| "grad_norm": 0.7838051915168762, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0828, | |
| "step": 35100 | |
| }, | |
| { | |
| "epoch": 0.000352, | |
| "grad_norm": 0.8149462938308716, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0819, | |
| "step": 35200 | |
| }, | |
| { | |
| "epoch": 0.000353, | |
| "grad_norm": 0.7315548062324524, | |
| "learning_rate": 1e-05, | |
| "loss": 0.081, | |
| "step": 35300 | |
| }, | |
| { | |
| "epoch": 0.000354, | |
| "grad_norm": 0.6927749514579773, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0802, | |
| "step": 35400 | |
| }, | |
| { | |
| "epoch": 0.000355, | |
| "grad_norm": 0.7449594736099243, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0822, | |
| "step": 35500 | |
| }, | |
| { | |
| "epoch": 0.000356, | |
| "grad_norm": 0.6572420597076416, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0809, | |
| "step": 35600 | |
| }, | |
| { | |
| "epoch": 0.000357, | |
| "grad_norm": 0.7096725702285767, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0805, | |
| "step": 35700 | |
| }, | |
| { | |
| "epoch": 0.000358, | |
| "grad_norm": 0.8065080046653748, | |
| "learning_rate": 1e-05, | |
| "loss": 0.08, | |
| "step": 35800 | |
| }, | |
| { | |
| "epoch": 0.000359, | |
| "grad_norm": 0.5750519633293152, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0796, | |
| "step": 35900 | |
| }, | |
| { | |
| "epoch": 0.00036, | |
| "grad_norm": 0.7987583875656128, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0795, | |
| "step": 36000 | |
| }, | |
| { | |
| "epoch": 0.000361, | |
| "grad_norm": 0.7741938233375549, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0795, | |
| "step": 36100 | |
| }, | |
| { | |
| "epoch": 0.000362, | |
| "grad_norm": 0.7459242343902588, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0804, | |
| "step": 36200 | |
| }, | |
| { | |
| "epoch": 0.000363, | |
| "grad_norm": 0.6847333312034607, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0809, | |
| "step": 36300 | |
| }, | |
| { | |
| "epoch": 0.000364, | |
| "grad_norm": 0.7405627369880676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0782, | |
| "step": 36400 | |
| }, | |
| { | |
| "epoch": 0.000365, | |
| "grad_norm": 0.6119332909584045, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0806, | |
| "step": 36500 | |
| }, | |
| { | |
| "epoch": 0.000366, | |
| "grad_norm": 0.7295922636985779, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0791, | |
| "step": 36600 | |
| }, | |
| { | |
| "epoch": 0.000367, | |
| "grad_norm": 0.7362000346183777, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0793, | |
| "step": 36700 | |
| }, | |
| { | |
| "epoch": 0.000368, | |
| "grad_norm": 0.650321900844574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0787, | |
| "step": 36800 | |
| }, | |
| { | |
| "epoch": 0.000369, | |
| "grad_norm": 0.6487528681755066, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0788, | |
| "step": 36900 | |
| }, | |
| { | |
| "epoch": 0.00037, | |
| "grad_norm": 0.6908884644508362, | |
| "learning_rate": 1e-05, | |
| "loss": 0.078, | |
| "step": 37000 | |
| }, | |
| { | |
| "epoch": 0.000371, | |
| "grad_norm": 0.7823421359062195, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0773, | |
| "step": 37100 | |
| }, | |
| { | |
| "epoch": 0.000372, | |
| "grad_norm": 0.7242419719696045, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0789, | |
| "step": 37200 | |
| }, | |
| { | |
| "epoch": 0.000373, | |
| "grad_norm": 0.7191994786262512, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0786, | |
| "step": 37300 | |
| }, | |
| { | |
| "epoch": 0.000374, | |
| "grad_norm": 0.6352174282073975, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0782, | |
| "step": 37400 | |
| }, | |
| { | |
| "epoch": 0.000375, | |
| "grad_norm": 0.6456391215324402, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0801, | |
| "step": 37500 | |
| }, | |
| { | |
| "epoch": 0.000376, | |
| "grad_norm": 0.7176135182380676, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0788, | |
| "step": 37600 | |
| }, | |
| { | |
| "epoch": 0.000377, | |
| "grad_norm": 0.7592889666557312, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0782, | |
| "step": 37700 | |
| }, | |
| { | |
| "epoch": 0.000378, | |
| "grad_norm": 0.7405545115470886, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0772, | |
| "step": 37800 | |
| }, | |
| { | |
| "epoch": 0.000379, | |
| "grad_norm": 0.6966970562934875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0761, | |
| "step": 37900 | |
| }, | |
| { | |
| "epoch": 0.00038, | |
| "grad_norm": 0.7346359491348267, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0775, | |
| "step": 38000 | |
| }, | |
| { | |
| "epoch": 0.000381, | |
| "grad_norm": 0.729246199131012, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0767, | |
| "step": 38100 | |
| }, | |
| { | |
| "epoch": 0.000382, | |
| "grad_norm": 0.8081512451171875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.078, | |
| "step": 38200 | |
| }, | |
| { | |
| "epoch": 0.000383, | |
| "grad_norm": 0.6851301193237305, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0757, | |
| "step": 38300 | |
| }, | |
| { | |
| "epoch": 0.000384, | |
| "grad_norm": 0.6699986457824707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0767, | |
| "step": 38400 | |
| }, | |
| { | |
| "epoch": 0.000385, | |
| "grad_norm": 0.7026481032371521, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0776, | |
| "step": 38500 | |
| }, | |
| { | |
| "epoch": 0.000386, | |
| "grad_norm": 0.7267670035362244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0761, | |
| "step": 38600 | |
| }, | |
| { | |
| "epoch": 0.000387, | |
| "grad_norm": 0.648714005947113, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0749, | |
| "step": 38700 | |
| }, | |
| { | |
| "epoch": 0.000388, | |
| "grad_norm": 0.7160006165504456, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0756, | |
| "step": 38800 | |
| }, | |
| { | |
| "epoch": 0.000389, | |
| "grad_norm": 0.7773024439811707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0759, | |
| "step": 38900 | |
| }, | |
| { | |
| "epoch": 0.00039, | |
| "grad_norm": 0.7162371277809143, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0749, | |
| "step": 39000 | |
| }, | |
| { | |
| "epoch": 0.000391, | |
| "grad_norm": 0.7529783844947815, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0746, | |
| "step": 39100 | |
| }, | |
| { | |
| "epoch": 0.000392, | |
| "grad_norm": 0.866392195224762, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0755, | |
| "step": 39200 | |
| }, | |
| { | |
| "epoch": 0.000393, | |
| "grad_norm": 0.751728355884552, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0752, | |
| "step": 39300 | |
| }, | |
| { | |
| "epoch": 0.000394, | |
| "grad_norm": 0.6856648325920105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0753, | |
| "step": 39400 | |
| }, | |
| { | |
| "epoch": 0.000395, | |
| "grad_norm": 0.683175265789032, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0739, | |
| "step": 39500 | |
| }, | |
| { | |
| "epoch": 0.000396, | |
| "grad_norm": 0.7458997368812561, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0752, | |
| "step": 39600 | |
| }, | |
| { | |
| "epoch": 0.000397, | |
| "grad_norm": 0.7095280885696411, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0753, | |
| "step": 39700 | |
| }, | |
| { | |
| "epoch": 0.000398, | |
| "grad_norm": 0.6352033019065857, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0737, | |
| "step": 39800 | |
| }, | |
| { | |
| "epoch": 0.000399, | |
| "grad_norm": 0.695184588432312, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0738, | |
| "step": 39900 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "grad_norm": 0.6518137454986572, | |
| "learning_rate": 1e-05, | |
| "loss": 0.074, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.0004, | |
| "eval_loss": 0.057464599609375, | |
| "eval_runtime": 146.7084, | |
| "eval_samples_per_second": 340.812, | |
| "eval_steps_per_second": 21.301, | |
| "step": 40000 | |
| }, | |
| { | |
| "epoch": 0.000401, | |
| "grad_norm": 0.7782549858093262, | |
| "learning_rate": 1e-05, | |
| "loss": 0.074, | |
| "step": 40100 | |
| }, | |
| { | |
| "epoch": 0.000402, | |
| "grad_norm": 0.6919134855270386, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0739, | |
| "step": 40200 | |
| }, | |
| { | |
| "epoch": 0.000403, | |
| "grad_norm": 0.661824643611908, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0744, | |
| "step": 40300 | |
| }, | |
| { | |
| "epoch": 0.000404, | |
| "grad_norm": 0.6964775919914246, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0732, | |
| "step": 40400 | |
| }, | |
| { | |
| "epoch": 0.000405, | |
| "grad_norm": 0.860140860080719, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0736, | |
| "step": 40500 | |
| }, | |
| { | |
| "epoch": 0.000406, | |
| "grad_norm": 0.6227797865867615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0734, | |
| "step": 40600 | |
| }, | |
| { | |
| "epoch": 0.000407, | |
| "grad_norm": 0.5687974095344543, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0734, | |
| "step": 40700 | |
| }, | |
| { | |
| "epoch": 0.000408, | |
| "grad_norm": 0.6930891871452332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.074, | |
| "step": 40800 | |
| }, | |
| { | |
| "epoch": 0.000409, | |
| "grad_norm": 0.6303442716598511, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0728, | |
| "step": 40900 | |
| }, | |
| { | |
| "epoch": 0.00041, | |
| "grad_norm": 0.6731743812561035, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0742, | |
| "step": 41000 | |
| }, | |
| { | |
| "epoch": 0.000411, | |
| "grad_norm": 0.6712822318077087, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0737, | |
| "step": 41100 | |
| }, | |
| { | |
| "epoch": 0.000412, | |
| "grad_norm": 0.6134166717529297, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0728, | |
| "step": 41200 | |
| }, | |
| { | |
| "epoch": 0.000413, | |
| "grad_norm": 0.6910662651062012, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0726, | |
| "step": 41300 | |
| }, | |
| { | |
| "epoch": 0.000414, | |
| "grad_norm": 0.6266744136810303, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0719, | |
| "step": 41400 | |
| }, | |
| { | |
| "epoch": 0.000415, | |
| "grad_norm": 0.600907027721405, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0737, | |
| "step": 41500 | |
| }, | |
| { | |
| "epoch": 0.000416, | |
| "grad_norm": 0.6139588356018066, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0722, | |
| "step": 41600 | |
| }, | |
| { | |
| "epoch": 0.000417, | |
| "grad_norm": 0.6445550918579102, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0721, | |
| "step": 41700 | |
| }, | |
| { | |
| "epoch": 0.000418, | |
| "grad_norm": 0.7176617980003357, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0718, | |
| "step": 41800 | |
| }, | |
| { | |
| "epoch": 0.000419, | |
| "grad_norm": 0.7564845085144043, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0724, | |
| "step": 41900 | |
| }, | |
| { | |
| "epoch": 0.00042, | |
| "grad_norm": 0.7683578133583069, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0714, | |
| "step": 42000 | |
| }, | |
| { | |
| "epoch": 0.000421, | |
| "grad_norm": 0.731192946434021, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0707, | |
| "step": 42100 | |
| }, | |
| { | |
| "epoch": 0.000422, | |
| "grad_norm": 0.6390314102172852, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0706, | |
| "step": 42200 | |
| }, | |
| { | |
| "epoch": 0.000423, | |
| "grad_norm": 0.6024550199508667, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0714, | |
| "step": 42300 | |
| }, | |
| { | |
| "epoch": 0.000424, | |
| "grad_norm": 0.6974002718925476, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0712, | |
| "step": 42400 | |
| }, | |
| { | |
| "epoch": 0.000425, | |
| "grad_norm": 0.6231324672698975, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0719, | |
| "step": 42500 | |
| }, | |
| { | |
| "epoch": 0.000426, | |
| "grad_norm": 0.6329951882362366, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0708, | |
| "step": 42600 | |
| }, | |
| { | |
| "epoch": 0.000427, | |
| "grad_norm": 0.7328572869300842, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0713, | |
| "step": 42700 | |
| }, | |
| { | |
| "epoch": 0.000428, | |
| "grad_norm": 0.6142467856407166, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0721, | |
| "step": 42800 | |
| }, | |
| { | |
| "epoch": 0.000429, | |
| "grad_norm": 0.7287197113037109, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0706, | |
| "step": 42900 | |
| }, | |
| { | |
| "epoch": 0.00043, | |
| "grad_norm": 0.6606420278549194, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0699, | |
| "step": 43000 | |
| }, | |
| { | |
| "epoch": 0.000431, | |
| "grad_norm": 0.7667610049247742, | |
| "learning_rate": 1e-05, | |
| "loss": 0.07, | |
| "step": 43100 | |
| }, | |
| { | |
| "epoch": 0.000432, | |
| "grad_norm": 0.5734269618988037, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0699, | |
| "step": 43200 | |
| }, | |
| { | |
| "epoch": 0.000433, | |
| "grad_norm": 0.5326073169708252, | |
| "learning_rate": 1e-05, | |
| "loss": 0.07, | |
| "step": 43300 | |
| }, | |
| { | |
| "epoch": 0.000434, | |
| "grad_norm": 0.7028875946998596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0696, | |
| "step": 43400 | |
| }, | |
| { | |
| "epoch": 0.000435, | |
| "grad_norm": 0.6137057542800903, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0691, | |
| "step": 43500 | |
| }, | |
| { | |
| "epoch": 0.000436, | |
| "grad_norm": 0.5539369583129883, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0688, | |
| "step": 43600 | |
| }, | |
| { | |
| "epoch": 0.000437, | |
| "grad_norm": 0.7035527229309082, | |
| "learning_rate": 1e-05, | |
| "loss": 0.071, | |
| "step": 43700 | |
| }, | |
| { | |
| "epoch": 0.000438, | |
| "grad_norm": 0.7055030465126038, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0699, | |
| "step": 43800 | |
| }, | |
| { | |
| "epoch": 0.000439, | |
| "grad_norm": 0.536948025226593, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0697, | |
| "step": 43900 | |
| }, | |
| { | |
| "epoch": 0.00044, | |
| "grad_norm": 0.6797453165054321, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0677, | |
| "step": 44000 | |
| }, | |
| { | |
| "epoch": 0.000441, | |
| "grad_norm": 0.6475409865379333, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0696, | |
| "step": 44100 | |
| }, | |
| { | |
| "epoch": 0.000442, | |
| "grad_norm": 0.5951113700866699, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0683, | |
| "step": 44200 | |
| }, | |
| { | |
| "epoch": 0.000443, | |
| "grad_norm": 0.7197650671005249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0696, | |
| "step": 44300 | |
| }, | |
| { | |
| "epoch": 0.000444, | |
| "grad_norm": 0.6708860397338867, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0692, | |
| "step": 44400 | |
| }, | |
| { | |
| "epoch": 0.000445, | |
| "grad_norm": 0.6833498477935791, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0694, | |
| "step": 44500 | |
| }, | |
| { | |
| "epoch": 0.000446, | |
| "grad_norm": 0.6520599722862244, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0694, | |
| "step": 44600 | |
| }, | |
| { | |
| "epoch": 0.000447, | |
| "grad_norm": 0.7471343278884888, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0679, | |
| "step": 44700 | |
| }, | |
| { | |
| "epoch": 0.000448, | |
| "grad_norm": 0.6124304533004761, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0685, | |
| "step": 44800 | |
| }, | |
| { | |
| "epoch": 0.000449, | |
| "grad_norm": 0.6457110643386841, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0683, | |
| "step": 44900 | |
| }, | |
| { | |
| "epoch": 0.00045, | |
| "grad_norm": 0.8282802104949951, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0675, | |
| "step": 45000 | |
| }, | |
| { | |
| "epoch": 0.000451, | |
| "grad_norm": 0.7290102243423462, | |
| "learning_rate": 1e-05, | |
| "loss": 0.067, | |
| "step": 45100 | |
| }, | |
| { | |
| "epoch": 0.000452, | |
| "grad_norm": 0.6666006445884705, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0672, | |
| "step": 45200 | |
| }, | |
| { | |
| "epoch": 0.000453, | |
| "grad_norm": 0.5930759906768799, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0687, | |
| "step": 45300 | |
| }, | |
| { | |
| "epoch": 0.000454, | |
| "grad_norm": 0.7391034960746765, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0681, | |
| "step": 45400 | |
| }, | |
| { | |
| "epoch": 0.000455, | |
| "grad_norm": 0.6331747770309448, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0686, | |
| "step": 45500 | |
| }, | |
| { | |
| "epoch": 0.000456, | |
| "grad_norm": 0.7175407409667969, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0682, | |
| "step": 45600 | |
| }, | |
| { | |
| "epoch": 0.000457, | |
| "grad_norm": 0.6839337348937988, | |
| "learning_rate": 1e-05, | |
| "loss": 0.068, | |
| "step": 45700 | |
| }, | |
| { | |
| "epoch": 0.000458, | |
| "grad_norm": 0.7204523682594299, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0674, | |
| "step": 45800 | |
| }, | |
| { | |
| "epoch": 0.000459, | |
| "grad_norm": 0.6172782778739929, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0672, | |
| "step": 45900 | |
| }, | |
| { | |
| "epoch": 0.00046, | |
| "grad_norm": 0.6801437735557556, | |
| "learning_rate": 1e-05, | |
| "loss": 0.068, | |
| "step": 46000 | |
| }, | |
| { | |
| "epoch": 0.000461, | |
| "grad_norm": 0.6950106620788574, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0667, | |
| "step": 46100 | |
| }, | |
| { | |
| "epoch": 0.000462, | |
| "grad_norm": 0.7430393099784851, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0661, | |
| "step": 46200 | |
| }, | |
| { | |
| "epoch": 0.000463, | |
| "grad_norm": 0.7335778474807739, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0664, | |
| "step": 46300 | |
| }, | |
| { | |
| "epoch": 0.000464, | |
| "grad_norm": 0.6109582185745239, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0678, | |
| "step": 46400 | |
| }, | |
| { | |
| "epoch": 0.000465, | |
| "grad_norm": 0.747843325138092, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0666, | |
| "step": 46500 | |
| }, | |
| { | |
| "epoch": 0.000466, | |
| "grad_norm": 0.5541141033172607, | |
| "learning_rate": 1e-05, | |
| "loss": 0.066, | |
| "step": 46600 | |
| }, | |
| { | |
| "epoch": 0.000467, | |
| "grad_norm": 0.7821163535118103, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0663, | |
| "step": 46700 | |
| }, | |
| { | |
| "epoch": 0.000468, | |
| "grad_norm": 0.6927903294563293, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0668, | |
| "step": 46800 | |
| }, | |
| { | |
| "epoch": 0.000469, | |
| "grad_norm": 0.6270934343338013, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0674, | |
| "step": 46900 | |
| }, | |
| { | |
| "epoch": 0.00047, | |
| "grad_norm": 0.7509257197380066, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0661, | |
| "step": 47000 | |
| }, | |
| { | |
| "epoch": 0.000471, | |
| "grad_norm": 0.6083252429962158, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0655, | |
| "step": 47100 | |
| }, | |
| { | |
| "epoch": 0.000472, | |
| "grad_norm": 0.5622929334640503, | |
| "learning_rate": 1e-05, | |
| "loss": 0.065, | |
| "step": 47200 | |
| }, | |
| { | |
| "epoch": 0.000473, | |
| "grad_norm": 0.5768439173698425, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0663, | |
| "step": 47300 | |
| }, | |
| { | |
| "epoch": 0.000474, | |
| "grad_norm": 0.7420287728309631, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0647, | |
| "step": 47400 | |
| }, | |
| { | |
| "epoch": 0.000475, | |
| "grad_norm": 0.6630219221115112, | |
| "learning_rate": 1e-05, | |
| "loss": 0.066, | |
| "step": 47500 | |
| }, | |
| { | |
| "epoch": 0.000476, | |
| "grad_norm": 0.5590940713882446, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0662, | |
| "step": 47600 | |
| }, | |
| { | |
| "epoch": 0.000477, | |
| "grad_norm": 0.5448912382125854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0648, | |
| "step": 47700 | |
| }, | |
| { | |
| "epoch": 0.000478, | |
| "grad_norm": 0.6090975999832153, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0653, | |
| "step": 47800 | |
| }, | |
| { | |
| "epoch": 0.000479, | |
| "grad_norm": 0.7398414611816406, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0653, | |
| "step": 47900 | |
| }, | |
| { | |
| "epoch": 0.00048, | |
| "grad_norm": 0.6005905270576477, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0654, | |
| "step": 48000 | |
| }, | |
| { | |
| "epoch": 0.000481, | |
| "grad_norm": 0.6361467838287354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0653, | |
| "step": 48100 | |
| }, | |
| { | |
| "epoch": 0.000482, | |
| "grad_norm": 0.6767069101333618, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0652, | |
| "step": 48200 | |
| }, | |
| { | |
| "epoch": 0.000483, | |
| "grad_norm": 0.6184808015823364, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0654, | |
| "step": 48300 | |
| }, | |
| { | |
| "epoch": 0.000484, | |
| "grad_norm": 0.7021101117134094, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0637, | |
| "step": 48400 | |
| }, | |
| { | |
| "epoch": 0.000485, | |
| "grad_norm": 0.6103231310844421, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0653, | |
| "step": 48500 | |
| }, | |
| { | |
| "epoch": 0.000486, | |
| "grad_norm": 0.5976945161819458, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0647, | |
| "step": 48600 | |
| }, | |
| { | |
| "epoch": 0.000487, | |
| "grad_norm": 0.6222690343856812, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0647, | |
| "step": 48700 | |
| }, | |
| { | |
| "epoch": 0.000488, | |
| "grad_norm": 0.5408068299293518, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0641, | |
| "step": 48800 | |
| }, | |
| { | |
| "epoch": 0.000489, | |
| "grad_norm": 0.628935694694519, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0642, | |
| "step": 48900 | |
| }, | |
| { | |
| "epoch": 0.00049, | |
| "grad_norm": 0.6062678694725037, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0645, | |
| "step": 49000 | |
| }, | |
| { | |
| "epoch": 0.000491, | |
| "grad_norm": 0.6533873677253723, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0648, | |
| "step": 49100 | |
| }, | |
| { | |
| "epoch": 0.000492, | |
| "grad_norm": 0.6818357706069946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0642, | |
| "step": 49200 | |
| }, | |
| { | |
| "epoch": 0.000493, | |
| "grad_norm": 0.5615854859352112, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0649, | |
| "step": 49300 | |
| }, | |
| { | |
| "epoch": 0.000494, | |
| "grad_norm": 0.5262526273727417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0645, | |
| "step": 49400 | |
| }, | |
| { | |
| "epoch": 0.000495, | |
| "grad_norm": 0.5227097868919373, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0634, | |
| "step": 49500 | |
| }, | |
| { | |
| "epoch": 0.000496, | |
| "grad_norm": 0.5794950723648071, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0632, | |
| "step": 49600 | |
| }, | |
| { | |
| "epoch": 0.000497, | |
| "grad_norm": 0.5515991449356079, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0639, | |
| "step": 49700 | |
| }, | |
| { | |
| "epoch": 0.000498, | |
| "grad_norm": 0.5834317803382874, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0633, | |
| "step": 49800 | |
| }, | |
| { | |
| "epoch": 0.000499, | |
| "grad_norm": 0.6389098763465881, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0637, | |
| "step": 49900 | |
| }, | |
| { | |
| "epoch": 0.0005, | |
| "grad_norm": 0.6473069787025452, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0634, | |
| "step": 50000 | |
| }, | |
| { | |
| "epoch": 0.000501, | |
| "grad_norm": 0.5156600475311279, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0638, | |
| "step": 50100 | |
| }, | |
| { | |
| "epoch": 0.000502, | |
| "grad_norm": 0.6542375683784485, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0635, | |
| "step": 50200 | |
| }, | |
| { | |
| "epoch": 0.000503, | |
| "grad_norm": 0.8224967122077942, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0631, | |
| "step": 50300 | |
| }, | |
| { | |
| "epoch": 0.000504, | |
| "grad_norm": 0.6293924450874329, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0619, | |
| "step": 50400 | |
| }, | |
| { | |
| "epoch": 0.000505, | |
| "grad_norm": 0.7436028718948364, | |
| "learning_rate": 1e-05, | |
| "loss": 0.064, | |
| "step": 50500 | |
| }, | |
| { | |
| "epoch": 0.000506, | |
| "grad_norm": 0.660367488861084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0639, | |
| "step": 50600 | |
| }, | |
| { | |
| "epoch": 0.000507, | |
| "grad_norm": 0.5511479377746582, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0625, | |
| "step": 50700 | |
| }, | |
| { | |
| "epoch": 0.000508, | |
| "grad_norm": 0.5846619009971619, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0634, | |
| "step": 50800 | |
| }, | |
| { | |
| "epoch": 0.000509, | |
| "grad_norm": 0.5902076959609985, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0637, | |
| "step": 50900 | |
| }, | |
| { | |
| "epoch": 0.00051, | |
| "grad_norm": 0.5104527473449707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0627, | |
| "step": 51000 | |
| }, | |
| { | |
| "epoch": 0.000511, | |
| "grad_norm": 0.592365026473999, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0624, | |
| "step": 51100 | |
| }, | |
| { | |
| "epoch": 0.000512, | |
| "grad_norm": 0.7283549904823303, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0618, | |
| "step": 51200 | |
| }, | |
| { | |
| "epoch": 0.000513, | |
| "grad_norm": 0.6117008328437805, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0621, | |
| "step": 51300 | |
| }, | |
| { | |
| "epoch": 0.000514, | |
| "grad_norm": 0.6155059933662415, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0627, | |
| "step": 51400 | |
| }, | |
| { | |
| "epoch": 0.000515, | |
| "grad_norm": 0.6605076789855957, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0626, | |
| "step": 51500 | |
| }, | |
| { | |
| "epoch": 0.000516, | |
| "grad_norm": 0.7391318082809448, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0609, | |
| "step": 51600 | |
| }, | |
| { | |
| "epoch": 0.000517, | |
| "grad_norm": 0.5673928260803223, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0626, | |
| "step": 51700 | |
| }, | |
| { | |
| "epoch": 0.000518, | |
| "grad_norm": 0.7229452729225159, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0613, | |
| "step": 51800 | |
| }, | |
| { | |
| "epoch": 0.000519, | |
| "grad_norm": 0.6015135049819946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0614, | |
| "step": 51900 | |
| }, | |
| { | |
| "epoch": 0.00052, | |
| "grad_norm": 3.3136706352233887, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0607, | |
| "step": 52000 | |
| }, | |
| { | |
| "epoch": 0.000521, | |
| "grad_norm": 0.5922873616218567, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0627, | |
| "step": 52100 | |
| }, | |
| { | |
| "epoch": 0.000522, | |
| "grad_norm": 0.6967010498046875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0611, | |
| "step": 52200 | |
| }, | |
| { | |
| "epoch": 0.000523, | |
| "grad_norm": 0.5986941456794739, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0618, | |
| "step": 52300 | |
| }, | |
| { | |
| "epoch": 0.000524, | |
| "grad_norm": 0.5476034879684448, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0614, | |
| "step": 52400 | |
| }, | |
| { | |
| "epoch": 0.000525, | |
| "grad_norm": 0.5859378576278687, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0614, | |
| "step": 52500 | |
| }, | |
| { | |
| "epoch": 0.000526, | |
| "grad_norm": 0.601116955280304, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0618, | |
| "step": 52600 | |
| }, | |
| { | |
| "epoch": 0.000527, | |
| "grad_norm": 0.5084663033485413, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0622, | |
| "step": 52700 | |
| }, | |
| { | |
| "epoch": 0.000528, | |
| "grad_norm": 0.5654129385948181, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0625, | |
| "step": 52800 | |
| }, | |
| { | |
| "epoch": 0.000529, | |
| "grad_norm": 0.5403587222099304, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0605, | |
| "step": 52900 | |
| }, | |
| { | |
| "epoch": 0.00053, | |
| "grad_norm": 0.5523150563240051, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0615, | |
| "step": 53000 | |
| }, | |
| { | |
| "epoch": 0.000531, | |
| "grad_norm": 0.6014654636383057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0613, | |
| "step": 53100 | |
| }, | |
| { | |
| "epoch": 0.000532, | |
| "grad_norm": 0.6389763355255127, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0618, | |
| "step": 53200 | |
| }, | |
| { | |
| "epoch": 0.000533, | |
| "grad_norm": 0.6326813697814941, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0621, | |
| "step": 53300 | |
| }, | |
| { | |
| "epoch": 0.000534, | |
| "grad_norm": 0.5675824284553528, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0603, | |
| "step": 53400 | |
| }, | |
| { | |
| "epoch": 0.000535, | |
| "grad_norm": 0.6056302189826965, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0604, | |
| "step": 53500 | |
| }, | |
| { | |
| "epoch": 0.000536, | |
| "grad_norm": 0.7404552698135376, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0617, | |
| "step": 53600 | |
| }, | |
| { | |
| "epoch": 0.000537, | |
| "grad_norm": 0.5762139558792114, | |
| "learning_rate": 1e-05, | |
| "loss": 0.061, | |
| "step": 53700 | |
| }, | |
| { | |
| "epoch": 0.000538, | |
| "grad_norm": 0.6377224922180176, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0606, | |
| "step": 53800 | |
| }, | |
| { | |
| "epoch": 0.000539, | |
| "grad_norm": 0.6007105708122253, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0617, | |
| "step": 53900 | |
| }, | |
| { | |
| "epoch": 0.00054, | |
| "grad_norm": 0.679589033126831, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0609, | |
| "step": 54000 | |
| }, | |
| { | |
| "epoch": 0.000541, | |
| "grad_norm": 0.6322323679924011, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0611, | |
| "step": 54100 | |
| }, | |
| { | |
| "epoch": 0.000542, | |
| "grad_norm": 0.7151752710342407, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0594, | |
| "step": 54200 | |
| }, | |
| { | |
| "epoch": 0.000543, | |
| "grad_norm": 0.5888739228248596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0608, | |
| "step": 54300 | |
| }, | |
| { | |
| "epoch": 0.000544, | |
| "grad_norm": 0.5529482364654541, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0616, | |
| "step": 54400 | |
| }, | |
| { | |
| "epoch": 0.000545, | |
| "grad_norm": 0.5086714625358582, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0599, | |
| "step": 54500 | |
| }, | |
| { | |
| "epoch": 0.000546, | |
| "grad_norm": 0.5248231887817383, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0611, | |
| "step": 54600 | |
| }, | |
| { | |
| "epoch": 0.000547, | |
| "grad_norm": 0.48391416668891907, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0603, | |
| "step": 54700 | |
| }, | |
| { | |
| "epoch": 0.000548, | |
| "grad_norm": 0.6535386443138123, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0599, | |
| "step": 54800 | |
| }, | |
| { | |
| "epoch": 0.000549, | |
| "grad_norm": 0.6315100193023682, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0599, | |
| "step": 54900 | |
| }, | |
| { | |
| "epoch": 0.00055, | |
| "grad_norm": 0.5279924273490906, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0601, | |
| "step": 55000 | |
| }, | |
| { | |
| "epoch": 0.000551, | |
| "grad_norm": 0.5455300807952881, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0601, | |
| "step": 55100 | |
| }, | |
| { | |
| "epoch": 0.000552, | |
| "grad_norm": 0.556695282459259, | |
| "learning_rate": 1e-05, | |
| "loss": 0.06, | |
| "step": 55200 | |
| }, | |
| { | |
| "epoch": 0.000553, | |
| "grad_norm": 0.5867908000946045, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0584, | |
| "step": 55300 | |
| }, | |
| { | |
| "epoch": 0.000554, | |
| "grad_norm": 0.6211426258087158, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0594, | |
| "step": 55400 | |
| }, | |
| { | |
| "epoch": 0.000555, | |
| "grad_norm": 0.6962873339653015, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0588, | |
| "step": 55500 | |
| }, | |
| { | |
| "epoch": 0.000556, | |
| "grad_norm": 0.5341864228248596, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0594, | |
| "step": 55600 | |
| }, | |
| { | |
| "epoch": 0.000557, | |
| "grad_norm": 0.5630548000335693, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0596, | |
| "step": 55700 | |
| }, | |
| { | |
| "epoch": 0.000558, | |
| "grad_norm": 0.6993235349655151, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0592, | |
| "step": 55800 | |
| }, | |
| { | |
| "epoch": 0.000559, | |
| "grad_norm": 0.5936434268951416, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0589, | |
| "step": 55900 | |
| }, | |
| { | |
| "epoch": 0.00056, | |
| "grad_norm": 0.6682338714599609, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0592, | |
| "step": 56000 | |
| }, | |
| { | |
| "epoch": 0.000561, | |
| "grad_norm": 0.5741124749183655, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0586, | |
| "step": 56100 | |
| }, | |
| { | |
| "epoch": 0.000562, | |
| "grad_norm": 0.5639105439186096, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0596, | |
| "step": 56200 | |
| }, | |
| { | |
| "epoch": 0.000563, | |
| "grad_norm": 0.6496306657791138, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0588, | |
| "step": 56300 | |
| }, | |
| { | |
| "epoch": 0.000564, | |
| "grad_norm": 0.6160135865211487, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0593, | |
| "step": 56400 | |
| }, | |
| { | |
| "epoch": 0.000565, | |
| "grad_norm": 0.6027793288230896, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0579, | |
| "step": 56500 | |
| }, | |
| { | |
| "epoch": 0.000566, | |
| "grad_norm": 0.6365297436714172, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0592, | |
| "step": 56600 | |
| }, | |
| { | |
| "epoch": 0.000567, | |
| "grad_norm": 0.6124427914619446, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0584, | |
| "step": 56700 | |
| }, | |
| { | |
| "epoch": 0.000568, | |
| "grad_norm": 0.5500183701515198, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0603, | |
| "step": 56800 | |
| }, | |
| { | |
| "epoch": 0.000569, | |
| "grad_norm": 0.6076985597610474, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0586, | |
| "step": 56900 | |
| }, | |
| { | |
| "epoch": 0.00057, | |
| "grad_norm": 0.5683192610740662, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0577, | |
| "step": 57000 | |
| }, | |
| { | |
| "epoch": 0.000571, | |
| "grad_norm": 0.6625038385391235, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0581, | |
| "step": 57100 | |
| }, | |
| { | |
| "epoch": 0.000572, | |
| "grad_norm": 0.40177464485168457, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0586, | |
| "step": 57200 | |
| }, | |
| { | |
| "epoch": 0.000573, | |
| "grad_norm": 0.6952741742134094, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0584, | |
| "step": 57300 | |
| }, | |
| { | |
| "epoch": 0.000574, | |
| "grad_norm": 0.6179869771003723, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0589, | |
| "step": 57400 | |
| }, | |
| { | |
| "epoch": 0.000575, | |
| "grad_norm": 0.5745118260383606, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0578, | |
| "step": 57500 | |
| }, | |
| { | |
| "epoch": 0.000576, | |
| "grad_norm": 0.4852728843688965, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0584, | |
| "step": 57600 | |
| }, | |
| { | |
| "epoch": 0.000577, | |
| "grad_norm": 0.6206620335578918, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0583, | |
| "step": 57700 | |
| }, | |
| { | |
| "epoch": 0.000578, | |
| "grad_norm": 0.6402736306190491, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0574, | |
| "step": 57800 | |
| }, | |
| { | |
| "epoch": 0.000579, | |
| "grad_norm": 0.5858712792396545, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0582, | |
| "step": 57900 | |
| }, | |
| { | |
| "epoch": 0.00058, | |
| "grad_norm": 0.5614802837371826, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0586, | |
| "step": 58000 | |
| }, | |
| { | |
| "epoch": 0.000581, | |
| "grad_norm": 0.6376156210899353, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0574, | |
| "step": 58100 | |
| }, | |
| { | |
| "epoch": 0.000582, | |
| "grad_norm": 0.5398702621459961, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0567, | |
| "step": 58200 | |
| }, | |
| { | |
| "epoch": 0.000583, | |
| "grad_norm": 0.6560328602790833, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0586, | |
| "step": 58300 | |
| }, | |
| { | |
| "epoch": 0.000584, | |
| "grad_norm": 0.48175305128097534, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0579, | |
| "step": 58400 | |
| }, | |
| { | |
| "epoch": 0.000585, | |
| "grad_norm": 0.47494786977767944, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0565, | |
| "step": 58500 | |
| }, | |
| { | |
| "epoch": 0.000586, | |
| "grad_norm": 0.6271668672561646, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0572, | |
| "step": 58600 | |
| }, | |
| { | |
| "epoch": 0.000587, | |
| "grad_norm": 0.5039101243019104, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0578, | |
| "step": 58700 | |
| }, | |
| { | |
| "epoch": 0.000588, | |
| "grad_norm": 0.5363636612892151, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0578, | |
| "step": 58800 | |
| }, | |
| { | |
| "epoch": 0.000589, | |
| "grad_norm": 0.6029368042945862, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0567, | |
| "step": 58900 | |
| }, | |
| { | |
| "epoch": 0.00059, | |
| "grad_norm": 0.5582793354988098, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0579, | |
| "step": 59000 | |
| }, | |
| { | |
| "epoch": 0.000591, | |
| "grad_norm": 0.5290389657020569, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0575, | |
| "step": 59100 | |
| }, | |
| { | |
| "epoch": 0.000592, | |
| "grad_norm": 0.5864163041114807, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0565, | |
| "step": 59200 | |
| }, | |
| { | |
| "epoch": 0.000593, | |
| "grad_norm": 0.49124574661254883, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0584, | |
| "step": 59300 | |
| }, | |
| { | |
| "epoch": 0.000594, | |
| "grad_norm": 0.5180615782737732, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0555, | |
| "step": 59400 | |
| }, | |
| { | |
| "epoch": 0.000595, | |
| "grad_norm": 0.5236871838569641, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0574, | |
| "step": 59500 | |
| }, | |
| { | |
| "epoch": 0.000596, | |
| "grad_norm": 0.7328921556472778, | |
| "learning_rate": 1e-05, | |
| "loss": 0.057, | |
| "step": 59600 | |
| }, | |
| { | |
| "epoch": 0.000597, | |
| "grad_norm": 0.5635091662406921, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0559, | |
| "step": 59700 | |
| }, | |
| { | |
| "epoch": 0.000598, | |
| "grad_norm": 0.5094209313392639, | |
| "learning_rate": 1e-05, | |
| "loss": 0.057, | |
| "step": 59800 | |
| }, | |
| { | |
| "epoch": 0.000599, | |
| "grad_norm": 0.5855716466903687, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0566, | |
| "step": 59900 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "grad_norm": 0.6821003556251526, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0559, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.0006, | |
| "eval_loss": 0.045257568359375, | |
| "eval_runtime": 142.735, | |
| "eval_samples_per_second": 350.3, | |
| "eval_steps_per_second": 21.894, | |
| "step": 60000 | |
| }, | |
| { | |
| "epoch": 0.000601, | |
| "grad_norm": 0.5633527040481567, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0562, | |
| "step": 60100 | |
| }, | |
| { | |
| "epoch": 0.000602, | |
| "grad_norm": 0.5337314009666443, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0562, | |
| "step": 60200 | |
| }, | |
| { | |
| "epoch": 0.000603, | |
| "grad_norm": 0.5282440185546875, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0549, | |
| "step": 60300 | |
| }, | |
| { | |
| "epoch": 0.000604, | |
| "grad_norm": 0.5766568779945374, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0576, | |
| "step": 60400 | |
| }, | |
| { | |
| "epoch": 0.000605, | |
| "grad_norm": 0.5904074311256409, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0563, | |
| "step": 60500 | |
| }, | |
| { | |
| "epoch": 0.000606, | |
| "grad_norm": 0.6538689136505127, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0566, | |
| "step": 60600 | |
| }, | |
| { | |
| "epoch": 0.000607, | |
| "grad_norm": 0.45561283826828003, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0561, | |
| "step": 60700 | |
| }, | |
| { | |
| "epoch": 0.000608, | |
| "grad_norm": 0.47445598244667053, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0561, | |
| "step": 60800 | |
| }, | |
| { | |
| "epoch": 0.000609, | |
| "grad_norm": 0.7631045579910278, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0556, | |
| "step": 60900 | |
| }, | |
| { | |
| "epoch": 0.00061, | |
| "grad_norm": 0.5754849910736084, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0553, | |
| "step": 61000 | |
| }, | |
| { | |
| "epoch": 0.000611, | |
| "grad_norm": 0.6670407652854919, | |
| "learning_rate": 1e-05, | |
| "loss": 0.057, | |
| "step": 61100 | |
| }, | |
| { | |
| "epoch": 0.000612, | |
| "grad_norm": 0.5728887319564819, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0566, | |
| "step": 61200 | |
| }, | |
| { | |
| "epoch": 0.000613, | |
| "grad_norm": 0.5342495441436768, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0552, | |
| "step": 61300 | |
| }, | |
| { | |
| "epoch": 0.000614, | |
| "grad_norm": 0.5812315344810486, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0556, | |
| "step": 61400 | |
| }, | |
| { | |
| "epoch": 0.000615, | |
| "grad_norm": 0.5818805694580078, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0551, | |
| "step": 61500 | |
| }, | |
| { | |
| "epoch": 0.000616, | |
| "grad_norm": 0.6204677224159241, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0556, | |
| "step": 61600 | |
| }, | |
| { | |
| "epoch": 0.000617, | |
| "grad_norm": 0.5443527102470398, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0552, | |
| "step": 61700 | |
| }, | |
| { | |
| "epoch": 0.000618, | |
| "grad_norm": 0.49102166295051575, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0549, | |
| "step": 61800 | |
| }, | |
| { | |
| "epoch": 0.000619, | |
| "grad_norm": 0.557538628578186, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0543, | |
| "step": 61900 | |
| }, | |
| { | |
| "epoch": 0.00062, | |
| "grad_norm": 0.620365560054779, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0561, | |
| "step": 62000 | |
| }, | |
| { | |
| "epoch": 0.000621, | |
| "grad_norm": 0.6253044009208679, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0557, | |
| "step": 62100 | |
| }, | |
| { | |
| "epoch": 0.000622, | |
| "grad_norm": 0.7837327122688293, | |
| "learning_rate": 1e-05, | |
| "loss": 0.055, | |
| "step": 62200 | |
| }, | |
| { | |
| "epoch": 0.000623, | |
| "grad_norm": 0.5085681676864624, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0552, | |
| "step": 62300 | |
| }, | |
| { | |
| "epoch": 0.000624, | |
| "grad_norm": 0.608761191368103, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0532, | |
| "step": 62400 | |
| }, | |
| { | |
| "epoch": 0.000625, | |
| "grad_norm": 0.7588841915130615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0552, | |
| "step": 62500 | |
| }, | |
| { | |
| "epoch": 0.000626, | |
| "grad_norm": 0.5510600209236145, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0551, | |
| "step": 62600 | |
| }, | |
| { | |
| "epoch": 0.000627, | |
| "grad_norm": 0.5801370739936829, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0542, | |
| "step": 62700 | |
| }, | |
| { | |
| "epoch": 0.000628, | |
| "grad_norm": 0.6703765988349915, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0569, | |
| "step": 62800 | |
| }, | |
| { | |
| "epoch": 0.000629, | |
| "grad_norm": 0.4344656467437744, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0549, | |
| "step": 62900 | |
| }, | |
| { | |
| "epoch": 0.00063, | |
| "grad_norm": 0.5678920745849609, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0555, | |
| "step": 63000 | |
| }, | |
| { | |
| "epoch": 0.000631, | |
| "grad_norm": 0.5048655271530151, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0547, | |
| "step": 63100 | |
| }, | |
| { | |
| "epoch": 0.000632, | |
| "grad_norm": 0.5324554443359375, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0551, | |
| "step": 63200 | |
| }, | |
| { | |
| "epoch": 0.000633, | |
| "grad_norm": 0.5735768675804138, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0551, | |
| "step": 63300 | |
| }, | |
| { | |
| "epoch": 0.000634, | |
| "grad_norm": 0.5694500803947449, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0542, | |
| "step": 63400 | |
| }, | |
| { | |
| "epoch": 0.000635, | |
| "grad_norm": 0.5009059906005859, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0538, | |
| "step": 63500 | |
| }, | |
| { | |
| "epoch": 0.000636, | |
| "grad_norm": 0.5886440277099609, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0545, | |
| "step": 63600 | |
| }, | |
| { | |
| "epoch": 0.000637, | |
| "grad_norm": 0.5673546195030212, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0548, | |
| "step": 63700 | |
| }, | |
| { | |
| "epoch": 0.000638, | |
| "grad_norm": 0.5466011762619019, | |
| "learning_rate": 1e-05, | |
| "loss": 0.054, | |
| "step": 63800 | |
| }, | |
| { | |
| "epoch": 0.000639, | |
| "grad_norm": 0.5927892923355103, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0548, | |
| "step": 63900 | |
| }, | |
| { | |
| "epoch": 0.00064, | |
| "grad_norm": 0.7305207252502441, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0536, | |
| "step": 64000 | |
| }, | |
| { | |
| "epoch": 0.000641, | |
| "grad_norm": 0.5603034496307373, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0536, | |
| "step": 64100 | |
| }, | |
| { | |
| "epoch": 0.000642, | |
| "grad_norm": 0.6965247988700867, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0546, | |
| "step": 64200 | |
| }, | |
| { | |
| "epoch": 0.000643, | |
| "grad_norm": 0.57351154088974, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0535, | |
| "step": 64300 | |
| }, | |
| { | |
| "epoch": 0.000644, | |
| "grad_norm": 0.511005163192749, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0548, | |
| "step": 64400 | |
| }, | |
| { | |
| "epoch": 0.000645, | |
| "grad_norm": 0.5340495705604553, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0543, | |
| "step": 64500 | |
| }, | |
| { | |
| "epoch": 0.000646, | |
| "grad_norm": 0.6858961582183838, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0538, | |
| "step": 64600 | |
| }, | |
| { | |
| "epoch": 0.000647, | |
| "grad_norm": 0.6375705599784851, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0536, | |
| "step": 64700 | |
| }, | |
| { | |
| "epoch": 0.000648, | |
| "grad_norm": 0.48544806241989136, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0529, | |
| "step": 64800 | |
| }, | |
| { | |
| "epoch": 0.000649, | |
| "grad_norm": 0.49595892429351807, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0529, | |
| "step": 64900 | |
| }, | |
| { | |
| "epoch": 0.00065, | |
| "grad_norm": 0.4976153075695038, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0538, | |
| "step": 65000 | |
| }, | |
| { | |
| "epoch": 0.000651, | |
| "grad_norm": 0.5489813089370728, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0528, | |
| "step": 65100 | |
| }, | |
| { | |
| "epoch": 0.000652, | |
| "grad_norm": 0.4820660650730133, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0536, | |
| "step": 65200 | |
| }, | |
| { | |
| "epoch": 0.000653, | |
| "grad_norm": 0.5546014308929443, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0529, | |
| "step": 65300 | |
| }, | |
| { | |
| "epoch": 0.000654, | |
| "grad_norm": 0.4900113344192505, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0535, | |
| "step": 65400 | |
| }, | |
| { | |
| "epoch": 0.000655, | |
| "grad_norm": 0.6061577796936035, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0533, | |
| "step": 65500 | |
| }, | |
| { | |
| "epoch": 0.000656, | |
| "grad_norm": 0.6450973749160767, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0542, | |
| "step": 65600 | |
| }, | |
| { | |
| "epoch": 0.000657, | |
| "grad_norm": 0.677505612373352, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0539, | |
| "step": 65700 | |
| }, | |
| { | |
| "epoch": 0.000658, | |
| "grad_norm": 0.48482370376586914, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0533, | |
| "step": 65800 | |
| }, | |
| { | |
| "epoch": 0.000659, | |
| "grad_norm": 0.49198102951049805, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0527, | |
| "step": 65900 | |
| }, | |
| { | |
| "epoch": 0.00066, | |
| "grad_norm": 0.47996985912323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0544, | |
| "step": 66000 | |
| }, | |
| { | |
| "epoch": 0.000661, | |
| "grad_norm": 0.548791229724884, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0534, | |
| "step": 66100 | |
| }, | |
| { | |
| "epoch": 0.000662, | |
| "grad_norm": 0.6156114935874939, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0538, | |
| "step": 66200 | |
| }, | |
| { | |
| "epoch": 0.000663, | |
| "grad_norm": 0.5212823748588562, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0534, | |
| "step": 66300 | |
| }, | |
| { | |
| "epoch": 0.000664, | |
| "grad_norm": 0.5812687873840332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0527, | |
| "step": 66400 | |
| }, | |
| { | |
| "epoch": 0.000665, | |
| "grad_norm": 0.4992978572845459, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0532, | |
| "step": 66500 | |
| }, | |
| { | |
| "epoch": 0.000666, | |
| "grad_norm": 0.5525248050689697, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0533, | |
| "step": 66600 | |
| }, | |
| { | |
| "epoch": 0.000667, | |
| "grad_norm": 0.6456683874130249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0535, | |
| "step": 66700 | |
| }, | |
| { | |
| "epoch": 0.000668, | |
| "grad_norm": 0.6112907528877258, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0532, | |
| "step": 66800 | |
| }, | |
| { | |
| "epoch": 0.000669, | |
| "grad_norm": 0.543624222278595, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0539, | |
| "step": 66900 | |
| }, | |
| { | |
| "epoch": 0.00067, | |
| "grad_norm": 0.5512799024581909, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0513, | |
| "step": 67000 | |
| }, | |
| { | |
| "epoch": 0.000671, | |
| "grad_norm": 0.631289005279541, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0519, | |
| "step": 67100 | |
| }, | |
| { | |
| "epoch": 0.000672, | |
| "grad_norm": 0.47048887610435486, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0532, | |
| "step": 67200 | |
| }, | |
| { | |
| "epoch": 0.000673, | |
| "grad_norm": 0.5930091142654419, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0528, | |
| "step": 67300 | |
| }, | |
| { | |
| "epoch": 0.000674, | |
| "grad_norm": 0.7611256837844849, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0527, | |
| "step": 67400 | |
| }, | |
| { | |
| "epoch": 0.000675, | |
| "grad_norm": 0.49624642729759216, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0528, | |
| "step": 67500 | |
| }, | |
| { | |
| "epoch": 0.000676, | |
| "grad_norm": 0.6547495126724243, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0523, | |
| "step": 67600 | |
| }, | |
| { | |
| "epoch": 0.000677, | |
| "grad_norm": 0.635519802570343, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0521, | |
| "step": 67700 | |
| }, | |
| { | |
| "epoch": 0.000678, | |
| "grad_norm": 0.606388509273529, | |
| "learning_rate": 1e-05, | |
| "loss": 0.052, | |
| "step": 67800 | |
| }, | |
| { | |
| "epoch": 0.000679, | |
| "grad_norm": 0.4945245385169983, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0522, | |
| "step": 67900 | |
| }, | |
| { | |
| "epoch": 0.00068, | |
| "grad_norm": 0.4815261662006378, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0533, | |
| "step": 68000 | |
| }, | |
| { | |
| "epoch": 0.000681, | |
| "grad_norm": 0.47382187843322754, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0519, | |
| "step": 68100 | |
| }, | |
| { | |
| "epoch": 0.000682, | |
| "grad_norm": 0.549886167049408, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0518, | |
| "step": 68200 | |
| }, | |
| { | |
| "epoch": 0.000683, | |
| "grad_norm": 0.5204160213470459, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0519, | |
| "step": 68300 | |
| }, | |
| { | |
| "epoch": 0.000684, | |
| "grad_norm": 0.5802004933357239, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0517, | |
| "step": 68400 | |
| }, | |
| { | |
| "epoch": 0.000685, | |
| "grad_norm": 0.5576998591423035, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0519, | |
| "step": 68500 | |
| }, | |
| { | |
| "epoch": 0.000686, | |
| "grad_norm": 0.5708860158920288, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0523, | |
| "step": 68600 | |
| }, | |
| { | |
| "epoch": 0.000687, | |
| "grad_norm": 0.6270045042037964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0502, | |
| "step": 68700 | |
| }, | |
| { | |
| "epoch": 0.000688, | |
| "grad_norm": 0.462593138217926, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0517, | |
| "step": 68800 | |
| }, | |
| { | |
| "epoch": 0.000689, | |
| "grad_norm": 0.4807493984699249, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0524, | |
| "step": 68900 | |
| }, | |
| { | |
| "epoch": 0.00069, | |
| "grad_norm": 0.5798048973083496, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0527, | |
| "step": 69000 | |
| }, | |
| { | |
| "epoch": 0.000691, | |
| "grad_norm": 0.44622689485549927, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0528, | |
| "step": 69100 | |
| }, | |
| { | |
| "epoch": 0.000692, | |
| "grad_norm": 0.5129225254058838, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0528, | |
| "step": 69200 | |
| }, | |
| { | |
| "epoch": 0.000693, | |
| "grad_norm": 0.5368632674217224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0524, | |
| "step": 69300 | |
| }, | |
| { | |
| "epoch": 0.000694, | |
| "grad_norm": 0.559655487537384, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0525, | |
| "step": 69400 | |
| }, | |
| { | |
| "epoch": 0.000695, | |
| "grad_norm": 0.6121320128440857, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0507, | |
| "step": 69500 | |
| }, | |
| { | |
| "epoch": 0.000696, | |
| "grad_norm": 0.5470311045646667, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0511, | |
| "step": 69600 | |
| }, | |
| { | |
| "epoch": 0.000697, | |
| "grad_norm": 0.5142286419868469, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0516, | |
| "step": 69700 | |
| }, | |
| { | |
| "epoch": 0.000698, | |
| "grad_norm": 0.6724265217781067, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0517, | |
| "step": 69800 | |
| }, | |
| { | |
| "epoch": 0.000699, | |
| "grad_norm": 0.4707196354866028, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0511, | |
| "step": 69900 | |
| }, | |
| { | |
| "epoch": 0.0007, | |
| "grad_norm": 0.616026759147644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0517, | |
| "step": 70000 | |
| }, | |
| { | |
| "epoch": 0.000701, | |
| "grad_norm": 0.5991165041923523, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0512, | |
| "step": 70100 | |
| }, | |
| { | |
| "epoch": 0.000702, | |
| "grad_norm": 0.5611563324928284, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0509, | |
| "step": 70200 | |
| }, | |
| { | |
| "epoch": 0.000703, | |
| "grad_norm": 0.46492424607276917, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0511, | |
| "step": 70300 | |
| }, | |
| { | |
| "epoch": 0.000704, | |
| "grad_norm": 0.5256513357162476, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0518, | |
| "step": 70400 | |
| }, | |
| { | |
| "epoch": 0.000705, | |
| "grad_norm": 0.499254435300827, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0501, | |
| "step": 70500 | |
| }, | |
| { | |
| "epoch": 0.000706, | |
| "grad_norm": 0.5403403043746948, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0509, | |
| "step": 70600 | |
| }, | |
| { | |
| "epoch": 0.000707, | |
| "grad_norm": 0.6283129453659058, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0519, | |
| "step": 70700 | |
| }, | |
| { | |
| "epoch": 0.000708, | |
| "grad_norm": 0.5229069590568542, | |
| "learning_rate": 1e-05, | |
| "loss": 0.051, | |
| "step": 70800 | |
| }, | |
| { | |
| "epoch": 0.000709, | |
| "grad_norm": 0.48306700587272644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0504, | |
| "step": 70900 | |
| }, | |
| { | |
| "epoch": 0.00071, | |
| "grad_norm": 0.5926072597503662, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0506, | |
| "step": 71000 | |
| }, | |
| { | |
| "epoch": 0.000711, | |
| "grad_norm": 0.5640701651573181, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0506, | |
| "step": 71100 | |
| }, | |
| { | |
| "epoch": 0.000712, | |
| "grad_norm": 0.49134358763694763, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0512, | |
| "step": 71200 | |
| }, | |
| { | |
| "epoch": 0.000713, | |
| "grad_norm": 0.4878164231777191, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0512, | |
| "step": 71300 | |
| }, | |
| { | |
| "epoch": 0.000714, | |
| "grad_norm": 0.6183532476425171, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0508, | |
| "step": 71400 | |
| }, | |
| { | |
| "epoch": 0.000715, | |
| "grad_norm": 0.5065814852714539, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0505, | |
| "step": 71500 | |
| }, | |
| { | |
| "epoch": 0.000716, | |
| "grad_norm": 0.548599898815155, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0502, | |
| "step": 71600 | |
| }, | |
| { | |
| "epoch": 0.000717, | |
| "grad_norm": 0.4534250795841217, | |
| "learning_rate": 1e-05, | |
| "loss": 0.05, | |
| "step": 71700 | |
| }, | |
| { | |
| "epoch": 0.000718, | |
| "grad_norm": 0.5044461488723755, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0502, | |
| "step": 71800 | |
| }, | |
| { | |
| "epoch": 0.000719, | |
| "grad_norm": 0.5321183204650879, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0498, | |
| "step": 71900 | |
| }, | |
| { | |
| "epoch": 0.00072, | |
| "grad_norm": 0.4777474105358124, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0503, | |
| "step": 72000 | |
| }, | |
| { | |
| "epoch": 0.000721, | |
| "grad_norm": 0.6466835141181946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0507, | |
| "step": 72100 | |
| }, | |
| { | |
| "epoch": 0.000722, | |
| "grad_norm": 0.5359812378883362, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0506, | |
| "step": 72200 | |
| }, | |
| { | |
| "epoch": 0.000723, | |
| "grad_norm": 0.4923792779445648, | |
| "learning_rate": 1e-05, | |
| "loss": 0.05, | |
| "step": 72300 | |
| }, | |
| { | |
| "epoch": 0.000724, | |
| "grad_norm": 0.5708417892456055, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0511, | |
| "step": 72400 | |
| }, | |
| { | |
| "epoch": 0.000725, | |
| "grad_norm": 0.5016763806343079, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0509, | |
| "step": 72500 | |
| }, | |
| { | |
| "epoch": 0.000726, | |
| "grad_norm": 0.4299620985984802, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0504, | |
| "step": 72600 | |
| }, | |
| { | |
| "epoch": 0.000727, | |
| "grad_norm": 0.387928307056427, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 72700 | |
| }, | |
| { | |
| "epoch": 0.000728, | |
| "grad_norm": 0.5286259651184082, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0508, | |
| "step": 72800 | |
| }, | |
| { | |
| "epoch": 0.000729, | |
| "grad_norm": 0.511677622795105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0503, | |
| "step": 72900 | |
| }, | |
| { | |
| "epoch": 0.00073, | |
| "grad_norm": 0.4648519456386566, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 73000 | |
| }, | |
| { | |
| "epoch": 0.000731, | |
| "grad_norm": 0.4918229877948761, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0496, | |
| "step": 73100 | |
| }, | |
| { | |
| "epoch": 0.000732, | |
| "grad_norm": 0.49148622155189514, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 73200 | |
| }, | |
| { | |
| "epoch": 0.000733, | |
| "grad_norm": 0.5078290104866028, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0495, | |
| "step": 73300 | |
| }, | |
| { | |
| "epoch": 0.000734, | |
| "grad_norm": 0.591152012348175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0506, | |
| "step": 73400 | |
| }, | |
| { | |
| "epoch": 0.000735, | |
| "grad_norm": 0.5350937843322754, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0499, | |
| "step": 73500 | |
| }, | |
| { | |
| "epoch": 0.000736, | |
| "grad_norm": 0.4960618019104004, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0495, | |
| "step": 73600 | |
| }, | |
| { | |
| "epoch": 0.000737, | |
| "grad_norm": 0.46348682045936584, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 73700 | |
| }, | |
| { | |
| "epoch": 0.000738, | |
| "grad_norm": 0.6859008073806763, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0506, | |
| "step": 73800 | |
| }, | |
| { | |
| "epoch": 0.000739, | |
| "grad_norm": 0.5936481952667236, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0504, | |
| "step": 73900 | |
| }, | |
| { | |
| "epoch": 0.00074, | |
| "grad_norm": 0.6398313045501709, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0498, | |
| "step": 74000 | |
| }, | |
| { | |
| "epoch": 0.000741, | |
| "grad_norm": 0.6062189936637878, | |
| "learning_rate": 1e-05, | |
| "loss": 0.05, | |
| "step": 74100 | |
| }, | |
| { | |
| "epoch": 0.000742, | |
| "grad_norm": 0.5730705261230469, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0498, | |
| "step": 74200 | |
| }, | |
| { | |
| "epoch": 0.000743, | |
| "grad_norm": 0.5183285474777222, | |
| "learning_rate": 1e-05, | |
| "loss": 0.05, | |
| "step": 74300 | |
| }, | |
| { | |
| "epoch": 0.000744, | |
| "grad_norm": 0.4582626521587372, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 74400 | |
| }, | |
| { | |
| "epoch": 0.000745, | |
| "grad_norm": 0.4545513987541199, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0497, | |
| "step": 74500 | |
| }, | |
| { | |
| "epoch": 0.000746, | |
| "grad_norm": 0.6823522448539734, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 74600 | |
| }, | |
| { | |
| "epoch": 0.000747, | |
| "grad_norm": 0.5017057061195374, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0498, | |
| "step": 74700 | |
| }, | |
| { | |
| "epoch": 0.000748, | |
| "grad_norm": 0.4436599910259247, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0507, | |
| "step": 74800 | |
| }, | |
| { | |
| "epoch": 0.000749, | |
| "grad_norm": 0.5471747517585754, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0491, | |
| "step": 74900 | |
| }, | |
| { | |
| "epoch": 0.00075, | |
| "grad_norm": 0.4700005352497101, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 75000 | |
| }, | |
| { | |
| "epoch": 0.000751, | |
| "grad_norm": 0.5744854211807251, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 75100 | |
| }, | |
| { | |
| "epoch": 0.000752, | |
| "grad_norm": 0.4908376634120941, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 75200 | |
| }, | |
| { | |
| "epoch": 0.000753, | |
| "grad_norm": 0.5889230966567993, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0497, | |
| "step": 75300 | |
| }, | |
| { | |
| "epoch": 0.000754, | |
| "grad_norm": 0.5542328953742981, | |
| "learning_rate": 1e-05, | |
| "loss": 0.049, | |
| "step": 75400 | |
| }, | |
| { | |
| "epoch": 0.000755, | |
| "grad_norm": 0.567498505115509, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0487, | |
| "step": 75500 | |
| }, | |
| { | |
| "epoch": 0.000756, | |
| "grad_norm": 0.4234246611595154, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 75600 | |
| }, | |
| { | |
| "epoch": 0.000757, | |
| "grad_norm": 0.7256674766540527, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0496, | |
| "step": 75700 | |
| }, | |
| { | |
| "epoch": 0.000758, | |
| "grad_norm": 0.6111962795257568, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0494, | |
| "step": 75800 | |
| }, | |
| { | |
| "epoch": 0.000759, | |
| "grad_norm": 0.5681432485580444, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0483, | |
| "step": 75900 | |
| }, | |
| { | |
| "epoch": 0.00076, | |
| "grad_norm": 0.44954606890678406, | |
| "learning_rate": 1e-05, | |
| "loss": 0.049, | |
| "step": 76000 | |
| }, | |
| { | |
| "epoch": 0.000761, | |
| "grad_norm": 0.5693077445030212, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0485, | |
| "step": 76100 | |
| }, | |
| { | |
| "epoch": 0.000762, | |
| "grad_norm": 0.47221890091896057, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0485, | |
| "step": 76200 | |
| }, | |
| { | |
| "epoch": 0.000763, | |
| "grad_norm": 0.5012596249580383, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0488, | |
| "step": 76300 | |
| }, | |
| { | |
| "epoch": 0.000764, | |
| "grad_norm": 0.5051250457763672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0492, | |
| "step": 76400 | |
| }, | |
| { | |
| "epoch": 0.000765, | |
| "grad_norm": 0.45128434896469116, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0483, | |
| "step": 76500 | |
| }, | |
| { | |
| "epoch": 0.000766, | |
| "grad_norm": 0.48324739933013916, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 76600 | |
| }, | |
| { | |
| "epoch": 0.000767, | |
| "grad_norm": 0.6752970814704895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0487, | |
| "step": 76700 | |
| }, | |
| { | |
| "epoch": 0.000768, | |
| "grad_norm": 0.4630663990974426, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0501, | |
| "step": 76800 | |
| }, | |
| { | |
| "epoch": 0.000769, | |
| "grad_norm": 0.4887773394584656, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0481, | |
| "step": 76900 | |
| }, | |
| { | |
| "epoch": 0.00077, | |
| "grad_norm": 0.4609774947166443, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0486, | |
| "step": 77000 | |
| }, | |
| { | |
| "epoch": 0.000771, | |
| "grad_norm": 0.6502612233161926, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0495, | |
| "step": 77100 | |
| }, | |
| { | |
| "epoch": 0.000772, | |
| "grad_norm": 0.563583254814148, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0493, | |
| "step": 77200 | |
| }, | |
| { | |
| "epoch": 0.000773, | |
| "grad_norm": 0.5242981314659119, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0485, | |
| "step": 77300 | |
| }, | |
| { | |
| "epoch": 0.000774, | |
| "grad_norm": 0.5238550901412964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 77400 | |
| }, | |
| { | |
| "epoch": 0.000775, | |
| "grad_norm": 0.38637349009513855, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 77500 | |
| }, | |
| { | |
| "epoch": 0.000776, | |
| "grad_norm": 0.5395223498344421, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 77600 | |
| }, | |
| { | |
| "epoch": 0.000777, | |
| "grad_norm": 0.5965639352798462, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 77700 | |
| }, | |
| { | |
| "epoch": 0.000778, | |
| "grad_norm": 0.4685559868812561, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0474, | |
| "step": 77800 | |
| }, | |
| { | |
| "epoch": 0.000779, | |
| "grad_norm": 0.46465954184532166, | |
| "learning_rate": 1e-05, | |
| "loss": 0.049, | |
| "step": 77900 | |
| }, | |
| { | |
| "epoch": 0.00078, | |
| "grad_norm": 0.5408352017402649, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0487, | |
| "step": 78000 | |
| }, | |
| { | |
| "epoch": 0.000781, | |
| "grad_norm": 0.3893685042858124, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0479, | |
| "step": 78100 | |
| }, | |
| { | |
| "epoch": 0.000782, | |
| "grad_norm": 0.6658462285995483, | |
| "learning_rate": 1e-05, | |
| "loss": 0.048, | |
| "step": 78200 | |
| }, | |
| { | |
| "epoch": 0.000783, | |
| "grad_norm": 0.6283921003341675, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0488, | |
| "step": 78300 | |
| }, | |
| { | |
| "epoch": 0.000784, | |
| "grad_norm": 0.4658546447753906, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0486, | |
| "step": 78400 | |
| }, | |
| { | |
| "epoch": 0.000785, | |
| "grad_norm": 0.5362129807472229, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0488, | |
| "step": 78500 | |
| }, | |
| { | |
| "epoch": 0.000786, | |
| "grad_norm": 0.5157918334007263, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 78600 | |
| }, | |
| { | |
| "epoch": 0.000787, | |
| "grad_norm": 0.5089668035507202, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0485, | |
| "step": 78700 | |
| }, | |
| { | |
| "epoch": 0.000788, | |
| "grad_norm": 0.49590611457824707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 78800 | |
| }, | |
| { | |
| "epoch": 0.000789, | |
| "grad_norm": 0.4500684440135956, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0482, | |
| "step": 78900 | |
| }, | |
| { | |
| "epoch": 0.00079, | |
| "grad_norm": 0.4456005096435547, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0479, | |
| "step": 79000 | |
| }, | |
| { | |
| "epoch": 0.000791, | |
| "grad_norm": 0.502184271812439, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0483, | |
| "step": 79100 | |
| }, | |
| { | |
| "epoch": 0.000792, | |
| "grad_norm": 0.4004657566547394, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0483, | |
| "step": 79200 | |
| }, | |
| { | |
| "epoch": 0.000793, | |
| "grad_norm": 0.6616214513778687, | |
| "learning_rate": 1e-05, | |
| "loss": 0.048, | |
| "step": 79300 | |
| }, | |
| { | |
| "epoch": 0.000794, | |
| "grad_norm": 0.5488511323928833, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0473, | |
| "step": 79400 | |
| }, | |
| { | |
| "epoch": 0.000795, | |
| "grad_norm": 0.5251606702804565, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0489, | |
| "step": 79500 | |
| }, | |
| { | |
| "epoch": 0.000796, | |
| "grad_norm": 0.43220826983451843, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0469, | |
| "step": 79600 | |
| }, | |
| { | |
| "epoch": 0.000797, | |
| "grad_norm": 0.5535863041877747, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0487, | |
| "step": 79700 | |
| }, | |
| { | |
| "epoch": 0.000798, | |
| "grad_norm": 0.4892144799232483, | |
| "learning_rate": 1e-05, | |
| "loss": 0.048, | |
| "step": 79800 | |
| }, | |
| { | |
| "epoch": 0.000799, | |
| "grad_norm": 0.443042516708374, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0484, | |
| "step": 79900 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "grad_norm": 0.4258803725242615, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.0008, | |
| "eval_loss": 0.0394287109375, | |
| "eval_runtime": 147.8199, | |
| "eval_samples_per_second": 338.249, | |
| "eval_steps_per_second": 21.141, | |
| "step": 80000 | |
| }, | |
| { | |
| "epoch": 0.000801, | |
| "grad_norm": 0.5370935201644897, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0478, | |
| "step": 80100 | |
| }, | |
| { | |
| "epoch": 0.000802, | |
| "grad_norm": 0.5561772584915161, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0487, | |
| "step": 80200 | |
| }, | |
| { | |
| "epoch": 0.000803, | |
| "grad_norm": 0.5092744827270508, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 80300 | |
| }, | |
| { | |
| "epoch": 0.000804, | |
| "grad_norm": 0.4691084623336792, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0473, | |
| "step": 80400 | |
| }, | |
| { | |
| "epoch": 0.000805, | |
| "grad_norm": 0.5660099387168884, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0475, | |
| "step": 80500 | |
| }, | |
| { | |
| "epoch": 0.000806, | |
| "grad_norm": 0.5250957012176514, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0471, | |
| "step": 80600 | |
| }, | |
| { | |
| "epoch": 0.000807, | |
| "grad_norm": 0.5492421388626099, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0481, | |
| "step": 80700 | |
| }, | |
| { | |
| "epoch": 0.000808, | |
| "grad_norm": 0.7874831557273865, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0475, | |
| "step": 80800 | |
| }, | |
| { | |
| "epoch": 0.000809, | |
| "grad_norm": 0.6476261615753174, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0477, | |
| "step": 80900 | |
| }, | |
| { | |
| "epoch": 0.00081, | |
| "grad_norm": 0.557145357131958, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0477, | |
| "step": 81000 | |
| }, | |
| { | |
| "epoch": 0.000811, | |
| "grad_norm": 0.5536689758300781, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0475, | |
| "step": 81100 | |
| }, | |
| { | |
| "epoch": 0.000812, | |
| "grad_norm": 0.5005760788917542, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0472, | |
| "step": 81200 | |
| }, | |
| { | |
| "epoch": 0.000813, | |
| "grad_norm": 0.43560323119163513, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0473, | |
| "step": 81300 | |
| }, | |
| { | |
| "epoch": 0.000814, | |
| "grad_norm": 0.49981963634490967, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0468, | |
| "step": 81400 | |
| }, | |
| { | |
| "epoch": 0.000815, | |
| "grad_norm": 0.5209627151489258, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 81500 | |
| }, | |
| { | |
| "epoch": 0.000816, | |
| "grad_norm": 0.7528536319732666, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0471, | |
| "step": 81600 | |
| }, | |
| { | |
| "epoch": 0.000817, | |
| "grad_norm": 0.6212517023086548, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 81700 | |
| }, | |
| { | |
| "epoch": 0.000818, | |
| "grad_norm": 0.45106619596481323, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0475, | |
| "step": 81800 | |
| }, | |
| { | |
| "epoch": 0.000819, | |
| "grad_norm": 0.5259119868278503, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0473, | |
| "step": 81900 | |
| }, | |
| { | |
| "epoch": 0.00082, | |
| "grad_norm": 0.4737171232700348, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0478, | |
| "step": 82000 | |
| }, | |
| { | |
| "epoch": 0.000821, | |
| "grad_norm": 0.5119843482971191, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0467, | |
| "step": 82100 | |
| }, | |
| { | |
| "epoch": 0.000822, | |
| "grad_norm": 0.3932953178882599, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0465, | |
| "step": 82200 | |
| }, | |
| { | |
| "epoch": 0.000823, | |
| "grad_norm": 0.43303382396698, | |
| "learning_rate": 1e-05, | |
| "loss": 0.047, | |
| "step": 82300 | |
| }, | |
| { | |
| "epoch": 0.000824, | |
| "grad_norm": 0.5500777363777161, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0461, | |
| "step": 82400 | |
| }, | |
| { | |
| "epoch": 0.000825, | |
| "grad_norm": 0.5227336883544922, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0477, | |
| "step": 82500 | |
| }, | |
| { | |
| "epoch": 0.000826, | |
| "grad_norm": 0.5672751665115356, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0476, | |
| "step": 82600 | |
| }, | |
| { | |
| "epoch": 0.000827, | |
| "grad_norm": 0.5093204975128174, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0468, | |
| "step": 82700 | |
| }, | |
| { | |
| "epoch": 0.000828, | |
| "grad_norm": 0.47309496998786926, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0464, | |
| "step": 82800 | |
| }, | |
| { | |
| "epoch": 0.000829, | |
| "grad_norm": 0.4092000722885132, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0467, | |
| "step": 82900 | |
| }, | |
| { | |
| "epoch": 0.00083, | |
| "grad_norm": 0.42544227838516235, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0455, | |
| "step": 83000 | |
| }, | |
| { | |
| "epoch": 0.000831, | |
| "grad_norm": 0.5713441371917725, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0457, | |
| "step": 83100 | |
| }, | |
| { | |
| "epoch": 0.000832, | |
| "grad_norm": 0.5193179845809937, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 83200 | |
| }, | |
| { | |
| "epoch": 0.000833, | |
| "grad_norm": 0.43209248781204224, | |
| "learning_rate": 1e-05, | |
| "loss": 0.047, | |
| "step": 83300 | |
| }, | |
| { | |
| "epoch": 0.000834, | |
| "grad_norm": 0.5342600345611572, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0456, | |
| "step": 83400 | |
| }, | |
| { | |
| "epoch": 0.000835, | |
| "grad_norm": 0.592204213142395, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0472, | |
| "step": 83500 | |
| }, | |
| { | |
| "epoch": 0.000836, | |
| "grad_norm": 0.5118575692176819, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0467, | |
| "step": 83600 | |
| }, | |
| { | |
| "epoch": 0.000837, | |
| "grad_norm": 0.4781627058982849, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 83700 | |
| }, | |
| { | |
| "epoch": 0.000838, | |
| "grad_norm": 0.4500192403793335, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0468, | |
| "step": 83800 | |
| }, | |
| { | |
| "epoch": 0.000839, | |
| "grad_norm": 0.49369123578071594, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 83900 | |
| }, | |
| { | |
| "epoch": 0.00084, | |
| "grad_norm": 0.48518478870391846, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0466, | |
| "step": 84000 | |
| }, | |
| { | |
| "epoch": 0.000841, | |
| "grad_norm": 0.4960392117500305, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0464, | |
| "step": 84100 | |
| }, | |
| { | |
| "epoch": 0.000842, | |
| "grad_norm": 0.4881882667541504, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0461, | |
| "step": 84200 | |
| }, | |
| { | |
| "epoch": 0.000843, | |
| "grad_norm": 0.45837706327438354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0462, | |
| "step": 84300 | |
| }, | |
| { | |
| "epoch": 0.000844, | |
| "grad_norm": 0.4866684675216675, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0456, | |
| "step": 84400 | |
| }, | |
| { | |
| "epoch": 0.000845, | |
| "grad_norm": 0.5094208121299744, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0466, | |
| "step": 84500 | |
| }, | |
| { | |
| "epoch": 0.000846, | |
| "grad_norm": 0.45124098658561707, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0474, | |
| "step": 84600 | |
| }, | |
| { | |
| "epoch": 0.000847, | |
| "grad_norm": 0.5730771422386169, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0465, | |
| "step": 84700 | |
| }, | |
| { | |
| "epoch": 0.000848, | |
| "grad_norm": 0.48597007989883423, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 84800 | |
| }, | |
| { | |
| "epoch": 0.000849, | |
| "grad_norm": 0.46603092551231384, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0465, | |
| "step": 84900 | |
| }, | |
| { | |
| "epoch": 0.00085, | |
| "grad_norm": 0.5534038543701172, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0469, | |
| "step": 85000 | |
| }, | |
| { | |
| "epoch": 0.000851, | |
| "grad_norm": 0.42876607179641724, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0465, | |
| "step": 85100 | |
| }, | |
| { | |
| "epoch": 0.000852, | |
| "grad_norm": 0.39502009749412537, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0466, | |
| "step": 85200 | |
| }, | |
| { | |
| "epoch": 0.000853, | |
| "grad_norm": 0.44408953189849854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0468, | |
| "step": 85300 | |
| }, | |
| { | |
| "epoch": 0.000854, | |
| "grad_norm": 0.444979190826416, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 85400 | |
| }, | |
| { | |
| "epoch": 0.000855, | |
| "grad_norm": 0.4805260896682739, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0465, | |
| "step": 85500 | |
| }, | |
| { | |
| "epoch": 0.000856, | |
| "grad_norm": 0.552291750907898, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0462, | |
| "step": 85600 | |
| }, | |
| { | |
| "epoch": 0.000857, | |
| "grad_norm": 0.5068393349647522, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0461, | |
| "step": 85700 | |
| }, | |
| { | |
| "epoch": 0.000858, | |
| "grad_norm": 0.41845035552978516, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0461, | |
| "step": 85800 | |
| }, | |
| { | |
| "epoch": 0.000859, | |
| "grad_norm": 0.4751891493797302, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 85900 | |
| }, | |
| { | |
| "epoch": 0.00086, | |
| "grad_norm": 0.5280572175979614, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 86000 | |
| }, | |
| { | |
| "epoch": 0.000861, | |
| "grad_norm": 0.68556147813797, | |
| "learning_rate": 1e-05, | |
| "loss": 0.046, | |
| "step": 86100 | |
| }, | |
| { | |
| "epoch": 0.000862, | |
| "grad_norm": 0.5463889241218567, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0457, | |
| "step": 86200 | |
| }, | |
| { | |
| "epoch": 0.000863, | |
| "grad_norm": 0.44014325737953186, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 86300 | |
| }, | |
| { | |
| "epoch": 0.000864, | |
| "grad_norm": 0.5454211235046387, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0457, | |
| "step": 86400 | |
| }, | |
| { | |
| "epoch": 0.000865, | |
| "grad_norm": 0.5828255414962769, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0453, | |
| "step": 86500 | |
| }, | |
| { | |
| "epoch": 0.000866, | |
| "grad_norm": 0.4621482789516449, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0456, | |
| "step": 86600 | |
| }, | |
| { | |
| "epoch": 0.000867, | |
| "grad_norm": 0.4085827171802521, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 86700 | |
| }, | |
| { | |
| "epoch": 0.000868, | |
| "grad_norm": 0.504058301448822, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 86800 | |
| }, | |
| { | |
| "epoch": 0.000869, | |
| "grad_norm": 0.48852622509002686, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0454, | |
| "step": 86900 | |
| }, | |
| { | |
| "epoch": 0.00087, | |
| "grad_norm": 0.4814854860305786, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0457, | |
| "step": 87000 | |
| }, | |
| { | |
| "epoch": 0.000871, | |
| "grad_norm": 0.40433430671691895, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0463, | |
| "step": 87100 | |
| }, | |
| { | |
| "epoch": 0.000872, | |
| "grad_norm": 0.40531593561172485, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0452, | |
| "step": 87200 | |
| }, | |
| { | |
| "epoch": 0.000873, | |
| "grad_norm": 0.5245575308799744, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 87300 | |
| }, | |
| { | |
| "epoch": 0.000874, | |
| "grad_norm": 0.39926889538764954, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 87400 | |
| }, | |
| { | |
| "epoch": 0.000875, | |
| "grad_norm": 0.4549976587295532, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0464, | |
| "step": 87500 | |
| }, | |
| { | |
| "epoch": 0.000876, | |
| "grad_norm": 0.4379943013191223, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 87600 | |
| }, | |
| { | |
| "epoch": 0.000877, | |
| "grad_norm": 0.5028941035270691, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0462, | |
| "step": 87700 | |
| }, | |
| { | |
| "epoch": 0.000878, | |
| "grad_norm": 0.43268847465515137, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 87800 | |
| }, | |
| { | |
| "epoch": 0.000879, | |
| "grad_norm": 0.5015890002250671, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 87900 | |
| }, | |
| { | |
| "epoch": 0.00088, | |
| "grad_norm": 0.445121705532074, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0457, | |
| "step": 88000 | |
| }, | |
| { | |
| "epoch": 0.000881, | |
| "grad_norm": 0.49214833974838257, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 88100 | |
| }, | |
| { | |
| "epoch": 0.000882, | |
| "grad_norm": 0.4444495141506195, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0455, | |
| "step": 88200 | |
| }, | |
| { | |
| "epoch": 0.000883, | |
| "grad_norm": 0.49876669049263, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0459, | |
| "step": 88300 | |
| }, | |
| { | |
| "epoch": 0.000884, | |
| "grad_norm": 0.5114990472793579, | |
| "learning_rate": 1e-05, | |
| "loss": 0.045, | |
| "step": 88400 | |
| }, | |
| { | |
| "epoch": 0.000885, | |
| "grad_norm": 0.48783600330352783, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0461, | |
| "step": 88500 | |
| }, | |
| { | |
| "epoch": 0.000886, | |
| "grad_norm": 0.45137009024620056, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0451, | |
| "step": 88600 | |
| }, | |
| { | |
| "epoch": 0.000887, | |
| "grad_norm": 0.5109623074531555, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0453, | |
| "step": 88700 | |
| }, | |
| { | |
| "epoch": 0.000888, | |
| "grad_norm": 0.57321697473526, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 88800 | |
| }, | |
| { | |
| "epoch": 0.000889, | |
| "grad_norm": 0.4072723686695099, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0446, | |
| "step": 88900 | |
| }, | |
| { | |
| "epoch": 0.00089, | |
| "grad_norm": 0.5093070268630981, | |
| "learning_rate": 1e-05, | |
| "loss": 0.045, | |
| "step": 89000 | |
| }, | |
| { | |
| "epoch": 0.000891, | |
| "grad_norm": 0.5923020839691162, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0444, | |
| "step": 89100 | |
| }, | |
| { | |
| "epoch": 0.000892, | |
| "grad_norm": 0.4343903958797455, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0445, | |
| "step": 89200 | |
| }, | |
| { | |
| "epoch": 0.000893, | |
| "grad_norm": 0.6024598479270935, | |
| "learning_rate": 1e-05, | |
| "loss": 0.045, | |
| "step": 89300 | |
| }, | |
| { | |
| "epoch": 0.000894, | |
| "grad_norm": 0.5708175301551819, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0445, | |
| "step": 89400 | |
| }, | |
| { | |
| "epoch": 0.000895, | |
| "grad_norm": 0.42085763812065125, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0448, | |
| "step": 89500 | |
| }, | |
| { | |
| "epoch": 0.000896, | |
| "grad_norm": 0.4565168023109436, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0448, | |
| "step": 89600 | |
| }, | |
| { | |
| "epoch": 0.000897, | |
| "grad_norm": 0.4638221561908722, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0455, | |
| "step": 89700 | |
| }, | |
| { | |
| "epoch": 0.000898, | |
| "grad_norm": 0.3921230435371399, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0452, | |
| "step": 89800 | |
| }, | |
| { | |
| "epoch": 0.000899, | |
| "grad_norm": 0.5701455473899841, | |
| "learning_rate": 1e-05, | |
| "loss": 0.045, | |
| "step": 89900 | |
| }, | |
| { | |
| "epoch": 0.0009, | |
| "grad_norm": 0.5132615566253662, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0458, | |
| "step": 90000 | |
| }, | |
| { | |
| "epoch": 0.000901, | |
| "grad_norm": 0.43130597472190857, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 90100 | |
| }, | |
| { | |
| "epoch": 0.000902, | |
| "grad_norm": 0.4558640718460083, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0446, | |
| "step": 90200 | |
| }, | |
| { | |
| "epoch": 0.000903, | |
| "grad_norm": 0.4325823485851288, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 90300 | |
| }, | |
| { | |
| "epoch": 0.000904, | |
| "grad_norm": 0.5899006724357605, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0446, | |
| "step": 90400 | |
| }, | |
| { | |
| "epoch": 0.000905, | |
| "grad_norm": 0.6101588010787964, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0455, | |
| "step": 90500 | |
| }, | |
| { | |
| "epoch": 0.000906, | |
| "grad_norm": 0.5354421138763428, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0447, | |
| "step": 90600 | |
| }, | |
| { | |
| "epoch": 0.000907, | |
| "grad_norm": 0.4496416449546814, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0444, | |
| "step": 90700 | |
| }, | |
| { | |
| "epoch": 0.000908, | |
| "grad_norm": 0.40793660283088684, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0447, | |
| "step": 90800 | |
| }, | |
| { | |
| "epoch": 0.000909, | |
| "grad_norm": 0.5534836053848267, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0444, | |
| "step": 90900 | |
| }, | |
| { | |
| "epoch": 0.00091, | |
| "grad_norm": 0.4275030493736267, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0448, | |
| "step": 91000 | |
| }, | |
| { | |
| "epoch": 0.000911, | |
| "grad_norm": 0.5632148385047913, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0443, | |
| "step": 91100 | |
| }, | |
| { | |
| "epoch": 0.000912, | |
| "grad_norm": 0.43501216173171997, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 91200 | |
| }, | |
| { | |
| "epoch": 0.000913, | |
| "grad_norm": 0.54071444272995, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 91300 | |
| }, | |
| { | |
| "epoch": 0.000914, | |
| "grad_norm": 0.40895435214042664, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0451, | |
| "step": 91400 | |
| }, | |
| { | |
| "epoch": 0.000915, | |
| "grad_norm": 0.495510995388031, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0441, | |
| "step": 91500 | |
| }, | |
| { | |
| "epoch": 0.000916, | |
| "grad_norm": 0.3936554789543152, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0437, | |
| "step": 91600 | |
| }, | |
| { | |
| "epoch": 0.000917, | |
| "grad_norm": 0.4443312883377075, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0443, | |
| "step": 91700 | |
| }, | |
| { | |
| "epoch": 0.000918, | |
| "grad_norm": 0.5269384384155273, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0442, | |
| "step": 91800 | |
| }, | |
| { | |
| "epoch": 0.000919, | |
| "grad_norm": 0.43092164397239685, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0442, | |
| "step": 91900 | |
| }, | |
| { | |
| "epoch": 0.00092, | |
| "grad_norm": 0.498935729265213, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 92000 | |
| }, | |
| { | |
| "epoch": 0.000921, | |
| "grad_norm": 0.4460262656211853, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0448, | |
| "step": 92100 | |
| }, | |
| { | |
| "epoch": 0.000922, | |
| "grad_norm": 0.4452255964279175, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0441, | |
| "step": 92200 | |
| }, | |
| { | |
| "epoch": 0.000923, | |
| "grad_norm": 0.5646675229072571, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 92300 | |
| }, | |
| { | |
| "epoch": 0.000924, | |
| "grad_norm": 0.5320536494255066, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 92400 | |
| }, | |
| { | |
| "epoch": 0.000925, | |
| "grad_norm": 0.4475862681865692, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0432, | |
| "step": 92500 | |
| }, | |
| { | |
| "epoch": 0.000926, | |
| "grad_norm": 0.42607611417770386, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0448, | |
| "step": 92600 | |
| }, | |
| { | |
| "epoch": 0.000927, | |
| "grad_norm": 0.465669721364975, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0447, | |
| "step": 92700 | |
| }, | |
| { | |
| "epoch": 0.000928, | |
| "grad_norm": 0.47202736139297485, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0449, | |
| "step": 92800 | |
| }, | |
| { | |
| "epoch": 0.000929, | |
| "grad_norm": 0.45119792222976685, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0443, | |
| "step": 92900 | |
| }, | |
| { | |
| "epoch": 0.00093, | |
| "grad_norm": 0.4515833258628845, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0446, | |
| "step": 93000 | |
| }, | |
| { | |
| "epoch": 0.000931, | |
| "grad_norm": 0.43587127327919006, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 93100 | |
| }, | |
| { | |
| "epoch": 0.000932, | |
| "grad_norm": 0.4407802224159241, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0446, | |
| "step": 93200 | |
| }, | |
| { | |
| "epoch": 0.000933, | |
| "grad_norm": 0.4792422950267792, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 93300 | |
| }, | |
| { | |
| "epoch": 0.000934, | |
| "grad_norm": 0.5214342474937439, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0438, | |
| "step": 93400 | |
| }, | |
| { | |
| "epoch": 0.000935, | |
| "grad_norm": 0.5573062300682068, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 93500 | |
| }, | |
| { | |
| "epoch": 0.000936, | |
| "grad_norm": 0.5918563008308411, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 93600 | |
| }, | |
| { | |
| "epoch": 0.000937, | |
| "grad_norm": 0.48166489601135254, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0442, | |
| "step": 93700 | |
| }, | |
| { | |
| "epoch": 0.000938, | |
| "grad_norm": 0.4840247631072998, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0438, | |
| "step": 93800 | |
| }, | |
| { | |
| "epoch": 0.000939, | |
| "grad_norm": 0.44477516412734985, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 93900 | |
| }, | |
| { | |
| "epoch": 0.00094, | |
| "grad_norm": 0.5108721256256104, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 94000 | |
| }, | |
| { | |
| "epoch": 0.000941, | |
| "grad_norm": 0.5947906970977783, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0441, | |
| "step": 94100 | |
| }, | |
| { | |
| "epoch": 0.000942, | |
| "grad_norm": 0.4325408637523651, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 94200 | |
| }, | |
| { | |
| "epoch": 0.000943, | |
| "grad_norm": 0.5207073092460632, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0432, | |
| "step": 94300 | |
| }, | |
| { | |
| "epoch": 0.000944, | |
| "grad_norm": 0.4852275848388672, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 94400 | |
| }, | |
| { | |
| "epoch": 0.000945, | |
| "grad_norm": 0.5342420339584351, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 94500 | |
| }, | |
| { | |
| "epoch": 0.000946, | |
| "grad_norm": 0.6544240713119507, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 94600 | |
| }, | |
| { | |
| "epoch": 0.000947, | |
| "grad_norm": 0.456338107585907, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0437, | |
| "step": 94700 | |
| }, | |
| { | |
| "epoch": 0.000948, | |
| "grad_norm": 0.51591956615448, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 94800 | |
| }, | |
| { | |
| "epoch": 0.000949, | |
| "grad_norm": 0.5521871447563171, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0437, | |
| "step": 94900 | |
| }, | |
| { | |
| "epoch": 0.00095, | |
| "grad_norm": 0.46055886149406433, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0433, | |
| "step": 95000 | |
| }, | |
| { | |
| "epoch": 0.000951, | |
| "grad_norm": 0.5128651261329651, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0441, | |
| "step": 95100 | |
| }, | |
| { | |
| "epoch": 0.000952, | |
| "grad_norm": 0.5421969294548035, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 95200 | |
| }, | |
| { | |
| "epoch": 0.000953, | |
| "grad_norm": 0.4281409680843353, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 95300 | |
| }, | |
| { | |
| "epoch": 0.000954, | |
| "grad_norm": 0.3867093622684479, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 95400 | |
| }, | |
| { | |
| "epoch": 0.000955, | |
| "grad_norm": 0.39425021409988403, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 95500 | |
| }, | |
| { | |
| "epoch": 0.000956, | |
| "grad_norm": 0.45868080854415894, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 95600 | |
| }, | |
| { | |
| "epoch": 0.000957, | |
| "grad_norm": 0.38381725549697876, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 95700 | |
| }, | |
| { | |
| "epoch": 0.000958, | |
| "grad_norm": 0.5100952386856079, | |
| "learning_rate": 1e-05, | |
| "loss": 0.043, | |
| "step": 95800 | |
| }, | |
| { | |
| "epoch": 0.000959, | |
| "grad_norm": 0.45941147208213806, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0442, | |
| "step": 95900 | |
| }, | |
| { | |
| "epoch": 0.00096, | |
| "grad_norm": 0.3832944929599762, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 96000 | |
| }, | |
| { | |
| "epoch": 0.000961, | |
| "grad_norm": 0.3378923535346985, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 96100 | |
| }, | |
| { | |
| "epoch": 0.000962, | |
| "grad_norm": 0.41457870602607727, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 96200 | |
| }, | |
| { | |
| "epoch": 0.000963, | |
| "grad_norm": 0.49303749203681946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0426, | |
| "step": 96300 | |
| }, | |
| { | |
| "epoch": 0.000964, | |
| "grad_norm": 0.3703688383102417, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 96400 | |
| }, | |
| { | |
| "epoch": 0.000965, | |
| "grad_norm": 0.3742707371711731, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0431, | |
| "step": 96500 | |
| }, | |
| { | |
| "epoch": 0.000966, | |
| "grad_norm": 0.4352927505970001, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0427, | |
| "step": 96600 | |
| }, | |
| { | |
| "epoch": 0.000967, | |
| "grad_norm": 0.4979144334793091, | |
| "learning_rate": 1e-05, | |
| "loss": 0.044, | |
| "step": 96700 | |
| }, | |
| { | |
| "epoch": 0.000968, | |
| "grad_norm": 0.38628560304641724, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 96800 | |
| }, | |
| { | |
| "epoch": 0.000969, | |
| "grad_norm": 0.5488578677177429, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0442, | |
| "step": 96900 | |
| }, | |
| { | |
| "epoch": 0.00097, | |
| "grad_norm": 0.3385869264602661, | |
| "learning_rate": 1e-05, | |
| "loss": 0.043, | |
| "step": 97000 | |
| }, | |
| { | |
| "epoch": 0.000971, | |
| "grad_norm": 0.3328537046909332, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0434, | |
| "step": 97100 | |
| }, | |
| { | |
| "epoch": 0.000972, | |
| "grad_norm": 0.3595049977302551, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 97200 | |
| }, | |
| { | |
| "epoch": 0.000973, | |
| "grad_norm": 0.4202601909637451, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0433, | |
| "step": 97300 | |
| }, | |
| { | |
| "epoch": 0.000974, | |
| "grad_norm": 0.47522690892219543, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0428, | |
| "step": 97400 | |
| }, | |
| { | |
| "epoch": 0.000975, | |
| "grad_norm": 0.4936007857322693, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 97500 | |
| }, | |
| { | |
| "epoch": 0.000976, | |
| "grad_norm": 0.40649285912513733, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0433, | |
| "step": 97600 | |
| }, | |
| { | |
| "epoch": 0.000977, | |
| "grad_norm": 0.4392286241054535, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 97700 | |
| }, | |
| { | |
| "epoch": 0.000978, | |
| "grad_norm": 0.38572990894317627, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 97800 | |
| }, | |
| { | |
| "epoch": 0.000979, | |
| "grad_norm": 0.5374602675437927, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0428, | |
| "step": 97900 | |
| }, | |
| { | |
| "epoch": 0.00098, | |
| "grad_norm": 0.4686330258846283, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0431, | |
| "step": 98000 | |
| }, | |
| { | |
| "epoch": 0.000981, | |
| "grad_norm": 0.44734638929367065, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0424, | |
| "step": 98100 | |
| }, | |
| { | |
| "epoch": 0.000982, | |
| "grad_norm": 0.47658222913742065, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0439, | |
| "step": 98200 | |
| }, | |
| { | |
| "epoch": 0.000983, | |
| "grad_norm": 0.73811274766922, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0428, | |
| "step": 98300 | |
| }, | |
| { | |
| "epoch": 0.000984, | |
| "grad_norm": 0.4593341648578644, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 98400 | |
| }, | |
| { | |
| "epoch": 0.000985, | |
| "grad_norm": 0.4732546806335449, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0433, | |
| "step": 98500 | |
| }, | |
| { | |
| "epoch": 0.000986, | |
| "grad_norm": 0.37035250663757324, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0424, | |
| "step": 98600 | |
| }, | |
| { | |
| "epoch": 0.000987, | |
| "grad_norm": 0.47103026509284973, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 98700 | |
| }, | |
| { | |
| "epoch": 0.000988, | |
| "grad_norm": 0.47766396403312683, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0431, | |
| "step": 98800 | |
| }, | |
| { | |
| "epoch": 0.000989, | |
| "grad_norm": 0.44070738554000854, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0431, | |
| "step": 98900 | |
| }, | |
| { | |
| "epoch": 0.00099, | |
| "grad_norm": 0.44191232323646545, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 99000 | |
| }, | |
| { | |
| "epoch": 0.000991, | |
| "grad_norm": 0.4926696717739105, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0426, | |
| "step": 99100 | |
| }, | |
| { | |
| "epoch": 0.000992, | |
| "grad_norm": 0.3758436143398285, | |
| "learning_rate": 1e-05, | |
| "loss": 0.042, | |
| "step": 99200 | |
| }, | |
| { | |
| "epoch": 0.000993, | |
| "grad_norm": 0.4165551960468292, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 99300 | |
| }, | |
| { | |
| "epoch": 0.000994, | |
| "grad_norm": 0.4664058983325958, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0436, | |
| "step": 99400 | |
| }, | |
| { | |
| "epoch": 0.000995, | |
| "grad_norm": 0.5242469906806946, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0431, | |
| "step": 99500 | |
| }, | |
| { | |
| "epoch": 0.000996, | |
| "grad_norm": 0.5722303986549377, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0433, | |
| "step": 99600 | |
| }, | |
| { | |
| "epoch": 0.000997, | |
| "grad_norm": 0.4828585684299469, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0425, | |
| "step": 99700 | |
| }, | |
| { | |
| "epoch": 0.000998, | |
| "grad_norm": 0.46811702847480774, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0429, | |
| "step": 99800 | |
| }, | |
| { | |
| "epoch": 0.000999, | |
| "grad_norm": 0.379393070936203, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0432, | |
| "step": 99900 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "grad_norm": 0.5672951340675354, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0435, | |
| "step": 100000 | |
| }, | |
| { | |
| "epoch": 0.001, | |
| "eval_loss": 0.035888671875, | |
| "eval_runtime": 147.9294, | |
| "eval_samples_per_second": 337.999, | |
| "eval_steps_per_second": 21.125, | |
| "step": 100000 | |
| } | |
| ], | |
| "logging_steps": 100, | |
| "max_steps": 100000000, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 9223372036854775807, | |
| "save_steps": 20000, | |
| "stateful_callbacks": { | |
| "EarlyStoppingCallback": { | |
| "args": { | |
| "early_stopping_patience": 200, | |
| "early_stopping_threshold": 0.0 | |
| }, | |
| "attributes": { | |
| "early_stopping_patience_counter": 0 | |
| } | |
| }, | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.951156666368e+18, | |
| "train_batch_size": 128, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |